From 5f7996845f8b7b3c230eee7ba7569b9b6158a2d7 Mon Sep 17 00:00:00 2001
From: Alexander Nesterov <nesterov.alexander@outlook.com>
Date: Mon, 1 Jun 2026 02:33:08 +0200
Subject: [PATCH] cpu: aarch64: align ACL integration with oneDNN v3.10.x

---
 .github/automation/aarch64/build.sh           |   4 +-
 .github/workflows/aarch64-acl.yml             | 129 +++++++++
 .github/workflows/ci-aarch64.yml              | 240 ++++++++++++++++
 .github/workflows/nightly-aarch64.yml         | 199 ++++++++++++++
 .github/workflows/performance-aarch64.yml     | 258 ++++++++++++++++++
 CMakeLists.txt                                |   2 -
 cmake/ACL.cmake                               |   4 +-
 cmake/options.cmake                           |   2 +-
 cmake/platform.cmake                          |   4 +-
 src/common/utils.cpp                          |   2 +-
 src/cpu/CMakeLists.txt                        |   3 -
 src/cpu/README.md                             |   1 -
 src/cpu/aarch64/CMakeLists.txt                |   9 +-
 .../acl_batch_normalization.cpp               |   6 +-
 .../acl_batch_normalization.hpp               |  12 +-
 .../acl_benchmark_scheduler.cpp               |   6 +-
 .../acl_benchmark_scheduler.hpp               |  10 +-
 src/cpu/{acl => aarch64}/acl_binary.cpp       |   4 +-
 src/cpu/{acl => aarch64}/acl_binary.hpp       |   8 +-
 .../acl_convolution_utils.cpp                 |  24 +-
 .../acl_convolution_utils.hpp                 |  10 +-
 .../{acl => aarch64}/acl_deconvolution.cpp    |   6 +-
 .../{acl => aarch64}/acl_deconvolution.hpp    |  10 +-
 .../acl_depthwise_convolution.cpp             |   6 +-
 .../acl_depthwise_convolution.hpp             |  10 +-
 src/cpu/{acl => aarch64}/acl_eltwise.cpp      |   4 +-
 src/cpu/{acl => aarch64}/acl_eltwise.hpp      |  10 +-
 .../{acl => aarch64}/acl_gemm_convolution.cpp |   4 +-
 .../{acl => aarch64}/acl_gemm_convolution.hpp |   8 +-
 .../acl_indirect_gemm_convolution.cpp         |   4 +-
 .../acl_indirect_gemm_convolution.hpp         |  10 +-
 .../{acl => aarch64}/acl_inner_product.cpp    |   6 +-
 .../{acl => aarch64}/acl_inner_product.hpp    |  14 +-
 .../acl_layer_normalization.cpp               |   6 +-
 .../acl_layer_normalization.hpp               |  12 +-
 src/cpu/{acl => aarch64}/acl_pooling.cpp      |   6 +-
 src/cpu/{acl => aarch64}/acl_pooling.hpp      |  12 +-
 src/cpu/{acl => aarch64}/acl_post_ops.cpp     |   6 +-
 src/cpu/{acl => aarch64}/acl_post_ops.hpp     |  14 +-
 src/cpu/{acl => aarch64}/acl_prelu.cpp        |   6 +-
 src/cpu/{acl => aarch64}/acl_prelu.hpp        |  12 +-
 src/cpu/aarch64/acl_reorder.cpp               |   4 +-
 src/cpu/aarch64/acl_reorder.hpp               |  12 +-
 src/cpu/{acl => aarch64}/acl_softmax.cpp      |   6 +-
 src/cpu/{acl => aarch64}/acl_softmax.hpp      |  10 +-
 src/cpu/{acl => aarch64}/acl_thread.cpp       |  10 +-
 src/cpu/{acl => aarch64}/acl_thread.hpp       |  10 +-
 .../acl_threadpool_scheduler.cpp              |   8 +-
 .../acl_threadpool_scheduler.hpp              |  10 +-
 src/cpu/{acl => aarch64}/acl_utils.cpp        |   6 +-
 src/cpu/{acl => aarch64}/acl_utils.hpp        |  10 +-
 .../acl_winograd_convolution.cpp              |   6 +-
 .../acl_winograd_convolution.hpp              |  14 +-
 .../matmul/acl_lowp_matmul.cpp                |  26 +-
 .../matmul/acl_lowp_matmul.hpp                |  14 +-
 .../matmul/acl_lowp_matmul_sq.cpp             |  13 +-
 .../matmul/acl_lowp_matmul_sq.hpp             |  12 +-
 .../{acl => aarch64}/matmul/acl_matmul.cpp    |  13 +-
 .../{acl => aarch64}/matmul/acl_matmul.hpp    |   8 +-
 .../matmul/acl_matmul_utils.cpp               |   8 +-
 .../matmul/acl_matmul_utils.hpp               |  10 +-
 src/cpu/acl/CMakeLists.txt                    |  33 ---
 src/cpu/cpu_batch_normalization_list.cpp      |   6 +-
 src/cpu/cpu_binary_list.cpp                   |   6 +-
 src/cpu/cpu_convolution_list.cpp              |  30 +-
 src/cpu/cpu_deconvolution_list.cpp            |   6 +-
 src/cpu/cpu_eltwise_list.cpp                  |   6 +-
 src/cpu/cpu_engine.hpp                        |   8 +-
 src/cpu/cpu_inner_product_list.cpp            |  12 +-
 src/cpu/cpu_layer_normalization_list.cpp      |   6 +-
 src/cpu/cpu_pooling_list.cpp                  |   6 +-
 src/cpu/cpu_prelu_list.cpp                    |   6 +-
 src/cpu/cpu_softmax_list.cpp                  |   6 +-
 src/cpu/matmul/cpu_matmul_list.cpp            |  16 +-
 src/cpu/platform.hpp                          |  21 +-
 .../reorder/cpu_reorder_regular_f32_bf16.cpp  |   2 +-
 .../reorder/cpu_reorder_regular_f32_f32.cpp   |   4 +-
 tests/benchdnn/rnn/rnn.cpp                    |   2 +-
 tests/benchdnn/softmax/softmax.cpp            |   4 +-
 79 files changed, 1139 insertions(+), 354 deletions(-)
 create mode 100644 .github/workflows/aarch64-acl.yml
 create mode 100644 .github/workflows/ci-aarch64.yml
 create mode 100644 .github/workflows/nightly-aarch64.yml
 create mode 100644 .github/workflows/performance-aarch64.yml
 rename src/cpu/{acl => aarch64}/acl_batch_normalization.cpp (96%)
 rename src/cpu/{acl => aarch64}/acl_batch_normalization.hpp (98%)
 rename src/cpu/{acl => aarch64}/acl_benchmark_scheduler.cpp (96%)
 rename src/cpu/{acl => aarch64}/acl_benchmark_scheduler.hpp (92%)
 rename src/cpu/{acl => aarch64}/acl_binary.cpp (99%)
 rename src/cpu/{acl => aarch64}/acl_binary.hpp (95%)
 rename src/cpu/{acl => aarch64}/acl_convolution_utils.cpp (96%)
 rename src/cpu/{acl => aarch64}/acl_convolution_utils.hpp (98%)
 rename src/cpu/{acl => aarch64}/acl_deconvolution.cpp (96%)
 rename src/cpu/{acl => aarch64}/acl_deconvolution.hpp (98%)
 rename src/cpu/{acl => aarch64}/acl_depthwise_convolution.cpp (97%)
 rename src/cpu/{acl => aarch64}/acl_depthwise_convolution.hpp (91%)
 rename src/cpu/{acl => aarch64}/acl_eltwise.cpp (98%)
 rename src/cpu/{acl => aarch64}/acl_eltwise.hpp (93%)
 rename src/cpu/{acl => aarch64}/acl_gemm_convolution.cpp (99%)
 rename src/cpu/{acl => aarch64}/acl_gemm_convolution.hpp (94%)
 rename src/cpu/{acl => aarch64}/acl_indirect_gemm_convolution.cpp (99%)
 rename src/cpu/{acl => aarch64}/acl_indirect_gemm_convolution.hpp (90%)
 rename src/cpu/{acl => aarch64}/acl_inner_product.cpp (99%)
 rename src/cpu/{acl => aarch64}/acl_inner_product.hpp (91%)
 rename src/cpu/{acl => aarch64}/acl_layer_normalization.cpp (98%)
 rename src/cpu/{acl => aarch64}/acl_layer_normalization.hpp (89%)
 rename src/cpu/{acl => aarch64}/acl_pooling.cpp (99%)
 rename src/cpu/{acl => aarch64}/acl_pooling.hpp (93%)
 rename src/cpu/{acl => aarch64}/acl_post_ops.cpp (97%)
 rename src/cpu/{acl => aarch64}/acl_post_ops.hpp (96%)
 rename src/cpu/{acl => aarch64}/acl_prelu.cpp (96%)
 rename src/cpu/{acl => aarch64}/acl_prelu.hpp (96%)
 rename src/cpu/{acl => aarch64}/acl_softmax.cpp (98%)
 rename src/cpu/{acl => aarch64}/acl_softmax.hpp (92%)
 rename src/cpu/{acl => aarch64}/acl_thread.cpp (95%)
 rename src/cpu/{acl => aarch64}/acl_thread.hpp (92%)
 rename src/cpu/{acl => aarch64}/acl_threadpool_scheduler.cpp (97%)
 rename src/cpu/{acl => aarch64}/acl_threadpool_scheduler.hpp (91%)
 rename src/cpu/{acl => aarch64}/acl_utils.cpp (99%)
 rename src/cpu/{acl => aarch64}/acl_utils.hpp (97%)
 rename src/cpu/{acl => aarch64}/acl_winograd_convolution.cpp (94%)
 rename src/cpu/{acl => aarch64}/acl_winograd_convolution.hpp (94%)
 rename src/cpu/{acl => aarch64}/matmul/acl_lowp_matmul.cpp (96%)
 rename src/cpu/{acl => aarch64}/matmul/acl_lowp_matmul.hpp (91%)
 rename src/cpu/{acl => aarch64}/matmul/acl_lowp_matmul_sq.cpp (97%)
 rename src/cpu/{acl => aarch64}/matmul/acl_lowp_matmul_sq.hpp (92%)
 rename src/cpu/{acl => aarch64}/matmul/acl_matmul.cpp (98%)
 rename src/cpu/{acl => aarch64}/matmul/acl_matmul.hpp (94%)
 rename src/cpu/{acl => aarch64}/matmul/acl_matmul_utils.cpp (98%)
 rename src/cpu/{acl => aarch64}/matmul/acl_matmul_utils.hpp (94%)
 delete mode 100644 src/cpu/acl/CMakeLists.txt

diff --git a/.github/automation/aarch64/build.sh b/.github/automation/aarch64/build.sh
index 2a244a41090..1be9c55be03 100755
--- a/.github/automation/aarch64/build.sh
+++ b/.github/automation/aarch64/build.sh
@@ -45,7 +45,7 @@ if [[ "$ONEDNN_ACTION" == "configure" ]]; then
         cmake \
             "${GENERATOR_ARGS[@]}" \
             -Bbuild -S. \
-            -DDNNL_USE_ACL=ON \
+            -DDNNL_AARCH64_USE_ACL=ON \
             -DONEDNN_BUILD_GRAPH=OFF \
             -DDNNL_CPU_RUNTIME=OMP \
             -DDNNL_WERROR=ON \
@@ -61,7 +61,7 @@ if [[ "$ONEDNN_ACTION" == "configure" ]]; then
         cmake \
             "${GENERATOR_ARGS[@]}" \
             -Bbuild -S. \
-            -DDNNL_USE_ACL=ON \
+            -DDNNL_AARCH64_USE_ACL=ON \
             -DONEDNN_BUILD_GRAPH=$ONEDNN_BUILD_GRAPH \
             -DDNNL_CPU_RUNTIME=$ONEDNN_THREADING \
             -DDNNL_WERROR=ON \
diff --git a/.github/workflows/aarch64-acl.yml b/.github/workflows/aarch64-acl.yml
new file mode 100644
index 00000000000..3e9bb6c90d8
--- /dev/null
+++ b/.github/workflows/aarch64-acl.yml
@@ -0,0 +1,129 @@
+# *******************************************************************************
+# Copyright 2025 Arm Limited and affiliates.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# *******************************************************************************
+
+name: "Build ACL cache"
+
+#* To avoid duplicate jobs running when both push and PR is satisfied, we use this:
+#* https://github.com/orgs/community/discussions/26940#discussioncomment-5686753
+on:
+  workflow_call:
+    inputs:
+      acl_hash:
+        required: false
+        type: string
+
+  workflow_dispatch:
+
+# Declare default permissions as read only.
+permissions: read-all
+
+jobs:
+  # Cache is built sequentially to avoid cache-hit race conditions
+  build-cache:
+    strategy:
+      max-parallel: 1
+      matrix:
+        config: [
+          { name: MacOS, label: macos-14, threading: SEQ, toolset: clang, build: RelWithAssert },
+          { name: cb100, label: ubuntu-24.04-arm, threading: OMP, toolset: gcc, build: RelWithAssert },
+          { name: cb100, label: ubuntu-24.04-arm, threading: OMP, toolset: gcc, build: Release }
+        ]
+
+    name: ${{ matrix.config.name }}, ${{ matrix.config.toolset }}, ${{ matrix.config.threading }}, ${{ matrix.config.build }}
+    runs-on: ${{ matrix.config.label }}
+    steps:
+      - name: Checkout oneDNN
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          path: oneDNN
+
+      - name: Read version file
+        id: get-versions
+        run: |
+          content=$(cat "${{ github.workspace }}/oneDNN/.github/automation/aarch64/ci.json")
+          content="${content//[$'\t\r\n$ ']}"
+          echo "output=$content" >> "$GITHUB_OUTPUT"
+
+      - name: Clone ACL
+        run: ${{ github.workspace }}/oneDNN/.github/automation/aarch64/build_acl.sh
+        env:
+          ACL_ACTION: clone
+          ACL_ROOT_DIR: ${{ github.workspace }}/ComputeLibrary
+          ACL_VERSION: ${{ inputs.acl_hash || fromJson(steps.get-versions.outputs.output).dependencies.acl }}
+
+      - name: Get ACL commit hash for cache key
+        id: get_acl_commit_hash
+        run: (cd "${{ github.workspace }}/ComputeLibrary" && echo "ACLCommitHash=$(git rev-parse --short HEAD)") >> "$GITHUB_OUTPUT"
+
+      - name: Get system name
+        id: get_system_name
+        run: (echo "SystemName=$(uname)") >> "$GITHUB_OUTPUT"
+
+      - name: Restore cached ACL
+        id: cache-acl-restore
+        uses: actions/cache/restore@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0
+        with:
+          key: ${{ steps.get_system_name.outputs.SystemName }}-acl-${{ matrix.config.toolset }}-${{ matrix.config.build }}-${{ steps.get_acl_commit_hash.outputs.ACLCommitHash }}
+          path: ${{ github.workspace }}/ComputeLibrary/build
+          lookup-only: true
+
+      - name: Install Scons (MacOS)
+        if: ${{ matrix.config.name == 'MacOS' && (steps.cache-acl-restore.outputs.cache-hit != 'true') }}
+        run: brew install scons
+
+      - name: Install scons (Linux)
+        if: ${{ matrix.config.name != 'MacOS' && (steps.cache-acl-restore.outputs.cache-hit != 'true') }}
+        run: |
+          sudo apt update -y
+          sudo apt install -y scons
+
+      - if: ${{ contains(matrix.config.label,'ubuntu') && (matrix.config.threading == 'OMP') && (steps.cache-acl-restore.outputs.cache-hit != 'true') }}
+        name: Install openmp
+        run: |
+          sudo apt install -y libomp-dev
+
+      - if: ${{ contains(matrix.config.label,'ubuntu') && (matrix.config.toolset == 'gcc') && (steps.cache-acl-restore.outputs.cache-hit != 'true') }}
+        name: Install gcc
+        run: |
+          sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y
+          sudo apt update -y
+          sudo apt install -y g++-${{ fromJson(steps.get-versions.outputs.output).dependencies.gcc }}
+
+      - if: ${{ contains(matrix.config.label,'ubuntu') && (matrix.config.toolset == 'clang') && (steps.cache-acl-restore.outputs.cache-hit != 'true') }}
+        name: Install clang
+        uses: KyleMayes/install-llvm-action@a7a1a882e2d06ebe05d5bb97c3e1f8c984ae96fc
+        with:
+          version: ${{ fromJson(steps.get-versions.outputs.output).dependencies.clang }}
+
+      - name: Build ACL
+        if: ${{ steps.cache-acl-restore.outputs.cache-hit != 'true' }}
+        run: ${{ github.workspace }}/oneDNN/.github/automation/aarch64/build_acl.sh
+        env:
+          ACL_ACTION: build
+          ACL_ROOT_DIR: ${{ github.workspace }}/ComputeLibrary
+          ACL_THREADING: ${{ matrix.config.threading }}
+          BUILD_TOOLSET: ${{ matrix.config.toolset }}
+          ACL_BUILD_TYPE: ${{ matrix.config.build }}
+          GCC_VERSION: ${{ fromJson(steps.get-versions.outputs.output).dependencies.gcc }}
+
+      - name: Save ACL in cache
+        id: cache-acl_build-save
+        if: ${{ steps.cache-acl-restore.outputs.cache-hit != 'true' }}
+        uses: actions/cache/save@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0
+        with:
+          key: ${{ steps.get_system_name.outputs.SystemName }}-acl-${{ matrix.config.toolset }}-${{ matrix.config.build }}-${{ steps.get_acl_commit_hash.outputs.ACLCommitHash }}
+          path: ${{ github.workspace }}/ComputeLibrary/build
diff --git a/.github/workflows/ci-aarch64.yml b/.github/workflows/ci-aarch64.yml
new file mode 100644
index 00000000000..47cb807037a
--- /dev/null
+++ b/.github/workflows/ci-aarch64.yml
@@ -0,0 +1,240 @@
+# *******************************************************************************
+# Copyright 2024-2025 Arm Limited and affiliates.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# *******************************************************************************
+
+name: "CI AArch64"
+
+#* To avoid duplicate jobs running when both push and PR is satisfied, we use this:
+#* https://github.com/orgs/community/discussions/26940#discussioncomment-5686753
+on:
+  push:
+    branches: [main, "rls-*"]
+    paths:
+      - ".github/automation/performance/**"
+      - ".github/automation/*"
+      - ".github/automation/aarch64/**"
+      - ".github/workflows/aarch64-acl.yml"
+      - ".github/workflows/ci-aarch64.yml"
+      - "cmake/**"
+      - "examples/**"
+      - "include/**"
+      - "src/common/**"
+      - "src/cpu/*"
+      - "src/cpu/aarch64/**"
+      - "tests/**"
+      - "CMakeLists.txt"
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths:
+      - ".github/automation/performance/**"
+      - ".github/automation/*"
+      - ".github/automation/aarch64/**"
+      - ".github/workflows/aarch64-acl.yml"
+      - ".github/workflows/ci-aarch64.yml"
+      - "cmake/**"
+      - "examples/**"
+      - "include/**"
+      - "src/common/**"
+      - "src/cpu/*"
+      - "src/cpu/aarch64/**"
+      - "tests/**"
+      - "CMakeLists.txt"
+  #* allow manual trigger of workflow when needed.
+  workflow_dispatch:
+
+#* Stop stale workflows when pull requests are updated: https://stackoverflow.com/a/70972844
+#* Does not apply to the main branch.
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+
+# Declare default permissions as read only.
+permissions: read-all
+
+jobs:
+  build-acl-cache:
+    uses: ./.github/workflows/aarch64-acl.yml
+
+  build-and-test:
+    needs: build-acl-cache
+    strategy:
+      matrix:
+        config: [
+          { name: MacOS, label: macos-14, threading: SEQ, toolset: clang, build: RelWithAssert, testset: SMOKE },
+          { name: cb100, label: ubuntu-24.04-arm, threading: OMP, toolset: gcc, build: RelWithAssert, testset: SMOKE },
+          { name: cb100, label: ubuntu-24.04-arm, threading: OMP, toolset: gcc, build: Release, testset: CI }
+        ]
+
+    name: ${{ matrix.config.name }}, ${{ matrix.config.toolset }}, ${{ matrix.config.threading }}, ${{ matrix.config.build }}
+    runs-on: ${{ matrix.config.label }}
+    steps:
+      - name: Checkout oneDNN
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          path: oneDNN
+
+      - name: Read version file
+        id: get-versions
+        run: |
+          content=$(cat "${{ github.workspace }}/oneDNN/.github/automation/aarch64/ci.json")
+          content="${content//[$'\t\r\n$ ']}"
+          echo "output=$content" >> "$GITHUB_OUTPUT"
+
+      # Note: This will create a github actions cache
+      - name: Get latest CMake and Ninja
+        uses: lukka/get-cmake@56d043d188c3612951d8755da8f4b709ec951ad6 # v3.31.6
+        with:
+          cmakeVersion: 3.31.0
+          ninjaVersion: 1.12.0
+
+      - if: ${{ (contains(matrix.config.label,'ubuntu') && (matrix.config.threading == 'OMP')) }}
+        name: Install openmp
+        run: |
+          sudo apt install -y libomp-dev
+
+      - if: ${{ (contains(matrix.config.label,'ubuntu') && (matrix.config.toolset == 'gcc')) }}
+        name: Install gcc
+        run: |
+          sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y
+          sudo apt update -y
+          sudo apt install -y g++-${{ fromJson(steps.get-versions.outputs.output).dependencies.gcc }}
+
+      - if: ${{ (contains(matrix.config.label,'ubuntu') && (matrix.config.toolset == 'clang')) }}
+        name: Install clang
+        uses: KyleMayes/install-llvm-action@a7a1a882e2d06ebe05d5bb97c3e1f8c984ae96fc
+        with:
+          version: ${{ fromJson(steps.get-versions.outputs.output).dependencies.clang }}
+
+      - name: setup python
+        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+        with:
+          python-version: '3.10'
+
+      - name: Install dependencies
+        if: ${{ matrix.config.build == 'Release' }}
+        run: pip install scipy statistics gitpython
+
+      - name: Clone ACL
+        run: ${{ github.workspace }}/oneDNN/.github/automation/aarch64/build_acl.sh
+        env:
+          ACL_ACTION: clone
+          ACL_ROOT_DIR: ${{ github.workspace }}/ComputeLibrary
+          ACL_VERSION: ${{ fromJson(steps.get-versions.outputs.output).dependencies.acl }}
+
+      - name: Get ACL commit hash for cache key
+        id: get_acl_commit_hash
+        run: (cd "${{ github.workspace }}/ComputeLibrary" && echo "ACLCommitHash=$(git rev-parse --short HEAD)") >> "$GITHUB_OUTPUT"
+
+      - name: Get system name
+        id: get_system_name
+        run: (echo "SystemName=$(uname)") >> "$GITHUB_OUTPUT"
+
+      - name: Restore cached ACL
+        id: cache-acl-restore
+        uses: actions/cache/restore@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0
+        with:
+          key: ${{ steps.get_system_name.outputs.SystemName }}-acl-${{ matrix.config.toolset }}-${{ matrix.config.build }}-${{ steps.get_acl_commit_hash.outputs.ACLCommitHash }}
+          path: ${{ github.workspace }}/ComputeLibrary/build
+          fail-on-cache-miss: true
+
+      - name: Configure oneDNN
+        run: ${{ github.workspace }}/oneDNN/.github/automation/aarch64/build.sh
+        working-directory: ${{ github.workspace }}/oneDNN
+        env:
+          ACL_ROOT_DIR: ${{ github.workspace }}/ComputeLibrary
+          BUILD_TOOLSET: ${{ matrix.config.toolset }}
+          CMAKE_BUILD_TYPE: ${{ matrix.config.build }}
+          CMAKE_GENERATOR: Ninja
+          GCC_VERSION: ${{ fromJson(steps.get-versions.outputs.output).dependencies.gcc }}
+          ONEDNN_ACTION: configure
+          ONEDNN_TEST_SET: ${{ matrix.config.testset }}
+          ONEDNN_THREADING: ${{ matrix.config.threading }}
+
+      - name: Build oneDNN
+        run: ${{ github.workspace }}/oneDNN/.github/automation/aarch64/build.sh
+        working-directory: ${{ github.workspace }}/oneDNN
+        env:
+          ONEDNN_ACTION: build
+
+      - name: Run oneDNN tests
+        run: ${{ github.workspace }}/oneDNN/.github/automation/aarch64/test.sh ${{ github.workspace }}/test_results.xml
+        working-directory: ${{ github.workspace }}/oneDNN/build
+        env:
+          CTEST_PARALLEL_LEVEL: 6
+          DYLD_LIBRARY_PATH: ${{ github.workspace }}/ComputeLibrary/build
+          ONEDNN_THREADING: ${{ matrix.config.threading }}
+
+      ## Performance test steps ##
+      - name: Checkout oneDNN base
+        if: ${{ github.event_name == 'pull_request' && matrix.config.build == 'Release' && matrix.config.name != 'cb100' && matrix.config.name != 'MacOS' }}
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          ref: ${{ github.base_ref }}
+          path: oneDNN_base
+
+      - name: Configure oneDNN base
+        if: ${{ github.event_name == 'pull_request' && matrix.config.build == 'Release' && matrix.config.name != 'cb100' && matrix.config.name != 'MacOS' }}
+        run: ${{ github.workspace }}/oneDNN/.github/automation/aarch64/build.sh
+        working-directory: ${{ github.workspace }}/oneDNN_base
+        env:
+          ACL_ROOT_DIR: ${{ github.workspace }}/ComputeLibrary
+          BUILD_TOOLSET: ${{ matrix.config.toolset }}
+          CMAKE_BUILD_TYPE: ${{ matrix.config.build }}
+          CMAKE_GENERATOR: Ninja
+          GCC_VERSION: ${{ fromJson(steps.get-versions.outputs.output).dependencies.gcc }}
+          ONEDNN_ACTION: configure
+          ONEDNN_TEST_SET: ${{ matrix.config.testset }}
+          ONEDNN_THREADING: ${{ matrix.config.threading }}
+
+      - name: Build oneDNN base
+        if: ${{ github.event_name == 'pull_request' && matrix.config.build == 'Release' && matrix.config.name != 'cb100' && matrix.config.name != 'MacOS' }}
+        run: ${{ github.workspace }}/oneDNN/.github/automation/aarch64/build.sh
+        working-directory: ${{ github.workspace }}/oneDNN_base
+        env:
+          ONEDNN_ACTION: build
+
+      - name: Run performance tests
+        shell: bash
+        if: ${{ github.event_name == 'pull_request' && matrix.config.build == 'Release' && matrix.config.name != 'cb100' && matrix.config.name != 'MacOS' }}
+        run: |
+          OMP_NUM_THREADS=4 bash ${{ github.workspace }}/oneDNN/.github/automation/performance/bench_pr_performance.sh ${{ github.workspace }}/oneDNN_base/build/tests/benchdnn/benchdnn ${{ github.workspace }}/oneDNN/build/tests/benchdnn/benchdnn base_4.txt new_4.txt
+          OMP_NUM_THREADS=16 bash ${{ github.workspace }}/oneDNN/.github/automation/performance/bench_pr_performance.sh ${{ github.workspace }}/oneDNN_base/build/tests/benchdnn/benchdnn ${{ github.workspace }}/oneDNN/build/tests/benchdnn/benchdnn base_16.txt new_16.txt
+        env:
+          DYLD_LIBRARY_PATH: ${{ github.workspace }}/ComputeLibrary/build
+
+      - name: Compare performance test results
+        if: ${{ github.event_name == 'pull_request' && matrix.config.build == 'Release' && matrix.config.name != 'cb100' && matrix.config.name != 'MacOS' }}
+        id: performance-test
+        continue-on-error: true
+        run: |
+          echo "4 threads:"
+          python ${{ github.workspace }}/oneDNN/.github/automation/performance/benchdnn_comparison.py base_4.txt new_4.txt
+          echo "16 threads:"
+          python ${{ github.workspace }}/oneDNN/.github/automation/performance/benchdnn_comparison.py base_16.txt new_16.txt
+
+      - name: Check performance test failure
+        if: ${{ steps.performance-test.outputs.pass != 'True' && github.event_name == 'pull_request' && matrix.config.build == 'Release' && matrix.config.name != 'cb100' && matrix.config.name != 'MacOS' }}
+        run: echo "::warning file=.github/workflows/ci-aarch64.yml,line=1,col=1::${{ steps.performance-test.outputs.message }}"
+
+  # This job adds a check named "CI AArch64" that represents overall
+  # workflow status and can be used in branch rulesets
+  status:
+    needs: build-and-test
+    runs-on: ubuntu-latest
+    name: "CI AArch64"
+    steps:
+      - name: Print success
+        run: echo Success
diff --git a/.github/workflows/nightly-aarch64.yml b/.github/workflows/nightly-aarch64.yml
new file mode 100644
index 00000000000..ab667214864
--- /dev/null
+++ b/.github/workflows/nightly-aarch64.yml
@@ -0,0 +1,199 @@
+# *******************************************************************************
+# Copyright 2024-2025 Arm Limited and affiliates.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# *******************************************************************************
+
+name: "Nightly AArch64"
+
+on:
+  #* allow manual trigger of workflow when needed. Useful for a nightly.
+  workflow_dispatch:
+  schedule:
+    #* minute (0-59) hour (0-23) day (1-31) month (1-12)  day of the week (0 - 6)
+    #* cron jobs run on the default (main) branch.
+    #* set to run at 5am UCT
+  - cron: "0 5 * * *"
+
+#* Stop stale workflows, though we should never hit this unless it hangs for a whole day.
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+# Declare default permissions as read only.
+permissions: write-all
+
+jobs:
+  build-acl-cache:
+    uses: ./.github/workflows/aarch64-acl.yml
+
+  test-performance:
+    uses: ./.github/workflows/performance-aarch64.yml
+
+  build-and-test:
+    needs: build-acl-cache
+    strategy:
+      matrix:
+        config: [
+          { name: cb100, label: ubuntu-24.04-arm, threading: OMP, toolset: gcc, build: RelWithAssert, testset: NIGHTLY }
+        ]
+
+    name: ${{ matrix.config.name }}, ${{ matrix.config.toolset }}, ${{ matrix.config.threading }}, ${{ matrix.config.build }}
+    runs-on: ${{ matrix.config.label }}
+    steps:
+
+      - name: Checkout oneDNN
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          path: oneDNN
+          fetch-tags: true
+          fetch-depth: 0
+
+      # Note: This will create a github actions cache
+      - name: Get latest CMake and Ninja
+        uses: lukka/get-cmake@56d043d188c3612951d8755da8f4b709ec951ad6 # v3.31.6
+        with:
+          cmakeVersion: 3.31.0
+          ninjaVersion: 1.12.0
+
+      - if: ${{ matrix.config.threading == 'OMP' }}
+        name: Install openmp
+        run: |
+          sudo apt install -y libomp-dev
+
+      - name: Read version file
+        id: get-versions
+        run: |
+          content=$(cat "${{ github.workspace }}/oneDNN/.github/automation/aarch64/ci.json")
+          content="${content//[$'\t\r\n$ ']}"
+          echo "output=$content" >> "$GITHUB_OUTPUT"
+
+      - name: Install gcc
+        run: |
+          sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y
+          sudo apt update -y
+          sudo apt install -y g++-${{ fromJson(steps.get-versions.outputs.output).dependencies.gcc }}
+
+      - name: setup python
+        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+        with:
+          python-version: '3.10'
+
+      - name: Clone ACL
+        run: ${{ github.workspace }}/oneDNN/.github/automation/aarch64/build_acl.sh
+        env:
+          ACL_ACTION: clone
+          ACL_ROOT_DIR: ${{ github.workspace }}/ComputeLibrary
+          ACL_VERSION: ${{ fromJson(steps.get-versions.outputs.output).dependencies.acl }}
+
+      - name: Get ACL commit hash for cache key
+        id: get_acl_commit_hash
+        run: (cd "${{ github.workspace }}/ComputeLibrary" && echo "ACLCommitHash=$(git rev-parse --short HEAD)") >> "$GITHUB_OUTPUT"
+
+      - name: Get system name
+        id: get_system_name
+        run: (echo "SystemName=$(uname)") >> "$GITHUB_OUTPUT"
+
+      - name: Restore cached ACL
+        id: cache-acl-restore
+        uses: actions/cache/restore@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0
+        with:
+          key: ${{ steps.get_system_name.outputs.SystemName }}-acl-${{ matrix.config.toolset }}-${{ matrix.config.build }}-${{ steps.get_acl_commit_hash.outputs.ACLCommitHash }}
+          path: ${{ github.workspace }}/ComputeLibrary/build
+
+      - name: Configure oneDNN
+        run: ${{ github.workspace }}/oneDNN/.github/automation/aarch64/build.sh
+        working-directory: ${{ github.workspace }}/oneDNN
+        env:
+          ACL_ROOT_DIR: ${{ github.workspace }}/ComputeLibrary
+          BUILD_TOOLSET: ${{ matrix.config.toolset }}
+          CMAKE_BUILD_TYPE: ${{ matrix.config.build }}
+          CMAKE_GENERATOR: Ninja
+          GCC_VERSION: ${{ fromJson(steps.get-versions.outputs.output).dependencies.gcc }}
+          ONEDNN_ACTION: configure
+          ONEDNN_TEST_SET: ${{ matrix.config.testset }}
+          ONEDNN_THREADING: ${{ matrix.config.threading }}
+
+      - name: Build oneDNN
+        run: ${{ github.workspace }}/oneDNN/.github/automation/aarch64/build.sh
+        working-directory: ${{ github.workspace }}/oneDNN
+        env:
+          ONEDNN_ACTION: build
+
+      - name: Run oneDNN tests
+        run: |
+          set -o pipefail
+          ${{ github.workspace }}/oneDNN/.github/automation/aarch64/test.sh ${{ github.workspace }}/test_results.xml
+        working-directory: ${{ github.workspace }}/oneDNN/build
+        env:
+          BUILD_TOOLSET: ${{ matrix.config.toolset }}
+          CMAKE_BUILD_TYPE: ${{ matrix.config.build }}
+          CTEST_PARALLEL_LEVEL: 8
+          DYLD_LIBRARY_PATH: ${{ github.workspace }}/ComputeLibrary/build
+          ONEDNN_THREADING: ${{ matrix.config.threading }}
+
+      - name: Create hash file
+        working-directory: ${{ github.workspace }}/oneDNN
+        run: git rev-parse --short HEAD > .github/automation/aarch64/stable.sha
+
+      - name: Save hash
+        uses: actions/cache/save@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0
+        with:
+          key: latest-nightly-success-sha
+          path: ${{ github.workspace }}/oneDNN/.github/automation/aarch64/stable.sha
+
+      - name: Find last successful run
+        if: failure()
+        id: get-stable-cache
+        uses: actions/cache/restore@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0
+        with:
+          key: latest-nightly-success-sha
+          path: ${{ github.workspace }}/oneDNN/.github/automation/aarch64/stable.sha
+
+      - name: Use backup stable hash
+        if: failure() && steps.get-stable-cache.outputs.cache-hit != 'true'
+        run: echo ${{ fromJson(steps.get-versions.outputs.output).dependencies.onednn-base }} > ${{ github.workspace }}/oneDNN/.github/automation/aarch64/stable.sha
+
+      - name: Get stable hash
+        if: failure()
+        id: get-stable
+        run: |
+          stable_hash=$(cat "${{ github.workspace }}/oneDNN/.github/automation/aarch64/stable.sha")
+          echo "stable-hash=$stable_hash" >> "$GITHUB_OUTPUT"
+
+      - name: Run git bisect
+        if: failure()
+        shell: bash
+        working-directory: ${{ github.workspace }}/oneDNN
+        run: python .github/automation/aarch64/bisect_ctest.py --unique ${{ steps.get-stable.outputs.stable-hash }} ${{ github.workspace }}/test_results.xml
+        env:
+          ACL_ROOT_DIR: ${{ github.workspace }}/ComputeLibrary
+
+      - name: Update wiki
+        if: ${{ (success() || failure()) && (github.ref == 'refs/heads/main' || github.ref == 'refs/heads/ryo-not-rio/wiki') }}
+        uses: ./oneDNN/.github/actions/update-wiki
+        with:
+          command: add-unit
+          title: ${{ matrix.config.name }}
+          in-file: ${{ github.workspace }}/test_results.xml
+
+  #* This job adds a check named "Nightly AArch64" that represents overall
+  #* workflow status and can be used in branch rulesets
+  status:
+    needs: build-and-test
+    runs-on: ubuntu-latest
+    name: "Nightly AArch64"
+    steps:
+      - name: Print success
+        run: echo Success
diff --git a/.github/workflows/performance-aarch64.yml b/.github/workflows/performance-aarch64.yml
new file mode 100644
index 00000000000..482ce4dd56b
--- /dev/null
+++ b/.github/workflows/performance-aarch64.yml
@@ -0,0 +1,258 @@
+# *******************************************************************************
+# Copyright 2024-2025 Arm Limited and affiliates.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# *******************************************************************************
+
+name: "Performance AArch64"
+
+on:
+  workflow_dispatch:
+    inputs:
+      onednn_base_hash:
+        required: false
+        type: string
+        description: 'Baseline oneDNN commit'
+      onednn_new_hash:
+        required: false
+        type: string
+        description: 'New oneDNN commit'
+      acl_base_hash:
+        required: false
+        type: string
+        description: 'Baseline ACL commit'
+      acl_new_hash:
+        required: false
+        type: string
+        description: 'New ACL commit'
+      num_threads:
+        required: false
+        type: string
+        description: 'Number of threads to use'
+      benchdnn_command:
+        required: false
+        type: string
+        description: 'benchdnn command to run'
+
+  workflow_call:
+
+#* Stop stale workflows
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}-performance
+  cancel-in-progress: true
+
+# Declare default permissions as read only.
+permissions: write-all
+
+jobs:
+  build-acl-base:
+    uses: ./.github/workflows/aarch64-acl.yml
+    with:
+      acl_hash: ${{ inputs.acl_base_hash }}
+
+  build-acl-new:
+    uses: ./.github/workflows/aarch64-acl.yml
+    with:
+      acl_hash: ${{ inputs.acl_new_hash }}
+
+  build-and-test-performance:
+    needs: [build-acl-base, build-acl-new]
+    strategy:
+      matrix:
+        config: [
+          { name: cb100, label: ubuntu-24.04-arm, threading: OMP, toolset: gcc, build: Release, testset: NIGHTLY }
+        ]
+
+    name: ${{ matrix.config.name }}, ${{ matrix.config.toolset }}, ${{ matrix.config.threading }}, ${{ matrix.config.build }}
+    runs-on: ${{ matrix.config.label }}
+    steps:
+
+      - name: Checkout oneDNN
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          path: oneDNN
+
+      # Note: This will create a github actions cache
+      - name: Get latest CMake and Ninja
+        uses: lukka/get-cmake@56d043d188c3612951d8755da8f4b709ec951ad6 # v3.31.6
+        with:
+          cmakeVersion: 3.31.0
+          ninjaVersion: 1.12.0
+
+      - if: ${{ matrix.config.threading == 'OMP' }}
+        name: Install openmp
+        run: |
+          sudo apt install -y libomp-dev
+
+      - name: Read version file
+        id: get-versions
+        run: |
+          content=$(cat "${{ github.workspace }}/oneDNN/.github/automation/aarch64/ci.json")
+          content="${content//[$'\t\r\n$ ']}"
+          echo "output=$content" >> "$GITHUB_OUTPUT"
+
+      - name: Install gcc
+        run: |
+          sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y
+          sudo apt update -y
+          sudo apt install -y g++-${{ fromJson(steps.get-versions.outputs.output).dependencies.gcc }}
+
+      - name: setup python
+        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+        with:
+          python-version: '3.10'
+
+      - name: Install scipy
+        if: ${{ matrix.config.build == 'Release' }}
+        run: pip install scipy statistics GitPython
+
+      - name: Clone base ACL
+        run: ${{ github.workspace }}/oneDNN/.github/automation/aarch64/build_acl.sh
+        env:
+          ACL_ACTION: clone
+          ACL_ROOT_DIR: ${{ github.workspace }}/ComputeLibrary
+          ACL_VERSION: ${{ inputs.acl_base_hash || fromJson(steps.get-versions.outputs.output).dependencies.acl }}
+
+      - name: Get system name
+        id: get_system_name
+        run: (echo "SystemName=$(uname)") >> "$GITHUB_OUTPUT"
+
+      - name: Get ACL commit hash for cache key
+        id: get_acl_commit_hash_base
+        run: |
+          cd "${{ github.workspace }}/ComputeLibrary" && echo "ACLCommitHash=$(git rev-parse --short HEAD)" >> "$GITHUB_OUTPUT"
+
+      - name: Restore base cached ACL
+        id: cache-acl-restore-base
+        uses: actions/cache/restore@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0
+        with:
+          key: ${{ steps.get_system_name.outputs.SystemName }}-acl-${{ matrix.config.toolset }}-${{ matrix.config.build }}-${{ steps.get_acl_commit_hash_base.outputs.ACLCommitHash }}
+          path: ${{ github.workspace }}/ComputeLibrary/build
+          fail-on-cache-miss: true
+
+      - name: Rename to ComputeLibrary_base
+        run: mv ${{ github.workspace }}/ComputeLibrary ${{ github.workspace }}/ComputeLibrary_base
+
+      - name: Clone new ACL
+        run: ${{ github.workspace }}/oneDNN/.github/automation/aarch64/build_acl.sh
+        env:
+          ACL_ACTION: clone
+          ACL_ROOT_DIR: ${{ github.workspace }}/ComputeLibrary
+          ACL_VERSION: ${{ inputs.acl_new_hash || fromJson(steps.get-versions.outputs.output).dependencies.acl }}
+
+      - name: Get ACL commit hash for cache key
+        id: get_acl_commit_hash_new
+        run: |
+          cd "${{ github.workspace }}/ComputeLibrary" && echo "ACLCommitHash=$(git rev-parse --short HEAD)" >> "$GITHUB_OUTPUT"
+
+      - name: Restore new cached ACL
+        id: cache-acl-restore-new
+        uses: actions/cache/restore@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0
+        with:
+          key: ${{ steps.get_system_name.outputs.SystemName }}-acl-${{ matrix.config.toolset }}-${{ matrix.config.build }}-${{ steps.get_acl_commit_hash_new.outputs.ACLCommitHash }}
+          path: ${{ github.workspace }}/ComputeLibrary/build
+          fail-on-cache-miss: true
+
+      - name: Move to ComputeLibrary_new
+        run: mv ${{ github.workspace }}/ComputeLibrary ${{ github.workspace }}/ComputeLibrary_new
+
+      - name: Checkout oneDNN base
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          ref: ${{inputs.onednn_base_hash || fromJson(steps.get-versions.outputs.output).dependencies.onednn-base }}
+          path: oneDNN_base
+
+      - name: Checkout oneDNN new
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          # when the input is non empty use it otherwise falls back to github.sha
+          ref: ${{ inputs.onednn_new_hash != '' && inputs.onednn_new_hash || github.sha }}
+          path: oneDNN_new
+
+      - name: Configure oneDNN base
+        run: .github/automation/aarch64/build.sh
+        working-directory: ${{ github.workspace }}/oneDNN_base
+        env:
+          ACL_ROOT_DIR: ${{ github.workspace }}/ComputeLibrary_base
+          BUILD_TOOLSET: ${{ matrix.config.toolset }}
+          CMAKE_BUILD_TYPE: ${{ matrix.config.build }}
+          CMAKE_GENERATOR: Ninja
+          GCC_VERSION: ${{ fromJson(steps.get-versions.outputs.output).dependencies.gcc }}
+          ONEDNN_ACTION: configure
+          ONEDNN_TEST_SET: ${{ matrix.config.testset }}
+          ONEDNN_THREADING: ${{ matrix.config.threading }}
+
+      - name: Build oneDNN base
+        run: .github/automation/aarch64/build.sh
+        working-directory: ${{ github.workspace }}/oneDNN_base
+        env:
+          ONEDNN_ACTION: build
+
+      - name: Configure oneDNN new
+        run: .github/automation/aarch64/build.sh
+        working-directory: ${{ github.workspace }}/oneDNN_new
+        env:
+          ACL_ROOT_DIR: ${{ github.workspace }}/ComputeLibrary_new
+          BUILD_TOOLSET: ${{ matrix.config.toolset }}
+          CMAKE_BUILD_TYPE: ${{ matrix.config.build }}
+          CMAKE_GENERATOR: Ninja
+          GCC_VERSION: ${{ fromJson(steps.get-versions.outputs.output).dependencies.gcc }}
+          ONEDNN_ACTION: configure
+          ONEDNN_TEST_SET: ${{ matrix.config.testset }}
+          ONEDNN_THREADING: ${{ matrix.config.threading }}
+
+      - name: Build oneDNN new
+        run: .github/automation/aarch64/build.sh
+        working-directory: ${{ github.workspace }}/oneDNN_new
+        env:
+          ONEDNN_ACTION: build
+
+      - name: Run nightly performance tests
+        if: ${{ inputs.benchdnn_command == '' }}
+        shell: bash
+        run: |
+          OMP_NUM_THREADS=${{ inputs.num_threads || 16 }} bash ${{ github.workspace }}/oneDNN/.github/automation/performance/bench_nightly_performance.sh ${{ github.workspace }}/oneDNN_base/build/tests/benchdnn/benchdnn ${{ github.workspace }}/oneDNN_new/build/tests/benchdnn/benchdnn base.txt new.txt
+          python ${{ github.workspace }}/oneDNN/.github/automation/performance/benchdnn_comparison.py base.txt new.txt --out-file perf_table.md
+
+      - name: Update wiki
+        if: ${{ (success() || failure()) && inputs.benchdnn_command == '' && (github.ref == 'refs/heads/main' || github.ref == 'refs/heads/ryo-not-rio/wiki') }}
+        uses: ./oneDNN/.github/actions/update-wiki
+        with:
+          command: add-perf
+          title: ${{ matrix.config.name }}
+          in-file: perf_table.md
+
+      - name: Run custom performance tests
+        if: ${{ inputs.benchdnn_command != '' }}
+        shell: bash
+        run: |
+          OMP_NUM_THREADS=${{ inputs.num_threads || 16 }} \
+          bash ${{ github.workspace }}/oneDNN/.github/automation/performance/run_benchdnn_compare.sh \
+            "${{ github.workspace }}/oneDNN_base/build/tests/benchdnn/benchdnn" \
+            "${{ github.workspace }}/oneDNN_new/build/tests/benchdnn/benchdnn" \
+            base.txt new.txt ${{ inputs.benchdnn_command }}
+
+      - name: Print speed comparisons
+        if: ${{ (success() || failure()) && inputs.benchdnn_command != '' }}
+        run: python3 "${{ github.workspace }}/oneDNN/.github/automation/performance/benchdnn_comparison.py" base.txt new.txt --out-file perf_results.md; cat perf_results.md >> "$GITHUB_STEP_SUMMARY"
+
+  #* This job adds a check named "Nightly Performance AArch64" that represents overall
+  #* workflow status and can be used in branch rulesets
+  status:
+    needs: build-and-test-performance
+    runs-on: ubuntu-latest
+    name: "Nightly Performance AArch64"
+    steps:
+      - name: Print success
+        run: echo Success
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4af933ec000..9d745709639 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -64,8 +64,6 @@ endif()
 if(NOT DNNL_TARGET_ARCH)
     if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*)")
         set(DNNL_TARGET_ARCH "AARCH64")
-    elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
-        set(DNNL_TARGET_ARCH "ARM")
     elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64.*|PPC64.*|powerpc64.*)")
         set(DNNL_TARGET_ARCH "PPC64")
     elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(s390x.*|S390X.*)")
diff --git a/cmake/ACL.cmake b/cmake/ACL.cmake
index 22b203ebabb..9622db5e65f 100644
--- a/cmake/ACL.cmake
+++ b/cmake/ACL.cmake
@@ -21,11 +21,11 @@ endif()
 set(acl_cmake_included true)
 include("cmake/options.cmake")
 
-if(NOT DNNL_TARGET_ARCH MATCHES "^(AARCH64|ARM)$")
+if(NOT DNNL_TARGET_ARCH STREQUAL "AARCH64")
     return()
 endif()
 
-if(NOT DNNL_USE_ACL)
+if(NOT DNNL_AARCH64_USE_ACL)
     return()
 endif()
 
diff --git a/cmake/options.cmake b/cmake/options.cmake
index 22441104296..720612c8066 100644
--- a/cmake/options.cmake
+++ b/cmake/options.cmake
@@ -416,7 +416,7 @@ set(DNNL_BLAS_VENDOR "NONE" CACHE STRING
 # AArch64 optimizations with Arm Compute Library
 # ==============================================
 
-option(DNNL_USE_ACL "Enables use of ARM optimised functions
+option(DNNL_AARCH64_USE_ACL "Enables use of AArch64 optimised functions
     from Arm Compute Library.
     This is only supported on AArch64 builds and assumes there is a
     functioning Compute Library build available at the location specified by the
diff --git a/cmake/platform.cmake b/cmake/platform.cmake
index 52944c18198..e6b69db8f9b 100644
--- a/cmake/platform.cmake
+++ b/cmake/platform.cmake
@@ -340,7 +340,7 @@ elseif(UNIX OR MINGW)
     platform_unix_and_mingw_noexcept_ccxx_flags(CMAKE_CMAKE_CCXX_NOEXCEPT_FLAGS)
     # compiler specific settings
     if(CMAKE_CXX_COMPILER_ID MATCHES "(Apple)?[Cc]lang")
-        if(DNNL_TARGET_ARCH MATCHES "^(AARCH64|ARM)$")
+        if(DNNL_TARGET_ARCH STREQUAL "AARCH64")
              if (NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
                  set(DEF_ARCH_OPT_FLAGS "-O3")
              endif()
@@ -449,7 +449,7 @@ elseif(UNIX OR MINGW)
             append(CMAKE_CCXX_FLAGS "-Wno-ignored-attributes")
         endif()
 
-        if(DNNL_TARGET_ARCH MATCHES "^(AARCH64|ARM)$")
+        if(DNNL_TARGET_ARCH STREQUAL "AARCH64")
             if (NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
                 set(DEF_ARCH_OPT_FLAGS "-O3")
             endif()
diff --git a/src/common/utils.cpp b/src/common/utils.cpp
index 90a8bf9e904..c7219be59e3 100644
--- a/src/common/utils.cpp
+++ b/src/common/utils.cpp
@@ -230,7 +230,7 @@ bool get_jit_dump() {
     return jit_dump.get();
 }
 
-#if defined(DNNL_AARCH64) && (DNNL_AARCH64 == 1) || defined(DNNL_ARM) && (DNNL_ARM == 1)
+#if defined(DNNL_AARCH64) && (DNNL_AARCH64 == 1)
 static setting_t<unsigned> jit_profiling_flags {DNNL_JIT_PROFILE_LINUX_PERFMAP};
 #else
 static setting_t<unsigned> jit_profiling_flags {DNNL_JIT_PROFILE_VTUNE};
diff --git a/src/cpu/CMakeLists.txt b/src/cpu/CMakeLists.txt
index ab791ee7b2c..49e73c7676a 100644
--- a/src/cpu/CMakeLists.txt
+++ b/src/cpu/CMakeLists.txt
@@ -139,9 +139,6 @@ endif()
 if (DNNL_TARGET_ARCH STREQUAL "AARCH64")
     add_subdirectory(aarch64)
 endif()
-if (DNNL_USE_ACL)
-    add_subdirectory(acl)
-endif()
 if (DNNL_TARGET_ARCH STREQUAL "PPC64")
     add_subdirectory(ppc64)
 endif()
diff --git a/src/cpu/README.md b/src/cpu/README.md
index 7641f9e825b..81718eae221 100644
--- a/src/cpu/README.md
+++ b/src/cpu/README.md
@@ -46,7 +46,6 @@ enable or disable parts of code. There the following macros defined:
 - `DNNL_X64` is 1 on x64 architecture;
 - `DNNL_X86` is 1 on x86 architecture;
 - `DNNL_AARCH64` is 1 on Arm AArch64 architecture;
-- `DNNL_ARM` is 1 on Arm 32 architecture;
 - `DNNL_PPC64` is 1 on OpenPOWER / IBM Power architecture;
 - `DNNL_S390X` is 1 on IBMz / s390x architecture;
 - `DNNL_RV64` is 1 on RISC-V architecture;
diff --git a/src/cpu/aarch64/CMakeLists.txt b/src/cpu/aarch64/CMakeLists.txt
index 32eec64988c..75ba8e2705c 100644
--- a/src/cpu/aarch64/CMakeLists.txt
+++ b/src/cpu/aarch64/CMakeLists.txt
@@ -20,6 +20,14 @@ file(GLOB_RECURSE SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/*.[ch]pp
     )
 
+if(NOT DNNL_AARCH64_USE_ACL)
+    file(GLOB_RECURSE ACL_FILES
+        ${CMAKE_CURRENT_SOURCE_DIR}/acl_*.[ch]
+        ${CMAKE_CURRENT_SOURCE_DIR}/acl_*.[ch]pp
+        )
+    list(REMOVE_ITEM SOURCES ${ACL_FILES})
+endif()
+
 # If the runtime is not THREADPOOL remove threadpool_scheduler sources.
 if(NOT DNNL_CPU_RUNTIME STREQUAL "THREADPOOL")
     list(APPEND ACL_THREADPOOL_FILES
@@ -33,6 +41,5 @@ set(OBJ_LIB ${LIB_PACKAGE_NAME}_cpu_aarch64)
 add_library(${OBJ_LIB} OBJECT ${SOURCES})
 set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS
     $<TARGET_OBJECTS:${OBJ_LIB}>)
-enable_conditional_compilation4(${OBJ_LIB})
 
 add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/xbyak_aarch64 xbyak_aarch64)
diff --git a/src/cpu/acl/acl_batch_normalization.cpp b/src/cpu/aarch64/acl_batch_normalization.cpp
similarity index 96%
rename from src/cpu/acl/acl_batch_normalization.cpp
rename to src/cpu/aarch64/acl_batch_normalization.cpp
index 83f4c5061a0..77a723207fc 100644
--- a/src/cpu/acl/acl_batch_normalization.cpp
+++ b/src/cpu/aarch64/acl_batch_normalization.cpp
@@ -14,12 +14,12 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "cpu/acl/acl_batch_normalization.hpp"
+#include "cpu/aarch64/acl_batch_normalization.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 status_t acl_batch_normalization_fwd_t::execute_forward(
         const exec_ctx_t &ctx) const {
@@ -72,7 +72,7 @@ status_t acl_batch_normalization_fwd_t::execute_forward(
     return status::success;
 }
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/acl/acl_batch_normalization.hpp b/src/cpu/aarch64/acl_batch_normalization.hpp
similarity index 98%
rename from src/cpu/acl/acl_batch_normalization.hpp
rename to src/cpu/aarch64/acl_batch_normalization.hpp
index ef7e4c22cbd..df8cd1223f7 100644
--- a/src/cpu/acl/acl_batch_normalization.hpp
+++ b/src/cpu/aarch64/acl_batch_normalization.hpp
@@ -14,18 +14,18 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_ACL_BATCH_NORMALIZATION_HPP
-#define CPU_ACL_BATCH_NORMALIZATION_HPP
+#ifndef CPU_AARCH64_ACL_BATCH_NORMALIZATION_HPP
+#define CPU_AARCH64_ACL_BATCH_NORMALIZATION_HPP
 
 #include "cpu/cpu_batch_normalization_pd.hpp"
 
-#include "cpu/acl/acl_post_ops.hpp"
-#include "cpu/acl/acl_utils.hpp"
+#include "cpu/aarch64/acl_post_ops.hpp"
+#include "cpu/aarch64/acl_utils.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 struct acl_batch_normalization_obj_t {
     arm_compute::NEBatchNormalizationLayer bnorm;
@@ -266,7 +266,7 @@ struct acl_batch_normalization_fwd_t : public primitive_t {
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
 }; // acl_batch_normalization_fwd_t
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/acl/acl_benchmark_scheduler.cpp b/src/cpu/aarch64/acl_benchmark_scheduler.cpp
similarity index 96%
rename from src/cpu/acl/acl_benchmark_scheduler.cpp
rename to src/cpu/aarch64/acl_benchmark_scheduler.cpp
index 9c41e0a488f..4f453f7eabf 100644
--- a/src/cpu/acl/acl_benchmark_scheduler.cpp
+++ b/src/cpu/aarch64/acl_benchmark_scheduler.cpp
@@ -14,13 +14,13 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "cpu/acl/acl_benchmark_scheduler.hpp"
+#include "cpu/aarch64/acl_benchmark_scheduler.hpp"
 #include "common/verbose.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 using namespace arm_compute;
 
 benchmark_scheduler_t::benchmark_scheduler_t(IScheduler &real_scheduler)
@@ -72,7 +72,7 @@ void benchmark_scheduler_t::run_workloads(std::vector<Workload> &workloads) {
     ARM_COMPUTE_ERROR("Can't be reached");
 }
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/acl/acl_benchmark_scheduler.hpp b/src/cpu/aarch64/acl_benchmark_scheduler.hpp
similarity index 92%
rename from src/cpu/acl/acl_benchmark_scheduler.hpp
rename to src/cpu/aarch64/acl_benchmark_scheduler.hpp
index 59de1d26ea4..0b5d0b929e5 100644
--- a/src/cpu/acl/acl_benchmark_scheduler.hpp
+++ b/src/cpu/aarch64/acl_benchmark_scheduler.hpp
@@ -13,8 +13,8 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
-#ifndef CPU_ACL_BENCHMARK_SCHEDULER_HPP
-#define CPU_ACL_BENCHMARK_SCHEDULER_HPP
+#ifndef CPU_AARCH64_ACL_BENCHMARK_SCHEDULER_HPP
+#define CPU_AARCH64_ACL_BENCHMARK_SCHEDULER_HPP
 
 #include "arm_compute/core/CPP/ICPPKernel.h"
 #include "arm_compute/runtime/IScheduler.h"
@@ -22,7 +22,7 @@
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 // BenchmarkScheduler implement's ACL IScheduler interface and acts as an interceptor scheduler
 // when DNNL_VERBOSE=profile,profile_externals. It intercepts calls made by the actual scheduler used by ACL and adds
 // timers to benchmark execution time of ACL kernels and store kernel information.
@@ -52,9 +52,9 @@ class benchmark_scheduler_t final : public arm_compute::IScheduler {
     IScheduler &_real_scheduler;
 };
 
-#endif // CPU_ACL_BENCHMARK_SCHEDULER_HPP
+#endif // CPU_AARCH64_ACL_BENCHMARK_SCHEDULER_HPP
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/acl/acl_binary.cpp b/src/cpu/aarch64/acl_binary.cpp
similarity index 99%
rename from src/cpu/acl/acl_binary.cpp
rename to src/cpu/aarch64/acl_binary.cpp
index 703ddac3f0a..8795d9d1ec7 100644
--- a/src/cpu/acl/acl_binary.cpp
+++ b/src/cpu/aarch64/acl_binary.cpp
@@ -27,7 +27,7 @@
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 status_t acl_binary_t::pd_t::init(engine_t *engine) {
     using namespace acl_utils;
@@ -232,7 +232,7 @@ const acl_binary_t::pd_t *acl_binary_t::pd() const {
     return static_cast<const pd_t *>(primitive_t::pd().get());
 }
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/acl/acl_binary.hpp b/src/cpu/aarch64/acl_binary.hpp
similarity index 95%
rename from src/cpu/acl/acl_binary.hpp
rename to src/cpu/aarch64/acl_binary.hpp
index 7040fe8aa42..41ecdded523 100644
--- a/src/cpu/acl/acl_binary.hpp
+++ b/src/cpu/aarch64/acl_binary.hpp
@@ -14,8 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_ACL_BINARY_HPP
-#define CPU_ACL_BINARY_HPP
+#ifndef CPU_AARCH64_ACL_BINARY_HPP
+#define CPU_AARCH64_ACL_BINARY_HPP
 
 #include "acl_utils.hpp"
 #include "cpu/cpu_binary_pd.hpp"
@@ -28,7 +28,7 @@
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 struct acl_binary_conf_t {
     arm_compute::TensorInfo src0_info;
@@ -73,7 +73,7 @@ struct acl_binary_t : public primitive_t {
 
 }; // acl_binary_t
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/acl/acl_convolution_utils.cpp b/src/cpu/aarch64/acl_convolution_utils.cpp
similarity index 96%
rename from src/cpu/acl/acl_convolution_utils.cpp
rename to src/cpu/aarch64/acl_convolution_utils.cpp
index 9cb9146bc4b..eb71f230908 100644
--- a/src/cpu/acl/acl_convolution_utils.cpp
+++ b/src/cpu/aarch64/acl_convolution_utils.cpp
@@ -14,7 +14,7 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "cpu/acl/acl_convolution_utils.hpp"
+#include "cpu/aarch64/acl_convolution_utils.hpp"
 #include "common/convolution_pd.hpp"
 #include "common/utils.hpp"
 #include "oneapi/dnnl/dnnl.h"
@@ -22,7 +22,7 @@
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 namespace acl_convolution_utils {
 
@@ -289,21 +289,19 @@ status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
         memory_desc_t &bias_md, const convolution_desc_t &cd,
         const primitive_attr_t &attr) {
 
-    // Under these conditions, fallback to faster GEMM-based convolution
-    // unless the user explicitly specifies Winograd algorithm
-    // clang-format off
-
     // Heuristic only for servers
-    if (dnnl_get_max_threads() > 28 && cd.alg_kind == alg_kind::convolution_auto) {
-            return status::unimplemented;
+    // clang-format off
+    if (dnnl_get_max_threads() > 28
+            && cd.alg_kind == alg_kind::convolution_auto) {
+        return status::unimplemented;
     }
+
     // Heuristic for other devices
     if (one_of(true, src_md.dims[1] < 64, // ic
-                     dst_md.dims[1] < 64) // oc
-                  && cd.alg_kind == alg_kind::convolution_auto) {
-            return status::unimplemented;
+                 dst_md.dims[1] < 64) // oc
+            && cd.alg_kind == alg_kind::convolution_auto) {
+        return status::unimplemented;
     }
-
     // clang-format on
 
     // General Compute Library checks, memory tags are also set there
@@ -339,7 +337,7 @@ status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
 
 } // namespace acl_convolution_utils
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/acl/acl_convolution_utils.hpp b/src/cpu/aarch64/acl_convolution_utils.hpp
similarity index 98%
rename from src/cpu/acl/acl_convolution_utils.hpp
rename to src/cpu/aarch64/acl_convolution_utils.hpp
index fb616e71a7c..c438cf9574b 100644
--- a/src/cpu/acl/acl_convolution_utils.hpp
+++ b/src/cpu/aarch64/acl_convolution_utils.hpp
@@ -14,8 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_ACL_CONVOLUTION_UTILS_HPP
-#define CPU_ACL_CONVOLUTION_UTILS_HPP
+#ifndef CPU_AARCH64_ACL_CONVOLUTION_UTILS_HPP
+#define CPU_AARCH64_ACL_CONVOLUTION_UTILS_HPP
 
 #include <map>
 #include "acl_post_ops.hpp"
@@ -26,7 +26,7 @@
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 template <typename ConvOp>
 struct acl_obj_t {
@@ -231,9 +231,9 @@ status_t execute_forward_conv_acl(
     return status::success;
 }
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
 
-#endif // CPU_ACL_CONVOLUTION_UTILS_HPP
+#endif // CPU_AARCH64_ACL_CONVOLUTION_UTILS_HPP
diff --git a/src/cpu/acl/acl_deconvolution.cpp b/src/cpu/aarch64/acl_deconvolution.cpp
similarity index 96%
rename from src/cpu/acl/acl_deconvolution.cpp
rename to src/cpu/aarch64/acl_deconvolution.cpp
index 0eef20dbabc..cdeca9cb8bb 100644
--- a/src/cpu/acl/acl_deconvolution.cpp
+++ b/src/cpu/aarch64/acl_deconvolution.cpp
@@ -14,12 +14,12 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "cpu/acl/acl_deconvolution.hpp"
+#include "cpu/aarch64/acl_deconvolution.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 status_t acl_deconvolution_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
 
@@ -64,7 +64,7 @@ status_t acl_deconvolution_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
     return status::success;
 }
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/acl/acl_deconvolution.hpp b/src/cpu/aarch64/acl_deconvolution.hpp
similarity index 98%
rename from src/cpu/acl/acl_deconvolution.hpp
rename to src/cpu/aarch64/acl_deconvolution.hpp
index 18c8c1f1a67..e4379cb0d99 100644
--- a/src/cpu/acl/acl_deconvolution.hpp
+++ b/src/cpu/aarch64/acl_deconvolution.hpp
@@ -14,16 +14,16 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_ACL_DECONVOLUTION_HPP
-#define CPU_ACL_DECONVOLUTION_HPP
+#ifndef CPU_AARCH64_ACL_DECONVOLUTION_HPP
+#define CPU_AARCH64_ACL_DECONVOLUTION_HPP
 
-#include "cpu/acl/acl_post_ops.hpp"
+#include "cpu/aarch64/acl_post_ops.hpp"
 #include "cpu/cpu_deconvolution_pd.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 struct acl_deconv_obj_t {
     arm_compute::NEDeconvolutionLayer deconv;
@@ -330,7 +330,7 @@ struct acl_deconvolution_fwd_t : public primitive_t {
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
 }; // acl_deconvolution_fwd_t
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/acl/acl_depthwise_convolution.cpp b/src/cpu/aarch64/acl_depthwise_convolution.cpp
similarity index 97%
rename from src/cpu/acl/acl_depthwise_convolution.cpp
rename to src/cpu/aarch64/acl_depthwise_convolution.cpp
index 2f6d5756cc0..976cf096408 100644
--- a/src/cpu/acl/acl_depthwise_convolution.cpp
+++ b/src/cpu/aarch64/acl_depthwise_convolution.cpp
@@ -14,12 +14,12 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "cpu/acl/acl_depthwise_convolution.hpp"
+#include "cpu/aarch64/acl_depthwise_convolution.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 namespace {
 using data_t = prec_traits_t<data_type::f32>::type;
@@ -88,7 +88,7 @@ status_t acl_depthwise_convolution_fwd_t::init(engine_t *engine) {
     acl_obj_->aux_mem_req = acl_obj_->conv.workspace();
     return status::success;
 }
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/acl/acl_depthwise_convolution.hpp b/src/cpu/aarch64/acl_depthwise_convolution.hpp
similarity index 91%
rename from src/cpu/acl/acl_depthwise_convolution.hpp
rename to src/cpu/aarch64/acl_depthwise_convolution.hpp
index 61c39332a67..6fa37e98e4f 100644
--- a/src/cpu/acl/acl_depthwise_convolution.hpp
+++ b/src/cpu/aarch64/acl_depthwise_convolution.hpp
@@ -14,8 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_ACL_DEPTHWISE_CONVOLUTION_HPP
-#define CPU_ACL_DEPTHWISE_CONVOLUTION_HPP
+#ifndef CPU_AARCH64_ACL_DEPTHWISE_CONVOLUTION_HPP
+#define CPU_AARCH64_ACL_DEPTHWISE_CONVOLUTION_HPP
 
 #include "acl_convolution_utils.hpp"
 #include "arm_compute/runtime/experimental/operators/CpuDepthwiseConv2d.h"
@@ -24,7 +24,7 @@
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 struct acl_depthwise_convolution_fwd_t : public primitive_t {
 
@@ -57,9 +57,9 @@ struct acl_depthwise_convolution_fwd_t : public primitive_t {
     std::unique_ptr<acl_obj_t<Op>> acl_obj_;
 };
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
 
-#endif // CPU_ACL_DEPTHWISE_CONVOLUTION_HPP
+#endif // CPU_AARCH64_ACL_DEPTHWISE_CONVOLUTION_HPP
diff --git a/src/cpu/acl/acl_eltwise.cpp b/src/cpu/aarch64/acl_eltwise.cpp
similarity index 98%
rename from src/cpu/acl/acl_eltwise.cpp
rename to src/cpu/aarch64/acl_eltwise.cpp
index b0cf7181cd0..64d0ce6b3f6 100644
--- a/src/cpu/acl/acl_eltwise.cpp
+++ b/src/cpu/aarch64/acl_eltwise.cpp
@@ -19,7 +19,7 @@
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 status_t acl_eltwise_fwd_t::execute(const exec_ctx_t &ctx) const {
     return execute_forward(ctx);
@@ -109,7 +109,7 @@ status_t acl_eltwise_fwd_t::pd_t::init(engine_t *engine) {
 
     return status::success;
 }
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/acl/acl_eltwise.hpp b/src/cpu/aarch64/acl_eltwise.hpp
similarity index 93%
rename from src/cpu/acl/acl_eltwise.hpp
rename to src/cpu/aarch64/acl_eltwise.hpp
index 45869414bec..bd64eac1936 100644
--- a/src/cpu/acl/acl_eltwise.hpp
+++ b/src/cpu/aarch64/acl_eltwise.hpp
@@ -14,8 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_ACL_ELTWISE_HPP
-#define CPU_ACL_ELTWISE_HPP
+#ifndef CPU_AARCH64_ACL_ELTWISE_HPP
+#define CPU_AARCH64_ACL_ELTWISE_HPP
 
 #include <memory>
 #include "cpu/cpu_eltwise_pd.hpp"
@@ -27,7 +27,7 @@
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 struct acl_eltwise_conf_t {
     arm_compute::ActivationLayerInfo act_info;
@@ -71,9 +71,9 @@ struct acl_eltwise_fwd_t : public primitive_t {
     friend struct acl_post_ops_t;
 }; // acl_eltwise_fwd_t
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
 
-#endif // CPU_ACL_ELTWISE_HPP
+#endif // CPU_AARCH64_ACL_ELTWISE_HPP
diff --git a/src/cpu/acl/acl_gemm_convolution.cpp b/src/cpu/aarch64/acl_gemm_convolution.cpp
similarity index 99%
rename from src/cpu/acl/acl_gemm_convolution.cpp
rename to src/cpu/aarch64/acl_gemm_convolution.cpp
index 8392bd29676..e299644eee2 100644
--- a/src/cpu/acl/acl_gemm_convolution.cpp
+++ b/src/cpu/aarch64/acl_gemm_convolution.cpp
@@ -21,7 +21,7 @@
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 namespace {
 // Keys are anonymous. So deduce the type automagically.
@@ -113,7 +113,7 @@ template struct acl_gemm_convolution_fwd_t<f32>;
 template struct acl_gemm_convolution_fwd_t<f16>;
 template struct acl_gemm_convolution_fwd_t<s8, s8, s8, s32>;
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/acl/acl_gemm_convolution.hpp b/src/cpu/aarch64/acl_gemm_convolution.hpp
similarity index 94%
rename from src/cpu/acl/acl_gemm_convolution.hpp
rename to src/cpu/aarch64/acl_gemm_convolution.hpp
index 14d0050c7ab..58ff3e5fd9a 100644
--- a/src/cpu/acl/acl_gemm_convolution.hpp
+++ b/src/cpu/aarch64/acl_gemm_convolution.hpp
@@ -14,8 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_ACL_GEMM_CONVOLUTION_HPP
-#define CPU_ACL_GEMM_CONVOLUTION_HPP
+#ifndef CPU_AARCH64_ACL_GEMM_CONVOLUTION_HPP
+#define CPU_AARCH64_ACL_GEMM_CONVOLUTION_HPP
 
 #include "common/memory_tracking.hpp"
 #include "cpu/cpu_convolution_pd.hpp"
@@ -27,7 +27,7 @@
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 template <data_type_t src_type, data_type_t wei_type = src_type,
         data_type_t dst_type = src_type, data_type_t bia_type = dst_type>
@@ -67,7 +67,7 @@ struct acl_gemm_convolution_fwd_t : public primitive_t {
     std::unique_ptr<acl_obj_t<Op>> acl_obj_;
 }; // acl_gemm_convolution_fwd_t
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/acl/acl_indirect_gemm_convolution.cpp b/src/cpu/aarch64/acl_indirect_gemm_convolution.cpp
similarity index 99%
rename from src/cpu/acl/acl_indirect_gemm_convolution.cpp
rename to src/cpu/aarch64/acl_indirect_gemm_convolution.cpp
index f53304b7c35..0e080c6e6ca 100644
--- a/src/cpu/acl/acl_indirect_gemm_convolution.cpp
+++ b/src/cpu/aarch64/acl_indirect_gemm_convolution.cpp
@@ -22,7 +22,7 @@
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 namespace {
 using data_t = typename prec_traits_t<data_type::f32>::type;
@@ -123,7 +123,7 @@ status_t acl_indirect_gemm_convolution_fwd_t::pd_t::init(engine_t *engine) {
             dst_md_);
 }
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/acl/acl_indirect_gemm_convolution.hpp b/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp
similarity index 90%
rename from src/cpu/acl/acl_indirect_gemm_convolution.hpp
rename to src/cpu/aarch64/acl_indirect_gemm_convolution.hpp
index 7286cc3ced6..6c91ac5e35c 100644
--- a/src/cpu/acl/acl_indirect_gemm_convolution.hpp
+++ b/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp
@@ -14,8 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_ACL_INDIRECT_GEMM_CONVOLUTION_HPP
-#define CPU_ACL_INDIRECT_GEMM_CONVOLUTION_HPP
+#ifndef CPU_AARCH64_ACL_INDIRECT_GEMM_CONVOLUTION_HPP
+#define CPU_AARCH64_ACL_INDIRECT_GEMM_CONVOLUTION_HPP
 
 #include "cpu/cpu_convolution_pd.hpp"
 
@@ -25,7 +25,7 @@
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 struct acl_indirect_gemm_convolution_fwd_t : public primitive_t {
 
@@ -61,9 +61,9 @@ struct acl_indirect_gemm_convolution_fwd_t : public primitive_t {
     std::unique_ptr<acl_obj_t<Op>> acl_obj_;
 };
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
 
-#endif // CPU_ACL_INDIRECT_GEMM_CONVOLUTION_HPP
+#endif // CPU_AARCH64_ACL_INDIRECT_GEMM_CONVOLUTION_HPP
diff --git a/src/cpu/acl/acl_inner_product.cpp b/src/cpu/aarch64/acl_inner_product.cpp
similarity index 99%
rename from src/cpu/acl/acl_inner_product.cpp
rename to src/cpu/aarch64/acl_inner_product.cpp
index d51030c4381..16bd5318f28 100644
--- a/src/cpu/acl/acl_inner_product.cpp
+++ b/src/cpu/aarch64/acl_inner_product.cpp
@@ -14,12 +14,12 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "cpu/acl/acl_inner_product.hpp"
+#include "cpu/aarch64/acl_inner_product.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 status_t acl_inner_product_fwd_t::init(engine_t *engine) {
     auto aip = pd()->aip_;
@@ -283,7 +283,7 @@ status_t acl_inner_product_fwd_t::pd_t::init_conf_ip(
     return status::success;
 }
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/acl/acl_inner_product.hpp b/src/cpu/aarch64/acl_inner_product.hpp
similarity index 91%
rename from src/cpu/acl/acl_inner_product.hpp
rename to src/cpu/aarch64/acl_inner_product.hpp
index 46e9acb8313..8ed5f858b9a 100644
--- a/src/cpu/acl/acl_inner_product.hpp
+++ b/src/cpu/aarch64/acl_inner_product.hpp
@@ -14,11 +14,11 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_ACL_INNER_PRODUCT_HPP
-#define CPU_ACL_INNER_PRODUCT_HPP
+#ifndef CPU_AARCH64_ACL_INNER_PRODUCT_HPP
+#define CPU_AARCH64_ACL_INNER_PRODUCT_HPP
 
-#include "cpu/acl/acl_post_ops.hpp"
-#include "cpu/acl/acl_utils.hpp"
+#include "cpu/aarch64/acl_post_ops.hpp"
+#include "cpu/aarch64/acl_utils.hpp"
 #include "cpu/cpu_inner_product_pd.hpp"
 
 #include "arm_compute/runtime/experimental/operators/CpuFullyConnected.h"
@@ -26,7 +26,7 @@
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 struct acl_ip_conf_t {
     bool with_bias;
@@ -75,9 +75,9 @@ struct acl_inner_product_fwd_t : public primitive_t {
             inner_product_op_;
 }; // acl_inner_product_fwd_t
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
 
-#endif // CPU_ACL_INNER_PRODUCT_HPP
+#endif // CPU_AARCH64_ACL_INNER_PRODUCT_HPP
diff --git a/src/cpu/acl/acl_layer_normalization.cpp b/src/cpu/aarch64/acl_layer_normalization.cpp
similarity index 98%
rename from src/cpu/acl/acl_layer_normalization.cpp
rename to src/cpu/aarch64/acl_layer_normalization.cpp
index 1f244585571..18324de0c80 100644
--- a/src/cpu/acl/acl_layer_normalization.cpp
+++ b/src/cpu/aarch64/acl_layer_normalization.cpp
@@ -14,12 +14,12 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "cpu/acl/acl_layer_normalization.hpp"
+#include "cpu/aarch64/acl_layer_normalization.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 acl_layer_normalization_fwd_t::acl_layer_normalization_fwd_t(const pd_t *apd)
     : primitive_t(apd)
@@ -200,7 +200,7 @@ status_t acl_layer_normalization_fwd_t::execute_forward(
     return status::success;
 }
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/acl/acl_layer_normalization.hpp b/src/cpu/aarch64/acl_layer_normalization.hpp
similarity index 89%
rename from src/cpu/acl/acl_layer_normalization.hpp
rename to src/cpu/aarch64/acl_layer_normalization.hpp
index 133ea2be4a4..c3dfaf3b214 100644
--- a/src/cpu/acl/acl_layer_normalization.hpp
+++ b/src/cpu/aarch64/acl_layer_normalization.hpp
@@ -14,18 +14,18 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_ACL_LAYER_NORMALIZATION_HPP
-#define CPU_ACL_LAYER_NORMALIZATION_HPP
+#ifndef CPU_AARCH64_ACL_LAYER_NORMALIZATION_HPP
+#define CPU_AARCH64_ACL_LAYER_NORMALIZATION_HPP
 
 #include "arm_compute/runtime/experimental/operators/CpuMeanStdDevNormalization.h"
 
-#include "cpu/acl/acl_utils.hpp"
+#include "cpu/aarch64/acl_utils.hpp"
 #include "cpu/cpu_layer_normalization_pd.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 struct acl_layer_normalization_fwd_t : public primitive_t {
     struct pd_t : public cpu_layer_normalization_fwd_pd_t {
         using cpu_layer_normalization_fwd_pd_t::
@@ -55,9 +55,9 @@ struct acl_layer_normalization_fwd_t : public primitive_t {
             acl_obj_;
 };
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
 
-#endif // CPU_ACL_LAYER_NORMALIZATION_HPP
+#endif // CPU_AARCH64_ACL_LAYER_NORMALIZATION_HPP
diff --git a/src/cpu/acl/acl_pooling.cpp b/src/cpu/aarch64/acl_pooling.cpp
similarity index 99%
rename from src/cpu/acl/acl_pooling.cpp
rename to src/cpu/aarch64/acl_pooling.cpp
index 4c14193ce1c..6728f0a2395 100644
--- a/src/cpu/acl/acl_pooling.cpp
+++ b/src/cpu/aarch64/acl_pooling.cpp
@@ -14,13 +14,13 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "cpu/acl/acl_pooling.hpp"
+#include "cpu/aarch64/acl_pooling.hpp"
 #include "common/memory_tracking.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 status_t acl_pooling_fwd_t::pd_t::init(engine_t *engine) {
 
@@ -297,7 +297,7 @@ status_t acl_pooling_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
     return status;
 }
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/acl/acl_pooling.hpp b/src/cpu/aarch64/acl_pooling.hpp
similarity index 93%
rename from src/cpu/acl/acl_pooling.hpp
rename to src/cpu/aarch64/acl_pooling.hpp
index b47fdfa175b..f8b540cd023 100644
--- a/src/cpu/acl/acl_pooling.hpp
+++ b/src/cpu/aarch64/acl_pooling.hpp
@@ -14,12 +14,12 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_ACL_POOLING_HPP
-#define CPU_ACL_POOLING_HPP
+#ifndef CPU_AARCH64_ACL_POOLING_HPP
+#define CPU_AARCH64_ACL_POOLING_HPP
 
 #include "cpu/cpu_pooling_pd.hpp"
 
-#include "cpu/acl/acl_utils.hpp"
+#include "cpu/aarch64/acl_utils.hpp"
 
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/runtime/IOperator.h"
@@ -28,7 +28,7 @@
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 struct acl_pooling_conf_t {
     arm_compute::TensorInfo src_info;
@@ -74,9 +74,9 @@ struct acl_pooling_fwd_t : public primitive_t {
     std::unique_ptr<arm_compute::experimental::op::CpuPool2d> pooling_op_;
 }; // acl_pooling_fwd_t
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
 
-#endif // CPU_ACL_POOLING_HPP
+#endif // CPU_AARCH64_ACL_POOLING_HPP
diff --git a/src/cpu/acl/acl_post_ops.cpp b/src/cpu/aarch64/acl_post_ops.cpp
similarity index 97%
rename from src/cpu/acl/acl_post_ops.cpp
rename to src/cpu/aarch64/acl_post_ops.cpp
index 816d195a920..dbb1bf2d53c 100644
--- a/src/cpu/acl/acl_post_ops.cpp
+++ b/src/cpu/aarch64/acl_post_ops.cpp
@@ -15,12 +15,12 @@
 *******************************************************************************/
 
 #include "common/float16.hpp"
-#include "cpu/acl/acl_gemm_convolution.hpp"
+#include "cpu/aarch64/acl_gemm_convolution.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 status_t acl_post_ops_t::execute(
         const exec_ctx_t &ctx, void *src, void *dst) const {
@@ -97,7 +97,7 @@ status_t acl_post_ops_t::execute(
     return status::success;
 }
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/acl/acl_post_ops.hpp b/src/cpu/aarch64/acl_post_ops.hpp
similarity index 96%
rename from src/cpu/acl/acl_post_ops.hpp
rename to src/cpu/aarch64/acl_post_ops.hpp
index d5e470e4578..9fd4650456a 100644
--- a/src/cpu/acl/acl_post_ops.hpp
+++ b/src/cpu/aarch64/acl_post_ops.hpp
@@ -14,16 +14,16 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_ACL_POST_OPS_HPP
-#define CPU_ACL_POST_OPS_HPP
+#ifndef CPU_AARCH64_ACL_POST_OPS_HPP
+#define CPU_AARCH64_ACL_POST_OPS_HPP
 
-#include "cpu/acl/acl_binary.hpp"
-#include "cpu/acl/acl_eltwise.hpp"
+#include "cpu/aarch64/acl_binary.hpp"
+#include "cpu/aarch64/acl_eltwise.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 struct acl_post_ops_t {
 
@@ -142,7 +142,7 @@ struct acl_post_ops_t {
 
         CHECK(base_post_ops.set_default_formats(&dst_md));
         dst_data_type = dst_md.data_type;
-        // If the first entry is eltwise, we fuse it
+        // If the first entry is eltwise, we fuse it.
         if (base_post_ops.len() >= 1 && base_post_ops.entry_[0].is_eltwise()) {
 
             const auto &first_po = base_post_ops.entry_[0].eltwise;
@@ -176,7 +176,7 @@ struct acl_post_ops_t {
     std::vector<std::shared_ptr<primitive_t>> post_op_primitives;
 };
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/acl/acl_prelu.cpp b/src/cpu/aarch64/acl_prelu.cpp
similarity index 96%
rename from src/cpu/acl/acl_prelu.cpp
rename to src/cpu/aarch64/acl_prelu.cpp
index b118fe20811..e2aae9392c0 100644
--- a/src/cpu/acl/acl_prelu.cpp
+++ b/src/cpu/aarch64/acl_prelu.cpp
@@ -14,12 +14,12 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "cpu/acl/acl_prelu.hpp"
+#include "cpu/aarch64/acl_prelu.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 status_t acl_prelu_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
 
@@ -51,7 +51,7 @@ status_t acl_prelu_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
     return status::success;
 }
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/acl/acl_prelu.hpp b/src/cpu/aarch64/acl_prelu.hpp
similarity index 96%
rename from src/cpu/acl/acl_prelu.hpp
rename to src/cpu/aarch64/acl_prelu.hpp
index a7b70402687..8517d1bb3ee 100644
--- a/src/cpu/acl/acl_prelu.hpp
+++ b/src/cpu/aarch64/acl_prelu.hpp
@@ -13,16 +13,16 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
-#ifndef CPU_ACL_PRELU_HPP
-#define CPU_ACL_PRELU_HPP
+#ifndef CPU_AARCH64_ACL_PRELU_HPP
+#define CPU_AARCH64_ACL_PRELU_HPP
 
-#include "cpu/acl/acl_utils.hpp"
+#include "cpu/aarch64/acl_utils.hpp"
 #include "cpu/cpu_prelu_pd.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 struct acl_prelu_obj_t {
     arm_compute::NEPReluLayer prelu;
@@ -151,9 +151,9 @@ struct acl_prelu_fwd_t : public primitive_t {
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
 }; // acl_prelu_fwd_t
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
 
-#endif // CPU_ACL_PRELU_HPP
+#endif // CPU_AARCH64_ACL_PRELU_HPP
diff --git a/src/cpu/aarch64/acl_reorder.cpp b/src/cpu/aarch64/acl_reorder.cpp
index 9f3c062cb82..7158c5fb19d 100644
--- a/src/cpu/aarch64/acl_reorder.cpp
+++ b/src/cpu/aarch64/acl_reorder.cpp
@@ -39,7 +39,7 @@ int find_innermost_dense_idx(const dnnl::impl::memory_desc_t *md) {
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 status_t acl_reorder_resource_t::configure(const acl_reorder_conf_t &app) {
     if (!acl_obj_) return status::out_of_memory;
@@ -255,7 +255,7 @@ status_t acl_reorder_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
     return status::success;
 }
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/aarch64/acl_reorder.hpp b/src/cpu/aarch64/acl_reorder.hpp
index e6bcad62f3c..50c058ed99b 100644
--- a/src/cpu/aarch64/acl_reorder.hpp
+++ b/src/cpu/aarch64/acl_reorder.hpp
@@ -13,19 +13,19 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
-#ifndef CPU_ACL_REORDER_HPP
-#define CPU_ACL_REORDER_HPP
+#ifndef CPU_AARCH64_ACL_REORDER_HPP
+#define CPU_AARCH64_ACL_REORDER_HPP
 
 #include "arm_compute/core/Types.h"
 #include "common/utils.hpp"
-#include "cpu/acl/acl_utils.hpp"
+#include "cpu/aarch64/acl_utils.hpp"
 #include "cpu/aarch64/cpu_isa_traits.hpp"
 #include "cpu/reorder/cpu_reorder_pd.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 struct acl_reorder_obj_t {
     arm_compute::NEReorderLayer reorder;
@@ -91,9 +91,9 @@ struct acl_reorder_fwd_t : public primitive_t {
 
 }; // acl_reorder_fwd_t
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
 
-#endif // CPU_ACL_REORDER_HPP
+#endif // CPU_AARCH64_ACL_REORDER_HPP
diff --git a/src/cpu/acl/acl_softmax.cpp b/src/cpu/aarch64/acl_softmax.cpp
similarity index 98%
rename from src/cpu/acl/acl_softmax.cpp
rename to src/cpu/aarch64/acl_softmax.cpp
index 50966d33e4e..47517f1de8a 100644
--- a/src/cpu/acl/acl_softmax.cpp
+++ b/src/cpu/aarch64/acl_softmax.cpp
@@ -14,12 +14,12 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "cpu/acl/acl_softmax.hpp"
+#include "cpu/aarch64/acl_softmax.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 const acl_softmax_fwd_t::pd_t *acl_softmax_fwd_t::pd() const {
     return static_cast<const pd_t *>(primitive_t::pd().get());
@@ -168,7 +168,7 @@ status_t acl_softmax_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
     return status::success;
 }
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/acl/acl_softmax.hpp b/src/cpu/aarch64/acl_softmax.hpp
similarity index 92%
rename from src/cpu/acl/acl_softmax.hpp
rename to src/cpu/aarch64/acl_softmax.hpp
index 470eea9a1a3..59a16f23b43 100644
--- a/src/cpu/acl/acl_softmax.hpp
+++ b/src/cpu/aarch64/acl_softmax.hpp
@@ -14,12 +14,12 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_ACL_SOFTMAX_HPP
-#define CPU_ACL_SOFTMAX_HPP
+#ifndef CPU_AARCH64_ACL_SOFTMAX_HPP
+#define CPU_AARCH64_ACL_SOFTMAX_HPP
 
 #include "cpu/cpu_softmax_pd.hpp"
 
-#include "cpu/acl/acl_utils.hpp"
+#include "cpu/aarch64/acl_utils.hpp"
 
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/runtime/IOperator.h"
@@ -28,7 +28,7 @@
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 struct acl_softmax_conf_t {
     arm_compute::TensorInfo src_info;
@@ -63,7 +63,7 @@ struct acl_softmax_fwd_t : public primitive_t {
     std::unique_ptr<arm_compute::experimental::op::CpuSoftmax> softmax_op_;
 }; // acl_softmax_fwd_t
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/acl/acl_thread.cpp b/src/cpu/aarch64/acl_thread.cpp
similarity index 95%
rename from src/cpu/acl/acl_thread.cpp
rename to src/cpu/aarch64/acl_thread.cpp
index 383fd283176..040e338c08d 100644
--- a/src/cpu/acl/acl_thread.cpp
+++ b/src/cpu/aarch64/acl_thread.cpp
@@ -14,16 +14,16 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "cpu/acl/acl_thread.hpp"
+#include "cpu/aarch64/acl_thread.hpp"
 #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
-#include "cpu/acl/acl_threadpool_scheduler.hpp"
+#include "cpu/aarch64/acl_threadpool_scheduler.hpp"
 #endif
-#include "cpu/acl/acl_benchmark_scheduler.hpp"
+#include "cpu/aarch64/acl_benchmark_scheduler.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 namespace acl_thread_utils {
 
@@ -119,7 +119,7 @@ void set_acl_threading() {
 
 } // namespace acl_thread_utils
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/acl/acl_thread.hpp b/src/cpu/aarch64/acl_thread.hpp
similarity index 92%
rename from src/cpu/acl/acl_thread.hpp
rename to src/cpu/aarch64/acl_thread.hpp
index 26b65564d79..f073376e63a 100644
--- a/src/cpu/acl/acl_thread.hpp
+++ b/src/cpu/aarch64/acl_thread.hpp
@@ -14,8 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_ACL_THREAD_HPP
-#define CPU_ACL_THREAD_HPP
+#ifndef CPU_AARCH64_ACL_THREAD_HPP
+#define CPU_AARCH64_ACL_THREAD_HPP
 
 #include "common/dnnl_thread.hpp"
 #include "common/verbose.hpp"
@@ -25,7 +25,7 @@
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 namespace acl_thread_utils {
 
@@ -49,9 +49,9 @@ void acl_set_tp_benchmark_scheduler();
 void set_acl_threading();
 } // namespace acl_thread_utils
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
 
-#endif // CPU_ACL_THREAD_HPP
+#endif // CPU_AARCH64_ACL_THREAD_HPP
diff --git a/src/cpu/acl/acl_threadpool_scheduler.cpp b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
similarity index 97%
rename from src/cpu/acl/acl_threadpool_scheduler.cpp
rename to src/cpu/aarch64/acl_threadpool_scheduler.cpp
index ae559c5ead9..34cf44b7e25 100644
--- a/src/cpu/acl/acl_threadpool_scheduler.cpp
+++ b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
@@ -14,13 +14,13 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "cpu/acl/acl_threadpool_scheduler.hpp"
+#include "cpu/aarch64/acl_threadpool_scheduler.hpp"
 
 #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
 
 #include "common/counting_barrier.hpp"
 #include "common/dnnl_thread.hpp"
-#include "cpu/acl/acl_thread.hpp"
+#include "cpu/aarch64/acl_thread.hpp"
 
 #include "arm_compute/core/CPP/ICPPKernel.h"
 #include "arm_compute/core/Error.h"
@@ -33,7 +33,7 @@
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 using namespace arm_compute;
 
@@ -135,7 +135,7 @@ void ThreadpoolScheduler::run_workloads(
     if (is_async) b.wait();
 }
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/acl/acl_threadpool_scheduler.hpp b/src/cpu/aarch64/acl_threadpool_scheduler.hpp
similarity index 91%
rename from src/cpu/acl/acl_threadpool_scheduler.hpp
rename to src/cpu/aarch64/acl_threadpool_scheduler.hpp
index 0bdd068bde5..1c7d054c08d 100644
--- a/src/cpu/acl/acl_threadpool_scheduler.hpp
+++ b/src/cpu/aarch64/acl_threadpool_scheduler.hpp
@@ -14,8 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_ACL_THREADPOOL_SCHEDULER_HPP
-#define CPU_ACL_THREADPOOL_SCHEDULER_HPP
+#ifndef CPU_AARCH64_ACL_THREADPOOL_SCHEDULER_HPP
+#define CPU_AARCH64_ACL_THREADPOOL_SCHEDULER_HPP
 
 #include "oneapi/dnnl/dnnl_config.h"
 
@@ -28,7 +28,7 @@
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 class ThreadpoolScheduler final : public arm_compute::IScheduler {
 public:
@@ -59,11 +59,11 @@ class ThreadpoolScheduler final : public arm_compute::IScheduler {
     std::mutex _mtx;
 };
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
 
 #endif // DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
 
-#endif // CPU_ACL_THREADPOOL_SCHEDULER_HPP
+#endif // CPU_AARCH64_ACL_THREADPOOL_SCHEDULER_HPP
diff --git a/src/cpu/acl/acl_utils.cpp b/src/cpu/aarch64/acl_utils.cpp
similarity index 99%
rename from src/cpu/acl/acl_utils.cpp
rename to src/cpu/aarch64/acl_utils.cpp
index 66494672aaf..ec7f162891f 100644
--- a/src/cpu/acl/acl_utils.cpp
+++ b/src/cpu/aarch64/acl_utils.cpp
@@ -14,13 +14,13 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "cpu/acl/acl_utils.hpp"
+#include "cpu/aarch64/acl_utils.hpp"
 #include <limits>
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 namespace acl_utils {
 
@@ -362,7 +362,7 @@ status_t reorder_to_weight_format(arm_compute::TensorInfo &info,
 
 } // namespace acl_utils
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/acl/acl_utils.hpp b/src/cpu/aarch64/acl_utils.hpp
similarity index 97%
rename from src/cpu/acl/acl_utils.hpp
rename to src/cpu/aarch64/acl_utils.hpp
index 1aba0d4644b..b1ec3f345da 100644
--- a/src/cpu/acl/acl_utils.hpp
+++ b/src/cpu/aarch64/acl_utils.hpp
@@ -14,8 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_ACL_UTILS_HPP
-#define CPU_ACL_UTILS_HPP
+#ifndef CPU_AARCH64_ACL_UTILS_HPP
+#define CPU_AARCH64_ACL_UTILS_HPP
 
 #include <mutex>
 
@@ -33,7 +33,7 @@
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 namespace acl_utils {
 
@@ -125,9 +125,9 @@ status_t reorder_to_weight_format(arm_compute::TensorInfo &info,
 
 } // namespace acl_utils
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
 
-#endif // CPU_ACL_UTILS_HPP
+#endif // CPU_AARCH64_ACL_UTILS_HPP
diff --git a/src/cpu/acl/acl_winograd_convolution.cpp b/src/cpu/aarch64/acl_winograd_convolution.cpp
similarity index 94%
rename from src/cpu/acl/acl_winograd_convolution.cpp
rename to src/cpu/aarch64/acl_winograd_convolution.cpp
index eb2e0bd9883..b801fa28aa1 100644
--- a/src/cpu/acl/acl_winograd_convolution.cpp
+++ b/src/cpu/aarch64/acl_winograd_convolution.cpp
@@ -14,12 +14,12 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "cpu/acl/acl_winograd_convolution.hpp"
+#include "cpu/aarch64/acl_winograd_convolution.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 using data_t = prec_traits_t<data_type::f32>::type;
 
 status_t acl_wino_convolution_fwd_t::execute_forward(
@@ -38,7 +38,7 @@ status_t acl_wino_convolution_fwd_t::execute_forward(
             ctx, acl_wino_obj, pd());
 }
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/acl/acl_winograd_convolution.hpp b/src/cpu/aarch64/acl_winograd_convolution.hpp
similarity index 94%
rename from src/cpu/acl/acl_winograd_convolution.hpp
rename to src/cpu/aarch64/acl_winograd_convolution.hpp
index 9c29ea376a3..69aee76e4b7 100644
--- a/src/cpu/acl/acl_winograd_convolution.hpp
+++ b/src/cpu/aarch64/acl_winograd_convolution.hpp
@@ -14,22 +14,22 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_ACL_WINOGRAD_CONVOLUTION_HPP
-#define CPU_ACL_WINOGRAD_CONVOLUTION_HPP
+#ifndef CPU_AARCH64_ACL_WINOGRAD_CONVOLUTION_HPP
+#define CPU_AARCH64_ACL_WINOGRAD_CONVOLUTION_HPP
 
 #include "cpu/cpu_convolution_pd.hpp"
 
-#include "cpu/acl/acl_convolution_utils.hpp"
+#include "cpu/aarch64/acl_convolution_utils.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 struct acl_wino_resource_t : public resource_t {
     acl_wino_resource_t()
         : acl_wino_obj_(utils::make_unique<
-                acl_obj_t<arm_compute::NEWinogradConvolutionLayer>>()) {}
+                  acl_obj_t<arm_compute::NEWinogradConvolutionLayer>>()) {}
 
     status_t configure(const acl_conv_conf_t &acp) {
         if (!acl_wino_obj_) return status::out_of_memory;
@@ -144,9 +144,9 @@ struct acl_wino_convolution_fwd_t : public primitive_t {
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
 }; // acl_wino_convolution_fwd_t
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
 
-#endif // CPU_ACL_WINOGRAD_CONVOLUTION_HPP
+#endif // CPU_AARCH64_ACL_WINOGRAD_CONVOLUTION_HPP
diff --git a/src/cpu/acl/matmul/acl_lowp_matmul.cpp b/src/cpu/aarch64/matmul/acl_lowp_matmul.cpp
similarity index 96%
rename from src/cpu/acl/matmul/acl_lowp_matmul.cpp
rename to src/cpu/aarch64/matmul/acl_lowp_matmul.cpp
index 8b4dca04b76..9f6209df55e 100644
--- a/src/cpu/acl/matmul/acl_lowp_matmul.cpp
+++ b/src/cpu/aarch64/matmul/acl_lowp_matmul.cpp
@@ -14,15 +14,13 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "cpu/acl/matmul/acl_lowp_matmul.hpp"
+#include "cpu/aarch64/matmul/acl_lowp_matmul.hpp"
 #include "cpu/cpu_primitive.hpp"
 
-#include "src/cpu/CpuTypes.h"
-
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 namespace matmul {
 
 namespace {
@@ -253,11 +251,11 @@ status_t acl_lowp_matmul_t::pd_t::init_scratchpad(
     const memory_desc_wrapper dst_d(&dst_md_);
     if (almc_.use_dst_acc) {
         scratchpad.book(memory_tracking::names::key_matmul_dst_in_acc_dt,
-                dst_d.nelems(), sizeof(arm_compute::float32_t));
+                dst_d.nelems(), sizeof(float32_t));
     }
     if (almc_.use_cast_acc) {
         scratchpad.book(memory_tracking::names::key_matmul_dst_cast_acc,
-                dst_d.nelems(), sizeof(arm_compute::float32_t));
+                dst_d.nelems(), sizeof(float32_t));
     }
     return status::success;
 }
@@ -335,15 +333,17 @@ status_t acl_lowp_matmul_t::execute(const exec_ctx_t &ctx) const {
         bia_tensor.allocator()->import_memory(const_cast<float *>(bias));
     }
 
-    auto dst = pd()->almc_.use_dst_acc ? scratchpad.get<void>(
-                       memory_tracking::names::key_matmul_dst_in_acc_dt)
-                                       : CTX_OUT_MEM(float *, DNNL_ARG_DST);
+    auto dst = pd()->almc_.use_dst_acc
+            ? scratchpad.get<void>(
+                      memory_tracking::names::key_matmul_dst_in_acc_dt)
+            : CTX_OUT_MEM(float *, DNNL_ARG_DST);
     dst_tensor.allocator()->init(alcm.dst_tensor_info);
     dst_tensor.allocator()->import_memory(dst);
 
-    auto dst_cast = pd()->almc_.use_cast_acc ? scratchpad.get<void>(
-                            memory_tracking::names::key_matmul_dst_cast_acc)
-                                             : nullptr;
+    auto dst_cast = pd()->almc_.use_cast_acc
+            ? scratchpad.get<void>(
+                      memory_tracking::names::key_matmul_dst_cast_acc)
+            : nullptr;
     if (dst_cast) {
         dst_cast_tensor.allocator()->init(alcm.dst_cast_tensor_info);
         dst_cast_tensor.allocator()->import_memory(dst_cast);
@@ -438,7 +438,7 @@ status_t acl_lowp_matmul_t::execute(const exec_ctx_t &ctx) const {
 };
 
 } // namespace matmul
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/acl/matmul/acl_lowp_matmul.hpp b/src/cpu/aarch64/matmul/acl_lowp_matmul.hpp
similarity index 91%
rename from src/cpu/acl/matmul/acl_lowp_matmul.hpp
rename to src/cpu/aarch64/matmul/acl_lowp_matmul.hpp
index bde5fade7fd..46d005515a9 100644
--- a/src/cpu/acl/matmul/acl_lowp_matmul.hpp
+++ b/src/cpu/aarch64/matmul/acl_lowp_matmul.hpp
@@ -14,21 +14,21 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef ACL_LOWP_MATMUL_HPP
-#define ACL_LOWP_MATMUL_HPP
+#ifndef CPU_AARCH64_MATMUL_ACL_LOWP_MATMUL_HPP
+#define CPU_AARCH64_MATMUL_ACL_LOWP_MATMUL_HPP
 
 #include "arm_compute/runtime/experimental/operators/CpuDequantize.h"
 #include "arm_compute/runtime/experimental/operators/CpuGEMMLowp.h"
 #include "arm_compute/runtime/experimental/operators/CpuQuantize.h"
-#include "cpu/acl/acl_post_ops.hpp"
-#include "cpu/acl/acl_utils.hpp"
+#include "cpu/aarch64/acl_post_ops.hpp"
+#include "cpu/aarch64/acl_utils.hpp"
 #include "cpu/matmul/cpu_matmul_pd.hpp"
 #include "cpu/matmul/matmul_utils.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 namespace matmul {
 
 using arm_compute::experimental::MemoryLifetime;
@@ -81,9 +81,9 @@ struct acl_lowp_matmul_t : public primitive_t {
 };
 
 } // namespace matmul
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
 
-#endif // ACL_LOWP_MATMUL_HPP
+#endif // CPU_AARCH64_MATMUL_ACL_LOWP_MATMUL_HPP
diff --git a/src/cpu/acl/matmul/acl_lowp_matmul_sq.cpp b/src/cpu/aarch64/matmul/acl_lowp_matmul_sq.cpp
similarity index 97%
rename from src/cpu/acl/matmul/acl_lowp_matmul_sq.cpp
rename to src/cpu/aarch64/matmul/acl_lowp_matmul_sq.cpp
index 0884a123ba9..e58b0823360 100644
--- a/src/cpu/acl/matmul/acl_lowp_matmul_sq.cpp
+++ b/src/cpu/aarch64/matmul/acl_lowp_matmul_sq.cpp
@@ -14,17 +14,16 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "cpu/acl/matmul/acl_lowp_matmul_sq.hpp"
+#include "cpu/aarch64/matmul/acl_lowp_matmul_sq.hpp"
 
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 
-#include "cpu/acl/acl_utils.hpp"
-#include "src/cpu/CpuTypes.h"
+#include "cpu/aarch64/acl_utils.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 namespace matmul {
 
 namespace {
@@ -279,10 +278,10 @@ status_t acl_lowp_matmul_sq_t::execute(const exec_ctx_t &ctx) const {
     if (with_bias) {
         auto bia_s32_base = scratchpad.get<uint32_t>(
                 memory_tracking::names::key_conv_bias_s32_convert);
-        auto bia_f32_base = CTX_IN_MEM(const arm_compute::float32_t *, DNNL_ARG_BIAS);
+        auto bia_f32_base = CTX_IN_MEM(const float32_t *, DNNL_ARG_BIAS);
         const float bias_scale = 1 / (*src_scale * (*wei_scale));
         const int num_elements
-                = almc.bia_tensor_info.total_size() / sizeof(arm_compute::float32_t);
+                = almc.bia_tensor_info.total_size() / sizeof(float32_t);
         parallel_nd(num_elements, [&](dim_t e) {
             const auto b = int32_t(std::round(bia_f32_base[e] * bias_scale));
             bia_s32_base[e] = b;
@@ -340,7 +339,7 @@ status_t acl_lowp_matmul_sq_t::execute(const exec_ctx_t &ctx) const {
     return status::success;
 };
 } // namespace matmul
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/acl/matmul/acl_lowp_matmul_sq.hpp b/src/cpu/aarch64/matmul/acl_lowp_matmul_sq.hpp
similarity index 92%
rename from src/cpu/acl/matmul/acl_lowp_matmul_sq.hpp
rename to src/cpu/aarch64/matmul/acl_lowp_matmul_sq.hpp
index 24b60ab3450..52b83c6d964 100644
--- a/src/cpu/acl/matmul/acl_lowp_matmul_sq.hpp
+++ b/src/cpu/aarch64/matmul/acl_lowp_matmul_sq.hpp
@@ -14,12 +14,12 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef ACL_LOWP_MATMUL_SQ_HPP
-#define ACL_LOWP_MATMUL_SQ_HPP
+#ifndef CPU_AARCH64_MATMUL_ACL_LOWP_MATMUL_SQ_HPP
+#define CPU_AARCH64_MATMUL_ACL_LOWP_MATMUL_SQ_HPP
 
 #include <memory>
 
-#include "cpu/acl/acl_post_ops.hpp"
+#include "cpu/aarch64/acl_post_ops.hpp"
 #include "cpu/cpu_primitive.hpp"
 #include "cpu/matmul/cpu_matmul_pd.hpp"
 #include "cpu/matmul/matmul_utils.hpp"
@@ -30,7 +30,7 @@
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 namespace matmul {
 
 struct acl_lowp_matmul_sq_conf_t {
@@ -84,9 +84,9 @@ struct acl_lowp_matmul_sq_t : public primitive_t {
 };
 
 } // namespace matmul
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
 
-#endif // ACL_LOWP_MATMUL_SQ_HPP
+#endif // CPU_AARCH64_MATMUL_ACL_LOWP_MATMUL_SQ_HPP
diff --git a/src/cpu/acl/matmul/acl_matmul.cpp b/src/cpu/aarch64/matmul/acl_matmul.cpp
similarity index 98%
rename from src/cpu/acl/matmul/acl_matmul.cpp
rename to src/cpu/aarch64/matmul/acl_matmul.cpp
index bbaa3f37832..8bf7bcd757b 100644
--- a/src/cpu/acl/matmul/acl_matmul.cpp
+++ b/src/cpu/aarch64/matmul/acl_matmul.cpp
@@ -14,14 +14,14 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "cpu/acl/matmul/acl_matmul.hpp"
+#include "cpu/aarch64/matmul/acl_matmul.hpp"
 
 #include <mutex>
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 namespace matmul {
 
 using namespace data_type;
@@ -229,9 +229,10 @@ status_t acl_matmul_t::execute_forward(const exec_ctx_t &ctx) const {
 
     // If we have an unfused sum post op, put the result in a scratchpad tensor.
     // Result will be summed to the dst during acl_post_ops.execute
-    auto dst_base = use_dst_acc_for_sum ? scratchpad.get<void>(
-                            memory_tracking::names::key_matmul_dst_in_acc_dt)
-                                        : CTX_OUT_MEM(data_t *, DNNL_ARG_DST);
+    auto dst_base = use_dst_acc_for_sum
+            ? scratchpad.get<void>(
+                      memory_tracking::names::key_matmul_dst_in_acc_dt)
+            : CTX_OUT_MEM(data_t *, DNNL_ARG_DST);
     dst_tensor.allocator()->import_memory(dst_base);
 
     // Run transpose kernel
@@ -377,7 +378,7 @@ template status_t acl_matmul_t::execute_forward<false>(
         const exec_ctx_t &ctx) const;
 
 } // namespace matmul
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/acl/matmul/acl_matmul.hpp b/src/cpu/aarch64/matmul/acl_matmul.hpp
similarity index 94%
rename from src/cpu/acl/matmul/acl_matmul.hpp
rename to src/cpu/aarch64/matmul/acl_matmul.hpp
index d78351941c2..cbfc33c5ed8 100644
--- a/src/cpu/acl/matmul/acl_matmul.hpp
+++ b/src/cpu/aarch64/matmul/acl_matmul.hpp
@@ -17,8 +17,8 @@
 #ifndef CPU_AARCH64_MATMUL_ACL_MATMUL_HPP
 #define CPU_AARCH64_MATMUL_ACL_MATMUL_HPP
 
-#include "cpu/acl/acl_post_ops.hpp"
-#include "cpu/acl/matmul/acl_matmul_utils.hpp"
+#include "cpu/aarch64/acl_post_ops.hpp"
+#include "cpu/aarch64/matmul/acl_matmul_utils.hpp"
 #include "cpu/matmul/cpu_matmul_pd.hpp"
 
 #include <mutex>
@@ -26,7 +26,7 @@
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 namespace matmul {
 
 struct acl_matmul_t : public primitive_t {
@@ -66,7 +66,7 @@ struct acl_matmul_t : public primitive_t {
 }; // acl_matmul_t
 
 } // namespace matmul
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/acl/matmul/acl_matmul_utils.cpp b/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
similarity index 98%
rename from src/cpu/acl/matmul/acl_matmul_utils.cpp
rename to src/cpu/aarch64/matmul/acl_matmul_utils.cpp
index 11e61715940..8b3becd3e1b 100644
--- a/src/cpu/acl/matmul/acl_matmul_utils.cpp
+++ b/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
@@ -14,15 +14,15 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "cpu/acl/matmul/acl_matmul_utils.hpp"
-#include "cpu/acl/acl_utils.hpp"
+#include "cpu/aarch64/matmul/acl_matmul_utils.hpp"
+#include "cpu/aarch64/acl_utils.hpp"
 #include "cpu/matmul/gemm_based_common.hpp"
 #include "cpu/matmul/matmul_utils.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 bool batch_dims_have_default_order(const memory_desc_wrapper &mdw) {
     assert(mdw.is_blocking_desc());
@@ -267,7 +267,7 @@ template status_t init_conf_matmul<false>(acl_matmul_conf_t &amp,
 
 } // namespace acl_matmul_utils
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/acl/matmul/acl_matmul_utils.hpp b/src/cpu/aarch64/matmul/acl_matmul_utils.hpp
similarity index 94%
rename from src/cpu/acl/matmul/acl_matmul_utils.hpp
rename to src/cpu/aarch64/matmul/acl_matmul_utils.hpp
index d55cf71263f..c154852a866 100644
--- a/src/cpu/acl/matmul/acl_matmul_utils.hpp
+++ b/src/cpu/aarch64/matmul/acl_matmul_utils.hpp
@@ -14,8 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_ACL_MATMUL_UTILS_HPP
-#define CPU_ACL_MATMUL_UTILS_HPP
+#ifndef CPU_AARCH64_MATMUL_ACL_MATMUL_UTILS_HPP
+#define CPU_AARCH64_MATMUL_ACL_MATMUL_UTILS_HPP
 
 #include "arm_compute/runtime/experimental/low_level/CpuGemmAssemblyDispatch.h"
 #include "arm_compute/runtime/experimental/operators/CpuActivation.h"
@@ -26,7 +26,7 @@
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace acl {
+namespace aarch64 {
 
 namespace {
 // Keys are anonymous. So deduce the type automagically.
@@ -80,9 +80,9 @@ status_t init_scratchpad(memory_tracking::registrar_t &scratchpad,
 
 } // namespace acl_matmul_utils
 
-} // namespace acl
+} // namespace aarch64
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
 
-#endif // CPU_ACL_MATMUL_UTILS_HPP
+#endif // CPU_AARCH64_MATMUL_ACL_MATMUL_UTILS_HPP
diff --git a/src/cpu/acl/CMakeLists.txt b/src/cpu/acl/CMakeLists.txt
deleted file mode 100644
index abe0a5c49eb..00000000000
--- a/src/cpu/acl/CMakeLists.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-#*******************************************************************************
-# Copyright 2020-2022 Arm Ltd. and affiliates
-# Copyright 2020-2021 FUJITSU LIMITED
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#*******************************************************************************
-file(GLOB_RECURSE SOURCES
-    ${CMAKE_CURRENT_SOURCE_DIR}/*.[ch]
-    ${CMAKE_CURRENT_SOURCE_DIR}/*.[ch]pp
-    )
-# If the runtime is not THREADPOOL remove threadpool_scheduler sources.
-if(NOT DNNL_CPU_RUNTIME STREQUAL "THREADPOOL")
-    list(APPEND ACL_THREADPOOL_FILES
-        ${CMAKE_CURRENT_SOURCE_DIR}/acl_threadpool_scheduler.cpp
-        ${CMAKE_CURRENT_SOURCE_DIR}/acl_threadpool_scheduler.hpp
-    )
-    list(REMOVE_ITEM SOURCES ${ACL_THREADPOOL_FILES})
-endif()
-set(OBJ_LIB ${DNNL_LIBRARY_NAME}_cpu_acl)
-add_library(${OBJ_LIB} OBJECT ${SOURCES})
-set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS
-    $<TARGET_OBJECTS:${OBJ_LIB}>)
-enable_conditional_compilation4(${OBJ_LIB})
\ No newline at end of file
diff --git a/src/cpu/cpu_batch_normalization_list.cpp b/src/cpu/cpu_batch_normalization_list.cpp
index e56d39c134f..c41de337514 100644
--- a/src/cpu/cpu_batch_normalization_list.cpp
+++ b/src/cpu/cpu_batch_normalization_list.cpp
@@ -33,8 +33,8 @@ using namespace dnnl::impl::cpu::x64;
 using namespace dnnl::impl::cpu::aarch64;
 #endif
 #if defined(DNNL_AARCH64_USE_ACL)
-#include "cpu/acl/acl_batch_normalization.hpp"
-using namespace dnnl::impl::cpu::acl;
+#include "cpu/aarch64/acl_batch_normalization.hpp"
+using namespace dnnl::impl::cpu::aarch64;
 #endif
 
 namespace dnnl {
@@ -59,7 +59,7 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map() {
             CPU_INSTANCE_AARCH64(jit_uni_batch_normalization_fwd_t, sve_512)
             CPU_INSTANCE_AARCH64(jit_uni_batch_normalization_fwd_t, sve_256)
             CPU_INSTANCE_AARCH64(jit_uni_batch_normalization_fwd_t, asimd)
-            DNNL_AARCH64_ONLY(DNNL_ACL_ONLY(CPU_INSTANCE(acl::acl_batch_normalization_fwd_t)))
+            DNNL_AARCH64_ACL_ONLY(CPU_INSTANCE(aarch64::acl_batch_normalization_fwd_t))
             CPU_INSTANCE(ncsp_batch_normalization_fwd_t, f32)
             CPU_INSTANCE(ncsp_batch_normalization_fwd_t, bf16)
             CPU_INSTANCE(ncsp_batch_normalization_fwd_t, f16)
diff --git a/src/cpu/cpu_binary_list.cpp b/src/cpu/cpu_binary_list.cpp
index 1cb4692f451..c83c39268a3 100644
--- a/src/cpu/cpu_binary_list.cpp
+++ b/src/cpu/cpu_binary_list.cpp
@@ -28,8 +28,8 @@ using namespace dnnl::impl::cpu::x64;
 using namespace dnnl::impl::cpu::aarch64;
 #endif
 #if defined(DNNL_AARCH64_USE_ACL)
-#include "cpu/acl/acl_binary.hpp"
-using namespace dnnl::impl::cpu::acl;
+#include "cpu/aarch64/acl_binary.hpp"
+using namespace dnnl::impl::cpu::aarch64;
 #endif
 
 namespace dnnl {
@@ -43,7 +43,7 @@ using namespace dnnl::impl::data_type;
 const impl_list_item_t impl_list[] = REG_BINARY_P({
         CPU_INSTANCE_X64(jit_uni_binary_t)
         CPU_INSTANCE_AARCH64(jit_uni_binary_t)
-        CPU_INSTANCE_ACL(acl_binary_t)
+        CPU_INSTANCE_AARCH64_ACL(acl_binary_t)
         CPU_INSTANCE(ref_binary_t)
         /* eol */
         nullptr,
diff --git a/src/cpu/cpu_convolution_list.cpp b/src/cpu/cpu_convolution_list.cpp
index 431f6ba60e3..73076c62cb0 100644
--- a/src/cpu/cpu_convolution_list.cpp
+++ b/src/cpu/cpu_convolution_list.cpp
@@ -71,11 +71,11 @@ using namespace dnnl::impl::cpu::x64;
 using namespace dnnl::impl::cpu::aarch64;
 #endif
 #if defined(DNNL_AARCH64_USE_ACL)
-#include "cpu/acl/acl_depthwise_convolution.hpp"
-#include "cpu/acl/acl_gemm_convolution.hpp"
-#include "cpu/acl/acl_indirect_gemm_convolution.hpp"
-#include "cpu/acl/acl_winograd_convolution.hpp"
-using namespace dnnl::impl::cpu::acl;
+#include "cpu/aarch64/acl_depthwise_convolution.hpp"
+#include "cpu/aarch64/acl_gemm_convolution.hpp"
+#include "cpu/aarch64/acl_indirect_gemm_convolution.hpp"
+#include "cpu/aarch64/acl_winograd_convolution.hpp"
+using namespace dnnl::impl::cpu::aarch64;
 #endif
 
 namespace dnnl {
@@ -150,7 +150,7 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
             CPU_INSTANCE_SSE41(jit_sse41_1x1_convolution_fwd_t)
             CPU_INSTANCE_AVX2(jit_avx2_convolution_fwd_t)
             CPU_INSTANCE_SSE41(jit_sse41_convolution_fwd_t)
-            CPU_INSTANCE_ACL(acl_wino_convolution_fwd_t)
+            CPU_INSTANCE_AARCH64_ACL(acl_wino_convolution_fwd_t)
             CPU_INSTANCE_AARCH64(brdgmm_dw_convolution_fwd_t, sve_512)
             CPU_INSTANCE_AARCH64(brgemm_1x1_convolution_fwd_t, sve_512)
             CPU_INSTANCE_AARCH64(brgemm_convolution_fwd_t, sve_512)
@@ -168,9 +168,9 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
             CPU_INSTANCE_AARCH64(brdgmm_dw_convolution_fwd_t, sve_128)
             CPU_INSTANCE_AARCH64(brdgmm_dw_convolution_fwd_t, sve_256)
             CPU_INSTANCE_AARCH64(brdgmm_dw_convolution_fwd_t, sve_128)
-            CPU_INSTANCE_ACL(acl_depthwise_convolution_fwd_t)
-            CPU_INSTANCE_ACL(acl_indirect_gemm_convolution_fwd_t)
-            CPU_INSTANCE_ACL(acl_gemm_convolution_fwd_t, f32)
+            CPU_INSTANCE_AARCH64_ACL(acl_depthwise_convolution_fwd_t)
+            CPU_INSTANCE_AARCH64_ACL(acl_indirect_gemm_convolution_fwd_t)
+            CPU_INSTANCE_AARCH64_ACL(acl_gemm_convolution_fwd_t, f32)
             CPU_INSTANCE_AARCH64(brgemm_convolution_fwd_t, sve_256)
             CPU_INSTANCE_AARCH64(brgemm_1x1_convolution_fwd_t, sve_128)
             CPU_INSTANCE_AARCH64(brgemm_convolution_fwd_t, sve_128)
@@ -243,7 +243,7 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
             CPU_INSTANCE_AARCH64(brdgmm_dw_convolution_fwd_t, sve_256)
             CPU_INSTANCE_AARCH64(brdgmm_dw_convolution_fwd_t, sve_128)
             CPU_INSTANCE_AARCH64(jit_uni_dw_convolution_fwd_t, sve_128, bf16, bf16)
-            CPU_INSTANCE_ACL(acl_indirect_gemm_convolution_fwd_t)
+            CPU_INSTANCE_AARCH64_ACL(acl_indirect_gemm_convolution_fwd_t)
             CPU_INSTANCE_AARCH64(brgemm_1x1_convolution_fwd_t, sve_256)
             CPU_INSTANCE_AARCH64(brgemm_convolution_fwd_t, sve_256)
             CPU_INSTANCE_AARCH64(brgemm_1x1_convolution_fwd_t, sve_128)
@@ -279,10 +279,10 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
             CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t, avx512_core_fp16)
             CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t, avx2_vnni_2)
             CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t, avx2_vnni_2)
-            CPU_INSTANCE_ACL(acl_wino_convolution_fwd_t)
-            CPU_INSTANCE_ACL(acl_depthwise_convolution_fwd_t)
-            CPU_INSTANCE_ACL(acl_indirect_gemm_convolution_fwd_t)
-            CPU_INSTANCE_ACL(acl_gemm_convolution_fwd_t, f16)
+            CPU_INSTANCE_AARCH64_ACL(acl_wino_convolution_fwd_t)
+            CPU_INSTANCE_AARCH64_ACL(acl_depthwise_convolution_fwd_t)
+            CPU_INSTANCE_AARCH64_ACL(acl_indirect_gemm_convolution_fwd_t)
+            CPU_INSTANCE_AARCH64_ACL(acl_gemm_convolution_fwd_t, f16)
             CPU_INSTANCE(ref_convolution_fwd_t)
             CPU_INSTANCE(ref_fused_convolution_fwd_t)
             nullptr,
@@ -646,7 +646,7 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
             CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_1x1_convolution_fwd_t, sse41)
             CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_convolution_fwd_t, sse41)
             CPU_INSTANCE_AARCH64(jit_sve_512_x8s8s32x_convolution_fwd_t, s8, s8)
-            CPU_INSTANCE_ACL(acl_gemm_convolution_fwd_t, s8, s8, s8, s32)
+            CPU_INSTANCE_AARCH64_ACL(acl_gemm_convolution_fwd_t, s8, s8, s8, s32)
             CPU_INSTANCE_AARCH64(brgemm_1x1_convolution_fwd_t, sve_256)
             CPU_INSTANCE_AARCH64(brgemm_convolution_fwd_t, sve_256)
             CPU_INSTANCE(gemm_x8s8s32x_convolution_fwd_t)
diff --git a/src/cpu/cpu_deconvolution_list.cpp b/src/cpu/cpu_deconvolution_list.cpp
index 917eaa28ace..e916c9249f3 100644
--- a/src/cpu/cpu_deconvolution_list.cpp
+++ b/src/cpu/cpu_deconvolution_list.cpp
@@ -33,8 +33,8 @@ using namespace dnnl::impl::cpu::x64;
 using namespace dnnl::impl::cpu::aarch64;
 #endif
 #if defined(DNNL_AARCH64_USE_ACL)
-#include "cpu/acl/acl_deconvolution.hpp"
-using namespace dnnl::impl::cpu::acl;
+#include "cpu/aarch64/acl_deconvolution.hpp"
+using namespace dnnl::impl::cpu::aarch64;
 #endif
 
 namespace dnnl {
@@ -70,7 +70,7 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map() {
             CPU_INSTANCE_AARCH64(brgemm_deconvolution_fwd_t, sve_256)
             CPU_INSTANCE_AARCH64(brgemm_deconvolution_fwd_t, sve_128)
             CPU_INSTANCE_AARCH64(jit_sve_512_core_x8s8s32x_deconvolution_fwd_t)
-            CPU_INSTANCE_ACL(acl_deconvolution_fwd_t)
+            CPU_INSTANCE_AARCH64_ACL(acl_deconvolution_fwd_t)
             CPU_INSTANCE(ref_deconvolution_fwd_t)
             nullptr,
         }},
diff --git a/src/cpu/cpu_eltwise_list.cpp b/src/cpu/cpu_eltwise_list.cpp
index eec7a054399..c4164953f9f 100644
--- a/src/cpu/cpu_eltwise_list.cpp
+++ b/src/cpu/cpu_eltwise_list.cpp
@@ -30,8 +30,8 @@ using namespace dnnl::impl::cpu::x64;
 using namespace dnnl::impl::cpu::aarch64;
 #endif
 #if defined(DNNL_AARCH64_USE_ACL)
-#include "cpu/acl/acl_eltwise.hpp"
-using namespace dnnl::impl::cpu::acl;
+#include "cpu/aarch64/acl_eltwise.hpp"
+using namespace dnnl::impl::cpu::aarch64;
 #endif
 
 namespace dnnl {
@@ -76,7 +76,7 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map() {
             CPU_INSTANCE_AARCH64(jit_uni_eltwise_int_fwd_t, sve_512, s32)
             CPU_INSTANCE_AARCH64(jit_uni_eltwise_int_fwd_t, sve_512, s8)
             CPU_INSTANCE_AARCH64(jit_uni_eltwise_int_fwd_t, sve_512, u8)
-            CPU_INSTANCE_ACL(acl_eltwise_fwd_t)
+            CPU_INSTANCE_AARCH64_ACL(acl_eltwise_fwd_t)
             CPU_INSTANCE(ref_eltwise_fwd_t, f32)
             CPU_INSTANCE(ref_eltwise_fwd_t, bf16)
             // CPU_INSTANCE(ref_eltwise_fwd_t, f16)
diff --git a/src/cpu/cpu_engine.hpp b/src/cpu/cpu_engine.hpp
index cbaff592afa..8fc890d24c5 100644
--- a/src/cpu/cpu_engine.hpp
+++ b/src/cpu/cpu_engine.hpp
@@ -30,7 +30,7 @@
 #include "cpu/platform.hpp"
 
 #if DNNL_AARCH64 && defined(DNNL_AARCH64_USE_ACL)
-#include "cpu/acl/acl_thread.hpp"
+#include "cpu/aarch64/acl_thread.hpp"
 #endif
 
 #define CPU_INSTANCE_IMPL(...) \
@@ -46,8 +46,8 @@
 #define CPU_INSTANCE_AVX512(...) REG_AVX512_ISA(CPU_INSTANCE(__VA_ARGS__))
 #define CPU_INSTANCE_AMX(...) REG_AMX_ISA(CPU_INSTANCE(__VA_ARGS__))
 #define CPU_INSTANCE_AARCH64(...) DNNL_AARCH64_ONLY(CPU_INSTANCE(__VA_ARGS__))
-#define CPU_INSTANCE_ARM(...) DNNL_ARM_ONLY(CPU_INSTANCE(__VA_ARGS__))
-#define CPU_INSTANCE_ACL(...) DNNL_ACL_ONLY(CPU_INSTANCE(__VA_ARGS__))
+#define CPU_INSTANCE_AARCH64_ACL(...) \
+    DNNL_AARCH64_ACL_ONLY(CPU_INSTANCE(__VA_ARGS__))
 #define CPU_INSTANCE_RV64GCV(...) DNNL_RV64GCV_ONLY(CPU_INSTANCE(__VA_ARGS__))
 #define CPU_INSTANCE_RV64GCV_ZVFH(...) \
     DNNL_RV64GCV_ZVFH_ONLY(CPU_INSTANCE(__VA_ARGS__))
@@ -164,7 +164,7 @@ class cpu_engine_factory_t : public engine_factory_t {
                 engine_kind::cpu, get_cpu_native_runtime(), 0));
 
 #if DNNL_AARCH64 && defined(DNNL_AARCH64_USE_ACL)
-        dnnl::impl::cpu::acl::acl_thread_utils::set_acl_threading();
+        dnnl::impl::cpu::aarch64::acl_thread_utils::set_acl_threading();
 #endif
         return status::success;
     };
diff --git a/src/cpu/cpu_inner_product_list.cpp b/src/cpu/cpu_inner_product_list.cpp
index 96047c232c4..d96c619d860 100644
--- a/src/cpu/cpu_inner_product_list.cpp
+++ b/src/cpu/cpu_inner_product_list.cpp
@@ -28,8 +28,8 @@
 #include "cpu/x64/matmul_inner_product.hpp"
 using namespace dnnl::impl::cpu::x64;
 #elif defined(DNNL_AARCH64_USE_ACL)
-#include "cpu/acl/acl_inner_product.hpp"
-using namespace dnnl::impl::cpu::acl;
+#include "cpu/aarch64/acl_inner_product.hpp"
+using namespace dnnl::impl::cpu::aarch64;
 #endif
 
 namespace dnnl {
@@ -57,7 +57,7 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
             CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t, avx512_core_amx) // bf32
             CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t, avx512_core)
             CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t, avx2)
-            CPU_INSTANCE_ACL(acl_inner_product_fwd_t)
+            CPU_INSTANCE_AARCH64_ACL(acl_inner_product_fwd_t)
             CPU_INSTANCE(gemm_inner_product_fwd_t, f32)
             CPU_INSTANCE(ref_inner_product_fwd_t)
             nullptr,
@@ -130,7 +130,7 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
             CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t, avx512_core_bf16)
             CPU_INSTANCE_AVX512(gemm_bf16_inner_product_fwd_t, bf16)
             CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t, avx2_vnni_2)
-            CPU_INSTANCE_ACL(acl_inner_product_fwd_t)
+            CPU_INSTANCE_AARCH64_ACL(acl_inner_product_fwd_t)
             CPU_INSTANCE(ref_inner_product_fwd_t)
             nullptr,
         }},
@@ -230,7 +230,7 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
             CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t, avx10_2_512)
             CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t, avx512_core_fp16)
             CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t, avx2_vnni_2)
-            CPU_INSTANCE_ACL(acl_inner_product_fwd_t)
+            CPU_INSTANCE_AARCH64_ACL(acl_inner_product_fwd_t)
             CPU_INSTANCE(ref_inner_product_fwd_t)
             nullptr,
         }},
@@ -240,7 +240,7 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
          * in fp32 and weights are in bf16
          */
         {{forward, f32, bf16, f32}, {
-            CPU_INSTANCE_ACL(acl_inner_product_fwd_t)
+            CPU_INSTANCE_AARCH64_ACL(acl_inner_product_fwd_t)
             nullptr,
         }},
 
diff --git a/src/cpu/cpu_layer_normalization_list.cpp b/src/cpu/cpu_layer_normalization_list.cpp
index 4f1e5d178a2..b521fb63fba 100644
--- a/src/cpu/cpu_layer_normalization_list.cpp
+++ b/src/cpu/cpu_layer_normalization_list.cpp
@@ -24,8 +24,8 @@
 #include "cpu/x64/jit_uni_layer_normalization.hpp"
 using namespace dnnl::impl::cpu::x64;
 #elif defined(DNNL_AARCH64_USE_ACL)
-#include "cpu/acl/acl_layer_normalization.hpp"
-using namespace dnnl::impl::cpu::acl;
+#include "cpu/aarch64/acl_layer_normalization.hpp"
+using namespace dnnl::impl::cpu::aarch64;
 #endif
 
 namespace dnnl {
@@ -41,7 +41,7 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map() {
     static const std::map<pk_impl_key_t, std::vector<impl_list_item_t>> the_map = REG_LNORM_P({
         {{forward}, {
             CPU_INSTANCE_X64(jit_uni_layer_normalization_fwd_t)
-            CPU_INSTANCE_ACL(acl_layer_normalization_fwd_t)
+            CPU_INSTANCE_AARCH64_ACL(acl_layer_normalization_fwd_t)
             CPU_INSTANCE(simple_layer_normalization_fwd_t)
             CPU_INSTANCE(ref_layer_normalization_fwd_t)
             nullptr,
diff --git a/src/cpu/cpu_pooling_list.cpp b/src/cpu/cpu_pooling_list.cpp
index 6284cdeb858..79afb1ccdd2 100644
--- a/src/cpu/cpu_pooling_list.cpp
+++ b/src/cpu/cpu_pooling_list.cpp
@@ -37,8 +37,8 @@ using namespace dnnl::impl::cpu::rv64;
 #endif // DNNL_RISCV_USE_RVV_INTRINSICS
 #endif
 #if defined(DNNL_AARCH64_USE_ACL)
-#include "cpu/acl/acl_pooling.hpp"
-using namespace dnnl::impl::cpu::acl;
+#include "cpu/aarch64/acl_pooling.hpp"
+using namespace dnnl::impl::cpu::aarch64;
 #endif
 
 namespace dnnl {
@@ -66,7 +66,7 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map() {
             CPU_INSTANCE_X64(jit_uni_pooling_fwd_t, sse41, f32)
             CPU_INSTANCE_AARCH64(jit_uni_pooling_fwd_t, sve_512, f32)
             CPU_INSTANCE_AARCH64(jit_uni_pooling_fwd_t, sve_256, f32)
-            CPU_INSTANCE_ACL(acl_pooling_fwd_t)
+            CPU_INSTANCE_AARCH64_ACL(acl_pooling_fwd_t)
             CPU_INSTANCE_RV64GCV(riscv_nchw_pooling_fwd_t)
             CPU_INSTANCE(nchw_pooling_fwd_t, bf16)
             CPU_INSTANCE(nchw_pooling_fwd_t, f32)
diff --git a/src/cpu/cpu_prelu_list.cpp b/src/cpu/cpu_prelu_list.cpp
index e55f18203b8..c88974bc323 100644
--- a/src/cpu/cpu_prelu_list.cpp
+++ b/src/cpu/cpu_prelu_list.cpp
@@ -24,8 +24,8 @@
 
 using namespace dnnl::impl::cpu::x64;
 #elif defined(DNNL_AARCH64_USE_ACL)
-#include "cpu/acl/acl_prelu.hpp"
-using namespace dnnl::impl::cpu::acl;
+#include "cpu/aarch64/acl_prelu.hpp"
+using namespace dnnl::impl::cpu::aarch64;
 #endif
 
 namespace dnnl {
@@ -41,7 +41,7 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map() {
     static const std::map<pk_impl_key_t, std::vector<impl_list_item_t>> the_map = REG_PRELU_P({
         {{forward}, {
             CPU_INSTANCE_X64(jit_prelu_fwd_t)
-            CPU_INSTANCE_ACL(acl_prelu_fwd_t)
+            CPU_INSTANCE_AARCH64_ACL(acl_prelu_fwd_t)
             CPU_INSTANCE(ref_prelu_fwd_t)
             nullptr,
         }},
diff --git a/src/cpu/cpu_softmax_list.cpp b/src/cpu/cpu_softmax_list.cpp
index c4c2954b235..efd4e6a28db 100644
--- a/src/cpu/cpu_softmax_list.cpp
+++ b/src/cpu/cpu_softmax_list.cpp
@@ -29,8 +29,8 @@ using namespace dnnl::impl::cpu::x64;
 using namespace dnnl::impl::cpu::aarch64;
 #endif
 #if defined(DNNL_AARCH64_USE_ACL)
-#include "cpu/acl/acl_softmax.hpp"
-using namespace dnnl::impl::cpu::acl;
+#include "cpu/aarch64/acl_softmax.hpp"
+using namespace dnnl::impl::cpu::aarch64;
 #endif
 
 namespace dnnl {
@@ -52,7 +52,7 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map() {
             CPU_INSTANCE_AARCH64(jit_uni_softmax_fwd_t, sve_512)
             CPU_INSTANCE_AARCH64(jit_uni_softmax_fwd_t, sve_256)
             CPU_INSTANCE_AARCH64(jit_uni_softmax_fwd_t, sve_128)
-            CPU_INSTANCE_ACL(acl_softmax_fwd_t)
+            CPU_INSTANCE_AARCH64_ACL(acl_softmax_fwd_t)
             CPU_INSTANCE(ref_softmax_fwd_t)
             nullptr,
         }},
diff --git a/src/cpu/matmul/cpu_matmul_list.cpp b/src/cpu/matmul/cpu_matmul_list.cpp
index e0e98c23e91..5107770b453 100644
--- a/src/cpu/matmul/cpu_matmul_list.cpp
+++ b/src/cpu/matmul/cpu_matmul_list.cpp
@@ -45,11 +45,11 @@ using namespace dnnl::impl::cpu::rv64;
 #endif
 
 #ifdef DNNL_AARCH64_USE_ACL
-#include "cpu/acl/matmul/acl_lowp_matmul.hpp"
-#include "cpu/acl/matmul/acl_lowp_matmul_sq.hpp"
-#include "cpu/acl/matmul/acl_matmul.hpp"
-using namespace dnnl::impl::cpu::acl::matmul;
-using namespace dnnl::impl::cpu::acl;
+#include "cpu/aarch64/matmul/acl_lowp_matmul.hpp"
+#include "cpu/aarch64/matmul/acl_lowp_matmul_sq.hpp"
+#include "cpu/aarch64/matmul/acl_matmul.hpp"
+using namespace dnnl::impl::cpu::aarch64::matmul;
+using namespace dnnl::impl::cpu::aarch64;
 #endif
 
 namespace dnnl {
@@ -63,9 +63,9 @@ using namespace dnnl::impl::cpu::matmul;
 // clang-format off
 const impl_list_item_t impl_list[] = REG_MATMUL_P({
         CPU_INSTANCE_AARCH64(brgemm_matmul_t, sve_512)
-        CPU_INSTANCE_ACL(acl_lowp_matmul_sq_t)
-        CPU_INSTANCE_ACL(acl_lowp_matmul_t)
-        CPU_INSTANCE_ACL(acl_matmul_t)
+        CPU_INSTANCE_AARCH64_ACL(acl_lowp_matmul_sq_t)
+        CPU_INSTANCE_AARCH64_ACL(acl_lowp_matmul_t)
+        CPU_INSTANCE_AARCH64_ACL(acl_matmul_t)
         CPU_INSTANCE_AARCH64(jit_bf16_matmul_t)
         CPU_INSTANCE_AARCH64(brgemm_matmul_t, sve_256)
         CPU_INSTANCE_AARCH64(jit_int8_matmul_t)
diff --git a/src/cpu/platform.hpp b/src/cpu/platform.hpp
index 1052367b028..b6645725dd5 100644
--- a/src/cpu/platform.hpp
+++ b/src/cpu/platform.hpp
@@ -28,7 +28,6 @@
 // - DNNL_X64
 // - DNNL_X86
 // - DNNL_AARCH64
-// - DNNL_ARM
 // - DNNL_PPC64
 // - DNNL_S390X
 // - DNNL_RV64
@@ -36,8 +35,8 @@
 // Target architecture macro is set to 1, others to 0. All macros are defined.
 
 #if defined(DNNL_X64) + defined(DNNL_AARCH64) + defined(DNNL_PPC64) \
-                + defined(DNNL_S390X) + defined(DNNL_RV64) + defined(DNNL_ARM) \
-                + defined(DNNL_X86) + defined(DNNL_ARCH_GENERIC) \
+                + defined(DNNL_S390X) + defined(DNNL_RV64) + defined(DNNL_X86) \
+                + defined(DNNL_ARCH_GENERIC) \
         == 0
 #if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) \
         || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
@@ -49,8 +48,6 @@
 #define DNNL_X86 1
 #elif defined(__aarch64__) || defined(_M_ARM64)
 #define DNNL_AARCH64 1
-#elif defined(__arm__) || defined(_M_ARM) || defined(__ARMEL__)
-#define DNNL_ARM 1
 #elif defined(__powerpc64__) || defined(__PPC64__) || defined(_ARCH_PPC64)
 #define DNNL_PPC64 1
 #elif defined(__s390x__)
@@ -63,8 +60,8 @@
 #endif // defined(DNNL_X64) + ... == 0
 
 #if defined(DNNL_X64) + defined(DNNL_AARCH64) + defined(DNNL_PPC64) \
-                + defined(DNNL_S390X) + defined(DNNL_RV64) + defined(DNNL_ARM) \
-                + defined(DNNL_X86) + defined(DNNL_ARCH_GENERIC) \
+                + defined(DNNL_S390X) + defined(DNNL_RV64) + defined(DNNL_X86) \
+                + defined(DNNL_ARCH_GENERIC) \
         != 1
 #error One and only one architecture should be defined at a time
 #endif
@@ -78,9 +75,6 @@
 #if !defined(DNNL_AARCH64)
 #define DNNL_AARCH64 0
 #endif
-#if !defined(DNNL_ARM)
-#define DNNL_ARM 0
-#endif
 #if !defined(DNNL_PPC64)
 #define DNNL_PPC64 0
 #endif
@@ -100,7 +94,6 @@
 #define DNNL_PPC64_ONLY(...) Z_CONDITIONAL_DO(DNNL_PPC64, __VA_ARGS__)
 #define DNNL_S390X_ONLY(...) Z_CONDITIONAL_DO(DNNL_S390X_ONLY, __VA_ARGS__)
 #define DNNL_AARCH64_ONLY(...) Z_CONDITIONAL_DO(DNNL_AARCH64, __VA_ARGS__)
-#define DNNL_ARM_ONLY(...) Z_CONDITIONAL_DO(DNNL_ARM, __VA_ARGS__)
 
 // Using RISC-V implementations optimized with RVV Intrinsics is optional for RISC-V builds
 // and can be enabled with DNNL_ARCH_OPT_FLAGS="-march=<ISA-string>" option, where <ISA-string>
@@ -122,11 +115,11 @@
 #define DNNL_NON_X64_ONLY(...) Z_CONDITIONAL_DO(Z_NOT(DNNL_X64), __VA_ARGS__)
 
 // Using Arm Compute Library kernels is optional for AArch64 builds
-// and can be enabled with the DNNL_USE_ACL CMake option
+// and can be enabled with the DNNL_AARCH64_USE_ACL CMake option
 #if defined(DNNL_AARCH64) && defined(DNNL_AARCH64_USE_ACL)
-#define DNNL_ACL_ONLY(...) __VA_ARGS__
+#define DNNL_AARCH64_ACL_ONLY(...) __VA_ARGS__
 #else
-#define DNNL_ACL_ONLY(...)
+#define DNNL_AARCH64_ACL_ONLY(...)
 #endif
 
 // Primitive ISA section for configuring knobs.
diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp
index 9b6d5cd4f2d..6c2c76a8451 100644
--- a/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp
+++ b/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp
@@ -36,7 +36,7 @@ const impl_list_map_t &regular_f32_bf16_impl_list_map() {
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, bf16, nChw16c))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, bf16, nCdhw16c))
 
-            DNNL_AARCH64_ONLY(DNNL_ACL_ONLY(CPU_REORDER_INSTANCE(acl::acl_reorder_fwd_t)))
+            DNNL_AARCH64_ACL_ONLY(CPU_REORDER_INSTANCE(aarch64::acl_reorder_fwd_t))
             DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
 
             DNNL_NON_X64_ONLY(REG_SR(f32, oihw, bf16, OIhw8i16o2i, fmt_order::keep))
diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp
index b7e34aaf92e..dfe6e96553d 100644
--- a/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp
+++ b/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp
@@ -33,7 +33,7 @@ const impl_list_map_t &regular_f32_f32_impl_list_map() {
             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_blk_reorder_t))
             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_t))
 
-            DNNL_AARCH64_ONLY(DNNL_ACL_ONLY(CPU_REORDER_INSTANCE(acl::acl_reorder_fwd_t)))
+            DNNL_AARCH64_ACL_ONLY(CPU_REORDER_INSTANCE(aarch64::acl_reorder_fwd_t))
             DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::brgemm_matmul_copy_reorder_t))
             DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_blk_reorder_t))
             DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_uni_reorder_t))
@@ -85,7 +85,7 @@ const impl_list_map_t &regular_f32_f32_impl_list_map() {
             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_blk_reorder_t))
             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_t))
 
-            DNNL_AARCH64_ONLY(DNNL_ACL_ONLY(CPU_REORDER_INSTANCE(acl::acl_reorder_fwd_t)))
+            DNNL_AARCH64_ACL_ONLY(CPU_REORDER_INSTANCE(aarch64::acl_reorder_fwd_t))
             DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_blk_reorder_t))
             DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_uni_reorder_t))
 
diff --git a/tests/benchdnn/rnn/rnn.cpp b/tests/benchdnn/rnn/rnn.cpp
index 8fcf2b4385c..ba325bda930 100644
--- a/tests/benchdnn/rnn/rnn.cpp
+++ b/tests/benchdnn/rnn/rnn.cpp
@@ -816,7 +816,7 @@ void skip_unimplemented_prb(const prb_t *prb_, res_t *res) {
             return;
         }
 
-#ifdef DNNL_USE_ACL
+#ifdef DNNL_AARCH64_USE_ACL
         const bool is_acl_f16_not_ok = prb.cfg[SRC_LAYER].dt == dnnl_f16
                 && dnnl::impl::cpu::platform::has_data_type_support(dnnl_f16);
         if (is_acl_f16_not_ok) {
diff --git a/tests/benchdnn/softmax/softmax.cpp b/tests/benchdnn/softmax/softmax.cpp
index 2a64e0b787f..66bd2a1b0da 100644
--- a/tests/benchdnn/softmax/softmax.cpp
+++ b/tests/benchdnn/softmax/softmax.cpp
@@ -266,7 +266,7 @@ void setup_cmp(compare::compare_t &cmp, const prb_t *prb, data_kind_t kind,
     const float trh_coeff_bwd = (prb->dir & FLAG_FWD) ? 1.f : 4.f;
     const float trh_f32 = trh_coeff_log * trh_coeff_bwd * trh_coeff_f32
             * epsilon_dt(trh_dt);
-#if defined(DNNL_USE_ACL) || defined(DNNL_SYCL_HIP) || defined(DNNL_SYCL_CUDA)
+#if defined(DNNL_AARCH64) || defined(DNNL_SYCL_HIP) || defined(DNNL_SYCL_CUDA)
     // MIOpen and ACL softmax accumulate in F16, but oneDNN now expects accumulation in
     // F32, this partially reverts 6727bbe8. For more information on ACL softmax, see
     // https://github.com/uxlfoundation/oneDNN/issues/1819
@@ -306,7 +306,7 @@ void setup_cmp(compare::compare_t &cmp, const prb_t *prb, data_kind_t kind,
 
     const auto softmax_add_check
             = [&](const compare::compare_t::driver_check_func_args_t &args) {
-#if defined(DNNL_USE_ACL)
+#if defined(DNNL_AARCH64_USE_ACL)
                   auto diff_trh = epsilon_dt(args.dt);
 #else
                   auto diff_trh = epsilon_dt(dnnl_f32);