diff --git a/.github/workflows/build-upload-wheels.yml b/.github/workflows/build-upload-wheels.yml
index 91b590afaf9c..5dd79d39eb37 100644
--- a/.github/workflows/build-upload-wheels.yml
+++ b/.github/workflows/build-upload-wheels.yml
@@ -35,13 +35,13 @@ jobs:
       - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065
 
       - name: Install cibuildwheel
-        run: python -m pip install cibuildwheel==3.1.4
+        run: python -m pip install cibuildwheel==3.3.1
 
       - name: Build wheels {{ matrix.os }}
         # Set LLVM_VERSION for the host to forward to the cibuildwheel
         # environment.
         env:
-          LLVM_VERSION: "15.0.7"
+          LLVM_VERSION: "20.1.8"
         run: python -m cibuildwheel --output-dir wheelhouse
 
       - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
@@ -58,7 +58,7 @@ jobs:
 
       - name: Build sdist
         env:
-          LLVM_VERSION: "15.0.7"
+          LLVM_VERSION: "20.1.8"
         run: pipx run build --sdist
 
       - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
@@ -73,8 +73,14 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest, macos-latest, ubuntu-24.04-arm]
-        python-version: ['3.10', '3.11', '3.12', '3.13']
-        numba-version: ['0.61.0', '0.61.2']
+        python-version: ['3.10', '3.11', '3.12', '3.13', '3.14']
+        numba-version: ['0.62.0', '0.62.1', '0.63.0', '0.63.1']
+        exclude:
+          - python-version: '3.14'
+            numba-version: '0.62.0'
+          - python-version: '3.14'
+            numba-version: '0.62.1'
+
     steps:
       - name: Download built wheels
         uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0
diff --git a/.gitlab/jobs/tioga.yml b/.gitlab/jobs/tioga.yml
index 0568a68382db..3914eb5c310f 100644
--- a/.gitlab/jobs/tioga.yml
+++ b/.gitlab/jobs/tioga.yml
@@ -56,6 +56,7 @@ variables:
         - "3.11"
         - "3.12"
         - "3.13"
+        - "3.14"
 
 build-and-test-tioga:
   extends: [.base-job, .python-variants]
diff --git a/MANIFEST.in b/MANIFEST.in
index fd6c57c169fa..e3842d878410 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1 +1 @@
-recursive-include src/numba/openmp/libs *
+recursive-include src/numba/openmp/libs *.py *.so *.a *.bc
diff --git a/README.md b/README.md
index f8778004cea9..4c8f064f65e6 100644
--- a/README.md
+++ b/README.md
@@ -45,6 +45,7 @@ conda install -c python-for-hpc -c conda-forge pyomp
 
 | PyOMP | Numba           |
 | ----- | --------------- |
+| 0.5.x | 0.62.x - 0.63.x |
 | 0.4.x | 0.61.x          |
 | 0.3.x | 0.57.x - 0.60.x |
 
diff --git a/buildscripts/cibuildwheel/setup-miniconda3.sh b/buildscripts/cibuildwheel/setup-miniconda3.sh
index c69a8bfbb890..b50cae6f3647 100644
--- a/buildscripts/cibuildwheel/setup-miniconda3.sh
+++ b/buildscripts/cibuildwheel/setup-miniconda3.sh
@@ -23,6 +23,6 @@ echo "Miniconda installed"
 source "_stage/miniconda3/bin/activate" base
 export CONDA_PLUGINS_AUTO_ACCEPT_TOS=true
 
-# Create clangdev ${LLVM_VERSION}
-echo "Installing manylinux llvmdev ${LLVM_VERSION}..."
-conda create -n llvmdev-${LLVM_VERSION} -c conda-forge -q -y clang=${LLVM_VERSION} clang-tools=${LLVM_VERSION} llvmdev=${LLVM_VERSION}
+# Create conda environment with tools and libraries for the LLVM_VERSION.
+echo "Installing llvmdev ${LLVM_VERSION}..."
+conda create -n llvmdev-${LLVM_VERSION} -c conda-forge -q -y clang=${LLVM_VERSION} clangxx=${LLVM_VERSION} clang-tools=${LLVM_VERSION} llvmdev=${LLVM_VERSION} zstd
diff --git a/buildscripts/conda-recipes/pyomp/meta.yaml b/buildscripts/conda-recipes/pyomp/meta.yaml
index b23506b196e1..1b97fea7d36c 100644
--- a/buildscripts/conda-recipes/pyomp/meta.yaml
+++ b/buildscripts/conda-recipes/pyomp/meta.yaml
@@ -1,5 +1,5 @@
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0').lstrip('v') %}
-{% set LLVM_VERSION = environ.get('LLVM_VERSION', '15.0.7') %}
+{% set LLVM_VERSION = environ.get('LLVM_VERSION', '20.1.8') %}
 
 package:
   name: pyomp
@@ -15,8 +15,11 @@ build:
   script:
     - export LLVM_VERSION={{ LLVM_VERSION }}
     - export LLVM_DIR=${PREFIX}
+    - export CC=${PREFIX}/bin/clang
+    - export CXX=${PREFIX}/bin/clang++
     - export VERBOSE=1
     - export CPPFLAGS="-mmacosx-version-min=${MACOSX_DEPLOYMENT_TARGET} -isystem ${PREFIX}/include -D_FORTIFY_SOURCE=2" # [osx]
+    - export ENABLE_BUNDLED_LIBOMPTARGET=1 # [linux]
     - rm -rf build dist src/*.egg-info
     - {{ PYTHON }} -m pip install -v .
 
@@ -28,6 +31,7 @@ requirements:
     - sysroot_linux-64       # [linux64]
     - sysroot_linux-aarch64  # [aarch64]
     - cmake
+    - ninja
     - setuptools_scm
     - elfutils # [linux]
     - libffi # [linux]
@@ -39,55 +43,32 @@ requirements:
     - sysroot_linux-aarch64  # [aarch64]
     - setuptools
     - setuptools_scm
-    - numba >=0.61, <0.62
+    - numba >=0.62, <0.64
     - clang {{ LLVM_VERSION }}
+    - clangxx {{ LLVM_VERSION }}
     - clang-tools {{ LLVM_VERSION }}
     - llvmdev {{ LLVM_VERSION }}
     - zlib
+    # require llvm-openmp for the openmp cpu runtime.
+    - llvm-openmp {{ LLVM_VERSION }}
     - elfutils # [linux]
     - libffi # [linux]
   run:
     - python
     - setuptools
-    - numba >=0.61, <0.62
+    - numba >=0.62, <0.64
+    # require llvm-openmp for the openmp cpu runtime.
+    - llvm-openmp {{ LLVM_VERSION }}
     - lark
     - cffi
 
 test:
   commands:
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomp.dylib                        # [osx]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomp.so                           # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx1010.bc      # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx1030.bc      # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx1031.bc      # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx700.bc       # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx701.bc       # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx801.bc       # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx803.bc       # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx900.bc       # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx902.bc       # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx906.bc       # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx908.bc       # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx90a.bc       # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_35.bc         # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_37.bc         # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_50.bc         # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_52.bc         # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_53.bc         # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_60.bc         # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_61.bc         # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_62.bc         # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_70.bc         # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_72.bc         # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_75.bc         # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_80.bc         # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_86.bc         # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget.rtl.amdgpu.so          # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget.rtl.cuda.so            # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget.rtl.ppc64.so           # [linux and ppc64le]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget.rtl.x86_64.so          # [linux and x86_64]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget.rtl.aarch64.so         # [linux and aarch64]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget.so                     # [linux]
+    - test -f $SP_DIR/numba/openmp/libs/pass/libIntrinsicsOpenMP.dylib                 # [osx]
+    - test -f $SP_DIR/numba/openmp/libs/pass/libIntrinsicsOpenMP.so                    # [linux]
+    - test -f $SP_DIR/numba/openmp/libs/openmp/lib/libomptarget-amdgpu.bc        # [linux]
+    - test -f $SP_DIR/numba/openmp/libs/openmp/lib/libomptarget-nvptx.bc         # [linux]
+    - test -f $SP_DIR/numba/openmp/libs/openmp/lib/libomptarget.so               # [linux]
 
 about:
   home: https://github.com/Python-for-HPC/PyOMP
diff --git a/buildscripts/gitlab/build-and-test.sh b/buildscripts/gitlab/build-and-test.sh
index 4121e4402cfa..ab734f48d036 100644
--- a/buildscripts/gitlab/build-and-test.sh
+++ b/buildscripts/gitlab/build-and-test.sh
@@ -4,11 +4,12 @@ set -e
 
 # Create a unique temporary directory for this job.
 TMPDIR=/tmp/pyomp/${CI_JOB_ID}
+rm -rf ${TMPDIR}
 mkdir -p ${TMPDIR}
 pushd ${TMPDIR}
 
 # Set the LLVM_VERSION to use.
-export LLVM_VERSION="15.0.7"
+export LLVM_VERSION="20.1.8"
 
 # Set the envs directory under the temporary directory.
 export CONDA_ENVS_DIRS="${TMPDIR}/_stage/miniconda3/envs"
@@ -17,14 +18,18 @@ export CONDA_ENVS_DIRS="${TMPDIR}/_stage/miniconda3/envs"
 source ${CI_PROJECT_DIR}/buildscripts/cibuildwheel/setup-miniconda3.sh
 
 # Export environment variables for building and testing.
+export ENABLE_BUNDLED_LIBOMP="1"
+export ENABLE_BUNDLED_LIBOMPTARGET="1"
 export LLVM_DIR="${CONDA_ENVS_DIRS}/llvmdev-${LLVM_VERSION}"
-export PATH="${CONDA_ENVS_DIRS}/llvmdev-${LLVM_VERSION}/bin:${PATH}"
+export CMAKE_PREFIX_PATH="${CONDA_PREFIX}"
 export USE_CXX11_ABI="1"
 export PIP_NO_INPUT="1"
 
 # Create and activate a conda environment with the desired Python version.
 conda create -n py-${PYOMP_CI_PYTHON_VERSION} -c conda-forge -y python=${PYOMP_CI_PYTHON_VERSION}
 conda activate py-${PYOMP_CI_PYTHON_VERSION}
+# Add extra packages needed to build openmp libraries.
+conda install -c conda-forge -y zstd libffi
 
 # Clone and fetch the commit with history for package versioning.
 git clone https://github.com/${GITHUB_PROJECT_ORG}/${GITHUB_PROJECT_NAME}.git --single-branch
@@ -33,7 +38,7 @@ git fetch origin ${CI_COMMIT_SHA}
 git checkout ${CI_COMMIT_SHA}
 
 # Install pyomp.
-CC=gcc CXX=g++ python -m pip install -v .
+CC=clang CXX=clang++ python -m pip install -v .
 
 # Run host OpenMP tests.
 TEST_DEVICES=0 RUN_TARGET=0 python -m numba.runtests -v -- numba.openmp.tests.test_openmp
diff --git a/examples/ploop.1.py b/examples/ploop.1.py
index 19f3113892a0..28a21e4a11ea 100644
--- a/examples/ploop.1.py
+++ b/examples/ploop.1.py
@@ -1,13 +1,14 @@
-import numba
+from numba.openmp import njit
 from numba.openmp import openmp_context as openmp
-from numba.openmp import omp_set_num_threads, omp_get_thread_num, omp_get_num_threads, omp_get_wtime
 import numpy as np
 
-@numba.njit
+
+@njit
 def simple(n, a, b):
     with openmp("parallel for"):
         for i in range(1, n):
-            b[i] = (a[i] + a[i-1]) / 2.0
+            b[i] = (a[i] + a[i - 1]) / 2.0
+
 
 a = np.ones(100)
 b = np.empty(len(a))
diff --git a/pyproject.toml b/pyproject.toml
index 03874900678d..41f8782b8523 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools>=75.3", "wheel", "setuptools-scm>=8", "cmake>=3.20"]
+requires = ["setuptools>=75.3", "setuptools-scm>=8", "cmake>=3.20", "ninja"]
 build-backend = "setuptools.build_meta"
 
 [project]
@@ -7,7 +7,7 @@ name = "pyomp"
 dynamic = ["version"]
 description = "Python OpenMP library based on Numba"
 readme = "README.md"
-requires-python = ">=3.10, <3.14"
+requires-python = ">=3.10, <3.15"
 license = "BSD-2-Clause"
 license-files = ["LICENSE", "LICENSE-OPENMP.txt"]
 classifiers = [
@@ -17,7 +17,7 @@ classifiers = [
     "Intended Audience :: Developers",
     "Topic :: Software Development :: Compilers",
 ]
-dependencies = ["numba>=0.61, <0.62", "lark", "cffi", "setuptools"]
+dependencies = ["numba>=0.62, <0.64", "lark", "cffi", "setuptools"]
 maintainers = [
     { name = "Giorgis Georgakoudis", email = "georgakoudis1@llnl.gov" },
 ]
@@ -37,7 +37,7 @@ include = ["numba.openmp*"]
 
 # Bundle the CMake-installed artifacts into the wheel.
 [tool.setuptools.package-data]
-"numba.openmp.libs" = ["pass/*", "libomp/**/*"]
+"numba.openmp.libs" = ["pass/*", "openmp/**/*"]
 
 # setuptools-scm config
 [tool.setuptools_scm]
@@ -49,7 +49,6 @@ archs = ["native"]
 # Pass LLVM_VERSION from the host environment to cibuildwheel.
 environment-pass = ["LLVM_VERSION"]
 # We use miniconda3 to get the clang/llvm toolchain on Linux.
-before-all = ["bash buildscripts/cibuildwheel/setup-miniconda3.sh"]
 before-build = ["rm -rf build dist src/*.egg-info"]
 skip = ["*-musllinux_*", "cp38-*"]
 test-command = [
@@ -59,14 +58,32 @@ test-command = [
     "OMP_TARGET_OFFLOAD=mandatory TEST_DEVICES=1 RUN_TARGET=1 python -m numba.runtests -v -- numba.openmp.tests.test_openmp.TestOpenmpTarget",
 ]
 
+[tool.cibuildwheel.environment]
+USE_CXX11_ABI = "1"
+PIP_NO_INPUT = "1"
+
 [tool.cibuildwheel.linux]
 before-all = [
-    "yum install -y elfutils-libelf-devel libffi-devel",
-    "bash buildscripts/cibuildwheel/setup-miniconda3.sh",
+    "yum install -y elfutils-libelf-devel libffi-devel clang-devel-20.1.8 llvm-devel-20.1.8",
 ]
 
-[tool.cibuildwheel.environment]
-LLVM_DIR = "${PWD}/_stage/miniconda3/envs/llvmdev-${LLVM_VERSION}"
-PATH = "${PWD}/_stage/miniconda3/envs/llvmdev-${LLVM_VERSION}/bin:${PATH}"
-USE_CXX11_ABI = "1"
-PIP_NO_INPUT = "1"
+[tool.cibuildwheel.linux.environment]
+ENABLE_BUNDLED_LIBOMP = "1"
+ENABLE_BUNDLED_LIBOMPTARGET = "1"
+LLVM_DIR = "/usr/lib64/cmake/llvm"
+CC = "/usr/bin/clang"
+CXX = "/usr/bin/clang++"
+
+[tool.cibuildwheel.macos]
+before-all = ["bash buildscripts/cibuildwheel/setup-miniconda3.sh"]
+
+[tool.cibuildwheel.macos.environment]
+ENABLE_BUNDLED_LIBOMP = "1"
+LLVM_DIR = "${PWD}/_stage/miniconda3/envs/llvmdev-${LLVM_VERSION}/lib/cmake/llvm"
+CC = "${PWD}/_stage/miniconda3/envs/llvmdev-${LLVM_VERSION}/bin/clang"
+CXX = "${PWD}/_stage/miniconda3/envs/llvmdev-${LLVM_VERSION}/bin/clang++"
+# Set the deplioyment target to macOS 11.0.
+MACOSX_DEPLOYMENT_TARGET = "11.0"
+# Set the cmake prefix path to find libraries in the conda environment which is
+# compatible with the macos target.
+CMAKE_PREFIX_PATH = "${PWD}/_stage/miniconda3/envs/llvmdev-${LLVM_VERSION}/"
diff --git a/setup.py b/setup.py
index 57f5b8fd9510..dc5404195390 100644
--- a/setup.py
+++ b/setup.py
@@ -8,11 +8,7 @@
 from setuptools import setup, Extension
 from setuptools import Command
 from setuptools.command.build_ext import build_ext
-
-try:
-    from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
-except ImportError:
-    _bdist_wheel = None
+from setuptools.command.bdist_wheel import bdist_wheel
 
 
 class CleanCommand(Command):
@@ -33,26 +29,32 @@ def run(self):
             shutil.rmtree(egg_info, ignore_errors=True)
 
 
-if _bdist_wheel:
-
-    class CustomBdistWheel(_bdist_wheel):
-        def run(self):
-            # Ensure all build steps are run before bdist_wheel
-            self.run_command("build_ext")
-            super().run()
-else:
-    CustomBdistWheel = None
+class CustomBdistWheel(bdist_wheel):
+    def run(self):
+        # Ensure all build steps are run before bdist_wheel
+        self.run_command("build_ext")
+        super().run()
 
 
 class CMakeExtension(Extension):
-    def __init__(self, name, *, sourcedir=None, url=None, sha256=None, cmake_args=[]):
+    def __init__(
+        self,
+        name,
+        *,
+        source_dir=None,
+        install_dir=None,
+        url=None,
+        sha256=None,
+        cmake_args=[],
+    ):
         # Don't invoke the original build_ext for this special extension.
         super().__init__(name, sources=[])
-        if sourcedir and url:
+        if source_dir and url:
             raise ValueError(
-                "CMakeExtension should have either a sourcedir or a url, not both."
+                "CMakeExtension should have either a source_dir or a url, not both."
             )
-        self.sourcedir = sourcedir
+        self.source_dir = source_dir
+        self.install_dir = install_dir
         self.url = url
         self.sha256 = sha256
         self.cmake_args = cmake_args
@@ -66,7 +68,13 @@ def run(self):
             else:
                 super().run()
 
+    def finalize_options(self):
+        super().finalize_options()
+        # Create placeholder directories for package-data validation.
+        Path("src/numba/openmp/libs/openmp/lib").mkdir(parents=True, exist_ok=True)
+
     def _build_cmake(self, ext: CMakeExtension):
+        print("Build CMake extension:", ext.name)
         # Delete build directory if it exists to avoid errors with stale
         # CMakeCache.txt leftovers.
         build_dir = Path(self.build_temp) / ext.name
@@ -84,32 +92,41 @@ def _build_cmake(self, ext: CMakeExtension):
         elif sys.platform == "darwin":
             extra_cmake_args.append(r"-DCMAKE_INSTALL_RPATH=@loader_path")
 
-        install_dir = Path(lib_dir) / ext.name
+        if ext.install_dir is None:
+            install_dir = Path(lib_dir) / ext.name
+        else:
+            install_dir = Path(lib_dir) / ext.install_dir
         install_dir.mkdir(parents=True, exist_ok=True)
+
         cfg = (
             [
                 "cmake",
                 "-S",
-                ext.sourcedir,
+                ext.source_dir,
                 "-B",
                 build_dir,
+                "-G",
+                "Ninja",
                 "-DCMAKE_BUILD_TYPE=Release",
                 f"-DCMAKE_INSTALL_PREFIX={install_dir}",
             ]
             + ext.cmake_args
             + extra_cmake_args
         )
+        print("Configure cmake with args:", cfg)
         subprocess.run(cfg, check=True, stdin=subprocess.DEVNULL)
 
+        print("Build at dir ", build_dir)
         subprocess.run(
             ["cmake", "--build", build_dir, "-j"], check=True, stdin=subprocess.DEVNULL
         )
+        print("Install at dir ", install_dir)
         subprocess.run(
             ["cmake", "--install", build_dir], check=True, stdin=subprocess.DEVNULL
         )
 
         # Remove unnecessary files after installing libomp.
-        if ext.name == "libomp":
+        if ext.name.startswith("libomp"):
             # Remove include directory after install.
             include_dir = install_dir / "include"
             if include_dir.exists():
@@ -124,40 +141,30 @@ def _env_toolchain_args(self, ext):
         # Forward LLVM_DIR if provided.
         if os.environ.get("LLVM_DIR"):
             args.append(f"-DLLVM_DIR={os.environ['LLVM_DIR']}")
+        # Forward CC, CXX if provided.
+        if os.environ.get("CC"):
+            args.append(f"-DCMAKE_C_COMPILER={os.environ['CC']}")
+        if os.environ.get("CXX"):
+            args.append(f"-DCMAKE_CXX_COMPILER={os.environ['CXX']}")
         return args
 
 
 def _prepare_source_openmp(sha256=None):
     LLVM_VERSION = os.environ.get("LLVM_VERSION", None)
     assert LLVM_VERSION is not None, "LLVM_VERSION environment variable must be set."
-    url = f"https://github.com/llvm/llvm-project/releases/download/llvmorg-{LLVM_VERSION}/openmp-{LLVM_VERSION}.src.tar.xz"
+    url = f"https://github.com/llvm/llvm-project/releases/download/llvmorg-{LLVM_VERSION}/llvm-project-{LLVM_VERSION}.src.tar.xz"
 
-    tmp = Path("_downloads/libomp") / f"openmp-{LLVM_VERSION}.tar.gz"
+    tmp = Path("_downloads/libomp") / f"llvm-project-{LLVM_VERSION}.tar.gz"
     tmp.parent.mkdir(parents=True, exist_ok=True)
 
     # Download the source tarball if it does not exist.
     if not tmp.exists():
-        print(f"download openmp version {LLVM_VERSION} url:", url)
+        print(f"Downloading llvm-project version {LLVM_VERSION} url:", url)
         with urllib.request.urlopen(url) as r:
             with tmp.open("wb") as f:
                 f.write(r.read())
-
-    # Extract only the major version.
-    llvm_major_version = tuple(map(int, LLVM_VERSION.split(".")))[0]
-    # For LLVM versions > 14, we also need to download CMake files.
-    if llvm_major_version > 14:
-        cmake_url = f"https://github.com/llvm/llvm-project/releases/download/llvmorg-{LLVM_VERSION}/cmake-{LLVM_VERSION}.src.tar.xz"
-        cmake_file = Path("_downloads/libomp") / f"cmake-{LLVM_VERSION}.tar.gz"
-        if not cmake_file.exists():
-            with urllib.request.urlopen(cmake_url) as r:
-                with cmake_file.open("wb") as f:
-                    f.write(r.read())
-        with tarfile.open(cmake_file) as tf:
-            tf.extractall(cmake_file.parent)
-            src = cmake_file.parent / tf.getnames()[0]
-            dst = cmake_file.parent / "cmake"
-            if not dst.exists():
-                src.rename(dst)
+    else:
+        print(f"Using downloaded llvm-project at {tmp}")
 
     if sha256:
         import hashlib
@@ -168,40 +175,111 @@ def _prepare_source_openmp(sha256=None):
         if hasher.hexdigest() != sha256:
             raise ValueError(f"SHA256 mismatch for {url}")
 
+    print("Extracting llvm-project...")
     with tarfile.open(tmp) as tf:
-        # We assume the tarball contains a single directory with the source files.
-        sourcedir = tmp.parent / tf.getnames()[0]
-        tf.extractall(tmp.parent)
-
-    for patch in (
-        Path(f"src/numba/openmp/libs/libomp/patches/{LLVM_VERSION}")
+        # The root dir llvm-project-20.1.8.src
+        root_name = tf.getnames()[0]
+
+        # Extract only needed subdirectories
+        members = [
+            m
+            for m in tf.getmembers()
+            if m.name.startswith(f"{root_name}/openmp/")
+            or m.name.startswith(f"{root_name}/offload/")
+            or m.name.startswith(f"{root_name}/runtimes/")
+            or m.name.startswith(f"{root_name}/cmake/")
+            or m.name.startswith(f"{root_name}/llvm/cmake/")
+            or m.name.startswith(f"{root_name}/llvm/utils/")
+            or m.name.startswith(f"{root_name}/libc/")
+        ]
+
+        parentdir = tmp.parent
+        # Base arguments for extractall.
+        kwargs = {"path": parentdir, "members": members}
+
+        # Check if data filter is available.
+        if hasattr(tarfile, "data_filter"):
+            # If this exists, the 'filter' argument is guaranteed to work
+            kwargs["filter"] = "data"
+
+        tf.extractall(**kwargs)
+
+        source_dir = parentdir / root_name
+        print("Extracted llvm-project to:", source_dir)
+
+    print("Applying patches to llvm-project...")
+    for patch in sorted(
+        Path(f"src/numba/openmp/libs/openmp/patches/{LLVM_VERSION}")
         .absolute()
         .glob("*.patch")
     ):
         print("applying patch", patch)
         subprocess.run(
             ["patch", "-p1", "-i", str(patch)],
-            cwd=sourcedir,
+            cwd=source_dir,
             check=True,
             stdin=subprocess.DEVNULL,
         )
 
-    return sourcedir
+    return f"{source_dir}/runtimes"
 
 
-setup(
-    ext_modules=[
-        CMakeExtension("pass", sourcedir="src/numba/openmp/libs/pass"),
+def _check_true(env_var):
+    val = os.environ.get(env_var, "0")
+    return val.lower() in ("1", "true", "yes", "on")
+
+
+# Build extensions: always include 'pass', conditionally include 'openmp'
+# libraries.
+ext_modules = [CMakeExtension("pass", source_dir="src/numba/openmp/libs/pass")]
+
+
+# Prepare source directory if either bundled libomp or libomptarget is enabled.
+if _check_true("ENABLE_BUNDLED_LIBOMP") or _check_true("ENABLE_BUNDLED_LIBOMPTARGET"):
+    openmp_source_dir = _prepare_source_openmp()
+
+# Optionally enable bundled libomp build via ENABLE_BUNDLED_LIBOMP=1.  We want
+# to avoid bundling for conda builds to avoid duplicate OpenMP runtime conflicts
+# (e.g., numba 0.62+ and libopenblas already require llvm-openmp).
+if _check_true("ENABLE_BUNDLED_LIBOMP"):
+    ext_modules.append(
         CMakeExtension(
             "libomp",
-            sourcedir=_prepare_source_openmp(),
+            source_dir=openmp_source_dir,
+            install_dir="openmp",
             cmake_args=[
+                "-DOPENMP_STANDALONE_BUILD=ON",
+                "-DLLVM_ENABLE_RUNTIMES=openmp",
                 "-DLIBOMP_OMPD_SUPPORT=OFF",
-                "-DLIBOMP_OMPT_SUPPORT=OFF",
-                "-DCMAKE_INSTALL_LIBDIR=lib",
+                "-DOPENMP_ENABLE_OMPT_TOOLS=OFF",
+                # Avoid conflicts in manylinux builds with packaged clang/llvm
+                # under /usr/include and its gcc-toolset provided header files.
+                "-DCMAKE_NO_SYSTEM_FROM_IMPORTED=ON",
+            ],
+        )
+    )
+
+# Optionally enable bundled libomptarget build via ENABLE_BUNDLED_LIBOMPTARGET=1.
+# We avoid building and bundling for unsupported platforms.
+if _check_true("ENABLE_BUNDLED_LIBOMPTARGET"):
+    ext_modules.append(
+        CMakeExtension(
+            "libomptarget",
+            source_dir=openmp_source_dir,
+            install_dir="openmp",
+            cmake_args=[
+                "-DOPENMP_STANDALONE_BUILD=ON",
+                "-DLLVM_ENABLE_RUNTIMES=offload",
+                # Avoid conflicts in manylinux builds with packaged clang/llvm
+                # under /usr/include and its gcc-toolset provided header files.
+                "-DCMAKE_NO_SYSTEM_FROM_IMPORTED=ON",
             ],
-        ),
-    ],
+        )
+    )
+
+
+setup(
+    ext_modules=ext_modules,
     cmdclass={
         "clean": CleanCommand,
         "build_ext": BuildCMakeExt,
diff --git a/src/numba/openmp/__init__.py b/src/numba/openmp/__init__.py
index 031c0a637616..1396f52ae9f7 100644
--- a/src/numba/openmp/__init__.py
+++ b/src/numba/openmp/__init__.py
@@ -60,25 +60,45 @@
 
 def _init():
     sys_platform = sys.platform
+    from ctypes.util import find_library
 
     omplib = (
         libpath
-        / "libomp"
+        / "openmp"
         / "lib"
         / f"libomp{'.dylib' if sys_platform == 'darwin' else '.so'}"
     )
-    if DEBUG_OPENMP >= 1:
-        print("Found OpenMP runtime library at", omplib)
-    ll.load_library_permanently(str(omplib))
+
+    # Prefer bundled libomp if it exists.
+    if omplib.exists():
+        if DEBUG_OPENMP >= 1:
+            print("Found bundled OpenMP runtime library at", omplib)
+        ll.load_library_permanently(str(omplib))
+    else:
+        # There is no bundled libomp, try to find it in standard library paths.
+        system_omplib = find_library("omp")
+        if system_omplib:
+            if DEBUG_OPENMP >= 1:
+                print(f"Found system OpenMP runtime library: {system_omplib}")
+            ll.load_library_permanently(system_omplib)
+        else:
+            raise RuntimeError(
+                f"OpenMP runtime not found. Bundled library missing at {omplib} "
+                "and no system libomp found via ctypes.util.find_library('omp'). "
+                "Ensure libomp is available in library paths."
+            )
 
     # libomptarget is unavailable on apple, windows, so return.
     if sys_platform.startswith("darwin") or sys_platform.startswith("win32"):
         return
 
-    omptargetlib = libpath / "libomp" / "lib" / "libomptarget.so"
-    if DEBUG_OPENMP >= 1:
-        print("Found OpenMP target runtime library at", omptargetlib)
-    ll.load_library_permanently(str(omptargetlib))
+    omptargetlib = libpath / "openmp" / "lib" / "libomptarget.so"
+    if omptargetlib.exists():
+        if DEBUG_OPENMP >= 1:
+            print("Found OpenMP target runtime library at", omptargetlib)
+        ll.load_library_permanently(str(omptargetlib))
+    else:
+        raise RuntimeError(f"OpenMP target runtime not found at {omptargetlib}")
 
 
 _init()
diff --git a/src/numba/openmp/compiler.py b/src/numba/openmp/compiler.py
index 3eb39509f0b7..ed71888c226f 100644
--- a/src/numba/openmp/compiler.py
+++ b/src/numba/openmp/compiler.py
@@ -306,7 +306,19 @@ def add_llvm_module(self, ll_module):
 
     def _finalize_specific(self):
         super()._finalize_specific()
-        ll.ExecutionEngine.run_static_constructors(self._codegen._engine._ee)
+        # Run target offloading descriptor registration functions, if there are any.
+        import ctypes
+
+        ee = self._codegen._engine._ee
+        for func in self.get_defined_functions():
+            if not func.name.startswith(".omp_offloading.descriptor_reg"):
+                continue
+            addr = ee.get_function_address(func.name)
+            reg = ctypes.CFUNCTYPE(None)(addr)
+            try:
+                reg()
+            except Exception:
+                raise RuntimeError("error registering OpenMP offloading descriptor")
 
 
 class CustomFunctionCompiler(_FunctionCompiler):
diff --git a/src/numba/openmp/config.py b/src/numba/openmp/config.py
index d4f380b5069c..84e2edc48545 100644
--- a/src/numba/openmp/config.py
+++ b/src/numba/openmp/config.py
@@ -23,4 +23,5 @@ def _safe_readenv(name, ctor, default):
     config.DEBUG_ARRAY_OPT = 1
 DEBUG_OPENMP_LLVM_PASS = _safe_readenv("NUMBA_DEBUG_OPENMP_LLVM_PASS", int, 0)
 OPENMP_DISABLED = _safe_readenv("NUMBA_OPENMP_DISABLED", int, 0)
-OPENMP_DEVICE_TOOLCHAIN = _safe_readenv("NUMBA_OPENMP_DEVICE_TOOLCHAIN", int, 0)
+# Use toolchain for device code compilation by default to avoid issues with libomptarget compatibility checks.
+OPENMP_DEVICE_TOOLCHAIN = _safe_readenv("NUMBA_OPENMP_DEVICE_TOOLCHAIN", int, 1)
diff --git a/src/numba/openmp/libs/libomp/patches/14.0.6/0001-BACKPORT-Fix-for-CUDA-OpenMP-RTL.patch b/src/numba/openmp/libs/openmp/patches/14.0.6/0001-BACKPORT-Fix-for-CUDA-OpenMP-RTL.patch
similarity index 100%
rename from src/numba/openmp/libs/libomp/patches/14.0.6/0001-BACKPORT-Fix-for-CUDA-OpenMP-RTL.patch
rename to src/numba/openmp/libs/openmp/patches/14.0.6/0001-BACKPORT-Fix-for-CUDA-OpenMP-RTL.patch
diff --git a/src/numba/openmp/libs/libomp/patches/14.0.6/0002-Fix-missing-includes.patch b/src/numba/openmp/libs/openmp/patches/14.0.6/0002-Fix-missing-includes.patch
similarity index 100%
rename from src/numba/openmp/libs/libomp/patches/14.0.6/0002-Fix-missing-includes.patch
rename to src/numba/openmp/libs/openmp/patches/14.0.6/0002-Fix-missing-includes.patch
diff --git a/src/numba/openmp/libs/libomp/patches/14.0.6/0003-Link-static-LLVM-libs.patch b/src/numba/openmp/libs/openmp/patches/14.0.6/0003-Link-static-LLVM-libs.patch
similarity index 100%
rename from src/numba/openmp/libs/libomp/patches/14.0.6/0003-Link-static-LLVM-libs.patch
rename to src/numba/openmp/libs/openmp/patches/14.0.6/0003-Link-static-LLVM-libs.patch
diff --git a/src/numba/openmp/libs/libomp/patches/15.0.7/0001-Fix-missing-includes.patch b/src/numba/openmp/libs/openmp/patches/15.0.7/0001-Fix-missing-includes.patch
similarity index 100%
rename from src/numba/openmp/libs/libomp/patches/15.0.7/0001-Fix-missing-includes.patch
rename to src/numba/openmp/libs/openmp/patches/15.0.7/0001-Fix-missing-includes.patch
diff --git a/src/numba/openmp/libs/libomp/patches/15.0.7/0002-Link-LLVM-statically.patch b/src/numba/openmp/libs/openmp/patches/15.0.7/0002-Link-LLVM-statically.patch
similarity index 100%
rename from src/numba/openmp/libs/libomp/patches/15.0.7/0002-Link-LLVM-statically.patch
rename to src/numba/openmp/libs/openmp/patches/15.0.7/0002-Link-LLVM-statically.patch
diff --git a/src/numba/openmp/libs/libomp/patches/15.0.7/0003-Disable-opaque-pointers-DeviceRTL-bitcode.patch b/src/numba/openmp/libs/openmp/patches/15.0.7/0003-Disable-opaque-pointers-DeviceRTL-bitcode.patch
similarity index 100%
rename from src/numba/openmp/libs/libomp/patches/15.0.7/0003-Disable-opaque-pointers-DeviceRTL-bitcode.patch
rename to src/numba/openmp/libs/openmp/patches/15.0.7/0003-Disable-opaque-pointers-DeviceRTL-bitcode.patch
diff --git a/src/numba/openmp/libs/libomp/patches/16.0.6/0001-Load-plugins-from-install-directory.patch b/src/numba/openmp/libs/openmp/patches/16.0.6/0001-Load-plugins-from-install-directory.patch
similarity index 100%
rename from src/numba/openmp/libs/libomp/patches/16.0.6/0001-Load-plugins-from-install-directory.patch
rename to src/numba/openmp/libs/openmp/patches/16.0.6/0001-Load-plugins-from-install-directory.patch
diff --git a/src/numba/openmp/libs/libomp/patches/16.0.6/0002-Link-LLVM-statically.patch b/src/numba/openmp/libs/openmp/patches/16.0.6/0002-Link-LLVM-statically.patch
similarity index 100%
rename from src/numba/openmp/libs/libomp/patches/16.0.6/0002-Link-LLVM-statically.patch
rename to src/numba/openmp/libs/openmp/patches/16.0.6/0002-Link-LLVM-statically.patch
diff --git a/src/numba/openmp/libs/openmp/patches/20.1.8/0001-Enable-standalone-build.patch b/src/numba/openmp/libs/openmp/patches/20.1.8/0001-Enable-standalone-build.patch
new file mode 100644
index 000000000000..9f73e9d3fb0c
--- /dev/null
+++ b/src/numba/openmp/libs/openmp/patches/20.1.8/0001-Enable-standalone-build.patch
@@ -0,0 +1,13 @@
+diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt
+index f6e894d39..9096d4ed5 100644
+--- a/offload/CMakeLists.txt
++++ b/offload/CMakeLists.txt
+@@ -4,7 +4,7 @@
+ cmake_minimum_required(VERSION 3.20.0)
+ set(LLVM_SUBPROJECT_TITLE "liboffload")
+ 
+-if ("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
++if (OPENMP_STANDALONE_BUILD OR "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
+   set(OPENMP_STANDALONE_BUILD TRUE)
+   project(offload C CXX ASM)
+ else()
diff --git a/src/numba/openmp/libs/openmp/patches/20.1.8/0002-Link-statically-LLVM.patch b/src/numba/openmp/libs/openmp/patches/20.1.8/0002-Link-statically-LLVM.patch
new file mode 100644
index 000000000000..43a9aaff0d74
--- /dev/null
+++ b/src/numba/openmp/libs/openmp/patches/20.1.8/0002-Link-statically-LLVM.patch
@@ -0,0 +1,24 @@
+diff --git a/offload/plugins-nextgen/CMakeLists.txt b/offload/plugins-nextgen/CMakeLists.txt
+index 9b5b12bea..78dde405b 100644
+--- a/offload/plugins-nextgen/CMakeLists.txt
++++ b/offload/plugins-nextgen/CMakeLists.txt
+@@ -3,6 +3,7 @@ set(common_dir ${CMAKE_CURRENT_SOURCE_DIR}/common)
+ add_subdirectory(common)
+ function(add_target_library target_name lib_name)
+   add_llvm_library(${target_name} STATIC
++    DISABLE_LLVM_LINK_LLVM_DYLIB
+     LINK_COMPONENTS
+       AggressiveInstCombine
+       Analysis
+diff --git a/offload/src/CMakeLists.txt b/offload/src/CMakeLists.txt
+index c5f5d902f..ca5135b13 100644
+--- a/offload/src/CMakeLists.txt
++++ b/offload/src/CMakeLists.txt
+@@ -8,6 +8,7 @@ endif()
+ 
+ add_llvm_library(omptarget
+   SHARED
++  DISABLE_LLVM_LINK_LLVM_DYLIB
+ 
+   device.cpp
+   interface.cpp
diff --git a/src/numba/openmp/libs/openmp/patches/20.1.8/0003-Do-not-build-liboffload.patch b/src/numba/openmp/libs/openmp/patches/20.1.8/0003-Do-not-build-liboffload.patch
new file mode 100644
index 000000000000..bc64017d94cf
--- /dev/null
+++ b/src/numba/openmp/libs/openmp/patches/20.1.8/0003-Do-not-build-liboffload.patch
@@ -0,0 +1,12 @@
+diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt
+index 9096d4e..a0aff92 100644
+--- a/offload/CMakeLists.txt
++++ b/offload/CMakeLists.txt
+@@ -380,7 +380,6 @@ add_subdirectory(tools)
+ add_subdirectory(src)
+ 
+ add_subdirectory(tools/offload-tblgen)
+-add_subdirectory(liboffload)
+ 
+ # Add tests.
+ add_subdirectory(test)
diff --git a/src/numba/openmp/libs/pass/CGIntrinsicsOpenMP.cpp b/src/numba/openmp/libs/pass/CGIntrinsicsOpenMP.cpp
index 309b23cb3466..553483e9581d 100644
--- a/src/numba/openmp/libs/pass/CGIntrinsicsOpenMP.cpp
+++ b/src/numba/openmp/libs/pass/CGIntrinsicsOpenMP.cpp
@@ -1,7 +1,7 @@
 #include "CGIntrinsicsOpenMP.h"
 #include "DebugOpenMP.h"
 
-#include <llvm/ADT/Triple.h>
+#include <llvm/ADT/StringExtras.h>
 #include <llvm/Frontend/OpenMP/OMP.h.inc>
 #include <llvm/Frontend/OpenMP/OMPConstants.h>
 #include <llvm/Frontend/OpenMP/OMPIRBuilder.h>
@@ -18,6 +18,7 @@
 #include <llvm/Support/Alignment.h>
 #include <llvm/Support/Debug.h>
 #include <llvm/Support/ErrorHandling.h>
+#include <llvm/TargetParser/Triple.h>
 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
 #include <llvm/Transforms/Utils/CodeExtractor.h>
 #include <llvm/Transforms/Utils/ModuleUtils.h>
@@ -248,10 +249,10 @@ OutlinedInfoStruct CGIntrinsicsOpenMP::createOutlinedFunction(
   assert(SinkingCands.empty() && "Expected empty alloca sinking candidates");
 
   auto IsTempOrDefaultPrivate = [](Value *V) {
-    if (V->getName().startswith("."))
+    if (V->getName().starts_with("."))
       return true;
 
-    if (V->getName().startswith("excinfo"))
+    if (V->getName().starts_with("excinfo"))
       return true;
 
     if (V->getName() == "quot")
@@ -292,8 +293,8 @@ OutlinedInfoStruct CGIntrinsicsOpenMP::createOutlinedFunction(
 
     DSAType DSA = DSAValueMap[V].Type;
 
-    DEBUG_ENABLE(dbgs() << "V " << *V << " from DSAValueMap Type " << DSA
-                        << "\n");
+    DEBUG_ENABLE(dbgs() << "V " << *V << " from DSAValueMap Type "
+                        << toString(DSA) << "\n");
     switch (DSA) {
     case DSA_PRIVATE:
       Privates.push_back(V);
@@ -514,8 +515,10 @@ OutlinedInfoStruct CGIntrinsicsOpenMP::createOutlinedFunction(
   // Deterministic insertion of BBs, BlockVector needs ExitBB to move to the
   // outlined function.
   BlockVector.push_back(OI.ExitBB);
-  for (auto *BB : BlockVector)
-    BB->moveBefore(OutlinedExitBB);
+  for (auto *BB : BlockVector) {
+    BB->removeFromParent();
+    BB->insertInto(OutlinedFn, OutlinedExitBB);
+  }
 
   DEBUG_ENABLE(dbgs() << "=== Dump OutlinedFn\n"
                       << *OutlinedFn << "=== End of Dump OutlinedFn\n");
@@ -741,11 +744,8 @@ void CGIntrinsicsOpenMP::emitOMPParallelDeviceRuntime(
         OMPBuilder.Builder.CreateLoad(OMPBuilder.Int8PtrPtr, GlobalArgs);
     Value *GEP = OMPBuilder.Builder.CreateConstInBoundsGEP1_64(
         OMPBuilder.Int8Ptr, LoadGlobalArgs, Idx);
-
-    Value *Bitcast = OMPBuilder.Builder.CreateBitCast(
-        GEP, CapturedVars[Idx]->getType()->getPointerTo());
     Value *Load =
-        OMPBuilder.Builder.CreateLoad(CapturedVars[Idx]->getType(), Bitcast);
+        OMPBuilder.Builder.CreateLoad(CapturedVars[Idx]->getType(), GEP);
     OutlinedFnArgs.push_back(Load);
   }
 
@@ -1377,11 +1377,13 @@ void CGIntrinsicsOpenMP::emitLoop(DSAValueMapTy &DSAValueMap,
     }
 
     OMPBuilder.Builder.SetInsertPoint(ExitBB->getTerminator());
-    OMPBuilder.createBarrier(OpenMPIRBuilder::LocationDescription(
-                                 OMPBuilder.Builder.saveIP(), Loc.DL),
-                             omp::Directive::OMPD_for,
-                             /* ForceSimpleCall */ false,
-                             /* CheckCancelFlag */ false);
+    auto IP = OMPBuilder.createBarrier(OpenMPIRBuilder::LocationDescription(
+                                           OMPBuilder.Builder.saveIP(), Loc.DL),
+                                       omp::Directive::OMPD_for,
+                                       /* ForceSimpleCall */ false,
+                                       /* CheckCancelFlag */ false);
+    if (auto E = IP.takeError())
+      FATAL_ERROR("Failed to create barrier: " + toString(std::move(E)));
   }
 
   if (verifyFunction(*PreHeader->getParent(), &errs()))
@@ -1425,7 +1427,7 @@ void CGIntrinsicsOpenMP::emitOMPTask(DSAValueMapTy &DSAValueMap, Function *Fn,
       StructType::create({OMPBuilder.VoidPtr, OMPBuilder.TaskRoutineEntryPtr,
                           OMPBuilder.Int32, KmpCmplrdataTy, KmpCmplrdataTy},
                          "struct.kmp_task_t");
-  Type *KmpTaskTPtrTy = KmpTaskTTy->getPointerTo();
+  Type *KmpTaskTPtrTy = PointerType::getUnqual(M.getContext());
 
   FunctionCallee KmpcOmpTaskAlloc =
       OMPBuilder.getOrCreateRuntimeFunction(M, OMPRTL___kmpc_omp_task_alloc);
@@ -1452,13 +1454,13 @@ void CGIntrinsicsOpenMP::emitOMPTask(DSAValueMapTy &DSAValueMap, Function *Fn,
   else
     KmpSharedsTTy = StructType::create(SharedsTy, "struct.kmp_shareds");
   assert(KmpSharedsTTy && "Expected non-null KmpSharedsTTy");
-  Type *KmpSharedsTPtrTy = KmpSharedsTTy->getPointerTo();
+  Type *KmpSharedsTPtrTy = PointerType::getUnqual(M.getContext());
   StructType *KmpPrivatesTTy =
       StructType::create(PrivatesTy, "struct.kmp_privates");
-  Type *KmpPrivatesTPtrTy = KmpPrivatesTTy->getPointerTo();
+  Type *KmpPrivatesTPtrTy = PointerType::getUnqual(M.getContext());
   StructType *KmpTaskTWithPrivatesTy = StructType::create(
       {KmpTaskTTy, KmpPrivatesTTy}, "struct.kmp_task_t_with_privates");
-  Type *KmpTaskTWithPrivatesPtrTy = KmpTaskTWithPrivatesTy->getPointerTo();
+  Type *KmpTaskTWithPrivatesPtrTy = PointerType::getUnqual(M.getContext());
 
   // Declare the task entry function.
   Function *TaskEntryFn = Function::Create(
@@ -1615,11 +1617,14 @@ void CGIntrinsicsOpenMP::emitOMPTask(DSAValueMapTy &DSAValueMap, Function *Fn,
         BasicBlock::Create(M.getContext(), "entry", TaskOutlinedFn);
     BasicBlock *TaskOutlinedExitBB =
         BasicBlock::Create(M.getContext(), "exit", TaskOutlinedFn);
-    for (BasicBlock *BB : OutlinedBlockVector)
-      BB->moveBefore(TaskOutlinedExitBB);
+    for (BasicBlock *BB : OutlinedBlockVector) {
+      BB->removeFromParent();
+      BB->insertInto(TaskOutlinedFn, TaskOutlinedExitBB);
+    }
     // Explicitly move EndBB to the outlined functions, since OutlineInfo
     // does not contain it in the OutlinedBlockVector.
-    EndBB->moveBefore(TaskOutlinedExitBB);
+    EndBB->removeFromParent();
+    EndBB->insertInto(TaskOutlinedFn, TaskOutlinedExitBB);
     EndBB->getTerminator()->setSuccessor(0, TaskOutlinedExitBB);
 
     OMPBuilder.Builder.SetInsertPoint(TaskOutlinedEntryBB);
@@ -1727,20 +1732,6 @@ void CGIntrinsicsOpenMP::emitOMPOffloadingMappings(
   SmallVector<Constant *, 8> OffloadMapTypes;
   SmallVector<Constant *, 8> OffloadMapNames;
 
-  if (DSAValueMap.empty()) {
-    OffloadingMappingArgs.Size = 0;
-    OffloadingMappingArgs.BasePtrs =
-        Constant::getNullValue(OMPBuilder.VoidPtrPtr);
-    OffloadingMappingArgs.Ptrs = Constant::getNullValue(OMPBuilder.VoidPtrPtr);
-    OffloadingMappingArgs.Sizes = Constant::getNullValue(OMPBuilder.Int64Ptr);
-    OffloadingMappingArgs.MapTypes =
-        Constant::getNullValue(OMPBuilder.Int64Ptr);
-    OffloadingMappingArgs.MapNames =
-        Constant::getNullValue(OMPBuilder.VoidPtrPtr);
-
-    return;
-  }
-
   auto EmitMappingEntry = [&](Value *Size, uint64_t MapType, Value *BasePtr,
                               Value *Ptr) {
     OffloadMapTypes.push_back(ConstantInt::get(OMPBuilder.SizeTy, MapType));
@@ -1749,8 +1740,8 @@ void CGIntrinsicsOpenMP::emitOMPOffloadingMappings(
     OffloadMapNames.push_back(OMPBuilder.getOrCreateSrcLocStr(
         BasePtr->getName(), "", 0, 0, SrcLocStrSize));
     DEBUG_ENABLE(dbgs() << "Emit mapping entry BasePtr " << *BasePtr << " Ptr "
-                        << *Ptr << " Size " << *Size << " MapType " << MapType
-                        << "\n");
+                        << *Ptr << " Size " << *Size << " MapType 0x"
+                        << toHex(MapType) << "\n");
     MapperInfos.push_back({BasePtr, Ptr, Size});
   };
 
@@ -1968,25 +1959,19 @@ void CGIntrinsicsOpenMP::emitOMPOffloadingMappings(
     auto *GEP = OMPBuilder.Builder.CreateInBoundsGEP(
         BasePtrsAlloca->getAllocatedType(), BasePtrsAlloca,
         {OMPBuilder.Builder.getInt32(0), OMPBuilder.Builder.getInt32(Idx)});
-    auto *Bitcast = OMPBuilder.Builder.CreateBitCast(
-        GEP, MI.BasePtr->getType()->getPointerTo());
-    OMPBuilder.Builder.CreateStore(MI.BasePtr, Bitcast);
+    OMPBuilder.Builder.CreateStore(MI.BasePtr, GEP);
 
     // Store in the pointers alloca.
     GEP = OMPBuilder.Builder.CreateInBoundsGEP(
         PtrsAlloca->getAllocatedType(), PtrsAlloca,
         {OMPBuilder.Builder.getInt32(0), OMPBuilder.Builder.getInt32(Idx)});
-    Bitcast = OMPBuilder.Builder.CreateBitCast(
-        GEP, MI.Ptr->getType()->getPointerTo());
-    OMPBuilder.Builder.CreateStore(MI.Ptr, Bitcast);
+    OMPBuilder.Builder.CreateStore(MI.Ptr, GEP);
 
     // Store in the sizes alloca.
     GEP = OMPBuilder.Builder.CreateInBoundsGEP(
         SizesAlloca->getAllocatedType(), SizesAlloca,
         {OMPBuilder.Builder.getInt32(0), OMPBuilder.Builder.getInt32(Idx)});
-    Bitcast = OMPBuilder.Builder.CreateBitCast(
-        GEP, MI.Size->getType()->getPointerTo());
-    OMPBuilder.Builder.CreateStore(MI.Size, Bitcast);
+    OMPBuilder.Builder.CreateStore(MI.Size, GEP);
 
     Idx++;
   }
@@ -1994,12 +1979,9 @@ void CGIntrinsicsOpenMP::emitOMPOffloadingMappings(
   OffloadingMappingArgs.Size = MapperInfos.size();
   // These operations could be also implemented with GEPs on the allocas, not
   // sure what's best, revisit.
-  OffloadingMappingArgs.BasePtrs =
-      OMPBuilder.Builder.CreateBitCast(BasePtrsAlloca, OMPBuilder.VoidPtrPtr);
-  OffloadingMappingArgs.Ptrs =
-      OMPBuilder.Builder.CreateBitCast(PtrsAlloca, OMPBuilder.VoidPtrPtr);
-  OffloadingMappingArgs.Sizes = OMPBuilder.Builder.CreateBitCast(
-      SizesAlloca, OMPBuilder.SizeTy->getPointerTo());
+  OffloadingMappingArgs.BasePtrs = BasePtrsAlloca;
+  OffloadingMappingArgs.Ptrs = PtrsAlloca;
+  OffloadingMappingArgs.Sizes = SizesAlloca;
 }
 
 void CGIntrinsicsOpenMP::emitOMPSingle(Function *Fn, BasicBlock *BBEntry,
@@ -2019,8 +2001,7 @@ void CGIntrinsicsOpenMP::emitOMPSingle(Function *Fn, BasicBlock *BBEntry,
 #else
 
   auto IPOrError =
-      OMPBuilder.createSingle(Loc, BodyGenCB, FiniCB, /* IsNoWait*/ false,
-                              /*DidIt*/ nullptr);
+      OMPBuilder.createSingle(Loc, BodyGenCB, FiniCB, /* IsNoWait*/ false);
   if (auto E = IPOrError.takeError()) {
     FATAL_ERROR("Error creating OpenMP single region: " +
                 toString(std::move(E)));
@@ -2072,9 +2053,12 @@ void CGIntrinsicsOpenMP::emitOMPBarrier(Function *Fn, BasicBlock *BBEntry,
       InsertPointTy(BBEntry, BBEntry->getTerminator()->getIterator()), DL);
 
   // TODO: check ForceSimpleCall usage.
-  OMPBuilder.createBarrier(Loc, DK,
-                           /*ForceSimpleCall*/ false,
-                           /*CheckCancelFlag*/ true);
+  auto IP = OMPBuilder.createBarrier(Loc, DK,
+                                     /*ForceSimpleCall*/ false,
+                                     /*CheckCancelFlag*/ true);
+  if (auto E = IP.takeError()) {
+    FATAL_ERROR("Error creating OpenMP barrier: " + toString(std::move(E)));
+  }
   DEBUG_ENABLE(dbgs() << "=== Barrier Fn\n"
                       << *Fn << "=== End of Barrier Fn\n");
 }
@@ -2094,8 +2078,6 @@ CGIntrinsicsOpenMP::emitOffloadingGlobals(StringRef DevWrapperFuncName,
   GlobalVariable *OMPRegionId = nullptr;
   GlobalVariable *OMPOffloadEntries = nullptr;
 
-  // TODO: assumes 1 target region, can we call tgt_register_lib
-  // multiple times?
   OMPRegionId = new GlobalVariable(
       M, OMPBuilder.Int8, /* isConstant */ true, GlobalValue::WeakAnyLinkage,
       ConstantInt::get(OMPBuilder.Int8, 0), DevWrapperFuncName + ".region_id",
@@ -2127,14 +2109,16 @@ CGIntrinsicsOpenMP::emitOffloadingGlobals(StringRef DevWrapperFuncName,
                                   ".omp_offloading.device_image");
     GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
 
+    auto &Ctx = M.getContext();
+
     StructType *TgtDeviceImageTy = StructType::create(
-        {OMPBuilder.Int8Ptr, OMPBuilder.Int8Ptr,
-         TgtOffloadEntryTy->getPointerTo(), TgtOffloadEntryTy->getPointerTo()},
+        {OMPBuilder.Int8Ptr, OMPBuilder.Int8Ptr, PointerType::getUnqual(Ctx),
+         PointerType::getUnqual(Ctx)},
         "struct.__tgt_device_image");
 
     StructType *TgtBinDescTy = StructType::create(
-        {OMPBuilder.Int32, TgtDeviceImageTy->getPointerTo(),
-         TgtOffloadEntryTy->getPointerTo(), TgtOffloadEntryTy->getPointerTo()},
+        {OMPBuilder.Int32, PointerType::getUnqual(Ctx),
+         PointerType::getUnqual(Ctx), PointerType::getUnqual(Ctx)},
         "struct.__tgt_bin_desc");
 
     auto *ArrayTy = ArrayType::get(TgtDeviceImageTy, 1);
@@ -2174,67 +2158,8 @@ CGIntrinsicsOpenMP::emitOffloadingGlobals(StringRef DevWrapperFuncName,
                            /* isConstant */ true, GlobalValue::InternalLinkage,
                            DescInit, ".omp_offloading.descriptor");
 
-    // Add tgt_register_requires, tgt_register_lib,
-    // tgt_unregister_lib.
-    {
-      // tgt_register_requires.
-      auto *FuncTy = FunctionType::get(OMPBuilder.Void, /*isVarArg*/ false);
-      auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage,
-                                    ".omp_offloading.requires_reg", &M);
-      Func->setSection(".text.startup");
-
-      // Get __tgt_register_lib function declaration.
-      auto *RegFuncTy = FunctionType::get(OMPBuilder.Void, OMPBuilder.Int64,
-                                          /*isVarArg*/ false);
-      FunctionCallee RegFuncC =
-          M.getOrInsertFunction("__tgt_register_requires", RegFuncTy);
-
-      // Construct function body
-      IRBuilder<> Builder(BasicBlock::Create(M.getContext(), "entry", Func));
-      // TODO: fix to pass the requirements enum value.
-      Builder.CreateCall(RegFuncC, ConstantInt::get(OMPBuilder.Int64, 1));
-      Builder.CreateRetVoid();
-
-      // Add this function to constructors.
-      // Set priority to 1 so that __tgt_register_lib is executed
-      // AFTER
-      // __tgt_register_requires (we want to know what requirements
-      // have been asked for before we load a libomptarget plugin so
-      // that by the time the plugin is loaded it can report how
-      // many devices there are which can satisfy these
-      // requirements).
-      appendToGlobalCtors(M, Func, /*Priority*/ 0);
-    }
-    {
-      // ctor
-      auto *FuncTy = FunctionType::get(OMPBuilder.Void, /*isVarArg*/ false);
-      auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage,
-                                    ".omp_offloading.descriptor_reg", &M);
-      Func->setSection(".text.startup");
-
-      // Get __tgt_register_lib function declaration.
-      auto *RegFuncTy =
-          FunctionType::get(OMPBuilder.Void, TgtBinDescTy->getPointerTo(),
-                            /*isVarArg*/ false);
-      FunctionCallee RegFuncC =
-          M.getOrInsertFunction("__tgt_register_lib", RegFuncTy);
-
-      // Construct function body
-      IRBuilder<> Builder(BasicBlock::Create(M.getContext(), "entry", Func));
-      Builder.CreateCall(RegFuncC, BinDesc);
-      Builder.CreateRetVoid();
-
-      // Add this function to constructors.
-      // Set priority to 1 so that __tgt_register_lib is executed
-      // AFTER
-      // __tgt_register_requires (we want to know what requirements
-      // have been asked for before we load a libomptarget plugin so
-      // that by the time the plugin is loaded it can report how
-      // many devices there are which can satisfy these
-      // requirements).
-      appendToGlobalCtors(M, Func, /*Priority*/ 1);
-    }
-    {
+    // Add tgt_register_lib in global ctors and tgt_unregister_lib in atexit.
+    auto CreateUnregFunction = [&]() {
       auto *FuncTy = FunctionType::get(OMPBuilder.Void, /*isVarArg*/ false);
       auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage,
                                     ".omp_offloading.descriptor_unreg", &M);
@@ -2242,7 +2167,7 @@ CGIntrinsicsOpenMP::emitOffloadingGlobals(StringRef DevWrapperFuncName,
 
       // Get __tgt_unregister_lib function declaration.
       auto *UnRegFuncTy =
-          FunctionType::get(OMPBuilder.Void, TgtBinDescTy->getPointerTo(),
+          FunctionType::get(OMPBuilder.Void, PointerType::getUnqual(Ctx),
                             /*isVarArg*/ false);
       FunctionCallee UnRegFuncC =
           M.getOrInsertFunction("__tgt_unregister_lib", UnRegFuncTy);
@@ -2252,10 +2177,41 @@ CGIntrinsicsOpenMP::emitOffloadingGlobals(StringRef DevWrapperFuncName,
       Builder.CreateCall(UnRegFuncC, BinDesc);
       Builder.CreateRetVoid();
 
-      // Add this function to global destructors.
-      // Match priority of __tgt_register_lib
-      appendToGlobalDtors(M, Func, /*Priority*/ 1);
-    }
+      return Func;
+    };
+
+    // Create the registration function constructor.
+    auto *FuncTy = FunctionType::get(OMPBuilder.Void, /*isVarArg*/ false);
+    auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage,
+                                  ".omp_offloading.descriptor_reg", &M);
+    Func->setSection(".text.startup");
+
+    // Get __tgt_register_lib function declaration.
+    auto *RegFuncTy =
+        FunctionType::get(OMPBuilder.Void, PointerType::getUnqual(Ctx),
+                          /*isVarArg*/ false);
+    FunctionCallee RegFuncC =
+        M.getOrInsertFunction("__tgt_register_lib", RegFuncTy);
+
+    // Get atexit function declaration.
+    auto *AtExitTy =
+        FunctionType::get(OMPBuilder.Int32, PointerType::getUnqual(Ctx),
+                          /*isVarArg=*/false);
+    FunctionCallee AtExit = M.getOrInsertFunction("atexit", AtExitTy);
+
+    // Construct function body.
+    IRBuilder<> Builder(BasicBlock::Create(M.getContext(), "entry", Func));
+    Builder.CreateCall(RegFuncC, BinDesc);
+
+    Function *UnregFunc = CreateUnregFunction();
+    Builder.CreateCall(AtExit, UnregFunc);
+
+    Builder.CreateRetVoid();
+
+    // Add this function to constructors.
+    // Set priority to 101 so that __tgt_register_lib is executed after system
+    // constructors but before user constructors.
+    appendToGlobalCtors(M, Func, /*Priority*/ 101);
   };
 
   EmitOffloadingBinaryGlobals();
@@ -2296,17 +2252,6 @@ void CGIntrinsicsOpenMP::emitOMPTargetHost(
   Constant *SrcLocStr = OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize);
   Value *Ident = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize);
 
-  // TODO: should we use target_mapper without teams or the more general
-  // target_teams_mapper. Does the former buy us anything (less overhead?)
-  // FunctionCallee TargetMapper =
-  //    OMPBuilder.getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_mapper);
-  // TODO: For nowait we need to enclose the host code in a task for async
-  // execution.
-  FunctionCallee TargetMapper =
-      (TargetInfo.NoWait ? OMPBuilder.getOrCreateRuntimeFunction(
-                               M, OMPRTL___tgt_target_teams_nowait_mapper)
-                         : OMPBuilder.getOrCreateRuntimeFunction(
-                               M, OMPRTL___tgt_target_teams_mapper));
   OMPBuilder.Builder.SetInsertPoint(EntryBB->getTerminator());
 
   // Emit mappings.
@@ -2316,19 +2261,13 @@ void CGIntrinsicsOpenMP::emitOMPTargetHost(
   emitOMPOffloadingMappings(AllocaIP, DSAValueMap, StructMappingInfoMap,
                             OffloadingMappingArgs, /* isTargetRegion */ true);
 
-  // Push the tripcount.
+  // Set the tripcount, if available.
+  Value *TripCount = nullptr;
   if (OMPLoopInfo) {
-    FunctionCallee TripcountMapper = OMPBuilder.getOrCreateRuntimeFunction(
-        M,
-        llvm::omp::RuntimeFunction::OMPRTL___kmpc_push_target_tripcount_mapper);
     Value *Load =
         OMPBuilder.Builder.CreateLoad(OMPBuilder.Int64, OMPLoopInfo->UB);
-    Value *Tripcount = OMPBuilder.Builder.CreateAdd(
+    TripCount = OMPBuilder.Builder.CreateAdd(
         Load, ConstantInt::get(OMPBuilder.Int64, 1));
-    auto *CI = checkCreateCall(
-        OMPBuilder.Builder, TripcountMapper,
-        {Ident, ConstantInt::get(OMPBuilder.Int64, -1), Tripcount});
-    assert(CI && "Expected valid call");
   }
 
   Value *NumTeams = createScalarCast(TargetInfo.NumTeams, OMPBuilder.Int32);
@@ -2338,25 +2277,52 @@ void CGIntrinsicsOpenMP::emitOMPTargetHost(
   assert(NumTeams && "Expected non-null NumTeams");
   assert(ThreadLimit && "Expected non-null ThreadLimit");
 
-  SmallVector<Value *, 16> Args = {
-      Ident, ConstantInt::get(OMPBuilder.Int64, -1),
-      ConstantExpr::getBitCast(OMPRegionId, OMPBuilder.VoidPtr),
-      ConstantInt::get(OMPBuilder.Int32, OffloadingMappingArgs.Size),
-      OffloadingMappingArgs.BasePtrs, OffloadingMappingArgs.Ptrs,
-      OffloadingMappingArgs.Sizes, OffloadingMappingArgs.MapTypes,
-      OffloadingMappingArgs.MapNames,
-      // TODO: offload_mappers is null for now.
-      Constant::getNullValue(OMPBuilder.VoidPtrPtr), NumTeams, ThreadLimit};
-
-  if (TargetInfo.NoWait) {
-    // Add extra dependency information (unused for now).
-    Args.push_back(Constant::getNullValue(OMPBuilder.Int32));
-    Args.push_back(Constant::getNullValue(OMPBuilder.Int8Ptr));
-    Args.push_back(Constant::getNullValue(OMPBuilder.Int32));
-    Args.push_back(Constant::getNullValue(OMPBuilder.Int8Ptr));
+  if (!isOpenMPDeviceRuntime()) {
+    FunctionCallee KmpcSetThreadLimit = OMPBuilder.getOrCreateRuntimeFunction(
+        M, OMPRTL___kmpc_set_thread_limit);
+    Value *ThreadID = OMPBuilder.getOrCreateThreadID(Ident);
+
+    checkCreateCall(OMPBuilder.Builder, KmpcSetThreadLimit,
+                    {Ident, ThreadID, ThreadLimit});
   }
 
-  auto *OffloadResult = checkCreateCall(OMPBuilder.Builder, TargetMapper, Args);
+  SmallVector<Value *> ArgsVector;
+
+  auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
+  OpenMPIRBuilder::TargetDataRTArgs RTArgs{
+      OffloadingMappingArgs.BasePtrs,
+      OffloadingMappingArgs.Ptrs,
+      OffloadingMappingArgs.Sizes,
+      OffloadingMappingArgs.MapTypes,
+      ConstantPointerNull::get(UnqualPtrTy),
+      ConstantPointerNull::get(UnqualPtrTy),
+      OffloadingMappingArgs.MapNames,
+  };
+  // Avoid initializer-list temporaries for ArrayRef fields. Use stable
+  // SmallVector storage so ArrayRef in TargetKernelArgs refers to valid
+  // memory.
+  SmallVector<Value *, 1> KernelNumTeams;
+  KernelNumTeams.push_back(NumTeams);
+  SmallVector<Value *, 1> KernelNumThreads;
+  KernelNumThreads.push_back(ThreadLimit);
+
+  // TODO: Implement nowait: we need to enclose the host code in a task for
+  // async execution. OpenMPIRBuilder may support that now.
+  OpenMPIRBuilder::TargetKernelArgs Args{
+      static_cast<unsigned int>(OffloadingMappingArgs.Size),
+      RTArgs,
+      (TripCount ? TripCount : OMPBuilder.Builder.getInt64(0)),
+      KernelNumTeams,
+      KernelNumThreads,
+      Constant::getNullValue(OMPBuilder.VoidPtr),
+      /*TargetInfo.NoWait*/ false};
+  OpenMPIRBuilder::getKernelArgsVector(Args, OMPBuilder.Builder, ArgsVector);
+
+  Value *DeviceID = ConstantInt::get(OMPBuilder.Int64, -1);
+  Value *OffloadResult = nullptr;
+  OMPBuilder.emitTargetKernel(Loc, AllocaIP, OffloadResult, Ident, DeviceID,
+                              NumTeams, ThreadLimit, OMPRegionId, ArgsVector);
+
   assert(OffloadResult && "Expected non-null call inst from code generation");
   auto *Failed = OMPBuilder.Builder.CreateIsNotNull(OffloadResult);
   OMPBuilder.Builder.CreateCondBr(Failed, StartBB, EndBB);
@@ -2372,6 +2338,11 @@ void CGIntrinsicsOpenMP::emitOMPTargetDevice(Function *Fn, BasicBlock *EntryBB,
   // Emit the Numba wrapper offloading function.
   SmallVector<Type *, 8> WrapperArgsTypes;
   SmallVector<StringRef, 8> WrapperArgsNames;
+
+  // Add the pointer argument to kernel args expected by the runtime.
+  WrapperArgsTypes.push_back(OMPBuilder.VoidPtr);
+  WrapperArgsNames.push_back("dyn_ptr");
+
   for (auto &It : DSAValueMap) {
     Value *V = It.first;
     DSAType DSA = It.second.Type;
@@ -2399,8 +2370,9 @@ void CGIntrinsicsOpenMP::emitOMPTargetDevice(Function *Fn, BasicBlock *EntryBB,
   Function *NumbaWrapperFunc = Function::Create(
       NumbaWrapperFnTy, GlobalValue::ExternalLinkage, DevWrapperFuncName, M);
 
-  // Name the wrapper arguments for readability.
-  for (size_t I = 0; I < NumbaWrapperFunc->arg_size(); ++I)
+  // Name the wrapper arguments for readability, start from 1 to skip the
+  // first "args" pointer argument.
+  for (size_t I = 1; I < NumbaWrapperFunc->arg_size(); ++I)
     NumbaWrapperFunc->getArg(I)->setName(WrapperArgsNames[I]);
 
   IRBuilder<> Builder(
@@ -2425,14 +2397,23 @@ void CGIntrinsicsOpenMP::emitOMPTargetDevice(Function *Fn, BasicBlock *EntryBB,
     ArgOffset = 1;
   }
   for (auto &Arg : NumbaWrapperFunc->args()) {
+    // Skip the first "args" pointer argument.
+    if (Arg.getArgNo() == 0)
+      continue;
     // TODO: Runtime expects all scalars typed as Int64.
     if (!Arg.getType()->isPointerTy()) {
-      auto *ParamType = DevFuncCallee.getFunctionType()->getParamType(
-          ArgOffset + Arg.getArgNo());
+      // ArgOffset accounts for the extra arguments added in the device
+      // function by Numba, and -1 accounts for the first "args" pointer
+      // argument.
+      size_t DevFuncArgNo = ArgOffset + (Arg.getArgNo() - 1);
+
+      auto *ParamType =
+          DevFuncCallee.getFunctionType()->getParamType(DevFuncArgNo);
       AllocaInst *TmpInt64 = Builder.CreateAlloca(OMPBuilder.Int64, nullptr,
                                                   Arg.getName() + ".casted");
       Builder.CreateStore(&Arg, TmpInt64);
-      Value *Cast = Builder.CreateBitCast(TmpInt64, ParamType->getPointerTo());
+      Value *Cast = Builder.CreateBitCast(
+          TmpInt64, PointerType::getUnqual(M.getContext()));
       Value *ConvLoad = Builder.CreateLoad(ParamType, Cast);
       DevFuncArgs.push_back(ConvLoad);
     } else
@@ -2447,15 +2428,14 @@ void CGIntrinsicsOpenMP::emitOMPTargetDevice(Function *Fn, BasicBlock *EntryBB,
 #elif LLVM_VERSION_MAJOR <= 16
     auto IP = OMPBuilder.createTargetInit(Loc, IsSPMD);
 #else
-    // TODO: Use TargetInfo launch configuration for max/min threads and
-    // threads.
+    // Note the default for MaxThreads is 0.
     OpenMPIRBuilder::TargetKernelDefaultAttrs Attrs{
         (IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC),
         {-1, -1, -1},
         1,
-        {-1, -1, -1},
+        {0, -1, -1},
         1};
-    auto IP = OMPBuilder.createTargetInit(Loc, Attrs);
+    auto IP = OMPBuilder.createTargetInit(Builder, Attrs);
 #endif
     Builder.restoreIP(IP);
   }
@@ -2467,8 +2447,10 @@ void CGIntrinsicsOpenMP::emitOMPTargetDevice(Function *Fn, BasicBlock *EntryBB,
     OpenMPIRBuilder::LocationDescription Loc(Builder);
 #if LLVM_VERSION_MAJOR <= 15
     OMPBuilder.createTargetDeinit(Loc, /* IsSPMD */ IsSPMD, true);
-#else
+#elif LLVM_VERSION_MAJOR <= 16
     OMPBuilder.createTargetDeinit(Loc, /* IsSPMD */ IsSPMD);
+#else
+    OMPBuilder.createTargetDeinit(Loc);
 #endif
   }
 
@@ -2485,15 +2467,8 @@ void CGIntrinsicsOpenMP::emitOMPTargetDevice(Function *Fn, BasicBlock *EntryBB,
     appendToCompilerUsed(M, {ExecModeGV});
 
     // Get "nvvm.annotations" metadata node.
-    // TODO: may need to adjust for AMD gpus.
-    NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
-
-    Metadata *MDVals[] = {
-        ConstantAsMetadata::get(NumbaWrapperFunc),
-        MDString::get(M.getContext(), "kernel"),
-        ConstantAsMetadata::get(ConstantInt::get(OMPBuilder.Int32, 1))};
-    // Append metadata to nvvm.annotations.
-    MD->addOperand(MDNode::get(M.getContext(), MDVals));
+    // TODO: will need to adjust for AMD gpus.
+    NumbaWrapperFunc->setCallingConv(CallingConv::PTX_Kernel);
 
     // Add a function attribute for the kernel.
     NumbaWrapperFunc->addFnAttr(Attribute::get(M.getContext(), "kernel"));
diff --git a/src/numba/openmp/libs/pass/CGIntrinsicsOpenMP.h b/src/numba/openmp/libs/pass/CGIntrinsicsOpenMP.h
index 74b15cde3798..fd6539cefe49 100644
--- a/src/numba/openmp/libs/pass/CGIntrinsicsOpenMP.h
+++ b/src/numba/openmp/libs/pass/CGIntrinsicsOpenMP.h
@@ -111,7 +111,7 @@ namespace helpers {
 Type *getPointeeType(DSAValueMapTy &DSAValueMap, Value *V);
 }
 
-inline std::string toString(DSAType DSA) {
+inline std::string toString(const DSAType &DSA) {
   switch (DSA) {
   case DSA_NONE:
     return "DSA_NONE";
diff --git a/src/numba/openmp/libs/pass/CMakeLists.txt b/src/numba/openmp/libs/pass/CMakeLists.txt
index 8b6ac8e26a0e..fcc977fc4c77 100644
--- a/src/numba/openmp/libs/pass/CMakeLists.txt
+++ b/src/numba/openmp/libs/pass/CMakeLists.txt
@@ -46,9 +46,9 @@ target_link_libraries(IntrinsicsOpenMP
 )
 
 if(APPLE)
-  set_property(TARGET IntrinsicsOpenMP APPEND_STRING PROPERTY LINK_FLAGS "-flto -Wl,-exported_symbol,_runIntrinsicsOpenMPPass")
+  set_property(TARGET IntrinsicsOpenMP APPEND_STRING PROPERTY LINK_FLAGS "-Wl,-exported_symbol,_runIntrinsicsOpenMPPass")
 else()
-  set_property(TARGET IntrinsicsOpenMP APPEND_STRING PROPERTY LINK_FLAGS "-flto -Wl,--exclude-libs,ALL")
+  set_property(TARGET IntrinsicsOpenMP APPEND_STRING PROPERTY LINK_FLAGS "-Wl,--exclude-libs,ALL")
 endif()
 
 install(TARGETS IntrinsicsOpenMP
diff --git a/src/numba/openmp/libs/pass/IntrinsicsOpenMP.cpp b/src/numba/openmp/libs/pass/IntrinsicsOpenMP.cpp
index 577eff6954f9..8a67ad315ccf 100644
--- a/src/numba/openmp/libs/pass/IntrinsicsOpenMP.cpp
+++ b/src/numba/openmp/libs/pass/IntrinsicsOpenMP.cpp
@@ -348,28 +348,28 @@ struct IntrinsicsOpenMP {
 
           // TODO: check for conflicting DSA, for example reduction variables
           // cannot be set private. Should be done in Numba.
-          if (Tag.startswith("DIR")) {
+          if (Tag.starts_with("DIR")) {
             auto It = StringToDir.find(Tag);
             assert(It != StringToDir.end() && "Directive is not supported!");
             Dir = It->second;
-          } else if (Tag.startswith("QUAL")) {
+          } else if (Tag.starts_with("QUAL")) {
             const ArrayRef<Value *> &TagInputs = O.inputs();
-            if (Tag.startswith("QUAL.OMP.NORMALIZED.IV")) {
+            if (Tag.starts_with("QUAL.OMP.NORMALIZED.IV")) {
               assert(O.input_size() == 1 && "Expected single IV value");
               OMPLoopInfo.IV = TagInputs[0];
-            } else if (Tag.startswith("QUAL.OMP.NORMALIZED.START")) {
+            } else if (Tag.starts_with("QUAL.OMP.NORMALIZED.START")) {
               assert(O.input_size() == 1 && "Expected single START value");
               OMPLoopInfo.Start = TagInputs[0];
-            } else if (Tag.startswith("QUAL.OMP.NORMALIZED.LB")) {
+            } else if (Tag.starts_with("QUAL.OMP.NORMALIZED.LB")) {
               assert(O.input_size() == 1 && "Expected single LB value");
               OMPLoopInfo.LB = TagInputs[0];
-            } else if (Tag.startswith("QUAL.OMP.NORMALIZED.UB")) {
+            } else if (Tag.starts_with("QUAL.OMP.NORMALIZED.UB")) {
               assert(O.input_size() == 1 && "Expected single UB value");
               OMPLoopInfo.UB = TagInputs[0];
-            } else if (Tag.startswith("QUAL.OMP.NUM_THREADS")) {
+            } else if (Tag.starts_with("QUAL.OMP.NUM_THREADS")) {
               assert(O.input_size() == 1 && "Expected single NumThreads value");
               ParRegionInfo.NumThreads = TagInputs[0];
-            } else if (Tag.startswith("QUAL.OMP.SCHEDULE")) {
+            } else if (Tag.starts_with("QUAL.OMP.SCHEDULE")) {
               // TODO: Add DIST_SCHEDULE for distribute loops.
               assert(O.input_size() == 1 &&
                      "Expected single chunking scheduling value");
@@ -385,11 +385,11 @@ struct IntrinsicsOpenMP {
                 }
               } else
                 FATAL_ERROR("Unsupported scheduling type");
-            } else if (Tag.startswith("QUAL.OMP.IF")) {
+            } else if (Tag.starts_with("QUAL.OMP.IF")) {
               assert(O.input_size() == 1 &&
                      "Expected single if condition value");
               ParRegionInfo.IfCondition = TagInputs[0];
-            } else if (Tag.startswith("QUAL.OMP.TARGET.DEV_FUNC")) {
+            } else if (Tag.starts_with("QUAL.OMP.TARGET.DEV_FUNC")) {
               assert(O.input_size() == 1 &&
                      "Expected a single device function name");
               ConstantDataArray *DevFuncArray =
@@ -397,16 +397,16 @@ struct IntrinsicsOpenMP {
               assert(DevFuncArray &&
                      "Expected constant string for the device function");
               TargetInfo.DevFuncName = DevFuncArray->getAsString();
-            } else if (Tag.startswith("QUAL.OMP.TARGET.ELF")) {
+            } else if (Tag.starts_with("QUAL.OMP.TARGET.ELF")) {
               assert(O.input_size() == 1 &&
                      "Expected a single elf image string");
               ConstantDataArray *ELF =
                   dyn_cast<ConstantDataArray>(TagInputs[0]);
               assert(ELF && "Expected constant string for ELF");
               TargetInfo.ELF = ELF;
-            } else if (Tag.startswith("QUAL.OMP.DEVICE")) {
+            } else if (Tag.starts_with("QUAL.OMP.DEVICE")) {
               // TODO: Handle device selection for target regions.
-            } else if (Tag.startswith("QUAL.OMP.NUM_TEAMS")) {
+            } else if (Tag.starts_with("QUAL.OMP.NUM_TEAMS")) {
               assert(O.input_size() == 1 && "Expected single NumTeams value");
               switch (Dir) {
               case OMPD_target:
@@ -429,7 +429,7 @@ struct IntrinsicsOpenMP {
               default:
                 FATAL_ERROR("Unsupported qualifier in directive");
               }
-            } else if (Tag.startswith("QUAL.OMP.THREAD_LIMIT")) {
+            } else if (Tag.starts_with("QUAL.OMP.THREAD_LIMIT")) {
               assert(O.input_size() == 1 &&
                      "Expected single ThreadLimit value");
               switch (Dir) {
@@ -450,7 +450,7 @@ struct IntrinsicsOpenMP {
               default:
                 FATAL_ERROR("Unsupported qualifier in directive");
               }
-            } else if (Tag.startswith("QUAL.OMP.NOWAIT")) {
+            } else if (Tag.starts_with("QUAL.OMP.NOWAIT")) {
               switch (Dir) {
               case OMPD_target:
               case OMPD_target_teams:
@@ -822,6 +822,12 @@ extern "C" int runIntrinsicsOpenMPPass(const char *BitcodePtr,
   MPM.addPass(IntrinsicsOpenMPPass());
   MPM.run(*M, MAM);
 
+  // Verify the module before writing bitcode
+  if (verifyModule(*M, &errs())) {
+    errs() << "ERROR: Module verification failed after IntrinsicsOpenMPPass\n";
+    return 1;
+  }
+
   SmallVector<char, 0> Buf;
   raw_svector_ostream OS(Buf);
   WriteBitcodeToFile(*M, OS);
diff --git a/src/numba/openmp/llvm_pass.py b/src/numba/openmp/llvm_pass.py
index f1745afc0fb1..4e56061caf79 100644
--- a/src/numba/openmp/llvm_pass.py
+++ b/src/numba/openmp/llvm_pass.py
@@ -39,8 +39,10 @@ def _writer_cb(ptr, size):
         raise RuntimeError(f"Running IntrinsicsOpenMPPass failed with return code {rc}")
 
     bc_out = bytes(out)
+
     lowered_module = ll.parse_bitcode(bc_out)
     if DEBUG_OPENMP_LLVM_PASS >= 1:
-        print(lowered_module)
+        with open(f"{ll_module.name}-intrinsics-omp.ll", "w") as f:
+            f.write(str(lowered_module))
 
     return lowered_module
diff --git a/src/numba/openmp/omp_ir.py b/src/numba/openmp/omp_ir.py
index f47be4244c6b..52128bb1d945 100644
--- a/src/numba/openmp/omp_ir.py
+++ b/src/numba/openmp/omp_ir.py
@@ -113,7 +113,11 @@ def __init__(self):
         import numba.cuda.api as cudaapi
         import numba.cuda.cudadrv.libs as cudalibs
         from numba.cuda.codegen import CUDA_TRIPLE
+        from numba.cuda.cudadrv import driver, enums
 
+        # The OpenMP target runtime prefers the blocking sync flag, so we set it
+        # here before creating the CUDA context.
+        driver.driver.cuDevicePrimaryCtxSetFlags(0, enums.CU_CTX_SCHED_BLOCKING_SYNC)
         self.cc = cudaapi.get_current_device().compute_capability
         self.sm = "sm_" + str(self.cc[0]) + str(self.cc[1])
 
@@ -123,9 +127,7 @@ def __init__(self):
             self.libdevice_mod = ll.parse_bitcode(f.read())
 
         # Read the OpenMP device RTL for the architecture to link with the module.
-        self.libomptarget_arch = (
-            libpath / "libomp" / "lib" / f"libomptarget-nvptx-{self.sm}.bc"
-        )
+        self.libomptarget_arch = libpath / "openmp" / "lib" / "libomptarget-nvptx.bc"
         try:
             with open(self.libomptarget_arch, "rb") as f:
                 self.libomptarget_mod = ll.parse_bitcode(f.read())
@@ -143,7 +145,7 @@ def __init__(self):
 
     def _get_target_image(self, mod, filename_prefix, ompx_attrs, use_toolchain=False):
         from numba.cuda.cudadrv import driver
-        from numba.core.llvm_bindings import create_pass_manager_builder
+        from numba.core.llvm_bindings import create_pass_builder
 
         if DEBUG_OPENMP_LLVM_PASS >= 1:
             with open(filename_prefix + ".ll", "w") as f:
@@ -173,25 +175,19 @@ def _internalize():
         _internalize()
         # Run passes for optimization, including target-specific passes.
         # Run function passes.
-        with ll.create_function_pass_manager(mod) as pm:
-            self.tm.add_analysis_passes(pm)
-            with create_pass_manager_builder(
-                opt=2, slp_vectorize=True, loop_vectorize=True
-            ) as pmb:
-                pmb.populate(pm)
-            pm.initialize()
+        with create_pass_builder(
+            self.tm, opt=2, slp_vectorize=True, loop_vectorize=True
+        ) as pb:
+            pm = pb.getFunctionPassManager()
             for func in mod.functions:
-                pm.run(func)
-            pm.finalize()
+                pm.run(func, pb)
 
         # Run module passes.
-        with ll.create_module_pass_manager() as pm:
-            self.tm.add_analysis_passes(pm)
-            with create_pass_manager_builder(
-                opt=2, slp_vectorize=True, loop_vectorize=True
-            ) as pmb:
-                pmb.populate(pm)
-            pm.run(mod)
+        with create_pass_builder(
+            self.tm, opt=2, slp_vectorize=True, loop_vectorize=True
+        ) as pb:
+            pm = pb.getModulePassManager()
+            pm.run(mod, pb)
 
         if DEBUG_OPENMP_LLVM_PASS >= 1:
             mod.verify()
@@ -205,13 +201,11 @@ def _internalize():
         # Internalize non-kernel function definitions.
         _internalize()
         # Run module passes.
-        with ll.create_module_pass_manager() as pm:
-            self.tm.add_analysis_passes(pm)
-            with create_pass_manager_builder(
-                opt=1, slp_vectorize=True, loop_vectorize=True
-            ) as pmb:
-                pmb.populate(pm)
-            pm.run(mod)
+        with create_pass_builder(
+            self.tm, opt=1, slp_vectorize=True, loop_vectorize=True
+        ) as pb:
+            pm = pb.getModulePassManager()
+            pm.run(mod, pb)
 
         if DEBUG_OPENMP_LLVM_PASS >= 1:
             mod.verify()
@@ -221,25 +215,47 @@ def _internalize():
         # Generate ptx assemlby.
         ptx = self.tm.emit_assembly(mod)
         if use_toolchain:
-            # ptxas does file I/O, so output the assembly and ingest the generated cubin.
-            with open(filename_prefix + "-intr-dev-rtl.s", "w") as f:
-                f.write(ptx)
-
-            subprocess.run(
-                [
-                    "ptxas",
-                    "-m64",
-                    "--gpu-name",
-                    self.sm,
-                    filename_prefix + "-intr-dev-rtl.s",
-                    "-o",
-                    filename_prefix + "-intr-dev-rtl.o",
-                ],
-                check=True,
-            )
+            # ptxas normally does file I/O; prefer piping PTX to stdin to avoid
+            # writing the .s file unless debug is enabled.
+            if DEBUG_OPENMP_LLVM_PASS >= 1:
+                with open(filename_prefix + "-intr-dev-rtl.s", "w") as f:
+                    f.write(ptx)
+
+            # Invoke ptxas reading PTX from stdin ('-') and writing output to
+            # a temporary file so we can capture the object in-memory without
+            # leaving it in the working directory.
+            with tempfile.NamedTemporaryFile(suffix=".o", delete=False) as tmpf:
+                outname = tmpf.name
+            try:
+                subprocess.run(
+                    [
+                        "ptxas",
+                        "-m64",
+                        "--gpu-name",
+                        self.sm,
+                        "-",
+                        "-o",
+                        outname,
+                    ],
+                    input=ptx.encode(),
+                    check=True,
+                )
 
-            with open(filename_prefix + "-intr-dev-rtl.o", "rb") as f:
-                cubin = f.read()
+                with open(outname, "rb") as f:
+                    cubin = f.read()
+
+                # If debug is enabled, also write a named copy for inspection.
+                if DEBUG_OPENMP_LLVM_PASS >= 1:
+                    with open(
+                        filename_prefix + "-intr-dev-rtl.o",
+                        "wb",
+                    ) as f:
+                        f.write(cubin)
+            finally:
+                try:
+                    os.remove(outname)
+                except OSError:
+                    pass
         else:
             if DEBUG_OPENMP_LLVM_PASS >= 1:
                 with open(
diff --git a/src/numba/openmp/omp_lower.py b/src/numba/openmp/omp_lower.py
index 4fbd26d92dbe..c8598389d2ed 100644
--- a/src/numba/openmp/omp_lower.py
+++ b/src/numba/openmp/omp_lower.py
@@ -1937,7 +1937,8 @@ def some_target_directive(self, args, dir_tag, lexer_count, has_loop=False):
         else:
             # Neither TEAMS or PARALLEL in directive, set teams, threads to 1.
             start_tags.append(openmp_tag("QUAL.OMP.NUM_TEAMS", 1))
-            start_tags.append(openmp_tag("QUAL.OMP.THREAD_LIMIT", 1))
+            # Set thread limit to 0 to use runtime default.
+            start_tags.append(openmp_tag("QUAL.OMP.THREAD_LIMIT", 0))
 
         if DEBUG_OPENMP >= 1:
             for clause in clauses:
diff --git a/src/numba/openmp/tests/test_openmp.py b/src/numba/openmp/tests/test_openmp.py
index 82e441021d86..bb2661132a0a 100644
--- a/src/numba/openmp/tests/test_openmp.py
+++ b/src/numba/openmp/tests/test_openmp.py
@@ -3541,14 +3541,8 @@ def test_impl():
         np.testing.assert_equal(threads2, 256)
 
     def target_nest_parallel(self, device):
-        # TODO: map should be "from" instead of "tofrom" once this is fixed.
         target_pragma = f"target device({device}) map(from: a)"
-        # NOTE: num_threads should be a multiple of warp size, e.g. for NVIDIA
-        # V100 it is 32, the OpenMP runtime floors non-multiple of warp size.
-        # TODO: Newer LLVM versions should not have this restriction.
-        parallel_pragma = (
-            "parallel num_threads(32)"  # + (" shared(a)" if explicit else "")
-        )
+        parallel_pragma = "parallel num_threads(32)"
 
         @njit
         def test_impl():