diff --git a/.github/workflows/build-upload-wheels.yml b/.github/workflows/build-upload-wheels.yml index 91b590afaf9c..5dd79d39eb37 100644 --- a/.github/workflows/build-upload-wheels.yml +++ b/.github/workflows/build-upload-wheels.yml @@ -35,13 +35,13 @@ jobs: - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 - name: Install cibuildwheel - run: python -m pip install cibuildwheel==3.1.4 + run: python -m pip install cibuildwheel==3.3.1 - name: Build wheels {{ matrix.os }} # Set LLVM_VERSION for the host to forward to the cibuildwheel # environment. env: - LLVM_VERSION: "15.0.7" + LLVM_VERSION: "20.1.8" run: python -m cibuildwheel --output-dir wheelhouse - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 @@ -58,7 +58,7 @@ jobs: - name: Build sdist env: - LLVM_VERSION: "15.0.7" + LLVM_VERSION: "20.1.8" run: pipx run build --sdist - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 @@ -73,8 +73,14 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest, ubuntu-24.04-arm] - python-version: ['3.10', '3.11', '3.12', '3.13'] - numba-version: ['0.61.0', '0.61.2'] + python-version: ['3.10', '3.11', '3.12', '3.13', '3.14'] + numba-version: ['0.62.0', '0.62.1', '0.63.0', '0.63.1'] + exclude: + - python-version: '3.14' + numba-version: '0.62.0' + - python-version: '3.14' + numba-version: '0.62.1' + steps: - name: Download built wheels uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 diff --git a/.gitlab/jobs/tioga.yml b/.gitlab/jobs/tioga.yml index 0568a68382db..3914eb5c310f 100644 --- a/.gitlab/jobs/tioga.yml +++ b/.gitlab/jobs/tioga.yml @@ -56,6 +56,7 @@ variables: - "3.11" - "3.12" - "3.13" + - "3.14" build-and-test-tioga: extends: [.base-job, .python-variants] diff --git a/MANIFEST.in b/MANIFEST.in index fd6c57c169fa..e3842d878410 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1 +1 @@ -recursive-include src/numba/openmp/libs * +recursive-include src/numba/openmp/libs *.py *.so *.a *.bc diff --git a/README.md b/README.md index f8778004cea9..4c8f064f65e6 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,7 @@ conda install -c python-for-hpc -c conda-forge pyomp | PyOMP | Numba | | ----- | --------------- | +| 0.5.x | 0.62.x - 0.63.x | | 0.4.x | 0.61.x | | 0.3.x | 0.57.x - 0.60.x | diff --git a/buildscripts/cibuildwheel/setup-miniconda3.sh b/buildscripts/cibuildwheel/setup-miniconda3.sh index c69a8bfbb890..b50cae6f3647 100644 --- a/buildscripts/cibuildwheel/setup-miniconda3.sh +++ b/buildscripts/cibuildwheel/setup-miniconda3.sh @@ -23,6 +23,6 @@ echo "Miniconda installed" source "_stage/miniconda3/bin/activate" base export CONDA_PLUGINS_AUTO_ACCEPT_TOS=true -# Create clangdev ${LLVM_VERSION} -echo "Installing manylinux llvmdev ${LLVM_VERSION}..." -conda create -n llvmdev-${LLVM_VERSION} -c conda-forge -q -y clang=${LLVM_VERSION} clang-tools=${LLVM_VERSION} llvmdev=${LLVM_VERSION} +# Create conda environment with tools and libraries for the LLVM_VERSION. +echo "Installing llvmdev ${LLVM_VERSION}..." +conda create -n llvmdev-${LLVM_VERSION} -c conda-forge -q -y clang=${LLVM_VERSION} clangxx=${LLVM_VERSION} clang-tools=${LLVM_VERSION} llvmdev=${LLVM_VERSION} zstd diff --git a/buildscripts/conda-recipes/pyomp/meta.yaml b/buildscripts/conda-recipes/pyomp/meta.yaml index b23506b196e1..1b97fea7d36c 100644 --- a/buildscripts/conda-recipes/pyomp/meta.yaml +++ b/buildscripts/conda-recipes/pyomp/meta.yaml @@ -1,5 +1,5 @@ {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0').lstrip('v') %} -{% set LLVM_VERSION = environ.get('LLVM_VERSION', '15.0.7') %} +{% set LLVM_VERSION = environ.get('LLVM_VERSION', '20.1.8') %} package: name: pyomp @@ -15,8 +15,11 @@ build: script: - export LLVM_VERSION={{ LLVM_VERSION }} - export LLVM_DIR=${PREFIX} + - export CC=${PREFIX}/bin/clang + - export CXX=${PREFIX}/bin/clang++ - export VERBOSE=1 - export CPPFLAGS="-mmacosx-version-min=${MACOSX_DEPLOYMENT_TARGET} -isystem ${PREFIX}/include -D_FORTIFY_SOURCE=2" # [osx] + - export ENABLE_BUNDLED_LIBOMPTARGET=1 # [linux] - rm -rf build dist src/*.egg-info - {{ PYTHON }} -m pip install -v . @@ -28,6 +31,7 @@ requirements: - sysroot_linux-64 # [linux64] - sysroot_linux-aarch64 # [aarch64] - cmake + - ninja - setuptools_scm - elfutils # [linux] - libffi # [linux] @@ -39,55 +43,32 @@ requirements: - sysroot_linux-aarch64 # [aarch64] - setuptools - setuptools_scm - - numba >=0.61, <0.62 + - numba >=0.62, <0.64 - clang {{ LLVM_VERSION }} + - clangxx {{ LLVM_VERSION }} - clang-tools {{ LLVM_VERSION }} - llvmdev {{ LLVM_VERSION }} - zlib + # require llvm-openmp for the openmp cpu runtime. + - llvm-openmp {{ LLVM_VERSION }} - elfutils # [linux] - libffi # [linux] run: - python - setuptools - - numba >=0.61, <0.62 + - numba >=0.62, <0.64 + # require llvm-openmp for the openmp cpu runtime. + - llvm-openmp {{ LLVM_VERSION }} - lark - cffi test: commands: - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomp.dylib # [osx] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomp.so # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx1010.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx1030.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx1031.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx700.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx701.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx801.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx803.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx900.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx902.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx906.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx908.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx90a.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_35.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_37.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_50.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_52.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_53.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_60.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_61.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_62.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_70.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_72.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_75.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_80.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_86.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget.rtl.amdgpu.so # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget.rtl.cuda.so # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget.rtl.ppc64.so # [linux and ppc64le] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget.rtl.x86_64.so # [linux and x86_64] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget.rtl.aarch64.so # [linux and aarch64] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget.so # [linux] + - test -f $SP_DIR/numba/openmp/libs/pass/libIntrinsicsOpenMP.dylib # [osx] + - test -f $SP_DIR/numba/openmp/libs/pass/libIntrinsicsOpenMP.so # [linux] + - test -f $SP_DIR/numba/openmp/libs/openmp/lib/libomptarget-amdgpu.bc # [linux] + - test -f $SP_DIR/numba/openmp/libs/openmp/lib/libomptarget-nvptx.bc # [linux] + - test -f $SP_DIR/numba/openmp/libs/openmp/lib/libomptarget.so # [linux] about: home: https://github.com/Python-for-HPC/PyOMP diff --git a/buildscripts/gitlab/build-and-test.sh b/buildscripts/gitlab/build-and-test.sh index 4121e4402cfa..ab734f48d036 100644 --- a/buildscripts/gitlab/build-and-test.sh +++ b/buildscripts/gitlab/build-and-test.sh @@ -4,11 +4,12 @@ set -e # Create a unique temporary directory for this job. TMPDIR=/tmp/pyomp/${CI_JOB_ID} +rm -rf ${TMPDIR} mkdir -p ${TMPDIR} pushd ${TMPDIR} # Set the LLVM_VERSION to use. -export LLVM_VERSION="15.0.7" +export LLVM_VERSION="20.1.8" # Set the envs directory under the temporary directory. export CONDA_ENVS_DIRS="${TMPDIR}/_stage/miniconda3/envs" @@ -17,14 +18,18 @@ export CONDA_ENVS_DIRS="${TMPDIR}/_stage/miniconda3/envs" source ${CI_PROJECT_DIR}/buildscripts/cibuildwheel/setup-miniconda3.sh # Export environment variables for building and testing. +export ENABLE_BUNDLED_LIBOMP="1" +export ENABLE_BUNDLED_LIBOMPTARGET="1" export LLVM_DIR="${CONDA_ENVS_DIRS}/llvmdev-${LLVM_VERSION}" -export PATH="${CONDA_ENVS_DIRS}/llvmdev-${LLVM_VERSION}/bin:${PATH}" +export CMAKE_PREFIX_PATH="${CONDA_PREFIX}" export USE_CXX11_ABI="1" export PIP_NO_INPUT="1" # Create and activate a conda environment with the desired Python version. conda create -n py-${PYOMP_CI_PYTHON_VERSION} -c conda-forge -y python=${PYOMP_CI_PYTHON_VERSION} conda activate py-${PYOMP_CI_PYTHON_VERSION} +# Add extra packages needed to build openmp libraries. +conda install -c conda-forge -y zstd libffi # Clone and fetch the commit with history for package versioning. git clone https://github.com/${GITHUB_PROJECT_ORG}/${GITHUB_PROJECT_NAME}.git --single-branch @@ -33,7 +38,7 @@ git fetch origin ${CI_COMMIT_SHA} git checkout ${CI_COMMIT_SHA} # Install pyomp. -CC=gcc CXX=g++ python -m pip install -v . +CC=clang CXX=clang++ python -m pip install -v . # Run host OpenMP tests. TEST_DEVICES=0 RUN_TARGET=0 python -m numba.runtests -v -- numba.openmp.tests.test_openmp diff --git a/examples/ploop.1.py b/examples/ploop.1.py index 19f3113892a0..28a21e4a11ea 100644 --- a/examples/ploop.1.py +++ b/examples/ploop.1.py @@ -1,13 +1,14 @@ -import numba +from numba.openmp import njit from numba.openmp import openmp_context as openmp -from numba.openmp import omp_set_num_threads, omp_get_thread_num, omp_get_num_threads, omp_get_wtime import numpy as np -@numba.njit + +@njit def simple(n, a, b): with openmp("parallel for"): for i in range(1, n): - b[i] = (a[i] + a[i-1]) / 2.0 + b[i] = (a[i] + a[i - 1]) / 2.0 + a = np.ones(100) b = np.empty(len(a)) diff --git a/pyproject.toml b/pyproject.toml index 03874900678d..41f8782b8523 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["setuptools>=75.3", "wheel", "setuptools-scm>=8", "cmake>=3.20"] +requires = ["setuptools>=75.3", "setuptools-scm>=8", "cmake>=3.20", "ninja"] build-backend = "setuptools.build_meta" [project] @@ -7,7 +7,7 @@ name = "pyomp" dynamic = ["version"] description = "Python OpenMP library based on Numba" readme = "README.md" -requires-python = ">=3.10, <3.14" +requires-python = ">=3.10, <3.15" license = "BSD-2-Clause" license-files = ["LICENSE", "LICENSE-OPENMP.txt"] classifiers = [ @@ -17,7 +17,7 @@ classifiers = [ "Intended Audience :: Developers", "Topic :: Software Development :: Compilers", ] -dependencies = ["numba>=0.61, <0.62", "lark", "cffi", "setuptools"] +dependencies = ["numba>=0.62, <0.64", "lark", "cffi", "setuptools"] maintainers = [ { name = "Giorgis Georgakoudis", email = "georgakoudis1@llnl.gov" }, ] @@ -37,7 +37,7 @@ include = ["numba.openmp*"] # Bundle the CMake-installed artifacts into the wheel. [tool.setuptools.package-data] -"numba.openmp.libs" = ["pass/*", "libomp/**/*"] +"numba.openmp.libs" = ["pass/*", "openmp/**/*"] # setuptools-scm config [tool.setuptools_scm] @@ -49,7 +49,6 @@ archs = ["native"] # Pass LLVM_VERSION from the host environment to cibuildwheel. environment-pass = ["LLVM_VERSION"] # We use miniconda3 to get the clang/llvm toolchain on Linux. -before-all = ["bash buildscripts/cibuildwheel/setup-miniconda3.sh"] before-build = ["rm -rf build dist src/*.egg-info"] skip = ["*-musllinux_*", "cp38-*"] test-command = [ @@ -59,14 +58,32 @@ test-command = [ "OMP_TARGET_OFFLOAD=mandatory TEST_DEVICES=1 RUN_TARGET=1 python -m numba.runtests -v -- numba.openmp.tests.test_openmp.TestOpenmpTarget", ] +[tool.cibuildwheel.environment] +USE_CXX11_ABI = "1" +PIP_NO_INPUT = "1" + [tool.cibuildwheel.linux] before-all = [ - "yum install -y elfutils-libelf-devel libffi-devel", - "bash buildscripts/cibuildwheel/setup-miniconda3.sh", + "yum install -y elfutils-libelf-devel libffi-devel clang-devel-20.1.8 llvm-devel-20.1.8", ] -[tool.cibuildwheel.environment] -LLVM_DIR = "${PWD}/_stage/miniconda3/envs/llvmdev-${LLVM_VERSION}" -PATH = "${PWD}/_stage/miniconda3/envs/llvmdev-${LLVM_VERSION}/bin:${PATH}" -USE_CXX11_ABI = "1" -PIP_NO_INPUT = "1" +[tool.cibuildwheel.linux.environment] +ENABLE_BUNDLED_LIBOMP = "1" +ENABLE_BUNDLED_LIBOMPTARGET = "1" +LLVM_DIR = "/usr/lib64/cmake/llvm" +CC = "/usr/bin/clang" +CXX = "/usr/bin/clang++" + +[tool.cibuildwheel.macos] +before-all = ["bash buildscripts/cibuildwheel/setup-miniconda3.sh"] + +[tool.cibuildwheel.macos.environment] +ENABLE_BUNDLED_LIBOMP = "1" +LLVM_DIR = "${PWD}/_stage/miniconda3/envs/llvmdev-${LLVM_VERSION}/lib/cmake/llvm" +CC = "${PWD}/_stage/miniconda3/envs/llvmdev-${LLVM_VERSION}/bin/clang" +CXX = "${PWD}/_stage/miniconda3/envs/llvmdev-${LLVM_VERSION}/bin/clang++" +# Set the deplioyment target to macOS 11.0. +MACOSX_DEPLOYMENT_TARGET = "11.0" +# Set the cmake prefix path to find libraries in the conda environment which is +# compatible with the macos target. +CMAKE_PREFIX_PATH = "${PWD}/_stage/miniconda3/envs/llvmdev-${LLVM_VERSION}/" diff --git a/setup.py b/setup.py index 57f5b8fd9510..dc5404195390 100644 --- a/setup.py +++ b/setup.py @@ -8,11 +8,7 @@ from setuptools import setup, Extension from setuptools import Command from setuptools.command.build_ext import build_ext - -try: - from wheel.bdist_wheel import bdist_wheel as _bdist_wheel -except ImportError: - _bdist_wheel = None +from setuptools.command.bdist_wheel import bdist_wheel class CleanCommand(Command): @@ -33,26 +29,32 @@ def run(self): shutil.rmtree(egg_info, ignore_errors=True) -if _bdist_wheel: - - class CustomBdistWheel(_bdist_wheel): - def run(self): - # Ensure all build steps are run before bdist_wheel - self.run_command("build_ext") - super().run() -else: - CustomBdistWheel = None +class CustomBdistWheel(bdist_wheel): + def run(self): + # Ensure all build steps are run before bdist_wheel + self.run_command("build_ext") + super().run() class CMakeExtension(Extension): - def __init__(self, name, *, sourcedir=None, url=None, sha256=None, cmake_args=[]): + def __init__( + self, + name, + *, + source_dir=None, + install_dir=None, + url=None, + sha256=None, + cmake_args=[], + ): # Don't invoke the original build_ext for this special extension. super().__init__(name, sources=[]) - if sourcedir and url: + if source_dir and url: raise ValueError( - "CMakeExtension should have either a sourcedir or a url, not both." + "CMakeExtension should have either a source_dir or a url, not both." ) - self.sourcedir = sourcedir + self.source_dir = source_dir + self.install_dir = install_dir self.url = url self.sha256 = sha256 self.cmake_args = cmake_args @@ -66,7 +68,13 @@ def run(self): else: super().run() + def finalize_options(self): + super().finalize_options() + # Create placeholder directories for package-data validation. + Path("src/numba/openmp/libs/openmp/lib").mkdir(parents=True, exist_ok=True) + def _build_cmake(self, ext: CMakeExtension): + print("Build CMake extension:", ext.name) # Delete build directory if it exists to avoid errors with stale # CMakeCache.txt leftovers. build_dir = Path(self.build_temp) / ext.name @@ -84,32 +92,41 @@ def _build_cmake(self, ext: CMakeExtension): elif sys.platform == "darwin": extra_cmake_args.append(r"-DCMAKE_INSTALL_RPATH=@loader_path") - install_dir = Path(lib_dir) / ext.name + if ext.install_dir is None: + install_dir = Path(lib_dir) / ext.name + else: + install_dir = Path(lib_dir) / ext.install_dir install_dir.mkdir(parents=True, exist_ok=True) + cfg = ( [ "cmake", "-S", - ext.sourcedir, + ext.source_dir, "-B", build_dir, + "-G", + "Ninja", "-DCMAKE_BUILD_TYPE=Release", f"-DCMAKE_INSTALL_PREFIX={install_dir}", ] + ext.cmake_args + extra_cmake_args ) + print("Configure cmake with args:", cfg) subprocess.run(cfg, check=True, stdin=subprocess.DEVNULL) + print("Build at dir ", build_dir) subprocess.run( ["cmake", "--build", build_dir, "-j"], check=True, stdin=subprocess.DEVNULL ) + print("Install at dir ", install_dir) subprocess.run( ["cmake", "--install", build_dir], check=True, stdin=subprocess.DEVNULL ) # Remove unnecessary files after installing libomp. - if ext.name == "libomp": + if ext.name.startswith("libomp"): # Remove include directory after install. include_dir = install_dir / "include" if include_dir.exists(): @@ -124,40 +141,30 @@ def _env_toolchain_args(self, ext): # Forward LLVM_DIR if provided. if os.environ.get("LLVM_DIR"): args.append(f"-DLLVM_DIR={os.environ['LLVM_DIR']}") + # Forward CC, CXX if provided. + if os.environ.get("CC"): + args.append(f"-DCMAKE_C_COMPILER={os.environ['CC']}") + if os.environ.get("CXX"): + args.append(f"-DCMAKE_CXX_COMPILER={os.environ['CXX']}") return args def _prepare_source_openmp(sha256=None): LLVM_VERSION = os.environ.get("LLVM_VERSION", None) assert LLVM_VERSION is not None, "LLVM_VERSION environment variable must be set." - url = f"https://github.com/llvm/llvm-project/releases/download/llvmorg-{LLVM_VERSION}/openmp-{LLVM_VERSION}.src.tar.xz" + url = f"https://github.com/llvm/llvm-project/releases/download/llvmorg-{LLVM_VERSION}/llvm-project-{LLVM_VERSION}.src.tar.xz" - tmp = Path("_downloads/libomp") / f"openmp-{LLVM_VERSION}.tar.gz" + tmp = Path("_downloads/libomp") / f"llvm-project-{LLVM_VERSION}.tar.gz" tmp.parent.mkdir(parents=True, exist_ok=True) # Download the source tarball if it does not exist. if not tmp.exists(): - print(f"download openmp version {LLVM_VERSION} url:", url) + print(f"Downloading llvm-project version {LLVM_VERSION} url:", url) with urllib.request.urlopen(url) as r: with tmp.open("wb") as f: f.write(r.read()) - - # Extract only the major version. - llvm_major_version = tuple(map(int, LLVM_VERSION.split(".")))[0] - # For LLVM versions > 14, we also need to download CMake files. - if llvm_major_version > 14: - cmake_url = f"https://github.com/llvm/llvm-project/releases/download/llvmorg-{LLVM_VERSION}/cmake-{LLVM_VERSION}.src.tar.xz" - cmake_file = Path("_downloads/libomp") / f"cmake-{LLVM_VERSION}.tar.gz" - if not cmake_file.exists(): - with urllib.request.urlopen(cmake_url) as r: - with cmake_file.open("wb") as f: - f.write(r.read()) - with tarfile.open(cmake_file) as tf: - tf.extractall(cmake_file.parent) - src = cmake_file.parent / tf.getnames()[0] - dst = cmake_file.parent / "cmake" - if not dst.exists(): - src.rename(dst) + else: + print(f"Using downloaded llvm-project at {tmp}") if sha256: import hashlib @@ -168,40 +175,111 @@ def _prepare_source_openmp(sha256=None): if hasher.hexdigest() != sha256: raise ValueError(f"SHA256 mismatch for {url}") + print("Extracting llvm-project...") with tarfile.open(tmp) as tf: - # We assume the tarball contains a single directory with the source files. - sourcedir = tmp.parent / tf.getnames()[0] - tf.extractall(tmp.parent) - - for patch in ( - Path(f"src/numba/openmp/libs/libomp/patches/{LLVM_VERSION}") + # The root dir llvm-project-20.1.8.src + root_name = tf.getnames()[0] + + # Extract only needed subdirectories + members = [ + m + for m in tf.getmembers() + if m.name.startswith(f"{root_name}/openmp/") + or m.name.startswith(f"{root_name}/offload/") + or m.name.startswith(f"{root_name}/runtimes/") + or m.name.startswith(f"{root_name}/cmake/") + or m.name.startswith(f"{root_name}/llvm/cmake/") + or m.name.startswith(f"{root_name}/llvm/utils/") + or m.name.startswith(f"{root_name}/libc/") + ] + + parentdir = tmp.parent + # Base arguments for extractall. + kwargs = {"path": parentdir, "members": members} + + # Check if data filter is available. + if hasattr(tarfile, "data_filter"): + # If this exists, the 'filter' argument is guaranteed to work + kwargs["filter"] = "data" + + tf.extractall(**kwargs) + + source_dir = parentdir / root_name + print("Extracted llvm-project to:", source_dir) + + print("Applying patches to llvm-project...") + for patch in sorted( + Path(f"src/numba/openmp/libs/openmp/patches/{LLVM_VERSION}") .absolute() .glob("*.patch") ): print("applying patch", patch) subprocess.run( ["patch", "-p1", "-i", str(patch)], - cwd=sourcedir, + cwd=source_dir, check=True, stdin=subprocess.DEVNULL, ) - return sourcedir + return f"{source_dir}/runtimes" -setup( - ext_modules=[ - CMakeExtension("pass", sourcedir="src/numba/openmp/libs/pass"), +def _check_true(env_var): + val = os.environ.get(env_var, "0") + return val.lower() in ("1", "true", "yes", "on") + + +# Build extensions: always include 'pass', conditionally include 'openmp' +# libraries. +ext_modules = [CMakeExtension("pass", source_dir="src/numba/openmp/libs/pass")] + + +# Prepare source directory if either bundled libomp or libomptarget is enabled. +if _check_true("ENABLE_BUNDLED_LIBOMP") or _check_true("ENABLE_BUNDLED_LIBOMPTARGET"): + openmp_source_dir = _prepare_source_openmp() + +# Optionally enable bundled libomp build via ENABLE_BUNDLED_LIBOMP=1. We want +# to avoid bundling for conda builds to avoid duplicate OpenMP runtime conflicts +# (e.g., numba 0.62+ and libopenblas already require llvm-openmp). +if _check_true("ENABLE_BUNDLED_LIBOMP"): + ext_modules.append( CMakeExtension( "libomp", - sourcedir=_prepare_source_openmp(), + source_dir=openmp_source_dir, + install_dir="openmp", cmake_args=[ + "-DOPENMP_STANDALONE_BUILD=ON", + "-DLLVM_ENABLE_RUNTIMES=openmp", "-DLIBOMP_OMPD_SUPPORT=OFF", - "-DLIBOMP_OMPT_SUPPORT=OFF", - "-DCMAKE_INSTALL_LIBDIR=lib", + "-DOPENMP_ENABLE_OMPT_TOOLS=OFF", + # Avoid conflicts in manylinux builds with packaged clang/llvm + # under /usr/include and its gcc-toolset provided header files. + "-DCMAKE_NO_SYSTEM_FROM_IMPORTED=ON", + ], + ) + ) + +# Optionally enable bundled libomptarget build via ENABLE_BUNDLED_LIBOMPTARGET=1. +# We avoid building and bundling for unsupported platforms. +if _check_true("ENABLE_BUNDLED_LIBOMPTARGET"): + ext_modules.append( + CMakeExtension( + "libomptarget", + source_dir=openmp_source_dir, + install_dir="openmp", + cmake_args=[ + "-DOPENMP_STANDALONE_BUILD=ON", + "-DLLVM_ENABLE_RUNTIMES=offload", + # Avoid conflicts in manylinux builds with packaged clang/llvm + # under /usr/include and its gcc-toolset provided header files. + "-DCMAKE_NO_SYSTEM_FROM_IMPORTED=ON", ], - ), - ], + ) + ) + + +setup( + ext_modules=ext_modules, cmdclass={ "clean": CleanCommand, "build_ext": BuildCMakeExt, diff --git a/src/numba/openmp/__init__.py b/src/numba/openmp/__init__.py index 031c0a637616..1396f52ae9f7 100644 --- a/src/numba/openmp/__init__.py +++ b/src/numba/openmp/__init__.py @@ -60,25 +60,45 @@ def _init(): sys_platform = sys.platform + from ctypes.util import find_library omplib = ( libpath - / "libomp" + / "openmp" / "lib" / f"libomp{'.dylib' if sys_platform == 'darwin' else '.so'}" ) - if DEBUG_OPENMP >= 1: - print("Found OpenMP runtime library at", omplib) - ll.load_library_permanently(str(omplib)) + + # Prefer bundled libomp if it exists. + if omplib.exists(): + if DEBUG_OPENMP >= 1: + print("Found bundled OpenMP runtime library at", omplib) + ll.load_library_permanently(str(omplib)) + else: + # There is no bundled libomp, try to find it in standard library paths. + system_omplib = find_library("omp") + if system_omplib: + if DEBUG_OPENMP >= 1: + print(f"Found system OpenMP runtime library: {system_omplib}") + ll.load_library_permanently(system_omplib) + else: + raise RuntimeError( + f"OpenMP runtime not found. Bundled library missing at {omplib} " + "and no system libomp found via ctypes.util.find_library('omp'). " + "Ensure libomp is available in library paths." + ) # libomptarget is unavailable on apple, windows, so return. if sys_platform.startswith("darwin") or sys_platform.startswith("win32"): return - omptargetlib = libpath / "libomp" / "lib" / "libomptarget.so" - if DEBUG_OPENMP >= 1: - print("Found OpenMP target runtime library at", omptargetlib) - ll.load_library_permanently(str(omptargetlib)) + omptargetlib = libpath / "openmp" / "lib" / "libomptarget.so" + if omptargetlib.exists(): + if DEBUG_OPENMP >= 1: + print("Found OpenMP target runtime library at", omptargetlib) + ll.load_library_permanently(str(omptargetlib)) + else: + raise RuntimeError(f"OpenMP target runtime not found at {omptargetlib}") _init() diff --git a/src/numba/openmp/compiler.py b/src/numba/openmp/compiler.py index 3eb39509f0b7..ed71888c226f 100644 --- a/src/numba/openmp/compiler.py +++ b/src/numba/openmp/compiler.py @@ -306,7 +306,19 @@ def add_llvm_module(self, ll_module): def _finalize_specific(self): super()._finalize_specific() - ll.ExecutionEngine.run_static_constructors(self._codegen._engine._ee) + # Run target offloading descriptor registration functions, if there are any. + import ctypes + + ee = self._codegen._engine._ee + for func in self.get_defined_functions(): + if not func.name.startswith(".omp_offloading.descriptor_reg"): + continue + addr = ee.get_function_address(func.name) + reg = ctypes.CFUNCTYPE(None)(addr) + try: + reg() + except Exception: + raise RuntimeError("error registering OpenMP offloading descriptor") class CustomFunctionCompiler(_FunctionCompiler): diff --git a/src/numba/openmp/config.py b/src/numba/openmp/config.py index d4f380b5069c..84e2edc48545 100644 --- a/src/numba/openmp/config.py +++ b/src/numba/openmp/config.py @@ -23,4 +23,5 @@ def _safe_readenv(name, ctor, default): config.DEBUG_ARRAY_OPT = 1 DEBUG_OPENMP_LLVM_PASS = _safe_readenv("NUMBA_DEBUG_OPENMP_LLVM_PASS", int, 0) OPENMP_DISABLED = _safe_readenv("NUMBA_OPENMP_DISABLED", int, 0) -OPENMP_DEVICE_TOOLCHAIN = _safe_readenv("NUMBA_OPENMP_DEVICE_TOOLCHAIN", int, 0) +# Use toolchain for device code compilation by default to avoid issues with libomptarget compatibility checks. +OPENMP_DEVICE_TOOLCHAIN = _safe_readenv("NUMBA_OPENMP_DEVICE_TOOLCHAIN", int, 1) diff --git a/src/numba/openmp/libs/libomp/patches/14.0.6/0001-BACKPORT-Fix-for-CUDA-OpenMP-RTL.patch b/src/numba/openmp/libs/openmp/patches/14.0.6/0001-BACKPORT-Fix-for-CUDA-OpenMP-RTL.patch similarity index 100% rename from src/numba/openmp/libs/libomp/patches/14.0.6/0001-BACKPORT-Fix-for-CUDA-OpenMP-RTL.patch rename to src/numba/openmp/libs/openmp/patches/14.0.6/0001-BACKPORT-Fix-for-CUDA-OpenMP-RTL.patch diff --git a/src/numba/openmp/libs/libomp/patches/14.0.6/0002-Fix-missing-includes.patch b/src/numba/openmp/libs/openmp/patches/14.0.6/0002-Fix-missing-includes.patch similarity index 100% rename from src/numba/openmp/libs/libomp/patches/14.0.6/0002-Fix-missing-includes.patch rename to src/numba/openmp/libs/openmp/patches/14.0.6/0002-Fix-missing-includes.patch diff --git a/src/numba/openmp/libs/libomp/patches/14.0.6/0003-Link-static-LLVM-libs.patch b/src/numba/openmp/libs/openmp/patches/14.0.6/0003-Link-static-LLVM-libs.patch similarity index 100% rename from src/numba/openmp/libs/libomp/patches/14.0.6/0003-Link-static-LLVM-libs.patch rename to src/numba/openmp/libs/openmp/patches/14.0.6/0003-Link-static-LLVM-libs.patch diff --git a/src/numba/openmp/libs/libomp/patches/15.0.7/0001-Fix-missing-includes.patch b/src/numba/openmp/libs/openmp/patches/15.0.7/0001-Fix-missing-includes.patch similarity index 100% rename from src/numba/openmp/libs/libomp/patches/15.0.7/0001-Fix-missing-includes.patch rename to src/numba/openmp/libs/openmp/patches/15.0.7/0001-Fix-missing-includes.patch diff --git a/src/numba/openmp/libs/libomp/patches/15.0.7/0002-Link-LLVM-statically.patch b/src/numba/openmp/libs/openmp/patches/15.0.7/0002-Link-LLVM-statically.patch similarity index 100% rename from src/numba/openmp/libs/libomp/patches/15.0.7/0002-Link-LLVM-statically.patch rename to src/numba/openmp/libs/openmp/patches/15.0.7/0002-Link-LLVM-statically.patch diff --git a/src/numba/openmp/libs/libomp/patches/15.0.7/0003-Disable-opaque-pointers-DeviceRTL-bitcode.patch b/src/numba/openmp/libs/openmp/patches/15.0.7/0003-Disable-opaque-pointers-DeviceRTL-bitcode.patch similarity index 100% rename from src/numba/openmp/libs/libomp/patches/15.0.7/0003-Disable-opaque-pointers-DeviceRTL-bitcode.patch rename to src/numba/openmp/libs/openmp/patches/15.0.7/0003-Disable-opaque-pointers-DeviceRTL-bitcode.patch diff --git a/src/numba/openmp/libs/libomp/patches/16.0.6/0001-Load-plugins-from-install-directory.patch b/src/numba/openmp/libs/openmp/patches/16.0.6/0001-Load-plugins-from-install-directory.patch similarity index 100% rename from src/numba/openmp/libs/libomp/patches/16.0.6/0001-Load-plugins-from-install-directory.patch rename to src/numba/openmp/libs/openmp/patches/16.0.6/0001-Load-plugins-from-install-directory.patch diff --git a/src/numba/openmp/libs/libomp/patches/16.0.6/0002-Link-LLVM-statically.patch b/src/numba/openmp/libs/openmp/patches/16.0.6/0002-Link-LLVM-statically.patch similarity index 100% rename from src/numba/openmp/libs/libomp/patches/16.0.6/0002-Link-LLVM-statically.patch rename to src/numba/openmp/libs/openmp/patches/16.0.6/0002-Link-LLVM-statically.patch diff --git a/src/numba/openmp/libs/openmp/patches/20.1.8/0001-Enable-standalone-build.patch b/src/numba/openmp/libs/openmp/patches/20.1.8/0001-Enable-standalone-build.patch new file mode 100644 index 000000000000..9f73e9d3fb0c --- /dev/null +++ b/src/numba/openmp/libs/openmp/patches/20.1.8/0001-Enable-standalone-build.patch @@ -0,0 +1,13 @@ +diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt +index f6e894d39..9096d4ed5 100644 +--- a/offload/CMakeLists.txt ++++ b/offload/CMakeLists.txt +@@ -4,7 +4,7 @@ + cmake_minimum_required(VERSION 3.20.0) + set(LLVM_SUBPROJECT_TITLE "liboffload") + +-if ("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") ++if (OPENMP_STANDALONE_BUILD OR "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") + set(OPENMP_STANDALONE_BUILD TRUE) + project(offload C CXX ASM) + else() diff --git a/src/numba/openmp/libs/openmp/patches/20.1.8/0002-Link-statically-LLVM.patch b/src/numba/openmp/libs/openmp/patches/20.1.8/0002-Link-statically-LLVM.patch new file mode 100644 index 000000000000..43a9aaff0d74 --- /dev/null +++ b/src/numba/openmp/libs/openmp/patches/20.1.8/0002-Link-statically-LLVM.patch @@ -0,0 +1,24 @@ +diff --git a/offload/plugins-nextgen/CMakeLists.txt b/offload/plugins-nextgen/CMakeLists.txt +index 9b5b12bea..78dde405b 100644 +--- a/offload/plugins-nextgen/CMakeLists.txt ++++ b/offload/plugins-nextgen/CMakeLists.txt +@@ -3,6 +3,7 @@ set(common_dir ${CMAKE_CURRENT_SOURCE_DIR}/common) + add_subdirectory(common) + function(add_target_library target_name lib_name) + add_llvm_library(${target_name} STATIC ++ DISABLE_LLVM_LINK_LLVM_DYLIB + LINK_COMPONENTS + AggressiveInstCombine + Analysis +diff --git a/offload/src/CMakeLists.txt b/offload/src/CMakeLists.txt +index c5f5d902f..ca5135b13 100644 +--- a/offload/src/CMakeLists.txt ++++ b/offload/src/CMakeLists.txt +@@ -8,6 +8,7 @@ endif() + + add_llvm_library(omptarget + SHARED ++ DISABLE_LLVM_LINK_LLVM_DYLIB + + device.cpp + interface.cpp diff --git a/src/numba/openmp/libs/openmp/patches/20.1.8/0003-Do-not-build-liboffload.patch b/src/numba/openmp/libs/openmp/patches/20.1.8/0003-Do-not-build-liboffload.patch new file mode 100644 index 000000000000..bc64017d94cf --- /dev/null +++ b/src/numba/openmp/libs/openmp/patches/20.1.8/0003-Do-not-build-liboffload.patch @@ -0,0 +1,12 @@ +diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt +index 9096d4e..a0aff92 100644 +--- a/offload/CMakeLists.txt ++++ b/offload/CMakeLists.txt +@@ -380,7 +380,6 @@ add_subdirectory(tools) + add_subdirectory(src) + + add_subdirectory(tools/offload-tblgen) +-add_subdirectory(liboffload) + + # Add tests. + add_subdirectory(test) diff --git a/src/numba/openmp/libs/pass/CGIntrinsicsOpenMP.cpp b/src/numba/openmp/libs/pass/CGIntrinsicsOpenMP.cpp index 309b23cb3466..553483e9581d 100644 --- a/src/numba/openmp/libs/pass/CGIntrinsicsOpenMP.cpp +++ b/src/numba/openmp/libs/pass/CGIntrinsicsOpenMP.cpp @@ -1,7 +1,7 @@ #include "CGIntrinsicsOpenMP.h" #include "DebugOpenMP.h" -#include +#include #include #include #include @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -248,10 +249,10 @@ OutlinedInfoStruct CGIntrinsicsOpenMP::createOutlinedFunction( assert(SinkingCands.empty() && "Expected empty alloca sinking candidates"); auto IsTempOrDefaultPrivate = [](Value *V) { - if (V->getName().startswith(".")) + if (V->getName().starts_with(".")) return true; - if (V->getName().startswith("excinfo")) + if (V->getName().starts_with("excinfo")) return true; if (V->getName() == "quot") @@ -292,8 +293,8 @@ OutlinedInfoStruct CGIntrinsicsOpenMP::createOutlinedFunction( DSAType DSA = DSAValueMap[V].Type; - DEBUG_ENABLE(dbgs() << "V " << *V << " from DSAValueMap Type " << DSA - << "\n"); + DEBUG_ENABLE(dbgs() << "V " << *V << " from DSAValueMap Type " + << toString(DSA) << "\n"); switch (DSA) { case DSA_PRIVATE: Privates.push_back(V); @@ -514,8 +515,10 @@ OutlinedInfoStruct CGIntrinsicsOpenMP::createOutlinedFunction( // Deterministic insertion of BBs, BlockVector needs ExitBB to move to the // outlined function. BlockVector.push_back(OI.ExitBB); - for (auto *BB : BlockVector) - BB->moveBefore(OutlinedExitBB); + for (auto *BB : BlockVector) { + BB->removeFromParent(); + BB->insertInto(OutlinedFn, OutlinedExitBB); + } DEBUG_ENABLE(dbgs() << "=== Dump OutlinedFn\n" << *OutlinedFn << "=== End of Dump OutlinedFn\n"); @@ -741,11 +744,8 @@ void CGIntrinsicsOpenMP::emitOMPParallelDeviceRuntime( OMPBuilder.Builder.CreateLoad(OMPBuilder.Int8PtrPtr, GlobalArgs); Value *GEP = OMPBuilder.Builder.CreateConstInBoundsGEP1_64( OMPBuilder.Int8Ptr, LoadGlobalArgs, Idx); - - Value *Bitcast = OMPBuilder.Builder.CreateBitCast( - GEP, CapturedVars[Idx]->getType()->getPointerTo()); Value *Load = - OMPBuilder.Builder.CreateLoad(CapturedVars[Idx]->getType(), Bitcast); + OMPBuilder.Builder.CreateLoad(CapturedVars[Idx]->getType(), GEP); OutlinedFnArgs.push_back(Load); } @@ -1377,11 +1377,13 @@ void CGIntrinsicsOpenMP::emitLoop(DSAValueMapTy &DSAValueMap, } OMPBuilder.Builder.SetInsertPoint(ExitBB->getTerminator()); - OMPBuilder.createBarrier(OpenMPIRBuilder::LocationDescription( - OMPBuilder.Builder.saveIP(), Loc.DL), - omp::Directive::OMPD_for, - /* ForceSimpleCall */ false, - /* CheckCancelFlag */ false); + auto IP = OMPBuilder.createBarrier(OpenMPIRBuilder::LocationDescription( + OMPBuilder.Builder.saveIP(), Loc.DL), + omp::Directive::OMPD_for, + /* ForceSimpleCall */ false, + /* CheckCancelFlag */ false); + if (auto E = IP.takeError()) + FATAL_ERROR("Failed to create barrier: " + toString(std::move(E))); } if (verifyFunction(*PreHeader->getParent(), &errs())) @@ -1425,7 +1427,7 @@ void CGIntrinsicsOpenMP::emitOMPTask(DSAValueMapTy &DSAValueMap, Function *Fn, StructType::create({OMPBuilder.VoidPtr, OMPBuilder.TaskRoutineEntryPtr, OMPBuilder.Int32, KmpCmplrdataTy, KmpCmplrdataTy}, "struct.kmp_task_t"); - Type *KmpTaskTPtrTy = KmpTaskTTy->getPointerTo(); + Type *KmpTaskTPtrTy = PointerType::getUnqual(M.getContext()); FunctionCallee KmpcOmpTaskAlloc = OMPBuilder.getOrCreateRuntimeFunction(M, OMPRTL___kmpc_omp_task_alloc); @@ -1452,13 +1454,13 @@ void CGIntrinsicsOpenMP::emitOMPTask(DSAValueMapTy &DSAValueMap, Function *Fn, else KmpSharedsTTy = StructType::create(SharedsTy, "struct.kmp_shareds"); assert(KmpSharedsTTy && "Expected non-null KmpSharedsTTy"); - Type *KmpSharedsTPtrTy = KmpSharedsTTy->getPointerTo(); + Type *KmpSharedsTPtrTy = PointerType::getUnqual(M.getContext()); StructType *KmpPrivatesTTy = StructType::create(PrivatesTy, "struct.kmp_privates"); - Type *KmpPrivatesTPtrTy = KmpPrivatesTTy->getPointerTo(); + Type *KmpPrivatesTPtrTy = PointerType::getUnqual(M.getContext()); StructType *KmpTaskTWithPrivatesTy = StructType::create( {KmpTaskTTy, KmpPrivatesTTy}, "struct.kmp_task_t_with_privates"); - Type *KmpTaskTWithPrivatesPtrTy = KmpTaskTWithPrivatesTy->getPointerTo(); + Type *KmpTaskTWithPrivatesPtrTy = PointerType::getUnqual(M.getContext()); // Declare the task entry function. Function *TaskEntryFn = Function::Create( @@ -1615,11 +1617,14 @@ void CGIntrinsicsOpenMP::emitOMPTask(DSAValueMapTy &DSAValueMap, Function *Fn, BasicBlock::Create(M.getContext(), "entry", TaskOutlinedFn); BasicBlock *TaskOutlinedExitBB = BasicBlock::Create(M.getContext(), "exit", TaskOutlinedFn); - for (BasicBlock *BB : OutlinedBlockVector) - BB->moveBefore(TaskOutlinedExitBB); + for (BasicBlock *BB : OutlinedBlockVector) { + BB->removeFromParent(); + BB->insertInto(TaskOutlinedFn, TaskOutlinedExitBB); + } // Explicitly move EndBB to the outlined functions, since OutlineInfo // does not contain it in the OutlinedBlockVector. - EndBB->moveBefore(TaskOutlinedExitBB); + EndBB->removeFromParent(); + EndBB->insertInto(TaskOutlinedFn, TaskOutlinedExitBB); EndBB->getTerminator()->setSuccessor(0, TaskOutlinedExitBB); OMPBuilder.Builder.SetInsertPoint(TaskOutlinedEntryBB); @@ -1727,20 +1732,6 @@ void CGIntrinsicsOpenMP::emitOMPOffloadingMappings( SmallVector OffloadMapTypes; SmallVector OffloadMapNames; - if (DSAValueMap.empty()) { - OffloadingMappingArgs.Size = 0; - OffloadingMappingArgs.BasePtrs = - Constant::getNullValue(OMPBuilder.VoidPtrPtr); - OffloadingMappingArgs.Ptrs = Constant::getNullValue(OMPBuilder.VoidPtrPtr); - OffloadingMappingArgs.Sizes = Constant::getNullValue(OMPBuilder.Int64Ptr); - OffloadingMappingArgs.MapTypes = - Constant::getNullValue(OMPBuilder.Int64Ptr); - OffloadingMappingArgs.MapNames = - Constant::getNullValue(OMPBuilder.VoidPtrPtr); - - return; - } - auto EmitMappingEntry = [&](Value *Size, uint64_t MapType, Value *BasePtr, Value *Ptr) { OffloadMapTypes.push_back(ConstantInt::get(OMPBuilder.SizeTy, MapType)); @@ -1749,8 +1740,8 @@ void CGIntrinsicsOpenMP::emitOMPOffloadingMappings( OffloadMapNames.push_back(OMPBuilder.getOrCreateSrcLocStr( BasePtr->getName(), "", 0, 0, SrcLocStrSize)); DEBUG_ENABLE(dbgs() << "Emit mapping entry BasePtr " << *BasePtr << " Ptr " - << *Ptr << " Size " << *Size << " MapType " << MapType - << "\n"); + << *Ptr << " Size " << *Size << " MapType 0x" + << toHex(MapType) << "\n"); MapperInfos.push_back({BasePtr, Ptr, Size}); }; @@ -1968,25 +1959,19 @@ void CGIntrinsicsOpenMP::emitOMPOffloadingMappings( auto *GEP = OMPBuilder.Builder.CreateInBoundsGEP( BasePtrsAlloca->getAllocatedType(), BasePtrsAlloca, {OMPBuilder.Builder.getInt32(0), OMPBuilder.Builder.getInt32(Idx)}); - auto *Bitcast = OMPBuilder.Builder.CreateBitCast( - GEP, MI.BasePtr->getType()->getPointerTo()); - OMPBuilder.Builder.CreateStore(MI.BasePtr, Bitcast); + OMPBuilder.Builder.CreateStore(MI.BasePtr, GEP); // Store in the pointers alloca. GEP = OMPBuilder.Builder.CreateInBoundsGEP( PtrsAlloca->getAllocatedType(), PtrsAlloca, {OMPBuilder.Builder.getInt32(0), OMPBuilder.Builder.getInt32(Idx)}); - Bitcast = OMPBuilder.Builder.CreateBitCast( - GEP, MI.Ptr->getType()->getPointerTo()); - OMPBuilder.Builder.CreateStore(MI.Ptr, Bitcast); + OMPBuilder.Builder.CreateStore(MI.Ptr, GEP); // Store in the sizes alloca. GEP = OMPBuilder.Builder.CreateInBoundsGEP( SizesAlloca->getAllocatedType(), SizesAlloca, {OMPBuilder.Builder.getInt32(0), OMPBuilder.Builder.getInt32(Idx)}); - Bitcast = OMPBuilder.Builder.CreateBitCast( - GEP, MI.Size->getType()->getPointerTo()); - OMPBuilder.Builder.CreateStore(MI.Size, Bitcast); + OMPBuilder.Builder.CreateStore(MI.Size, GEP); Idx++; } @@ -1994,12 +1979,9 @@ void CGIntrinsicsOpenMP::emitOMPOffloadingMappings( OffloadingMappingArgs.Size = MapperInfos.size(); // These operations could be also implemented with GEPs on the allocas, not // sure what's best, revisit. - OffloadingMappingArgs.BasePtrs = - OMPBuilder.Builder.CreateBitCast(BasePtrsAlloca, OMPBuilder.VoidPtrPtr); - OffloadingMappingArgs.Ptrs = - OMPBuilder.Builder.CreateBitCast(PtrsAlloca, OMPBuilder.VoidPtrPtr); - OffloadingMappingArgs.Sizes = OMPBuilder.Builder.CreateBitCast( - SizesAlloca, OMPBuilder.SizeTy->getPointerTo()); + OffloadingMappingArgs.BasePtrs = BasePtrsAlloca; + OffloadingMappingArgs.Ptrs = PtrsAlloca; + OffloadingMappingArgs.Sizes = SizesAlloca; } void CGIntrinsicsOpenMP::emitOMPSingle(Function *Fn, BasicBlock *BBEntry, @@ -2019,8 +2001,7 @@ void CGIntrinsicsOpenMP::emitOMPSingle(Function *Fn, BasicBlock *BBEntry, #else auto IPOrError = - OMPBuilder.createSingle(Loc, BodyGenCB, FiniCB, /* IsNoWait*/ false, - /*DidIt*/ nullptr); + OMPBuilder.createSingle(Loc, BodyGenCB, FiniCB, /* IsNoWait*/ false); if (auto E = IPOrError.takeError()) { FATAL_ERROR("Error creating OpenMP single region: " + toString(std::move(E))); @@ -2072,9 +2053,12 @@ void CGIntrinsicsOpenMP::emitOMPBarrier(Function *Fn, BasicBlock *BBEntry, InsertPointTy(BBEntry, BBEntry->getTerminator()->getIterator()), DL); // TODO: check ForceSimpleCall usage. - OMPBuilder.createBarrier(Loc, DK, - /*ForceSimpleCall*/ false, - /*CheckCancelFlag*/ true); + auto IP = OMPBuilder.createBarrier(Loc, DK, + /*ForceSimpleCall*/ false, + /*CheckCancelFlag*/ true); + if (auto E = IP.takeError()) { + FATAL_ERROR("Error creating OpenMP barrier: " + toString(std::move(E))); + } DEBUG_ENABLE(dbgs() << "=== Barrier Fn\n" << *Fn << "=== End of Barrier Fn\n"); } @@ -2094,8 +2078,6 @@ CGIntrinsicsOpenMP::emitOffloadingGlobals(StringRef DevWrapperFuncName, GlobalVariable *OMPRegionId = nullptr; GlobalVariable *OMPOffloadEntries = nullptr; - // TODO: assumes 1 target region, can we call tgt_register_lib - // multiple times? OMPRegionId = new GlobalVariable( M, OMPBuilder.Int8, /* isConstant */ true, GlobalValue::WeakAnyLinkage, ConstantInt::get(OMPBuilder.Int8, 0), DevWrapperFuncName + ".region_id", @@ -2127,14 +2109,16 @@ CGIntrinsicsOpenMP::emitOffloadingGlobals(StringRef DevWrapperFuncName, ".omp_offloading.device_image"); GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + auto &Ctx = M.getContext(); + StructType *TgtDeviceImageTy = StructType::create( - {OMPBuilder.Int8Ptr, OMPBuilder.Int8Ptr, - TgtOffloadEntryTy->getPointerTo(), TgtOffloadEntryTy->getPointerTo()}, + {OMPBuilder.Int8Ptr, OMPBuilder.Int8Ptr, PointerType::getUnqual(Ctx), + PointerType::getUnqual(Ctx)}, "struct.__tgt_device_image"); StructType *TgtBinDescTy = StructType::create( - {OMPBuilder.Int32, TgtDeviceImageTy->getPointerTo(), - TgtOffloadEntryTy->getPointerTo(), TgtOffloadEntryTy->getPointerTo()}, + {OMPBuilder.Int32, PointerType::getUnqual(Ctx), + PointerType::getUnqual(Ctx), PointerType::getUnqual(Ctx)}, "struct.__tgt_bin_desc"); auto *ArrayTy = ArrayType::get(TgtDeviceImageTy, 1); @@ -2174,67 +2158,8 @@ CGIntrinsicsOpenMP::emitOffloadingGlobals(StringRef DevWrapperFuncName, /* isConstant */ true, GlobalValue::InternalLinkage, DescInit, ".omp_offloading.descriptor"); - // Add tgt_register_requires, tgt_register_lib, - // tgt_unregister_lib. - { - // tgt_register_requires. - auto *FuncTy = FunctionType::get(OMPBuilder.Void, /*isVarArg*/ false); - auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage, - ".omp_offloading.requires_reg", &M); - Func->setSection(".text.startup"); - - // Get __tgt_register_lib function declaration. - auto *RegFuncTy = FunctionType::get(OMPBuilder.Void, OMPBuilder.Int64, - /*isVarArg*/ false); - FunctionCallee RegFuncC = - M.getOrInsertFunction("__tgt_register_requires", RegFuncTy); - - // Construct function body - IRBuilder<> Builder(BasicBlock::Create(M.getContext(), "entry", Func)); - // TODO: fix to pass the requirements enum value. - Builder.CreateCall(RegFuncC, ConstantInt::get(OMPBuilder.Int64, 1)); - Builder.CreateRetVoid(); - - // Add this function to constructors. - // Set priority to 1 so that __tgt_register_lib is executed - // AFTER - // __tgt_register_requires (we want to know what requirements - // have been asked for before we load a libomptarget plugin so - // that by the time the plugin is loaded it can report how - // many devices there are which can satisfy these - // requirements). - appendToGlobalCtors(M, Func, /*Priority*/ 0); - } - { - // ctor - auto *FuncTy = FunctionType::get(OMPBuilder.Void, /*isVarArg*/ false); - auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage, - ".omp_offloading.descriptor_reg", &M); - Func->setSection(".text.startup"); - - // Get __tgt_register_lib function declaration. - auto *RegFuncTy = - FunctionType::get(OMPBuilder.Void, TgtBinDescTy->getPointerTo(), - /*isVarArg*/ false); - FunctionCallee RegFuncC = - M.getOrInsertFunction("__tgt_register_lib", RegFuncTy); - - // Construct function body - IRBuilder<> Builder(BasicBlock::Create(M.getContext(), "entry", Func)); - Builder.CreateCall(RegFuncC, BinDesc); - Builder.CreateRetVoid(); - - // Add this function to constructors. - // Set priority to 1 so that __tgt_register_lib is executed - // AFTER - // __tgt_register_requires (we want to know what requirements - // have been asked for before we load a libomptarget plugin so - // that by the time the plugin is loaded it can report how - // many devices there are which can satisfy these - // requirements). - appendToGlobalCtors(M, Func, /*Priority*/ 1); - } - { + // Add tgt_register_lib in global ctors and tgt_unregister_lib in atexit. + auto CreateUnregFunction = [&]() { auto *FuncTy = FunctionType::get(OMPBuilder.Void, /*isVarArg*/ false); auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage, ".omp_offloading.descriptor_unreg", &M); @@ -2242,7 +2167,7 @@ CGIntrinsicsOpenMP::emitOffloadingGlobals(StringRef DevWrapperFuncName, // Get __tgt_unregister_lib function declaration. auto *UnRegFuncTy = - FunctionType::get(OMPBuilder.Void, TgtBinDescTy->getPointerTo(), + FunctionType::get(OMPBuilder.Void, PointerType::getUnqual(Ctx), /*isVarArg*/ false); FunctionCallee UnRegFuncC = M.getOrInsertFunction("__tgt_unregister_lib", UnRegFuncTy); @@ -2252,10 +2177,41 @@ CGIntrinsicsOpenMP::emitOffloadingGlobals(StringRef DevWrapperFuncName, Builder.CreateCall(UnRegFuncC, BinDesc); Builder.CreateRetVoid(); - // Add this function to global destructors. - // Match priority of __tgt_register_lib - appendToGlobalDtors(M, Func, /*Priority*/ 1); - } + return Func; + }; + + // Create the registration function constructor. + auto *FuncTy = FunctionType::get(OMPBuilder.Void, /*isVarArg*/ false); + auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage, + ".omp_offloading.descriptor_reg", &M); + Func->setSection(".text.startup"); + + // Get __tgt_register_lib function declaration. + auto *RegFuncTy = + FunctionType::get(OMPBuilder.Void, PointerType::getUnqual(Ctx), + /*isVarArg*/ false); + FunctionCallee RegFuncC = + M.getOrInsertFunction("__tgt_register_lib", RegFuncTy); + + // Get atexit function declaration. + auto *AtExitTy = + FunctionType::get(OMPBuilder.Int32, PointerType::getUnqual(Ctx), + /*isVarArg=*/false); + FunctionCallee AtExit = M.getOrInsertFunction("atexit", AtExitTy); + + // Construct function body. + IRBuilder<> Builder(BasicBlock::Create(M.getContext(), "entry", Func)); + Builder.CreateCall(RegFuncC, BinDesc); + + Function *UnregFunc = CreateUnregFunction(); + Builder.CreateCall(AtExit, UnregFunc); + + Builder.CreateRetVoid(); + + // Add this function to constructors. + // Set priority to 101 so that __tgt_register_lib is executed after system + // constructors but before user constructors. + appendToGlobalCtors(M, Func, /*Priority*/ 101); }; EmitOffloadingBinaryGlobals(); @@ -2296,17 +2252,6 @@ void CGIntrinsicsOpenMP::emitOMPTargetHost( Constant *SrcLocStr = OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize); Value *Ident = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize); - // TODO: should we use target_mapper without teams or the more general - // target_teams_mapper. Does the former buy us anything (less overhead?) - // FunctionCallee TargetMapper = - // OMPBuilder.getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_mapper); - // TODO: For nowait we need to enclose the host code in a task for async - // execution. - FunctionCallee TargetMapper = - (TargetInfo.NoWait ? OMPBuilder.getOrCreateRuntimeFunction( - M, OMPRTL___tgt_target_teams_nowait_mapper) - : OMPBuilder.getOrCreateRuntimeFunction( - M, OMPRTL___tgt_target_teams_mapper)); OMPBuilder.Builder.SetInsertPoint(EntryBB->getTerminator()); // Emit mappings. @@ -2316,19 +2261,13 @@ void CGIntrinsicsOpenMP::emitOMPTargetHost( emitOMPOffloadingMappings(AllocaIP, DSAValueMap, StructMappingInfoMap, OffloadingMappingArgs, /* isTargetRegion */ true); - // Push the tripcount. + // Set the tripcount, if available. + Value *TripCount = nullptr; if (OMPLoopInfo) { - FunctionCallee TripcountMapper = OMPBuilder.getOrCreateRuntimeFunction( - M, - llvm::omp::RuntimeFunction::OMPRTL___kmpc_push_target_tripcount_mapper); Value *Load = OMPBuilder.Builder.CreateLoad(OMPBuilder.Int64, OMPLoopInfo->UB); - Value *Tripcount = OMPBuilder.Builder.CreateAdd( + TripCount = OMPBuilder.Builder.CreateAdd( Load, ConstantInt::get(OMPBuilder.Int64, 1)); - auto *CI = checkCreateCall( - OMPBuilder.Builder, TripcountMapper, - {Ident, ConstantInt::get(OMPBuilder.Int64, -1), Tripcount}); - assert(CI && "Expected valid call"); } Value *NumTeams = createScalarCast(TargetInfo.NumTeams, OMPBuilder.Int32); @@ -2338,25 +2277,52 @@ void CGIntrinsicsOpenMP::emitOMPTargetHost( assert(NumTeams && "Expected non-null NumTeams"); assert(ThreadLimit && "Expected non-null ThreadLimit"); - SmallVector Args = { - Ident, ConstantInt::get(OMPBuilder.Int64, -1), - ConstantExpr::getBitCast(OMPRegionId, OMPBuilder.VoidPtr), - ConstantInt::get(OMPBuilder.Int32, OffloadingMappingArgs.Size), - OffloadingMappingArgs.BasePtrs, OffloadingMappingArgs.Ptrs, - OffloadingMappingArgs.Sizes, OffloadingMappingArgs.MapTypes, - OffloadingMappingArgs.MapNames, - // TODO: offload_mappers is null for now. - Constant::getNullValue(OMPBuilder.VoidPtrPtr), NumTeams, ThreadLimit}; - - if (TargetInfo.NoWait) { - // Add extra dependency information (unused for now). - Args.push_back(Constant::getNullValue(OMPBuilder.Int32)); - Args.push_back(Constant::getNullValue(OMPBuilder.Int8Ptr)); - Args.push_back(Constant::getNullValue(OMPBuilder.Int32)); - Args.push_back(Constant::getNullValue(OMPBuilder.Int8Ptr)); + if (!isOpenMPDeviceRuntime()) { + FunctionCallee KmpcSetThreadLimit = OMPBuilder.getOrCreateRuntimeFunction( + M, OMPRTL___kmpc_set_thread_limit); + Value *ThreadID = OMPBuilder.getOrCreateThreadID(Ident); + + checkCreateCall(OMPBuilder.Builder, KmpcSetThreadLimit, + {Ident, ThreadID, ThreadLimit}); } - auto *OffloadResult = checkCreateCall(OMPBuilder.Builder, TargetMapper, Args); + SmallVector ArgsVector; + + auto UnqualPtrTy = PointerType::getUnqual(M.getContext()); + OpenMPIRBuilder::TargetDataRTArgs RTArgs{ + OffloadingMappingArgs.BasePtrs, + OffloadingMappingArgs.Ptrs, + OffloadingMappingArgs.Sizes, + OffloadingMappingArgs.MapTypes, + ConstantPointerNull::get(UnqualPtrTy), + ConstantPointerNull::get(UnqualPtrTy), + OffloadingMappingArgs.MapNames, + }; + // Avoid initializer-list temporaries for ArrayRef fields. Use stable + // SmallVector storage so ArrayRef in TargetKernelArgs refers to valid + // memory. + SmallVector KernelNumTeams; + KernelNumTeams.push_back(NumTeams); + SmallVector KernelNumThreads; + KernelNumThreads.push_back(ThreadLimit); + + // TODO: Implement nowait: we need to enclose the host code in a task for + // async execution. OpenMPIRBuilder may support that now. + OpenMPIRBuilder::TargetKernelArgs Args{ + static_cast(OffloadingMappingArgs.Size), + RTArgs, + (TripCount ? TripCount : OMPBuilder.Builder.getInt64(0)), + KernelNumTeams, + KernelNumThreads, + Constant::getNullValue(OMPBuilder.VoidPtr), + /*TargetInfo.NoWait*/ false}; + OpenMPIRBuilder::getKernelArgsVector(Args, OMPBuilder.Builder, ArgsVector); + + Value *DeviceID = ConstantInt::get(OMPBuilder.Int64, -1); + Value *OffloadResult = nullptr; + OMPBuilder.emitTargetKernel(Loc, AllocaIP, OffloadResult, Ident, DeviceID, + NumTeams, ThreadLimit, OMPRegionId, ArgsVector); + assert(OffloadResult && "Expected non-null call inst from code generation"); auto *Failed = OMPBuilder.Builder.CreateIsNotNull(OffloadResult); OMPBuilder.Builder.CreateCondBr(Failed, StartBB, EndBB); @@ -2372,6 +2338,11 @@ void CGIntrinsicsOpenMP::emitOMPTargetDevice(Function *Fn, BasicBlock *EntryBB, // Emit the Numba wrapper offloading function. SmallVector WrapperArgsTypes; SmallVector WrapperArgsNames; + + // Add the pointer argument to kernel args expected by the runtime. + WrapperArgsTypes.push_back(OMPBuilder.VoidPtr); + WrapperArgsNames.push_back("dyn_ptr"); + for (auto &It : DSAValueMap) { Value *V = It.first; DSAType DSA = It.second.Type; @@ -2399,8 +2370,9 @@ void CGIntrinsicsOpenMP::emitOMPTargetDevice(Function *Fn, BasicBlock *EntryBB, Function *NumbaWrapperFunc = Function::Create( NumbaWrapperFnTy, GlobalValue::ExternalLinkage, DevWrapperFuncName, M); - // Name the wrapper arguments for readability. - for (size_t I = 0; I < NumbaWrapperFunc->arg_size(); ++I) + // Name the wrapper arguments for readability, start from 1 to skip the + // first "args" pointer argument. + for (size_t I = 1; I < NumbaWrapperFunc->arg_size(); ++I) NumbaWrapperFunc->getArg(I)->setName(WrapperArgsNames[I]); IRBuilder<> Builder( @@ -2425,14 +2397,23 @@ void CGIntrinsicsOpenMP::emitOMPTargetDevice(Function *Fn, BasicBlock *EntryBB, ArgOffset = 1; } for (auto &Arg : NumbaWrapperFunc->args()) { + // Skip the first "args" pointer argument. + if (Arg.getArgNo() == 0) + continue; // TODO: Runtime expects all scalars typed as Int64. if (!Arg.getType()->isPointerTy()) { - auto *ParamType = DevFuncCallee.getFunctionType()->getParamType( - ArgOffset + Arg.getArgNo()); + // ArgOffset accounts for the extra arguments added in the device + // function by Numba, and -1 accounts for the first "args" pointer + // argument. + size_t DevFuncArgNo = ArgOffset + (Arg.getArgNo() - 1); + + auto *ParamType = + DevFuncCallee.getFunctionType()->getParamType(DevFuncArgNo); AllocaInst *TmpInt64 = Builder.CreateAlloca(OMPBuilder.Int64, nullptr, Arg.getName() + ".casted"); Builder.CreateStore(&Arg, TmpInt64); - Value *Cast = Builder.CreateBitCast(TmpInt64, ParamType->getPointerTo()); + Value *Cast = Builder.CreateBitCast( + TmpInt64, PointerType::getUnqual(M.getContext())); Value *ConvLoad = Builder.CreateLoad(ParamType, Cast); DevFuncArgs.push_back(ConvLoad); } else @@ -2447,15 +2428,14 @@ void CGIntrinsicsOpenMP::emitOMPTargetDevice(Function *Fn, BasicBlock *EntryBB, #elif LLVM_VERSION_MAJOR <= 16 auto IP = OMPBuilder.createTargetInit(Loc, IsSPMD); #else - // TODO: Use TargetInfo launch configuration for max/min threads and - // threads. + // Note the default for MaxThreads is 0. OpenMPIRBuilder::TargetKernelDefaultAttrs Attrs{ (IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC), {-1, -1, -1}, 1, - {-1, -1, -1}, + {0, -1, -1}, 1}; - auto IP = OMPBuilder.createTargetInit(Loc, Attrs); + auto IP = OMPBuilder.createTargetInit(Builder, Attrs); #endif Builder.restoreIP(IP); } @@ -2467,8 +2447,10 @@ void CGIntrinsicsOpenMP::emitOMPTargetDevice(Function *Fn, BasicBlock *EntryBB, OpenMPIRBuilder::LocationDescription Loc(Builder); #if LLVM_VERSION_MAJOR <= 15 OMPBuilder.createTargetDeinit(Loc, /* IsSPMD */ IsSPMD, true); -#else +#elif LLVM_VERSION_MAJOR <= 16 OMPBuilder.createTargetDeinit(Loc, /* IsSPMD */ IsSPMD); +#else + OMPBuilder.createTargetDeinit(Loc); #endif } @@ -2485,15 +2467,8 @@ void CGIntrinsicsOpenMP::emitOMPTargetDevice(Function *Fn, BasicBlock *EntryBB, appendToCompilerUsed(M, {ExecModeGV}); // Get "nvvm.annotations" metadata node. - // TODO: may need to adjust for AMD gpus. - NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations"); - - Metadata *MDVals[] = { - ConstantAsMetadata::get(NumbaWrapperFunc), - MDString::get(M.getContext(), "kernel"), - ConstantAsMetadata::get(ConstantInt::get(OMPBuilder.Int32, 1))}; - // Append metadata to nvvm.annotations. - MD->addOperand(MDNode::get(M.getContext(), MDVals)); + // TODO: will need to adjust for AMD gpus. + NumbaWrapperFunc->setCallingConv(CallingConv::PTX_Kernel); // Add a function attribute for the kernel. NumbaWrapperFunc->addFnAttr(Attribute::get(M.getContext(), "kernel")); diff --git a/src/numba/openmp/libs/pass/CGIntrinsicsOpenMP.h b/src/numba/openmp/libs/pass/CGIntrinsicsOpenMP.h index 74b15cde3798..fd6539cefe49 100644 --- a/src/numba/openmp/libs/pass/CGIntrinsicsOpenMP.h +++ b/src/numba/openmp/libs/pass/CGIntrinsicsOpenMP.h @@ -111,7 +111,7 @@ namespace helpers { Type *getPointeeType(DSAValueMapTy &DSAValueMap, Value *V); } -inline std::string toString(DSAType DSA) { +inline std::string toString(const DSAType &DSA) { switch (DSA) { case DSA_NONE: return "DSA_NONE"; diff --git a/src/numba/openmp/libs/pass/CMakeLists.txt b/src/numba/openmp/libs/pass/CMakeLists.txt index 8b6ac8e26a0e..fcc977fc4c77 100644 --- a/src/numba/openmp/libs/pass/CMakeLists.txt +++ b/src/numba/openmp/libs/pass/CMakeLists.txt @@ -46,9 +46,9 @@ target_link_libraries(IntrinsicsOpenMP ) if(APPLE) - set_property(TARGET IntrinsicsOpenMP APPEND_STRING PROPERTY LINK_FLAGS "-flto -Wl,-exported_symbol,_runIntrinsicsOpenMPPass") + set_property(TARGET IntrinsicsOpenMP APPEND_STRING PROPERTY LINK_FLAGS "-Wl,-exported_symbol,_runIntrinsicsOpenMPPass") else() - set_property(TARGET IntrinsicsOpenMP APPEND_STRING PROPERTY LINK_FLAGS "-flto -Wl,--exclude-libs,ALL") + set_property(TARGET IntrinsicsOpenMP APPEND_STRING PROPERTY LINK_FLAGS "-Wl,--exclude-libs,ALL") endif() install(TARGETS IntrinsicsOpenMP diff --git a/src/numba/openmp/libs/pass/IntrinsicsOpenMP.cpp b/src/numba/openmp/libs/pass/IntrinsicsOpenMP.cpp index 577eff6954f9..8a67ad315ccf 100644 --- a/src/numba/openmp/libs/pass/IntrinsicsOpenMP.cpp +++ b/src/numba/openmp/libs/pass/IntrinsicsOpenMP.cpp @@ -348,28 +348,28 @@ struct IntrinsicsOpenMP { // TODO: check for conflicting DSA, for example reduction variables // cannot be set private. Should be done in Numba. - if (Tag.startswith("DIR")) { + if (Tag.starts_with("DIR")) { auto It = StringToDir.find(Tag); assert(It != StringToDir.end() && "Directive is not supported!"); Dir = It->second; - } else if (Tag.startswith("QUAL")) { + } else if (Tag.starts_with("QUAL")) { const ArrayRef &TagInputs = O.inputs(); - if (Tag.startswith("QUAL.OMP.NORMALIZED.IV")) { + if (Tag.starts_with("QUAL.OMP.NORMALIZED.IV")) { assert(O.input_size() == 1 && "Expected single IV value"); OMPLoopInfo.IV = TagInputs[0]; - } else if (Tag.startswith("QUAL.OMP.NORMALIZED.START")) { + } else if (Tag.starts_with("QUAL.OMP.NORMALIZED.START")) { assert(O.input_size() == 1 && "Expected single START value"); OMPLoopInfo.Start = TagInputs[0]; - } else if (Tag.startswith("QUAL.OMP.NORMALIZED.LB")) { + } else if (Tag.starts_with("QUAL.OMP.NORMALIZED.LB")) { assert(O.input_size() == 1 && "Expected single LB value"); OMPLoopInfo.LB = TagInputs[0]; - } else if (Tag.startswith("QUAL.OMP.NORMALIZED.UB")) { + } else if (Tag.starts_with("QUAL.OMP.NORMALIZED.UB")) { assert(O.input_size() == 1 && "Expected single UB value"); OMPLoopInfo.UB = TagInputs[0]; - } else if (Tag.startswith("QUAL.OMP.NUM_THREADS")) { + } else if (Tag.starts_with("QUAL.OMP.NUM_THREADS")) { assert(O.input_size() == 1 && "Expected single NumThreads value"); ParRegionInfo.NumThreads = TagInputs[0]; - } else if (Tag.startswith("QUAL.OMP.SCHEDULE")) { + } else if (Tag.starts_with("QUAL.OMP.SCHEDULE")) { // TODO: Add DIST_SCHEDULE for distribute loops. assert(O.input_size() == 1 && "Expected single chunking scheduling value"); @@ -385,11 +385,11 @@ struct IntrinsicsOpenMP { } } else FATAL_ERROR("Unsupported scheduling type"); - } else if (Tag.startswith("QUAL.OMP.IF")) { + } else if (Tag.starts_with("QUAL.OMP.IF")) { assert(O.input_size() == 1 && "Expected single if condition value"); ParRegionInfo.IfCondition = TagInputs[0]; - } else if (Tag.startswith("QUAL.OMP.TARGET.DEV_FUNC")) { + } else if (Tag.starts_with("QUAL.OMP.TARGET.DEV_FUNC")) { assert(O.input_size() == 1 && "Expected a single device function name"); ConstantDataArray *DevFuncArray = @@ -397,16 +397,16 @@ struct IntrinsicsOpenMP { assert(DevFuncArray && "Expected constant string for the device function"); TargetInfo.DevFuncName = DevFuncArray->getAsString(); - } else if (Tag.startswith("QUAL.OMP.TARGET.ELF")) { + } else if (Tag.starts_with("QUAL.OMP.TARGET.ELF")) { assert(O.input_size() == 1 && "Expected a single elf image string"); ConstantDataArray *ELF = dyn_cast(TagInputs[0]); assert(ELF && "Expected constant string for ELF"); TargetInfo.ELF = ELF; - } else if (Tag.startswith("QUAL.OMP.DEVICE")) { + } else if (Tag.starts_with("QUAL.OMP.DEVICE")) { // TODO: Handle device selection for target regions. - } else if (Tag.startswith("QUAL.OMP.NUM_TEAMS")) { + } else if (Tag.starts_with("QUAL.OMP.NUM_TEAMS")) { assert(O.input_size() == 1 && "Expected single NumTeams value"); switch (Dir) { case OMPD_target: @@ -429,7 +429,7 @@ struct IntrinsicsOpenMP { default: FATAL_ERROR("Unsupported qualifier in directive"); } - } else if (Tag.startswith("QUAL.OMP.THREAD_LIMIT")) { + } else if (Tag.starts_with("QUAL.OMP.THREAD_LIMIT")) { assert(O.input_size() == 1 && "Expected single ThreadLimit value"); switch (Dir) { @@ -450,7 +450,7 @@ struct IntrinsicsOpenMP { default: FATAL_ERROR("Unsupported qualifier in directive"); } - } else if (Tag.startswith("QUAL.OMP.NOWAIT")) { + } else if (Tag.starts_with("QUAL.OMP.NOWAIT")) { switch (Dir) { case OMPD_target: case OMPD_target_teams: @@ -822,6 +822,12 @@ extern "C" int runIntrinsicsOpenMPPass(const char *BitcodePtr, MPM.addPass(IntrinsicsOpenMPPass()); MPM.run(*M, MAM); + // Verify the module before writing bitcode + if (verifyModule(*M, &errs())) { + errs() << "ERROR: Module verification failed after IntrinsicsOpenMPPass\n"; + return 1; + } + SmallVector Buf; raw_svector_ostream OS(Buf); WriteBitcodeToFile(*M, OS); diff --git a/src/numba/openmp/llvm_pass.py b/src/numba/openmp/llvm_pass.py index f1745afc0fb1..4e56061caf79 100644 --- a/src/numba/openmp/llvm_pass.py +++ b/src/numba/openmp/llvm_pass.py @@ -39,8 +39,10 @@ def _writer_cb(ptr, size): raise RuntimeError(f"Running IntrinsicsOpenMPPass failed with return code {rc}") bc_out = bytes(out) + lowered_module = ll.parse_bitcode(bc_out) if DEBUG_OPENMP_LLVM_PASS >= 1: - print(lowered_module) + with open(f"{ll_module.name}-intrinsics-omp.ll", "w") as f: + f.write(str(lowered_module)) return lowered_module diff --git a/src/numba/openmp/omp_ir.py b/src/numba/openmp/omp_ir.py index f47be4244c6b..52128bb1d945 100644 --- a/src/numba/openmp/omp_ir.py +++ b/src/numba/openmp/omp_ir.py @@ -113,7 +113,11 @@ def __init__(self): import numba.cuda.api as cudaapi import numba.cuda.cudadrv.libs as cudalibs from numba.cuda.codegen import CUDA_TRIPLE + from numba.cuda.cudadrv import driver, enums + # The OpenMP target runtime prefers the blocking sync flag, so we set it + # here before creating the CUDA context. + driver.driver.cuDevicePrimaryCtxSetFlags(0, enums.CU_CTX_SCHED_BLOCKING_SYNC) self.cc = cudaapi.get_current_device().compute_capability self.sm = "sm_" + str(self.cc[0]) + str(self.cc[1]) @@ -123,9 +127,7 @@ def __init__(self): self.libdevice_mod = ll.parse_bitcode(f.read()) # Read the OpenMP device RTL for the architecture to link with the module. - self.libomptarget_arch = ( - libpath / "libomp" / "lib" / f"libomptarget-nvptx-{self.sm}.bc" - ) + self.libomptarget_arch = libpath / "openmp" / "lib" / "libomptarget-nvptx.bc" try: with open(self.libomptarget_arch, "rb") as f: self.libomptarget_mod = ll.parse_bitcode(f.read()) @@ -143,7 +145,7 @@ def __init__(self): def _get_target_image(self, mod, filename_prefix, ompx_attrs, use_toolchain=False): from numba.cuda.cudadrv import driver - from numba.core.llvm_bindings import create_pass_manager_builder + from numba.core.llvm_bindings import create_pass_builder if DEBUG_OPENMP_LLVM_PASS >= 1: with open(filename_prefix + ".ll", "w") as f: @@ -173,25 +175,19 @@ def _internalize(): _internalize() # Run passes for optimization, including target-specific passes. # Run function passes. - with ll.create_function_pass_manager(mod) as pm: - self.tm.add_analysis_passes(pm) - with create_pass_manager_builder( - opt=2, slp_vectorize=True, loop_vectorize=True - ) as pmb: - pmb.populate(pm) - pm.initialize() + with create_pass_builder( + self.tm, opt=2, slp_vectorize=True, loop_vectorize=True + ) as pb: + pm = pb.getFunctionPassManager() for func in mod.functions: - pm.run(func) - pm.finalize() + pm.run(func, pb) # Run module passes. - with ll.create_module_pass_manager() as pm: - self.tm.add_analysis_passes(pm) - with create_pass_manager_builder( - opt=2, slp_vectorize=True, loop_vectorize=True - ) as pmb: - pmb.populate(pm) - pm.run(mod) + with create_pass_builder( + self.tm, opt=2, slp_vectorize=True, loop_vectorize=True + ) as pb: + pm = pb.getModulePassManager() + pm.run(mod, pb) if DEBUG_OPENMP_LLVM_PASS >= 1: mod.verify() @@ -205,13 +201,11 @@ def _internalize(): # Internalize non-kernel function definitions. _internalize() # Run module passes. - with ll.create_module_pass_manager() as pm: - self.tm.add_analysis_passes(pm) - with create_pass_manager_builder( - opt=1, slp_vectorize=True, loop_vectorize=True - ) as pmb: - pmb.populate(pm) - pm.run(mod) + with create_pass_builder( + self.tm, opt=1, slp_vectorize=True, loop_vectorize=True + ) as pb: + pm = pb.getModulePassManager() + pm.run(mod, pb) if DEBUG_OPENMP_LLVM_PASS >= 1: mod.verify() @@ -221,25 +215,47 @@ def _internalize(): # Generate ptx assemlby. ptx = self.tm.emit_assembly(mod) if use_toolchain: - # ptxas does file I/O, so output the assembly and ingest the generated cubin. - with open(filename_prefix + "-intr-dev-rtl.s", "w") as f: - f.write(ptx) - - subprocess.run( - [ - "ptxas", - "-m64", - "--gpu-name", - self.sm, - filename_prefix + "-intr-dev-rtl.s", - "-o", - filename_prefix + "-intr-dev-rtl.o", - ], - check=True, - ) + # ptxas normally does file I/O; prefer piping PTX to stdin to avoid + # writing the .s file unless debug is enabled. + if DEBUG_OPENMP_LLVM_PASS >= 1: + with open(filename_prefix + "-intr-dev-rtl.s", "w") as f: + f.write(ptx) + + # Invoke ptxas reading PTX from stdin ('-') and writing output to + # a temporary file so we can capture the object in-memory without + # leaving it in the working directory. + with tempfile.NamedTemporaryFile(suffix=".o", delete=False) as tmpf: + outname = tmpf.name + try: + subprocess.run( + [ + "ptxas", + "-m64", + "--gpu-name", + self.sm, + "-", + "-o", + outname, + ], + input=ptx.encode(), + check=True, + ) - with open(filename_prefix + "-intr-dev-rtl.o", "rb") as f: - cubin = f.read() + with open(outname, "rb") as f: + cubin = f.read() + + # If debug is enabled, also write a named copy for inspection. + if DEBUG_OPENMP_LLVM_PASS >= 1: + with open( + filename_prefix + "-intr-dev-rtl.o", + "wb", + ) as f: + f.write(cubin) + finally: + try: + os.remove(outname) + except OSError: + pass else: if DEBUG_OPENMP_LLVM_PASS >= 1: with open( diff --git a/src/numba/openmp/omp_lower.py b/src/numba/openmp/omp_lower.py index 4fbd26d92dbe..c8598389d2ed 100644 --- a/src/numba/openmp/omp_lower.py +++ b/src/numba/openmp/omp_lower.py @@ -1937,7 +1937,8 @@ def some_target_directive(self, args, dir_tag, lexer_count, has_loop=False): else: # Neither TEAMS or PARALLEL in directive, set teams, threads to 1. start_tags.append(openmp_tag("QUAL.OMP.NUM_TEAMS", 1)) - start_tags.append(openmp_tag("QUAL.OMP.THREAD_LIMIT", 1)) + # Set thread limit to 0 to use runtime default. + start_tags.append(openmp_tag("QUAL.OMP.THREAD_LIMIT", 0)) if DEBUG_OPENMP >= 1: for clause in clauses: diff --git a/src/numba/openmp/tests/test_openmp.py b/src/numba/openmp/tests/test_openmp.py index 82e441021d86..bb2661132a0a 100644 --- a/src/numba/openmp/tests/test_openmp.py +++ b/src/numba/openmp/tests/test_openmp.py @@ -3541,14 +3541,8 @@ def test_impl(): np.testing.assert_equal(threads2, 256) def target_nest_parallel(self, device): - # TODO: map should be "from" instead of "tofrom" once this is fixed. target_pragma = f"target device({device}) map(from: a)" - # NOTE: num_threads should be a multiple of warp size, e.g. for NVIDIA - # V100 it is 32, the OpenMP runtime floors non-multiple of warp size. - # TODO: Newer LLVM versions should not have this restriction. - parallel_pragma = ( - "parallel num_threads(32)" # + (" shared(a)" if explicit else "") - ) + parallel_pragma = "parallel num_threads(32)" @njit def test_impl():