diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index 734cb1d..39c0019 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -10,66 +10,87 @@ on: release: types: - published - -env: - CIBW_BUILD: cp38-* cp39-* cp310-* cp311-* - CIBW_SKIP: cp3*-musllinux_* - CIBW_ARCHS_MACOS: x86_64 - CIBW_ARCHS_LINUX: auto64 - CIBW_MANYLINUX_X86_64_IMAGE: manylinux2014 - CIBW_BEFORE_ALL_LINUX: yum -y install boost-devel zlib-devel - CIBW_BEFORE_ALL_MACOS: brew install boost zlib gsl - CIBW_BEFORE_BUILD: pip install --upgrade pip setuptools wheel ninja numpy cython - CIBW_ENVIRONMENT_MACOS: CXX="$(brew --prefix llvm@15)/bin/clang++" - MACOSX_DEPLOYMENT_TARGET: 10.15 + workflow_dispatch: jobs: - build_wheels: + build_wheels_cloud: name: Build wheels on ${{ matrix.os }} runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: - os: [ubuntu-22.04, macos-12] + include: + - os: ubuntu-24.04 + arch: x86_64 + py-vers-full: cp39-* cp310-* cp311-* cp312-* cp313-* cp314-* + py-vers-pr: cp39-* cp310-* cp311-* cp312-* cp313-* cp314-* + before-all: dnf -y install boost-devel zlib + extra-env: "" + mdt: "" + - os: ubuntu-24.04-arm + arch: aarch64 + py-vers-full: cp39-* cp310-* cp311-* cp312-* cp313-* cp314-* + py-vers-pr: cp39-* cp310-* cp311-* cp312-* cp313-* cp314-* + before-all: dnf -y install boost-devel zlib + extra-env: "" + mdt: "" + - os: macos-15-intel + arch: x86_64 + py-vers-full: cp39-* cp310-* cp311-* cp312-* cp313-* cp314-* + py-vers-pr: cp39-* cp310-* cp311-* cp312-* cp313-* cp314-* + before-all: brew install boost zlib + extra-env: CC=clang CXX=clang++ + mdt: 15 + - os: macos-14 + arch: arm64 + py-vers-full: cp39-* cp310-* cp311-* cp312-* cp313-* cp314-* + py-vers-pr: cp39-* cp310-* cp311-* cp312-* cp313-* cp314-* + before-all: brew install boost zlib + extra-env: CC=clang CXX=clang++ + mdt: 14 + + env: + CIBW_BUILD: ${{ github.event_name != 'pull_request' && matrix.py-vers-full || matrix.py-vers-pr }} + CIBW_SKIP: cp3*-musllinux_* + CIBW_ARCHS: ${{ matrix.arch }} + CIBW_MANYLINUX_X86_64_IMAGE: manylinux_2_28 + CIBW_BEFORE_ALL: ${{ matrix.before-all }} + CIBW_BEFORE_BUILD: pip install --upgrade pip setuptools wheel ninja numpy cython + CIBW_ENVIRONMENT: ${{ matrix.extra-env }} + MACOSX_DEPLOYMENT_TARGET: ${{ matrix.mdt }} steps: - name: checkout repo & submodules - uses: actions/checkout@v3 + uses: actions/checkout@v5 with: + submodules: true fetch-depth: 0 - - name: Set up Python - uses: actions/setup-python@v3 - - - name: Install cibuildwheel - run: python -m pip install cibuildwheel==2.12.0 - - name: Build wheels - run: python -m cibuildwheel --output-dir wheelhouse + uses: pypa/cibuildwheel@v3.2.0 - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: - name: wheels + name: wheels-cloud-${{ matrix.os }} path: ./wheelhouse/*.whl retention-days: 1 upload_all: name: Upload to PyPI - needs: build_wheels + needs: [build_wheels_cloud] runs-on: ubuntu-latest - if: github.event_name == 'release' && github.event.action == 'published' + if: ${{ github.event_name == 'release' && github.event.action == 'published' }} steps: - - uses: actions/setup-python@v3 - - - name: Download wheels - uses: actions/download-artifact@v3 + - name: Download wheels from cloud runners + uses: actions/download-artifact@v5 with: - name: wheels + pattern: wheels-cloud-* + merge-multiple: true path: wheels - - uses: pypa/gh-action-pypi-publish@v1.6.4 + - uses: pypa/gh-action-pypi-publish@v1.13.0 with: user: __token__ password: ${{ secrets.PYPI_TOKEN }} diff --git a/.github/workflows/cpp_tests.yml b/.github/workflows/cpp_tests.yml new file mode 100644 index 0000000..0e03a4b --- /dev/null +++ b/.github/workflows/cpp_tests.yml @@ -0,0 +1,64 @@ +name: C++ unit tests + +on: + push: + branches: + - main + pull_request: + branches: + - '**' + workflow_dispatch: + +jobs: + + build-and-test: + name: C++ unit tests + runs-on: ${{ matrix.os }} + timeout-minutes: 10 + + strategy: + fail-fast: false + matrix: + include: + - os: ubuntu-24.04 + cxx: "clang++-18" + sys_install: "sudo apt -y update && sudo apt -y install libboost-iostreams-dev" + - os: ubuntu-24.04-arm + cxx: "g++-12" + sys_install: "sudo apt -y update && sudo apt -y install libboost-iostreams-dev" + - os: macos-15 + cxx: "" + sys_install: "brew install boost" + - os: macos-15 + cxx: "$(brew --prefix llvm@18)/bin/clang" + sys_install: "brew install boost" + + steps: + - name: checkout repo & submodules + uses: actions/checkout@v5 + with: + submodules: true + fetch-depth: 0 + + - name: install deps + run: | + ${{ matrix.sys_install }} + + - name: make build dir + run: | + mkdir build + + - name: cmake configure + run: | + cmake .. + working-directory: build + + - name: cmake build + run: | + cmake --build . --target cpp_tests --parallel 4 + working-directory: build + + - name: ctest + run: | + ctest --output-on-failure + working-directory: build diff --git a/.github/workflows/pip-install-macos.yml b/.github/workflows/pip-install-macos.yml deleted file mode 100644 index 897a57b..0000000 --- a/.github/workflows/pip-install-macos.yml +++ /dev/null @@ -1,48 +0,0 @@ -name: pip install on macOS - -on: - push: - branches: - - main - pull_request: - branches: - - '**' - -jobs: - - pip-install-and-import: - name: pip install on macOS - runs-on: macos-11 - - steps: - - name: checkout arg_needle - uses: actions/checkout@v3 - - - uses: actions/setup-python@v3 - with: - python-version: 3.x - - - name: install system packages - run: | - brew install boost zlib gsl - - - name: install python pip dependencies - run: | - python --version - python -m pip install --upgrade pip setuptools wheel - python -m pip install cmake ninja - - - name: install Palamara python dependencies - run: | - python --version - python -m pip install asmc-asmc arg-needle-lib - - - name: install arg_needle - run: | - python --version - python -m pip install . - - - name: test import works as expected - run: | - python --version - python test/test_import.py diff --git a/.github/workflows/pip-install-ubuntu.yml b/.github/workflows/pip-install-ubuntu.yml deleted file mode 100644 index 1e2f27a..0000000 --- a/.github/workflows/pip-install-ubuntu.yml +++ /dev/null @@ -1,49 +0,0 @@ -name: pip install on Ubuntu - -on: - push: - branches: - - main - pull_request: - branches: - - '**' - -jobs: - - pip-install-and-import: - name: pip install on Ubuntu - runs-on: ubuntu-22.04 - - steps: - - name: checkout arg_needle - uses: actions/checkout@v3 - - - uses: actions/setup-python@v3 - with: - python-version: 3.x - - - name: install system packages - run: | - sudo apt -y update - sudo apt -y install libboost-iostreams-dev zlib1g-dev - - - name: install python pip dependencies - run: | - python --version - python -m pip install --upgrade pip setuptools wheel - python -m pip install cmake ninja - - - name: install Palamara python dependencies - run: | - python --version - python -m pip install asmc-asmc arg-needle-lib - - - name: install arg_needle - run: | - python --version - python -m pip install . - - - name: test import works as expected - run: | - python --version - python test/test_import.py diff --git a/.github/workflows/python_examples.yml b/.github/workflows/python_examples.yml new file mode 100644 index 0000000..12c0b42 --- /dev/null +++ b/.github/workflows/python_examples.yml @@ -0,0 +1,77 @@ +name: Python examples + +on: + push: + branches: + - main + pull_request: + branches: + - '**' + workflow_dispatch: + +jobs: + + build-and-test: + name: Python examples + runs-on: ${{ matrix.os }} + timeout-minutes: 10 + + strategy: + fail-fast: false + matrix: + include: + - os: ubuntu-24.04 + py: "3.10" + sys_install: "sudo apt -y update && sudo apt -y install libboost-iostreams-dev" + - os: macos-15 + py: "3.13" + sys_install: "brew install boost" + + steps: + - name: checkout repo & submodules + uses: actions/checkout@v5 + with: + submodules: true + fetch-depth: 0 + + - name: install deps + run: | + ${{ matrix.sys_install }} + + - name: Set up Python ${{ matrix.py }} + uses: actions/setup-python@v6 + with: + python-version: ${{ matrix.py }} + + - name: install python bindings + run: | + python -m pip install --upgrade pip setuptools wheel + python -m pip install . + + - name: prepare example + run: | + prepare_example + + - name: regular ARG-Needle + run: | + infer_args + + - name: ARG-Needle with ASMC-clust + run: | + infer_args --asmc_clust 1 + + - name: ARG-Needle without ARG normalization + run: | + infer_args --normalize 0 + + - name: ARG-Needle (sequence mode) + run: | + prepare_example --mode sequence --length 5e5 + infer_args --mode sequence + + - name: ARG-Needle advanced mode (multistep) + run: | + prepare_example + infer_args_advanced --step 1 --num_snp_samples 200 + infer_args_advanced --step 2 + infer_args_advanced --step 3 --trim_num_snps 0,50 diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml new file mode 100644 index 0000000..01668bb --- /dev/null +++ b/.github/workflows/python_tests.yml @@ -0,0 +1,53 @@ +name: Python tests + +on: + push: + branches: + - main + pull_request: + branches: + - '**' + workflow_dispatch: + +jobs: + + build-and-test: + name: Python tests + runs-on: ${{ matrix.os }} + timeout-minutes: 10 + + strategy: + fail-fast: false + matrix: + include: + - os: ubuntu-24.04 + py: "3.13" + sys_install: "sudo apt -y update && sudo apt -y install libboost-iostreams-dev" + - os: macos-15 + py: "3.10" + sys_install: "brew install boost" + + steps: + - name: checkout repo & submodules + uses: actions/checkout@v5 + with: + submodules: true + fetch-depth: 0 + + - name: install deps + run: | + ${{ matrix.sys_install }} + + - name: Set up Python ${{ matrix.py }} + uses: actions/setup-python@v6 + with: + python-version: ${{ matrix.py }} + + - name: install python bindings + run: | + python -m pip install --upgrade pip setuptools wheel + python -m pip install .[dev] + + - name: python unit tests + run: | + python -m pytest diff --git a/CMakeLists.txt b/CMakeLists.txt index b42203c..23651aa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ # This file is part of the ARG-Needle genealogical inference and # analysis software suite. -# Copyright (C) 2023 ARG-Needle Developers. +# Copyright (C) 2023-2025 ARG-Needle Developers. # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -18,12 +18,17 @@ cmake_minimum_required(VERSION 3.16) message(STATUS "Using CMake version ${CMAKE_VERSION}") -project(arg_needle LANGUAGES CXX VERSION 1.0.3) +project(arg_needle LANGUAGES CXX VERSION 1.1.0) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) +option(ARG_NEEDLE_PYTHON_BINDINGS "Whether to build the python bindings" OFF) +if (ARG_NEEDLE_PYTHON_BINDINGS) + set(CMAKE_POSITION_INDEPENDENT_CODE ON) +endif () + # Project settings including default build type include(cmake/ProjectSettings.cmake) @@ -44,26 +49,41 @@ option(BUILD_SHARED_LIBS "Enable compilation of shared libraries" OFF) set(arg_needle_testdata_dir ${CMAKE_CURRENT_SOURCE_DIR}/testdata) add_definitions(-DARG_NEEDLE_TESTDATA_DIR=\"${arg_needle_testdata_dir}\") -option(PYTHON_BINDINGS "Whether to build the python bindings" ON) -if (PYTHON_BINDINGS) - include(FetchContent) - FetchContent_Declare( - pybind11 - GIT_REPOSITORY https://github.com/pybind/pybind11 - GIT_TAG 0bd8896a4010f2d91b2340570c24fa08606ec406 # Version 2.10.3 - ) - FetchContent_GetProperties(pybind11) - if (NOT pybind11_POPULATED) - FetchContent_Populate(pybind11) - add_subdirectory(${pybind11_SOURCE_DIR} ${pybind11_BINARY_DIR}) - endif () + +# Python bindings +if (ARG_NEEDLE_PYTHON_BINDINGS) + option(ARG_NEEDLE_BUILDING_FROM_PYPROJECT "Are we building from pyproject.toml (pip install)?" OFF) + + if(ARG_NEEDLE_BUILDING_FROM_PYPROJECT) + message(STATUS "Using pybind11 from pyproject.toml build environment") + find_package(pybind11 REQUIRED) + else() + message(STATUS "Using FetchContent to get pybind11") + include(FetchContent) + FetchContent_Declare( + pybind11 + GIT_REPOSITORY https://github.com/pybind/pybind11.git + GIT_TAG f5fbe867d2d26e4a0a9177a51f6e568868ad3dc8 # Version 3.0.1 + ) + FetchContent_MakeAvailable(pybind11) + endif() endif () + add_subdirectory(src) -#add_subdirectory(example) -#option(ENABLE_TESTING "Enable Test Builds" ON) -#if(ENABLE_TESTING) -# enable_testing() -# add_subdirectory(test) -#endif() +option(ARG_NEEDLE_TESTING "Enable ARG Needle unit testing" ON) +if(ARG_NEEDLE_TESTING) + Include(FetchContent) + FetchContent_Declare( + Catch2 + GIT_REPOSITORY https://github.com/catchorg/Catch2.git + GIT_TAG b3fb4b9feafcd8d91c5cb510a4775143fdbef02f # Version 3.11.0 + ) + + FetchContent_MakeAvailable(Catch2) + list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras) + include(CTest) + include(Catch) + add_subdirectory(test/cpp) +endif() diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 59dffca..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1 +0,0 @@ -recursive-include src/resources * diff --git a/PyPI_README.md b/PyPI_README.md index a4266ef..a686b6b 100644 --- a/PyPI_README.md +++ b/PyPI_README.md @@ -1,6 +1,14 @@ # arg-needle This repository contains arg-needle, which implements the ARG inference algorithms ARG-Needle and ASMC-clust. +Prebuilt CPython wheels are available for Linux (compatible with glibc ≥ 2.28) and macOS (built on macOS 15 for x86_64 and macOS 14 for arm64). + +| Platform \ CPython | ≤3.8 | 3.9 | 3.10 | 3.11 | 3.12 | 3.13 | 3.14 | +|-----------------------------|------|-----|------|------|------|------|------| +| Linux x86_64 | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| Linux aarch64 | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| macOS Intel (x86_64) | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| macOS Apple Silicon (arm64) | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ## Quickstart @@ -12,8 +20,6 @@ The Python module can be installed with: pip install arg-needle ``` -This Python module is currently available on Linux and macOS. - ### Documentation Please see the [ARG-Needle manual](https://palamaralab.github.io/software/argneedle/) for all usage instructions and documentation. diff --git a/README.md b/README.md index 8225c8a..46434d8 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ Please see the [ARG-Needle manual](https://palamaralab.github.io/software/argnee ## For developers: making a release -- Bump the version number in [setup.py](setup.py) and [CMakeLists.txt](CMakeLists.txt) +- Bump the version number in [pyproject.toml](pyproject.toml) and [CMakeLists.txt](CMakeLists.txt) - Update [RELEASE_NOTES.md](RELEASE_NOTES.md) - Push changes and check that all [GitHub workflows](https://github.com/PalamaraLab/arg_needle/actions) pass - Tag the commit in Git using syntax `vX.Y.Z` diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 3d4f9a5..cfe98a9 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,5 +1,10 @@ # arg-needle Release Notes +## v1.1.0 (2025-12-18) + +- Public source code release: https://github.com/PalamaraLab/arg-needle +- Python wheels are now available for Linux and macOS on both x86_64 and arm64/AArch64 architectures, for CPython versions 3.9 to 3.14 inclusive. + ## v1.0.3 (2023-08-30) - ASMC decoders can now take a genetic map file with a specified non-default location. diff --git a/arg_needle b/arg_needle deleted file mode 120000 index e831038..0000000 --- a/arg_needle +++ /dev/null @@ -1 +0,0 @@ -src \ No newline at end of file diff --git a/cmake/CompilerWarnings.cmake b/cmake/CompilerWarnings.cmake index 53f2bce..16c37c7 100644 --- a/cmake/CompilerWarnings.cmake +++ b/cmake/CompilerWarnings.cmake @@ -1,6 +1,6 @@ # This file is part of the ARG-Needle genealogical inference and # analysis software suite. -# Copyright (C) 2023 ARG-Needle Developers. +# Copyright (C) 2023-2025 ARG-Needle Developers. # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by diff --git a/cmake/ProjectSettings.cmake b/cmake/ProjectSettings.cmake index b73d53e..ad5289b 100644 --- a/cmake/ProjectSettings.cmake +++ b/cmake/ProjectSettings.cmake @@ -1,6 +1,6 @@ # This file is part of the ARG-Needle genealogical inference and # analysis software suite. -# Copyright (C) 2023 ARG-Needle Developers. +# Copyright (C) 2023-2025 ARG-Needle Developers. # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by diff --git a/cmake/Sanitisers.cmake b/cmake/Sanitisers.cmake index 8334ae8..8fbf606 100644 --- a/cmake/Sanitisers.cmake +++ b/cmake/Sanitisers.cmake @@ -1,6 +1,6 @@ # This file is part of the ARG-Needle genealogical inference and # analysis software suite. -# Copyright (C) 2023 ARG-Needle Developers. +# Copyright (C) 2023-2025 ARG-Needle Developers. # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by diff --git a/cmake/StaticAnalysers.cmake b/cmake/StaticAnalysers.cmake index f8bb49f..260e6f0 100644 --- a/cmake/StaticAnalysers.cmake +++ b/cmake/StaticAnalysers.cmake @@ -1,6 +1,6 @@ # This file is part of the ARG-Needle genealogical inference and # analysis software suite. -# Copyright (C) 2023 ARG-Needle Developers. +# Copyright (C) 2023-2025 ARG-Needle Developers. # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..c104575 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,82 @@ +[build-system] +requires = [ + "scikit-build-core>=0.11.6", + "pybind11==3.0.1", + "setuptools" +] +build-backend = "scikit_build_core.build" + +[project] +dynamic = ["readme"] +name = "arg-needle" +version = "1.1.0" +description = "Ancestral recombination graph (ARG)" +authors = [ + { name = "ARG-Needle Developers" } +] +requires-python = ">=3.9" + +classifiers = [ + "Development Status :: 4 - Beta", + "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", +] + +dependencies = [ + 'arg-needle-lib>=1.2.0', + 'asmc-asmc>=1.4.0', + 'fastcluster', + 'msprime>=1.3.0', + 'numpy>=1.17.0', + 'pandas', + 'psutil', + 'tskit>=1.0.0', +] + +[project.optional-dependencies] +dev = [ + "pytest", + "h5py", +] + +[project.scripts] +infer_args="arg_needle.scripts.infer_args:main" +infer_args_advanced="arg_needle.scripts.infer_args_advanced:main" +prepare_example="arg_needle.scripts.prepare_example:main" + +[tool.scikit-build] +minimum-version = "build-system.requires" +build.verbose = true +cmake.build-type = "Release" +build.targets = ["arg_needle_hashing_pybind"] +wheel.packages = ["src/arg_needle"] +metadata.readme.provider = "scikit_build_core.metadata.fancy_pypi_readme" + +[tool.scikit-build.cmake.define] +ARG_NEEDLE_TESTING = "OFF" +ARG_NEEDLE_PYTHON_BINDINGS = "ON" +ARG_NEEDLE_BUILDING_FROM_PYPROJECT = "ON" + +[tool.hatch.metadata.hooks.fancy-pypi-readme] +content-type = "text/markdown" + +[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]] +path = "PyPI_README.md" + +[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]] +path = "RELEASE_NOTES.md" + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.setuptools.package-data] +"arg_needle" = ["resources/*"] + +[tool.pytest.ini_options] +testpaths = ["test"] diff --git a/setup-pre-commit.sh b/setup-pre-commit.sh deleted file mode 100755 index 95333fe..0000000 --- a/setup-pre-commit.sh +++ /dev/null @@ -1,29 +0,0 @@ -# Set up pre-commit hooks, in this case just clang-format checking -# -# Note: this overwrites existing .git/hooks/pre-commit -# -# See .clang-format for configuration file -# Code modified from https://github.com/KDAB/kdabtv/tree/master/Qt-Widgets-and-more/clang-format -# Based on this tutorial: https://www.youtube.com/watch?v=Cz36YveDI2E - -echo "#!/bin/sh - -python .git/hooks/pre-commit-clang-format.py" > .git/hooks/pre-commit - - -echo "import subprocess -try: - output = str(subprocess.check_output([\"git\", \"clang-format\", \"--diff\"])) -except subprocess.CalledProcessError as e: - print(e) - print(\"Error raised, try installing clang-format.\\n\") - exit(1) - -if \"clang-format did not modify any files\" not in output and \"no modified files to format\" not in output: - print(\"Run git clang-format, add the modified files, then commit.\\n\") - exit(1) -else: - exit(0)" > .git/hooks/pre-commit-clang-format.py - - -chmod +x .git/hooks/pre-commit .git/hooks/pre-commit-clang-format.py diff --git a/setup.py b/setup.py deleted file mode 100644 index 1ba9ccb..0000000 --- a/setup.py +++ /dev/null @@ -1,165 +0,0 @@ -# This file is part of the ARG-Needle genealogical inference and -# analysis software suite. -# Copyright (C) 2023 ARG-Needle Developers. - -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. - -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . - -# Based on https://github.com/pybind/cmake_example - -import os -import sys -import subprocess - -from setuptools import setup, Extension -from setuptools.command.build_ext import build_ext - -# Convert distutils Windows platform specifiers to CMake -A arguments -PLAT_TO_CMAKE = { - "win32": "Win32", - "win-amd64": "x64", - "win-arm32": "ARM", - "win-arm64": "ARM64", -} - - -# A CMakeExtension needs a sourcedir instead of a file list. -# The name must be the _single_ output extension from the CMake build. -# If you need multiple extensions, see scikit-build. -class CMakeExtension(Extension): - def __init__(self, name, sourcedir=""): - Extension.__init__(self, name, sources=[]) - self.sourcedir = os.path.abspath(sourcedir) - - -class CMakeBuild(build_ext): - - def build_extension(self, ext): - extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name))) - - # required for auto-detection of auxiliary "native" libs - if not extdir.endswith(os.path.sep): - extdir += os.path.sep - - cfg = "Debug" if self.debug else "Release" - - # CMake lets you override the generator - we need to check this. - # Can be set with Conda-Build, for example. - cmake_generator = os.environ.get("CMAKE_GENERATOR", "") - - # Set Python_EXECUTABLE instead if you use PYBIND11_FINDPYTHON - # EXAMPLE_VERSION_INFO shows you how to pass a value into the C++ code - # from Python. - cmake_args = [ - f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}", - f"-DPYTHON_EXECUTABLE={sys.executable}", - f"-DCMAKE_BUILD_TYPE={cfg}", - f"-DWARNINGS_AS_ERRORS=OFF", - f"-DENABLE_TESTING=OFF", - f"-DMAKE_DOCS=OFF", - ] - build_args = [] - - if self.compiler.compiler_type != "msvc": - # Using Ninja-build since it a) is available as a wheel and b) - # multithreads automatically. MSVC would require all variables be - # exported for Ninja to pick it up, which is a little tricky to do. - # Users can override the generator with CMAKE_GENERATOR in CMake - # 3.15+. - if not cmake_generator: - cmake_args += ["-GNinja"] - - else: - - # Single config generators are handled "normally" - single_config = any(x in cmake_generator for x in {"NMake", "Ninja"}) - - # CMake allows an arch-in-generator style for backward compatibility - contains_arch = any(x in cmake_generator for x in {"ARM", "Win64"}) - - # Specify the arch if using MSVC generator, but only if it doesn't - # contain a backward-compatibility arch spec already in the - # generator name. - if not single_config and not contains_arch: - cmake_args += ["-A", PLAT_TO_CMAKE[self.plat_name]] - - # Multi-config generators have a different way to specify configs - if not single_config: - cmake_args += [ - "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}".format(cfg.upper(), extdir) - ] - build_args += ["--config", cfg] - - # Set CMAKE_BUILD_PARALLEL_LEVEL to control the parallel build level - # across all generators. - if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ: - # self.parallel is a Python 3 only way to set parallel jobs by hand - # using -j in the build_ext call, not supported by pip or PyPA-build. - if hasattr(self, "parallel") and self.parallel: - # CMake 3.12+ only. - build_args += ["-j{}".format(self.parallel)] - - if not os.path.exists(self.build_temp): - os.makedirs(self.build_temp) - - subprocess.check_call( - ["cmake", ext.sourcedir] + cmake_args, cwd=self.build_temp - ) - subprocess.check_call( - ["cmake", "--build", "."] + build_args, cwd=self.build_temp - ) - - -with open('PyPI_README.md', encoding='utf-8') as f: - long_description = f.read() - -with open('RELEASE_NOTES.md', encoding='utf-8') as f: - release_notes = f.read() - -setup( - name='arg-needle', - version='1.0.3', - author='PalamaraLab (https://palamaralab.github.io/)', - url='https://github.com/PalamaraLab/arg_needle/', - install_requires=[ - 'arg-needle-lib>=1.0.0', - 'asmc-asmc>=1.3.1', - 'fastcluster', - 'msprime>=1.0.0', - 'numpy>=1.17.0', - 'pandas', - 'psutil', - 'tskit>=0.1.5', - ], - extras_require={ - 'dev': [ - 'pytest', - ], - }, - description='Ancestral recombination graph (ARG)', - packages=['arg_needle', 'arg_needle.scripts'], - long_description='\n'.join([long_description, release_notes]), - long_description_content_type='text/markdown', - ext_modules=[CMakeExtension('arg_needle')], - cmdclass=dict(build_ext=CMakeBuild), - entry_points = { - 'console_scripts': [ - 'arg_needle=arg_needle.scripts.infer_args:main', - 'arg_needle_multistep=arg_needle.scripts.infer_args_advanced:main', - 'arg_needle_prepare_example=arg_needle.scripts.prepare_example:main', - ], - }, - include_package_data=True, - package_data={'': ['resources/*']}, - zip_safe=False, -) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5dc1c6c..7482dad 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,6 +1,6 @@ # This file is part of the ARG-Needle genealogical inference and # analysis software suite. -# Copyright (C) 2023 ARG-Needle Developers. +# Copyright (C) 2023-2025 ARG-Needle Developers. # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -43,8 +43,8 @@ set_target_properties(arg_needle_hashing PROPERTIES PUBLIC_HEADER "${arg_needle_ target_link_libraries(arg_needle_hashing PRIVATE Boost::headers Boost::iostreams) target_link_libraries(arg_needle_hashing PRIVATE project_warnings) -if (PYTHON_BINDINGS) - set_target_properties(arg_needle_hashing PROPERTIES POSITION_INDEPENDENT_CODE TRUE) +if (ARG_NEEDLE_PYTHON_BINDINGS) pybind11_add_module(arg_needle_hashing_pybind hashing/pybind.cpp) target_link_libraries(arg_needle_hashing_pybind PRIVATE arg_needle_hashing) + install(TARGETS arg_needle_hashing_pybind LIBRARY DESTINATION arg_needle) endif () diff --git a/src/__init__.py b/src/arg_needle/__init__.py similarity index 95% rename from src/__init__.py rename to src/arg_needle/__init__.py index d857496..258d285 100644 --- a/src/__init__.py +++ b/src/arg_needle/__init__.py @@ -1,6 +1,6 @@ # This file is part of the ARG-Needle genealogical inference and # analysis software suite. -# Copyright (C) 2023 ARG-Needle Developers. +# Copyright (C) 2023-2025 ARG-Needle Developers. # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by diff --git a/src/decoders.py b/src/arg_needle/decoders.py similarity index 98% rename from src/decoders.py rename to src/arg_needle/decoders.py index 7c13680..3c1336e 100644 --- a/src/decoders.py +++ b/src/arg_needle/decoders.py @@ -1,6 +1,6 @@ # This file is part of the ARG-Needle genealogical inference and # analysis software suite. -# Copyright (C) 2023 ARG-Needle Developers. +# Copyright (C) 2023-2025 ARG-Needle Developers. # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -30,7 +30,7 @@ # our packages from asmc.asmc import DecodingParams, ASMC -import arg_needle_hashing_pybind as arg_needle_hashing +from .arg_needle_hashing_pybind import HapData from .utils import btime logging.basicConfig( @@ -329,7 +329,7 @@ def make_asmc_decoder( if use_hashing: if verbose: logging.info("Making HapData object") - hasher = arg_needle_hashing.HapData( + hasher = HapData( mode, haps_file_root, hash_word_size, mapfile, fill_sites=False) logging.info("Hashing data is {} by {}".format(hasher.num_haps, hasher.num_sites)) @@ -337,7 +337,7 @@ def make_asmc_decoder( if use_hashing and backup_hash_word_size > 0: if verbose: logging.info("Making backup HapData object") - backup_hasher = arg_needle_hashing.HapData( + backup_hasher = HapData( mode, haps_file_root, backup_hash_word_size, map_file_path=mapfile, fill_sites=False) logging.info("Backup hashing data is {} by {}".format(hasher.num_haps, hasher.num_sites)) diff --git a/src/inference.py b/src/arg_needle/inference.py similarity index 99% rename from src/inference.py rename to src/arg_needle/inference.py index ae2f7b5..abb56fe 100644 --- a/src/inference.py +++ b/src/arg_needle/inference.py @@ -1,6 +1,6 @@ # This file is part of the ARG-Needle genealogical inference and # analysis software suite. -# Copyright (C) 2023 ARG-Needle Developers. +# Copyright (C) 2023-2025 ARG-Needle Developers. # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by diff --git a/src/resources/30-100-2000_CEU.decodingQuantities.gz b/src/arg_needle/resources/30-100-2000_CEU.decodingQuantities.gz similarity index 100% rename from src/resources/30-100-2000_CEU.decodingQuantities.gz rename to src/arg_needle/resources/30-100-2000_CEU.decodingQuantities.gz diff --git a/src/resources/CEU.demo b/src/arg_needle/resources/CEU.demo similarity index 100% rename from src/resources/CEU.demo rename to src/arg_needle/resources/CEU.demo diff --git a/src/resources/ukb_chr2_spectrum.tsv b/src/arg_needle/resources/ukb_chr2_spectrum.tsv similarity index 100% rename from src/resources/ukb_chr2_spectrum.tsv rename to src/arg_needle/resources/ukb_chr2_spectrum.tsv diff --git a/src/scripts/README.md b/src/arg_needle/scripts/README.md similarity index 100% rename from src/scripts/README.md rename to src/arg_needle/scripts/README.md diff --git a/src/scripts/__init__.py b/src/arg_needle/scripts/__init__.py similarity index 93% rename from src/scripts/__init__.py rename to src/arg_needle/scripts/__init__.py index 4783f5a..638e42b 100644 --- a/src/scripts/__init__.py +++ b/src/arg_needle/scripts/__init__.py @@ -1,6 +1,6 @@ # This file is part of the ARG-Needle genealogical inference and # analysis software suite. -# Copyright (C) 2023 ARG-Needle Developers. +# Copyright (C) 2023-2025 ARG-Needle Developers. # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by diff --git a/src/scripts/infer_args.py b/src/arg_needle/scripts/infer_args.py similarity index 98% rename from src/scripts/infer_args.py rename to src/arg_needle/scripts/infer_args.py index 10908b8..6b61f0a 100644 --- a/src/scripts/infer_args.py +++ b/src/arg_needle/scripts/infer_args.py @@ -1,6 +1,6 @@ # This file is part of the ARG-Needle genealogical inference and # analysis software suite. -# Copyright (C) 2023 ARG-Needle Developers. +# Copyright (C) 2023-2025 ARG-Needle Developers. # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by diff --git a/src/scripts/infer_args_advanced.py b/src/arg_needle/scripts/infer_args_advanced.py similarity index 99% rename from src/scripts/infer_args_advanced.py rename to src/arg_needle/scripts/infer_args_advanced.py index 6cb035b..5d73f20 100644 --- a/src/scripts/infer_args_advanced.py +++ b/src/arg_needle/scripts/infer_args_advanced.py @@ -1,6 +1,6 @@ # This file is part of the ARG-Needle genealogical inference and # analysis software suite. -# Copyright (C) 2023 ARG-Needle Developers. +# Copyright (C) 2023-2025 ARG-Needle Developers. # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by diff --git a/src/scripts/prepare_example.py b/src/arg_needle/scripts/prepare_example.py similarity index 99% rename from src/scripts/prepare_example.py rename to src/arg_needle/scripts/prepare_example.py index bc02370..9a0ec90 100644 --- a/src/scripts/prepare_example.py +++ b/src/arg_needle/scripts/prepare_example.py @@ -1,6 +1,6 @@ # This file is part of the ARG-Needle genealogical inference and # analysis software suite. -# Copyright (C) 2023 ARG-Needle Developers. +# Copyright (C) 2023-2025 ARG-Needle Developers. # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by diff --git a/src/simulator.py b/src/arg_needle/simulator.py similarity index 98% rename from src/simulator.py rename to src/arg_needle/simulator.py index 5c5f47f..6ff9a38 100644 --- a/src/simulator.py +++ b/src/arg_needle/simulator.py @@ -1,6 +1,6 @@ # This file is part of the ARG-Needle genealogical inference and # analysis software suite. -# Copyright (C) 2023 ARG-Needle Developers. +# Copyright (C) 2023-2025 ARG-Needle Developers. # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by diff --git a/src/utils.py b/src/arg_needle/utils.py similarity index 97% rename from src/utils.py rename to src/arg_needle/utils.py index 102728c..30bd384 100644 --- a/src/utils.py +++ b/src/arg_needle/utils.py @@ -1,6 +1,6 @@ # This file is part of the ARG-Needle genealogical inference and # analysis software suite. -# Copyright (C) 2023 ARG-Needle Developers. +# Copyright (C) 2023-2025 ARG-Needle Developers. # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by diff --git a/src/hashing/FileUtils.cpp b/src/hashing/FileUtils.cpp index 0699994..aac8028 100644 --- a/src/hashing/FileUtils.cpp +++ b/src/hashing/FileUtils.cpp @@ -1,7 +1,7 @@ /* This file is part of the ARG-Needle genealogical inference and analysis software suite. - Copyright (C) 2023 ARG-Needle Developers. + Copyright (C) 2023-2025 ARG-Needle Developers. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -25,63 +25,73 @@ // The license file can be found at 3rd_party/Eagle/COPYING from the // root of this repository. -#include -#include -#include -#include -#include -#include - #include "FileUtils.hpp" #include #include -namespace FileUtils { +#include +#include +#include +#include -using std::cerr; -using std::endl; -using std::string; -using std::vector; - -bool fileExists(const std::string& name) { - std::ifstream f(name.c_str()); - return f.good(); -} - -int AutoGzIfstream::lineCount(const std::string& file) { - AutoGzIfstream fin; - fin.openOrExit(file); - int ctr = 0; - string line; - while (getline(fin, line)) - ctr++; - return ctr; -} - -void AutoGzIfstream::openOrExit(const std::string& file, std::ios_base::openmode mode) { - fin.open(file.c_str(), mode); - if (!fin) { - cerr << "ERROR: Unable to open file: " << file << endl; - exit(1); - } - if ((int) file.length() > 3 && file.substr(file.length() - 3) == ".gz") - boost_in.push(boost::iostreams::gzip_decompressor()); - boost_in.push(fin); -} - -void AutoGzIfstream::close() { - fin.close(); - boost_in.reset(); -} - -AutoGzIfstream::operator bool() const { - return !boost_in.fail(); -} - -AutoGzIfstream& getline(AutoGzIfstream& in, std::string& s) { - std::getline(in.boost_in, s); - return in; -} +namespace FileUtils { + struct AutoGzIfstream::Impl { + boost::iostreams::filtering_istream boost_in; + std::ifstream fin; + }; + + AutoGzIfstream::AutoGzIfstream() : pimpl(std::make_unique()) { + } + + AutoGzIfstream::~AutoGzIfstream() noexcept = default; + + bool fileExists(const std::filesystem::path &file) { + std::ifstream f(file.c_str()); + return f.good(); + } + + int AutoGzIfstream::lineCount(const std::filesystem::path &file) { + AutoGzIfstream fin; + fin.openOrExit(file); + int ctr = 0; + std::string line; + while (getline(fin, line)) { + ctr++; + } + fin.close(); + return ctr; + } + + void AutoGzIfstream::openOrExit(const std::filesystem::path &file, std::ios_base::openmode mode) { + pimpl->fin.open(file.c_str(), mode); + if (!pimpl->fin) { + std::cerr << "ERROR: Unable to open file: " << file << std::endl; + exit(1); + } + if (file.extension() == ".gz") { + pimpl->boost_in.push(boost::iostreams::gzip_decompressor()); + } + pimpl->boost_in.push(pimpl->fin); + } + + void AutoGzIfstream::close() { + pimpl->fin.close(); + pimpl->boost_in.reset(); + } + + AutoGzIfstream::operator bool() const noexcept { + return !pimpl->boost_in.fail(); + } + + AutoGzIfstream &getline(AutoGzIfstream &in, std::string &s) { + std::getline(in.pimpl->boost_in, s); + return in; + } + + AutoGzIfstream &AutoGzIfstream::operator>>(std::string &x) { + pimpl->boost_in >> x; + return *this; + } } // namespace FileUtils diff --git a/src/hashing/FileUtils.hpp b/src/hashing/FileUtils.hpp index 3f1d262..de1f004 100644 --- a/src/hashing/FileUtils.hpp +++ b/src/hashing/FileUtils.hpp @@ -1,7 +1,7 @@ /* This file is part of the ARG-Needle genealogical inference and analysis software suite. - Copyright (C) 2023 ARG-Needle Developers. + Copyright (C) 2023-2025 ARG-Needle Developers. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -25,39 +25,108 @@ // The license file can be found at 3rd_party/Eagle/COPYING from the // root of this repository. -#ifndef FILEUTILS_HPP -#define FILEUTILS_HPP +#ifndef ARG_NEEDLE_FILE_UTILS_HPP +#define ARG_NEEDLE_FILE_UTILS_HPP -#include +#include +#include #include -#include - -#include namespace FileUtils { - -bool fileExists(const std::string& name); - -class AutoGzIfstream { - boost::iostreams::filtering_istream boost_in; - std::ifstream fin; - -public: - static int lineCount(const std::string& file); - - void openOrExit(const std::string& file, std::ios_base::openmode mode = std::ios::in); - void close(); - template AutoGzIfstream& operator>>(T& x) { - boost_in >> x; - return *this; - } - - operator bool() const; - friend AutoGzIfstream& getline(AutoGzIfstream& in, std::string& s); -}; - -AutoGzIfstream& getline(AutoGzIfstream& in, std::string& s); - + /** + * @brief Check whether a given file exists on disk. + * + * @param file Path to the file to check. + * @return true if the file exists, false otherwise. + */ + bool fileExists(const std::filesystem::path &file); + + /** + * @class AutoGzIfstream + * @brief Stream wrapper that transparently reads either plain-text or gzip-compressed files. + * + * AutoGzIfstream detects whether an input file is compressed (.gz) and automatically + * opens it appropriately. It behaves similarly to std::ifstream but supports reading + * gzip-compressed streams without requiring explicit decompression by the caller. + * + * Internally uses a pimpl to hide implementation details and avoid exposing boost + * libraries at the interface level. + */ + class AutoGzIfstream { + struct Impl; + std::unique_ptr pimpl; + + public: + /** + * @brief Construct an unopened AutoGzIfstream. + */ + AutoGzIfstream(); + + /** + * @brief Destructor closes the stream if open and releases internal resources. + */ + ~AutoGzIfstream() noexcept; + + /** + * @brief Count the number of lines in a file (supports gzipped and plain files). + * + * @param file Path to the file whose line count will be computed. + * @return Number of lines in the file. + */ + [[nodiscard]] static int lineCount(const std::filesystem::path &file); + + /** + * @brief Open a file for reading or exit the program if opening fails. + * + * Automatically detects gzip compression based on file contents. + * + * @param file Path to the file to open. + * @param mode Stream opening mode (defaults to std::ios::in). + */ + void openOrExit(const std::filesystem::path &file, + std::ios_base::openmode mode = std::ios::in); + + /** + * @brief Close the underlying stream. + */ + void close(); + + /** + * @brief Read whitespace-delimited input into a string via the extraction operator. + * + * @param x Output string that will receive the parsed token. + * @return Reference to this stream. + */ + AutoGzIfstream &operator>>(std::string &x); + + /** + * @brief Boolean conversion indicating whether the stream is currently valid. + * + * Allows usage in conditions such as: + * @code + * if (stream) { ... } + * @endcode + * + * @return true if the stream is open and in a good state, false otherwise. + */ + [[nodiscard]] explicit operator bool() const noexcept; + + /** + * @brief Friend declaration enabling getline(AutoGzIfstream&, ...). + */ + friend AutoGzIfstream &getline(AutoGzIfstream &in, std::string &s); + }; + + /** + * @brief Read a full line from an AutoGzIfstream into a string. + * + * Supports both compressed and uncompressed input sources. + * + * @param in Stream to read from. + * @param s Output string receiving the line (without delimiter). + * @return Reference to the stream. + */ + AutoGzIfstream &getline(AutoGzIfstream &in, std::string &s); } // namespace FileUtils -#endif +#endif // ARG_NEEDLE_FILE_UTILS_HPP diff --git a/src/hashing/HapData.cpp b/src/hashing/HapData.cpp index 4be4e90..ceb7a07 100644 --- a/src/hashing/HapData.cpp +++ b/src/hashing/HapData.cpp @@ -1,7 +1,7 @@ /* This file is part of the ARG-Needle genealogical inference and analysis software suite. - Copyright (C) 2023 ARG-Needle Developers. + Copyright (C) 2023-2025 ARG-Needle Developers. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -33,18 +33,8 @@ #include "HapData.hpp" #include "utils.hpp" -using std::cerr; -using std::cout; -using std::deque; -using std::endl; -using std::ostream; -using std::pair; -using std::string; -using std::tuple; -using std::unordered_map; -using std::unordered_set; - -HapData::HapData(string mode, string file_root_path, unsigned int _word_size, string map_file_path, + +HapData::HapData(std::string mode, std::string file_root_path, unsigned int _word_size, std::string map_file_path, bool fill_sites) : word_size(_word_size) { if (mode == "sequence") { @@ -54,21 +44,21 @@ HapData::HapData(string mode, string file_root_path, unsigned int _word_size, st data_mode = HapDataMode::array; } else { - throw std::logic_error(THROW_LINE("Mode not recognized.")); + throw std::logic_error(MAKE_ERROR("Mode not recognized.")); } if (sizeof(word_type) != 8) { - throw std::logic_error(THROW_LINE("Expected word_type to be 8 bytes (64 bits).")); + throw std::logic_error(MAKE_ERROR("Expected word_type to be 8 bytes (64 bits).")); } if (sizeof(1ull) < 8) { throw std::logic_error( - THROW_LINE("Expected unsigned long long to be at least 8 bytes (64 bits).")); + MAKE_ERROR("Expected unsigned long long to be at least 8 bytes (64 bits).")); } if (word_size > 64 || word_size <= 0) { - throw std::logic_error(THROW_LINE("Out of bounds word size.")); + throw std::logic_error(MAKE_ERROR("Out of bounds word size.")); } - string line; + std::string line; std::stringstream ss; // read in .sample[s] file @@ -80,14 +70,14 @@ HapData::HapData(string mode, string file_root_path, unsigned int _word_size, st file_samples.openOrExit(file_root_path + ".sample"); } else { - cerr << "ERROR. Could not find sample file in " + file_root_path + ".sample[s]" << endl; + std::cerr << "ERROR. Could not find sample file in " + file_root_path + ".sample[s]" << std::endl; exit(1); } while (getline(file_samples, line)) { - vector splitStr; + std::vector splitStr; std::istringstream iss(line); - string buf; + std::string buf; while (iss >> buf) splitStr.push_back(buf); @@ -105,14 +95,13 @@ HapData::HapData(string mode, string file_root_path, unsigned int _word_size, st // Parse .map[.gz] file FileUtils::AutoGzIfstream file_map; - if (map_file_path != "") { + if (!map_file_path.empty()) { // Attempt to read in .map[.gz] file if (FileUtils::fileExists(map_file_path)) { file_map.openOrExit(map_file_path); - // cout << "Using genetic map " << map_file_path << endl; } else { - cerr << "ERROR. Could not open map file " + map_file_path + ", no such file" << endl; + std::cerr << "ERROR. Could not open map file " + map_file_path + ", no such file" << std::endl; exit(1); } } @@ -120,20 +109,18 @@ HapData::HapData(string mode, string file_root_path, unsigned int _word_size, st // If no map file is specified, default to file_root_path.map[.gz] if (FileUtils::fileExists(file_root_path + ".map.gz")) { file_map.openOrExit(file_root_path + ".map.gz"); - // cout << "Using genetic map " << file_root_path << ".map.gz" << endl; } else if (FileUtils::fileExists(file_root_path + ".map")) { file_map.openOrExit(file_root_path + ".map"); - // cout << "Using genetic map " << file_root_path << ".map" << endl; } else { - cerr << "ERROR. Could not find map file in " + file_root_path + ".map.gz or " + + std::cerr << "ERROR. Could not find map file in " + file_root_path + ".map.gz or " + file_root_path + ".map" - << endl; + << std::endl; exit(1); } } - string map_field[4]; + std::string map_field[4]; while (getline(file_map, line)) { ss.clear(); ss.str(line); @@ -159,27 +146,28 @@ HapData::HapData(string mode, string file_root_path, unsigned int _word_size, st file_hap.openOrExit(file_root_path + ".haps"); } else { - cerr << "ERROR. Could not find hap file in " + file_root_path + ".hap.gz, " + file_root_path + + std::cerr << "ERROR. Could not find hap file in " + file_root_path + ".hap.gz, " + file_root_path + ".hap, " + ".haps.gz, or " + file_root_path + ".haps" - << endl; + << std::endl; exit(1); } if (fill_sites) { - sites = vector>(num_haps, vector()); + sites = std::vector>(num_haps, std::vector()); } - words = vector>(num_haps, vector()); - string marker_id; + words = std::vector>(num_haps, std::vector()); + std::string marker_id; unsigned long int marker_pos; char al[2], inp; - int site_id = 0; + unsigned int site_id = 0u; while (getline(file_hap, line)) { // read the meta data ss.clear(); ss.str(line); ss >> map_field[0] >> marker_id >> marker_pos >> al[0] >> al[1]; - if (map_field[0] == "") + if (map_field[0].empty()) { continue; + } if (site_id % word_size == 0) { for (size_t hap_id = 0; hap_id < num_haps; ++hap_id) { @@ -212,9 +200,9 @@ HapData::HapData(string mode, string file_root_path, unsigned int _word_size, st } } } - float maf = (float) maf_ctr / num_haps; - if (maf > 0.5) { - maf = 1 - maf; + float maf = static_cast(maf_ctr) / static_cast(num_haps); + if (maf > 0.5f) { + maf = 1.f - maf; } site_mafs.push_back(maf); ++site_id; @@ -222,27 +210,21 @@ HapData::HapData(string mode, string file_root_path, unsigned int _word_size, st file_hap.close(); } -HapData::~HapData() { -#ifdef _DEBUG - cout << "Deleting: " << *this << endl; -#endif // _DEBUG -} - void HapData::add_to_hash(size_t hap_id) { if (hashed_hap_ids.find(hap_id) != hashed_hap_ids.end()) { - throw std::logic_error(THROW_LINE("This haplotype has already been hashed.")); + throw std::logic_error(MAKE_ERROR("This haplotype has already been hashed.")); } if (hap_id >= num_haps) { - throw std::logic_error(THROW_LINE("Haplotype ID out of bounds.")); + throw std::logic_error(MAKE_ERROR("Haplotype ID out of bounds.")); } if (hashes.empty()) { - hashes = vector>>( - words[hap_id].size(), unordered_map>()); + hashes = std::vector>>( + words[hap_id].size(), std::unordered_map>()); } for (size_t i = 0; i < words[hap_id].size(); ++i) { - vector& hash_value = + std::vector& hash_value = hashes[i][words[hap_id][i]]; // creates if not present, only hashes once hash_value.push_back(hap_id); } @@ -252,23 +234,23 @@ void HapData::add_to_hash(size_t hap_id) { void HapData::print_hap(size_t hap_id) { if (hap_id >= num_haps) { - throw std::logic_error(THROW_LINE("Haplotype ID out of bounds.")); + throw std::logic_error(MAKE_ERROR("Haplotype ID out of bounds.")); } - cout << "Bits for hap_id = " << hap_id << endl; + std::cout << "Bits for hap_id = " << hap_id << std::endl; for (size_t site_id = 0; site_id < num_sites; ++site_id) { - cout << sites[hap_id][site_id]; + std::cout << sites[hap_id][site_id]; if (site_id % word_size == word_size - 1) { - cout << " "; + std::cout << " "; } } - cout << endl; + std::cout << std::endl; - cout << "Words (hex) for hap_id = " << hap_id << endl; + std::cout << "Words (hex) for hap_id = " << hap_id << std::endl; std::cout << std::hex << std::showbase; for (auto const& word : words[hap_id]) { - cout << word << " "; + std::cout << word << " "; } - cout << endl; + std::cout << std::endl; std::cout << std::dec << std::noshowbase; // cout << "Words (decimal)" << endl; @@ -280,62 +262,62 @@ void HapData::print_hap(size_t hap_id) { void HapData::print_hashes() { for (size_t i = 0; i < hashes.size(); ++i) { - cout << "Hash for word " << i << " of " << hashes.size() << endl; + std::cout << "Hash for word " << i << " of " << hashes.size() << std::endl; for (auto const& map_entry : hashes[i]) { - unsigned int num_bits = word_size; + unsigned long num_bits = word_size; if (i == hashes.size() - 1) { - num_bits = ((num_sites - 1) % word_size) + 1; + num_bits = ((num_sites - 1ul) % word_size) + 1ul; } for (size_t j = 0; j < num_bits; ++j) { - cout << ((map_entry.first >> j) & 1); + std::cout << ((map_entry.first >> j) & 1); } - cout << ":"; + std::cout << ":"; for (const size_t id : map_entry.second) { - cout << " " << id; + std::cout << " " << id; } - cout << endl; + std::cout << std::endl; } - cout << endl; + std::cout << std::endl; } } void HapData::print_word_match_diagram(size_t hap_id1, size_t hap_id2) { if (hap_id1 >= num_haps || hap_id2 >= num_haps) { - throw std::logic_error(THROW_LINE("Haplotype ID out of bounds.")); + throw std::logic_error(MAKE_ERROR("Haplotype ID out of bounds.")); } for (size_t i = 0; i < words[hap_id1].size(); ++i) { if (i != 0) { if (i % 100 == 0) { - cout << endl; + std::cout << std::endl; } if (i % 25 == 0) { - cout << endl; + std::cout << std::endl; } else if (i % 5 == 0) { - cout << " "; + std::cout << " "; } } if (words[hap_id1][i] == words[hap_id2][i]) { - cout << "x"; + std::cout << "x"; } else { - cout << "_"; + std::cout << "_"; } } - cout << endl; + std::cout << std::endl; } -vector>>> +std::vector>>> HapData::get_closest_cousins(size_t hap_id, unsigned int k, unsigned int tolerance, double window_size_genetic) { // find the windows - vector windows; // Window defined in HapData.hpp + std::vector windows; // Window defined in HapData.hpp size_t num_words = words[hap_id].size(); if (window_size_genetic <= 0) { // make a new window for each and every word for (size_t j = 0; j < num_words; ++j) { - Window w; + Window w{}; w.start = j; w.end = j + 1; w.index = j; @@ -353,7 +335,7 @@ HapData::get_closest_cousins(size_t hap_id, unsigned int k, unsigned int toleran (genetic_positions[last_word_site] - start_genetic >= window_size_genetic && genetic_positions[num_sites - 1] - genetic_positions[last_word_site + 1] >= window_size_genetic)) { - Window w; + Window w{}; w.start = start_word; w.end = j + 1; w.index = window_index; @@ -367,7 +349,7 @@ HapData::get_closest_cousins(size_t hap_id, unsigned int k, unsigned int toleran } } - vector words_to_windows; + std::vector words_to_windows; for (size_t i = 0; i < windows.size(); ++i) { Window w = windows[i]; for (size_t j = w.start; j < w.end; ++j) { @@ -377,25 +359,24 @@ HapData::get_closest_cousins(size_t hap_id, unsigned int k, unsigned int toleran // how high each sample scores in each window // we only record samples that have matched - vector> window_scores( - windows.size(), unordered_map()); + std::vector> window_scores( + windows.size(), std::unordered_map()); // stretches of matching material separated by 2*k + 1 fillers, where k is the number // of mismatches, max size defined by 2*tolerance + 1 - vector>> stretches(hap_id, deque>()); + std::vector>> stretches(hap_id, std::deque>()); // size_t num_overall_matches = 0; for (size_t i = 0; i < num_words; ++i) { // in some cases, the word does not yet exist in the hashmap if (hashes[i].find(words[hap_id][i]) != hashes[i].end()) { - const vector& hash_value = hashes[i].find(words[hap_id][i])->second; - // num_overall_matches += hash_value.size(); - for (auto v : hash_value) { + const std::vector& matches = hashes[i].find(words[hap_id][i])->second; + for (auto v : matches) { // check the end of stretches to figure out what to do - if (stretches[v].size() == 0) { + if (stretches[v].empty()) { stretches[v].emplace_back(i, i + 1); // end is exclusive } else { - pair& back_pair = stretches[v].back(); + std::pair& back_pair = stretches[v].back(); if (back_pair.second == i) { back_pair.second = i + 1; // end is exclusive } @@ -419,7 +400,7 @@ HapData::get_closest_cousins(size_t hap_id, unsigned int k, unsigned int toleran // pop_front to get to size 2*tolerance + 1 while (stretches[v].size() > 2 * tolerance + 1) { - pair& item = stretches[v].front(); + std::pair& item = stretches[v].front(); if (item.second != 0) { size_t range_start = item.first; // old version was buggy @@ -439,10 +420,10 @@ HapData::get_closest_cousins(size_t hap_id, unsigned int k, unsigned int toleran // if our range is [6, 16), we want [5, 10) to [15, 20) inclusive for (size_t window_index = words_to_windows[range_start]; window_index <= words_to_windows[range_end - 1]; ++window_index) { - size_t& hash_value = + size_t& best_len = window_scores[window_index][v]; // creates if not present, only hashes once - if (range_size > hash_value) { - hash_value = range_size; + if (range_size > best_len) { + best_len = range_size; } } } @@ -454,8 +435,8 @@ HapData::get_closest_cousins(size_t hap_id, unsigned int k, unsigned int toleran // go over all the stretches and pop_front for (size_t v = 0; v < hap_id; ++v) { - while (stretches[v].size() > 0) { - pair item = stretches[v].front(); + while (!stretches[v].empty()) { + std::pair item = stretches[v].front(); if (item.second != 0) { size_t range_start = item.first; // old version is buggy in general, but should work in this case @@ -487,28 +468,27 @@ HapData::get_closest_cousins(size_t hap_id, unsigned int k, unsigned int toleran } // take the values in window_scores and sort to find top k - vector>>> results; + std::vector>>> results; for (const Window& w : windows) { size_t window_start_site = w.start * word_size; size_t window_end_site = std::min(w.end * word_size - 1, num_sites - 1); - vector> stats; + std::vector> stats; for (const auto& map_entry : window_scores[w.index]) { - size_t hap_id = map_entry.first; - double score = (double) map_entry.second; - stats.emplace_back(score, hap_id); + size_t map_entry_hap_id = map_entry.first; + auto score = static_cast(map_entry.second); + stats.emplace_back(score, map_entry_hap_id); } size_t actual_k = std::min(k, stats.size()); // use this if we want sorted std::partial_sort( - stats.begin(), stats.begin() + actual_k, stats.end(), std::greater>()); + stats.begin(), stats.begin() + static_cast(actual_k), stats.end(), std::greater>()); // use this if we don't care about sorted // std::nth_element(stats.begin(), stats.begin() + actual_k, stats.end(), // std::greater>()); // append to results - results.push_back( - std::make_tuple(window_start_site, window_end_site, vector>())); + results.emplace_back(window_start_site, window_end_site, std::vector>()); for (size_t stats_idx = 0; stats_idx < actual_k; ++stats_idx) { std::get<2>(results[results.size() - 1]) .emplace_back(stats[stats_idx].second, stats[stats_idx].first); @@ -518,7 +498,7 @@ HapData::get_closest_cousins(size_t hap_id, unsigned int k, unsigned int toleran return results; } -ostream& operator<<(ostream& os, const HapData& data) { +std::ostream& operator<<(std::ostream& os, const HapData& data) { os << "HapData with " << data.num_haps << " haplotypes and " << data.num_sites; os << " sites, word size = " << data.word_size << " bits"; return os; diff --git a/src/hashing/HapData.hpp b/src/hashing/HapData.hpp index 17d47f4..671bc44 100644 --- a/src/hashing/HapData.hpp +++ b/src/hashing/HapData.hpp @@ -1,7 +1,7 @@ /* This file is part of the ARG-Needle genealogical inference and analysis software suite. - Copyright (C) 2023 ARG-Needle Developers. + Copyright (C) 2023-2025 ARG-Needle Developers. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -17,11 +17,10 @@ along with this program. If not, see . */ -#ifndef __HAP_DATA_HPP_ -#define __HAP_DATA_HPP_ +#ifndef ARG_NEELE_HAP_DATA_HPP +#define ARG_NEELE_HAP_DATA_HPP -#include -#include +#include #include #include #include @@ -29,14 +28,6 @@ #include #include -using std::ostream; -using std::pair; -using std::string; -using std::tuple; -using std::unordered_map; -using std::unordered_set; -using std::vector; - struct Window { size_t start, end, index; // end is inclusive friend bool operator<(const Window& a, const Window& b) { @@ -55,31 +46,31 @@ class HapData { public: typedef uint64_t word_type; - unsigned int num_haps = 0; - unsigned int num_sites = 0; + unsigned long num_haps = 0ul; + unsigned long num_sites = 0ul; unsigned int word_size; HapDataMode data_mode; - vector physical_positions; - vector genetic_positions; - vector site_mafs; - vector sample_names; - vector> sites; - vector> words; + std::vector physical_positions; + std::vector genetic_positions; + std::vector site_mafs; + std::vector sample_names; + std::vector> sites; + std::vector> words; - vector>> hashes; - unordered_set hashed_hap_ids; + std::vector>> hashes; + std::unordered_set hashed_hap_ids; - HapData(string mode, string file_root_path, unsigned int _word_size = 64, - string map_file_path = "", bool fill_sites = true); - ~HapData(); + HapData(std::string mode, std::string file_root_path, unsigned int _word_size = 64, + std::string map_file_path = "", bool fill_sites = true); + ~HapData() = default; void add_to_hash(size_t hap_id); - vector>>> + std::vector>>> get_closest_cousins(size_t hap_id, unsigned int k, unsigned int tolerance = 0, double window_size_genetic = 0); void print_hap(size_t hap_id); void print_hashes(); void print_word_match_diagram(size_t hap_id1, size_t hap_id2); - friend ostream& operator<<(ostream& os, const HapData& data); + friend std::ostream& operator<<(std::ostream& os, const HapData& data); }; -#endif // __HAP_DATA_HPP_ +#endif // ARG_NEELE_HAP_DATA_HPP diff --git a/src/hashing/pybind.cpp b/src/hashing/pybind.cpp index 7e77bde..b8eda37 100644 --- a/src/hashing/pybind.cpp +++ b/src/hashing/pybind.cpp @@ -1,7 +1,7 @@ /* This file is part of the ARG-Needle genealogical inference and analysis software suite. - Copyright (C) 2023 ARG-Needle Developers. + Copyright (C) 2023-2025 ARG-Needle Developers. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -56,7 +56,7 @@ PYBIND11_MODULE(arg_needle_hashing_pybind, m) { "physical_position_at", [](const HapData& data, size_t site) { if (site >= data.num_sites) { - throw std::logic_error(THROW_LINE("Out of bounds site.")); + throw std::logic_error(MAKE_ERROR("Out of bounds site.")); } return data.physical_positions[site]; }, @@ -65,7 +65,7 @@ PYBIND11_MODULE(arg_needle_hashing_pybind, m) { "genetic_position_at", [](const HapData& data, size_t site) { if (site >= data.num_sites) { - throw std::logic_error(THROW_LINE("Out of bounds site.")); + throw std::logic_error(MAKE_ERROR("Out of bounds site.")); } return data.genetic_positions[site]; }, @@ -74,7 +74,7 @@ PYBIND11_MODULE(arg_needle_hashing_pybind, m) { "site_maf_at", [](const HapData& data, size_t site) { if (site >= data.num_sites) { - throw std::logic_error(THROW_LINE("Out of bounds site.")); + throw std::logic_error(MAKE_ERROR("Out of bounds site.")); } return data.site_mafs[site]; }, @@ -83,7 +83,7 @@ PYBIND11_MODULE(arg_needle_hashing_pybind, m) { "sample_name", [](const HapData& data, size_t hap_id) { if (hap_id >= data.num_haps) { - throw std::logic_error(THROW_LINE("Out of bounds hap_id.")); + throw std::logic_error(MAKE_ERROR("Out of bounds hap_id.")); } return data.sample_names[hap_id]; }, diff --git a/src/hashing/utils.hpp b/src/hashing/utils.hpp index 0fb16d7..85619b5 100644 --- a/src/hashing/utils.hpp +++ b/src/hashing/utils.hpp @@ -1,7 +1,7 @@ /* This file is part of the ARG-Needle genealogical inference and analysis software suite. - Copyright (C) 2023 ARG-Needle Developers. + Copyright (C) 2023-2025 ARG-Needle Developers. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -17,14 +17,15 @@ along with this program. If not, see . */ -#ifndef __UTILS_HPP_ -#define __UTILS_HPP_ +#ifndef ARG_NEEDLE_UTILS_HPP +#define ARG_NEEDLE_UTILS_HPP #include -using std::string; +inline std::string make_error(const std::string &msg, const char *file, const int line) noexcept { + return std::string(file) + ":" + std::to_string(line) + ": " + msg; +} -// Utility for exceptions -#define THROW_LINE(a) (string(__FILE__) + ":" + std::to_string(__LINE__) + ": " + a) +#define MAKE_ERROR(msg) (make_error((msg), __FILE__, __LINE__)) -#endif // __UTILS_HPP_ +#endif // ARG_NEEDLE_UTILS_HPP diff --git a/test/cpp/CMakeLists.txt b/test/cpp/CMakeLists.txt new file mode 100644 index 0000000..148cf22 --- /dev/null +++ b/test/cpp/CMakeLists.txt @@ -0,0 +1,33 @@ +# This file is part of the ARG-Needle genealogical inference and +# analysis software suite. +# Copyright (C) 2023-2025 ARG-Needle Developers. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +set( + test_files + test_file_utils.cpp + test_utils.cpp +) + +add_executable(cpp_tests ${test_files}) +target_link_libraries(cpp_tests PRIVATE arg_needle_hashing Catch2::Catch2WithMain) + +catch_discover_tests(cpp_tests) + +set(ARG_NEEDLE_TEST_DIR "${CMAKE_CURRENT_SOURCE_DIR}") +set(ARG_NEEDLE_RESOURCES_DIR "${CMAKE_SOURCE_DIR}/src/arg_needle/resources") + +add_definitions(-DARG_NEEDLE_TEST_DIR=\"${ARG_NEEDLE_TEST_DIR}\") +add_definitions(-DARG_NEEDLE_RESOURCES_DIR=\"${ARG_NEEDLE_RESOURCES_DIR}\") diff --git a/test/cpp/test_file_utils.cpp b/test/cpp/test_file_utils.cpp new file mode 100644 index 0000000..0a846e9 --- /dev/null +++ b/test/cpp/test_file_utils.cpp @@ -0,0 +1,59 @@ +/* + This file is part of the ARG-Needle genealogical inference and + analysis software suite. + Copyright (C) 2023-2025 ARG-Needle Developers. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include "FileUtils.hpp" + + +TEST_CASE( "FileUtils::fileExists", "[test_file_utils]" ) { + REQUIRE(FileUtils::fileExists(ARG_NEEDLE_TEST_DIR "/CMakeLists.txt") == true); + REQUIRE(FileUtils::fileExists(ARG_NEEDLE_TEST_DIR "/file_that_does_not_exist") == false); +} + +TEST_CASE( "FileUtils::AutoGzIfstream", "[test_file_utils]") +{ + SECTION("open and close gz file") + { + REQUIRE(FileUtils::fileExists(ARG_NEEDLE_RESOURCES_DIR "/30-100-2000_CEU.decodingQuantities.gz") == true); + FileUtils::AutoGzIfstream gz_file; + gz_file.openOrExit(ARG_NEEDLE_RESOURCES_DIR "/30-100-2000_CEU.decodingQuantities.gz"); + gz_file.close(); + } + + SECTION("count lines in file") + { + REQUIRE(FileUtils::fileExists(ARG_NEEDLE_RESOURCES_DIR "/30-100-2000_CEU.decodingQuantities.gz") == true); + REQUIRE(FileUtils::AutoGzIfstream::lineCount(ARG_NEEDLE_RESOURCES_DIR "/30-100-2000_CEU.decodingQuantities.gz") == 35245); + } + + SECTION("extract line from a file") + { + REQUIRE(FileUtils::fileExists(ARG_NEEDLE_RESOURCES_DIR "/30-100-2000_CEU.decodingQuantities.gz") == true); + + FileUtils::AutoGzIfstream gz_file; + gz_file.openOrExit(ARG_NEEDLE_RESOURCES_DIR "/30-100-2000_CEU.decodingQuantities.gz"); + + std::string first_line; + FileUtils::getline(gz_file, first_line); + gz_file.close(); + + REQUIRE(first_line == "TransitionType"); + } +} \ No newline at end of file diff --git a/test/cpp/test_utils.cpp b/test/cpp/test_utils.cpp new file mode 100644 index 0000000..db78b30 --- /dev/null +++ b/test/cpp/test_utils.cpp @@ -0,0 +1,41 @@ +/* + This file is part of the ARG-Needle genealogical inference and + analysis software suite. + Copyright (C) 2023-2025 ARG-Needle Developers. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include +#include + +#include "HapData.hpp" +#include "utils.hpp" + +using Catch::Matchers::ContainsSubstring; + +void test_throw() { + throw std::logic_error(MAKE_ERROR("Something went wrong")); +} + + +TEST_CASE("make_error", "[utils]") { + + REQUIRE_THROWS_WITH(test_throw(), + ContainsSubstring( "test_utils.cpp:" ) && ContainsSubstring( "Something went wrong" )); + + REQUIRE_THROWS_WITH(HapData("banana", ""), + ContainsSubstring( "HapData.cpp:" ) && ContainsSubstring( "Mode not recognized" )); +} + diff --git a/test/test_import.py b/test/test_import.py index b73d831..5c0bc44 100644 --- a/test/test_import.py +++ b/test/test_import.py @@ -1,6 +1,6 @@ # This file is part of the ARG-Needle genealogical inference and # analysis software suite. -# Copyright (C) 2023 ARG-Needle Developers. +# Copyright (C) 2023-2025 ARG-Needle Developers. # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -20,4 +20,6 @@ add_default_arg_building_arguments, normalize_arg, trim_arg ) -print("Successfully imported all arg_needle components") +def test_import(): + assert True + print("Successfully imported all arg_needle components") diff --git a/test/test_regression.py b/test/test_regression.py new file mode 100644 index 0000000..3c8f2f8 --- /dev/null +++ b/test/test_regression.py @@ -0,0 +1,99 @@ +# This file is part of the ARG-Needle genealogical inference and +# analysis software suite. +# Copyright (C) 2023-2025 ARG-Needle Developers. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + + +# This test assumes you have installed the dev dependencies of arg-needle. +# In the root of this repository, run: +# +# pip install .[dev] + +import subprocess +import tempfile +import pathlib +import h5py +import numpy as np +import shutil + +DATA_DIR = pathlib.Path(__file__).resolve().parent / "data" + +ARGN_KEYS = ['arg_file_version', 'chromosome', 'datetime_created', 'end', 'mutations', 'node_bounds', 'num_edges', + 'num_mutations', 'num_nodes', 'offset', 'start', 'threaded_samples'] + +def check_attr_keys(attrs): + """ + Check if the keys are as expected in generated HDF5 file + """ + keys_in_generated_file = sorted([str(x) for x in attrs.keys()]) + + if keys_in_generated_file != ARGN_KEYS: + print(f"Expected the following keys:\n{ARGN_KEYS}\n but got:\n{keys_in_generated_file}") + return False + + return True + +def check_attr_vals(attrs): + """ + Check attr values are correct + """ + + # Deterministic values: + assert attrs["arg_file_version"] == 2 + assert attrs["chromosome"] == 1 + assert np.isclose(attrs["start"], 0.0, rtol=1e-8) + assert np.isclose(attrs["end"], 2000079.0, rtol=1e-8) + assert attrs["mutations"] == False + assert attrs["node_bounds"] == True + assert attrs["offset"] == 10001457 + assert attrs["threaded_samples"] == 400 + + # These values were calculated by running the example about 100 times + nodes_mean = 17203.69792 + nodes_std = 127.8394651 + edges_mean = 93654.98958 + edges_std = 591.7562353 + + # This should almost never fail + assert attrs["num_nodes"] > nodes_mean - 3.0 * nodes_std + assert attrs["num_nodes"] < nodes_mean + 3.0 * nodes_std + + assert attrs["num_edges"] > edges_mean - 3.0 * edges_std + assert attrs["num_edges"] < edges_mean + 3.0 * edges_std + + return True + + +def test_script_output(): + + with tempfile.TemporaryDirectory() as tmpdir: + tmp = pathlib.Path(tmpdir) + + # Run scripts inside temporary directory + subprocess.run(["prepare_example"], cwd=tmp, check=True) + subprocess.run(["infer_args", "--normalize", "0"], cwd=tmp, check=True) + + # Check output file exists + outfile = tmp / "example.argn" + assert outfile.exists() + + # Compare to frozen output + with h5py.File(outfile, "r") as arg_file: + assert check_attr_keys(arg_file.attrs) + assert check_attr_vals(arg_file.attrs) + + +if __name__ == '__main__': + test_script_output()