diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml
index 734cb1d..39c0019 100644
--- a/.github/workflows/build-wheels.yml
+++ b/.github/workflows/build-wheels.yml
@@ -10,66 +10,87 @@ on:
release:
types:
- published
-
-env:
- CIBW_BUILD: cp38-* cp39-* cp310-* cp311-*
- CIBW_SKIP: cp3*-musllinux_*
- CIBW_ARCHS_MACOS: x86_64
- CIBW_ARCHS_LINUX: auto64
- CIBW_MANYLINUX_X86_64_IMAGE: manylinux2014
- CIBW_BEFORE_ALL_LINUX: yum -y install boost-devel zlib-devel
- CIBW_BEFORE_ALL_MACOS: brew install boost zlib gsl
- CIBW_BEFORE_BUILD: pip install --upgrade pip setuptools wheel ninja numpy cython
- CIBW_ENVIRONMENT_MACOS: CXX="$(brew --prefix llvm@15)/bin/clang++"
- MACOSX_DEPLOYMENT_TARGET: 10.15
+ workflow_dispatch:
jobs:
- build_wheels:
+ build_wheels_cloud:
name: Build wheels on ${{ matrix.os }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
- os: [ubuntu-22.04, macos-12]
+ include:
+ - os: ubuntu-24.04
+ arch: x86_64
+ py-vers-full: cp39-* cp310-* cp311-* cp312-* cp313-* cp314-*
+ py-vers-pr: cp39-* cp310-* cp311-* cp312-* cp313-* cp314-*
+ before-all: dnf -y install boost-devel zlib
+ extra-env: ""
+ mdt: ""
+ - os: ubuntu-24.04-arm
+ arch: aarch64
+ py-vers-full: cp39-* cp310-* cp311-* cp312-* cp313-* cp314-*
+ py-vers-pr: cp39-* cp310-* cp311-* cp312-* cp313-* cp314-*
+ before-all: dnf -y install boost-devel zlib
+ extra-env: ""
+ mdt: ""
+ - os: macos-15-intel
+ arch: x86_64
+ py-vers-full: cp39-* cp310-* cp311-* cp312-* cp313-* cp314-*
+ py-vers-pr: cp39-* cp310-* cp311-* cp312-* cp313-* cp314-*
+ before-all: brew install boost zlib
+ extra-env: CC=clang CXX=clang++
+ mdt: 15
+ - os: macos-14
+ arch: arm64
+ py-vers-full: cp39-* cp310-* cp311-* cp312-* cp313-* cp314-*
+ py-vers-pr: cp39-* cp310-* cp311-* cp312-* cp313-* cp314-*
+ before-all: brew install boost zlib
+ extra-env: CC=clang CXX=clang++
+ mdt: 14
+
+ env:
+ CIBW_BUILD: ${{ github.event_name != 'pull_request' && matrix.py-vers-full || matrix.py-vers-pr }}
+ CIBW_SKIP: cp3*-musllinux_*
+ CIBW_ARCHS: ${{ matrix.arch }}
+ CIBW_MANYLINUX_X86_64_IMAGE: manylinux_2_28
+ CIBW_BEFORE_ALL: ${{ matrix.before-all }}
+ CIBW_BEFORE_BUILD: pip install --upgrade pip setuptools wheel ninja numpy cython
+ CIBW_ENVIRONMENT: ${{ matrix.extra-env }}
+ MACOSX_DEPLOYMENT_TARGET: ${{ matrix.mdt }}
steps:
- name: checkout repo & submodules
- uses: actions/checkout@v3
+ uses: actions/checkout@v5
with:
+ submodules: true
fetch-depth: 0
- - name: Set up Python
- uses: actions/setup-python@v3
-
- - name: Install cibuildwheel
- run: python -m pip install cibuildwheel==2.12.0
-
- name: Build wheels
- run: python -m cibuildwheel --output-dir wheelhouse
+ uses: pypa/cibuildwheel@v3.2.0
- - uses: actions/upload-artifact@v3
+ - uses: actions/upload-artifact@v4
with:
- name: wheels
+ name: wheels-cloud-${{ matrix.os }}
path: ./wheelhouse/*.whl
retention-days: 1
upload_all:
name: Upload to PyPI
- needs: build_wheels
+ needs: [build_wheels_cloud]
runs-on: ubuntu-latest
- if: github.event_name == 'release' && github.event.action == 'published'
+ if: ${{ github.event_name == 'release' && github.event.action == 'published' }}
steps:
- - uses: actions/setup-python@v3
-
- - name: Download wheels
- uses: actions/download-artifact@v3
+ - name: Download wheels from cloud runners
+ uses: actions/download-artifact@v5
with:
- name: wheels
+ pattern: wheels-cloud-*
+ merge-multiple: true
path: wheels
- - uses: pypa/gh-action-pypi-publish@v1.6.4
+ - uses: pypa/gh-action-pypi-publish@v1.13.0
with:
user: __token__
password: ${{ secrets.PYPI_TOKEN }}
diff --git a/.github/workflows/cpp_tests.yml b/.github/workflows/cpp_tests.yml
new file mode 100644
index 0000000..0e03a4b
--- /dev/null
+++ b/.github/workflows/cpp_tests.yml
@@ -0,0 +1,64 @@
+name: C++ unit tests
+
+on:
+ push:
+ branches:
+ - main
+ pull_request:
+ branches:
+ - '**'
+ workflow_dispatch:
+
+jobs:
+
+ build-and-test:
+ name: C++ unit tests
+ runs-on: ${{ matrix.os }}
+ timeout-minutes: 10
+
+ strategy:
+ fail-fast: false
+ matrix:
+ include:
+ - os: ubuntu-24.04
+ cxx: "clang++-18"
+ sys_install: "sudo apt -y update && sudo apt -y install libboost-iostreams-dev"
+ - os: ubuntu-24.04-arm
+ cxx: "g++-12"
+ sys_install: "sudo apt -y update && sudo apt -y install libboost-iostreams-dev"
+ - os: macos-15
+ cxx: ""
+ sys_install: "brew install boost"
+ - os: macos-15
+ cxx: "$(brew --prefix llvm@18)/bin/clang"
+ sys_install: "brew install boost"
+
+ steps:
+ - name: checkout repo & submodules
+ uses: actions/checkout@v5
+ with:
+ submodules: true
+ fetch-depth: 0
+
+ - name: install deps
+ run: |
+ ${{ matrix.sys_install }}
+
+ - name: make build dir
+ run: |
+ mkdir build
+
+ - name: cmake configure
+ run: |
+ cmake ..
+ working-directory: build
+
+ - name: cmake build
+ run: |
+ cmake --build . --target cpp_tests --parallel 4
+ working-directory: build
+
+ - name: ctest
+ run: |
+ ctest --output-on-failure
+ working-directory: build
diff --git a/.github/workflows/pip-install-macos.yml b/.github/workflows/pip-install-macos.yml
deleted file mode 100644
index 897a57b..0000000
--- a/.github/workflows/pip-install-macos.yml
+++ /dev/null
@@ -1,48 +0,0 @@
-name: pip install on macOS
-
-on:
- push:
- branches:
- - main
- pull_request:
- branches:
- - '**'
-
-jobs:
-
- pip-install-and-import:
- name: pip install on macOS
- runs-on: macos-11
-
- steps:
- - name: checkout arg_needle
- uses: actions/checkout@v3
-
- - uses: actions/setup-python@v3
- with:
- python-version: 3.x
-
- - name: install system packages
- run: |
- brew install boost zlib gsl
-
- - name: install python pip dependencies
- run: |
- python --version
- python -m pip install --upgrade pip setuptools wheel
- python -m pip install cmake ninja
-
- - name: install Palamara python dependencies
- run: |
- python --version
- python -m pip install asmc-asmc arg-needle-lib
-
- - name: install arg_needle
- run: |
- python --version
- python -m pip install .
-
- - name: test import works as expected
- run: |
- python --version
- python test/test_import.py
diff --git a/.github/workflows/pip-install-ubuntu.yml b/.github/workflows/pip-install-ubuntu.yml
deleted file mode 100644
index 1e2f27a..0000000
--- a/.github/workflows/pip-install-ubuntu.yml
+++ /dev/null
@@ -1,49 +0,0 @@
-name: pip install on Ubuntu
-
-on:
- push:
- branches:
- - main
- pull_request:
- branches:
- - '**'
-
-jobs:
-
- pip-install-and-import:
- name: pip install on Ubuntu
- runs-on: ubuntu-22.04
-
- steps:
- - name: checkout arg_needle
- uses: actions/checkout@v3
-
- - uses: actions/setup-python@v3
- with:
- python-version: 3.x
-
- - name: install system packages
- run: |
- sudo apt -y update
- sudo apt -y install libboost-iostreams-dev zlib1g-dev
-
- - name: install python pip dependencies
- run: |
- python --version
- python -m pip install --upgrade pip setuptools wheel
- python -m pip install cmake ninja
-
- - name: install Palamara python dependencies
- run: |
- python --version
- python -m pip install asmc-asmc arg-needle-lib
-
- - name: install arg_needle
- run: |
- python --version
- python -m pip install .
-
- - name: test import works as expected
- run: |
- python --version
- python test/test_import.py
diff --git a/.github/workflows/python_examples.yml b/.github/workflows/python_examples.yml
new file mode 100644
index 0000000..12c0b42
--- /dev/null
+++ b/.github/workflows/python_examples.yml
@@ -0,0 +1,77 @@
+name: Python examples
+
+on:
+ push:
+ branches:
+ - main
+ pull_request:
+ branches:
+ - '**'
+ workflow_dispatch:
+
+jobs:
+
+ build-and-test:
+ name: Python examples
+ runs-on: ${{ matrix.os }}
+ timeout-minutes: 10
+
+ strategy:
+ fail-fast: false
+ matrix:
+ include:
+ - os: ubuntu-24.04
+ py: "3.10"
+ sys_install: "sudo apt -y update && sudo apt -y install libboost-iostreams-dev"
+ - os: macos-15
+ py: "3.13"
+ sys_install: "brew install boost"
+
+ steps:
+ - name: checkout repo & submodules
+ uses: actions/checkout@v5
+ with:
+ submodules: true
+ fetch-depth: 0
+
+ - name: install deps
+ run: |
+ ${{ matrix.sys_install }}
+
+ - name: Set up Python ${{ matrix.py }}
+ uses: actions/setup-python@v6
+ with:
+ python-version: ${{ matrix.py }}
+
+ - name: install python bindings
+ run: |
+ python -m pip install --upgrade pip setuptools wheel
+ python -m pip install .
+
+ - name: prepare example
+ run: |
+ prepare_example
+
+ - name: regular ARG-Needle
+ run: |
+ infer_args
+
+ - name: ARG-Needle with ASMC-clust
+ run: |
+ infer_args --asmc_clust 1
+
+ - name: ARG-Needle without ARG normalization
+ run: |
+ infer_args --normalize 0
+
+ - name: ARG-Needle (sequence mode)
+ run: |
+ prepare_example --mode sequence --length 5e5
+ infer_args --mode sequence
+
+ - name: ARG-Needle advanced mode (multistep)
+ run: |
+ prepare_example
+ infer_args_advanced --step 1 --num_snp_samples 200
+ infer_args_advanced --step 2
+ infer_args_advanced --step 3 --trim_num_snps 0,50
diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml
new file mode 100644
index 0000000..01668bb
--- /dev/null
+++ b/.github/workflows/python_tests.yml
@@ -0,0 +1,53 @@
+name: Python tests
+
+on:
+ push:
+ branches:
+ - main
+ pull_request:
+ branches:
+ - '**'
+ workflow_dispatch:
+
+jobs:
+
+ build-and-test:
+ name: Python tests
+ runs-on: ${{ matrix.os }}
+ timeout-minutes: 10
+
+ strategy:
+ fail-fast: false
+ matrix:
+ include:
+ - os: ubuntu-24.04
+ py: "3.13"
+ sys_install: "sudo apt -y update && sudo apt -y install libboost-iostreams-dev"
+ - os: macos-15
+ py: "3.10"
+ sys_install: "brew install boost"
+
+ steps:
+ - name: checkout repo & submodules
+ uses: actions/checkout@v5
+ with:
+ submodules: true
+ fetch-depth: 0
+
+ - name: install deps
+ run: |
+ ${{ matrix.sys_install }}
+
+ - name: Set up Python ${{ matrix.py }}
+ uses: actions/setup-python@v6
+ with:
+ python-version: ${{ matrix.py }}
+
+ - name: install python bindings
+ run: |
+ python -m pip install --upgrade pip setuptools wheel
+ python -m pip install .[dev]
+
+ - name: python unit tests
+ run: |
+ python -m pytest
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b42203c..23651aa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
# This file is part of the ARG-Needle genealogical inference and
# analysis software suite.
-# Copyright (C) 2023 ARG-Needle Developers.
+# Copyright (C) 2023-2025 ARG-Needle Developers.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -18,12 +18,17 @@
cmake_minimum_required(VERSION 3.16)
message(STATUS "Using CMake version ${CMAKE_VERSION}")
-project(arg_needle LANGUAGES CXX VERSION 1.0.3)
+project(arg_needle LANGUAGES CXX VERSION 1.1.0)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
+option(ARG_NEEDLE_PYTHON_BINDINGS "Whether to build the python bindings" OFF)
+if (ARG_NEEDLE_PYTHON_BINDINGS)
+ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+endif ()
+
# Project settings including default build type
include(cmake/ProjectSettings.cmake)
@@ -44,26 +49,41 @@ option(BUILD_SHARED_LIBS "Enable compilation of shared libraries" OFF)
set(arg_needle_testdata_dir ${CMAKE_CURRENT_SOURCE_DIR}/testdata)
add_definitions(-DARG_NEEDLE_TESTDATA_DIR=\"${arg_needle_testdata_dir}\")
-option(PYTHON_BINDINGS "Whether to build the python bindings" ON)
-if (PYTHON_BINDINGS)
- include(FetchContent)
- FetchContent_Declare(
- pybind11
- GIT_REPOSITORY https://github.com/pybind/pybind11
- GIT_TAG 0bd8896a4010f2d91b2340570c24fa08606ec406 # Version 2.10.3
- )
- FetchContent_GetProperties(pybind11)
- if (NOT pybind11_POPULATED)
- FetchContent_Populate(pybind11)
- add_subdirectory(${pybind11_SOURCE_DIR} ${pybind11_BINARY_DIR})
- endif ()
+
+# Python bindings
+if (ARG_NEEDLE_PYTHON_BINDINGS)
+ option(ARG_NEEDLE_BUILDING_FROM_PYPROJECT "Are we building from pyproject.toml (pip install)?" OFF)
+
+ if(ARG_NEEDLE_BUILDING_FROM_PYPROJECT)
+ message(STATUS "Using pybind11 from pyproject.toml build environment")
+ find_package(pybind11 REQUIRED)
+ else()
+ message(STATUS "Using FetchContent to get pybind11")
+ include(FetchContent)
+ FetchContent_Declare(
+ pybind11
+ GIT_REPOSITORY https://github.com/pybind/pybind11.git
+ GIT_TAG f5fbe867d2d26e4a0a9177a51f6e568868ad3dc8 # Version 3.0.1
+ )
+ FetchContent_MakeAvailable(pybind11)
+ endif()
endif ()
+
add_subdirectory(src)
-#add_subdirectory(example)
-#option(ENABLE_TESTING "Enable Test Builds" ON)
-#if(ENABLE_TESTING)
-# enable_testing()
-# add_subdirectory(test)
-#endif()
+option(ARG_NEEDLE_TESTING "Enable ARG Needle unit testing" ON)
+if(ARG_NEEDLE_TESTING)
+ Include(FetchContent)
+ FetchContent_Declare(
+ Catch2
+ GIT_REPOSITORY https://github.com/catchorg/Catch2.git
+ GIT_TAG b3fb4b9feafcd8d91c5cb510a4775143fdbef02f # Version 3.11.0
+ )
+
+ FetchContent_MakeAvailable(Catch2)
+ list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
+ include(CTest)
+ include(Catch)
+ add_subdirectory(test/cpp)
+endif()
diff --git a/MANIFEST.in b/MANIFEST.in
deleted file mode 100644
index 59dffca..0000000
--- a/MANIFEST.in
+++ /dev/null
@@ -1 +0,0 @@
-recursive-include src/resources *
diff --git a/PyPI_README.md b/PyPI_README.md
index a4266ef..a686b6b 100644
--- a/PyPI_README.md
+++ b/PyPI_README.md
@@ -1,6 +1,14 @@
# arg-needle
This repository contains arg-needle, which implements the ARG inference algorithms ARG-Needle and ASMC-clust.
+Prebuilt CPython wheels are available for Linux (compatible with glibc ≥ 2.28) and macOS (built on macOS 15 for x86_64 and macOS 14 for arm64).
+
+| Platform \ CPython | ≤3.8 | 3.9 | 3.10 | 3.11 | 3.12 | 3.13 | 3.14 |
+|-----------------------------|------|-----|------|------|------|------|------|
+| Linux x86_64 | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| Linux aarch64 | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| macOS Intel (x86_64) | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| macOS Apple Silicon (arm64) | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
## Quickstart
@@ -12,8 +20,6 @@ The Python module can be installed with:
pip install arg-needle
```
-This Python module is currently available on Linux and macOS.
-
### Documentation
Please see the [ARG-Needle manual](https://palamaralab.github.io/software/argneedle/) for all usage instructions and documentation.
diff --git a/README.md b/README.md
index 8225c8a..46434d8 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ Please see the [ARG-Needle manual](https://palamaralab.github.io/software/argnee
## For developers: making a release
-- Bump the version number in [setup.py](setup.py) and [CMakeLists.txt](CMakeLists.txt)
+- Bump the version number in [pyproject.toml](pyproject.toml) and [CMakeLists.txt](CMakeLists.txt)
- Update [RELEASE_NOTES.md](RELEASE_NOTES.md)
- Push changes and check that all [GitHub workflows](https://github.com/PalamaraLab/arg_needle/actions) pass
- Tag the commit in Git using syntax `vX.Y.Z`
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index 3d4f9a5..cfe98a9 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -1,5 +1,10 @@
# arg-needle Release Notes
+## v1.1.0 (2025-12-18)
+
+- Public source code release: https://github.com/PalamaraLab/arg-needle
+- Python wheels are now available for Linux and macOS on both x86_64 and arm64/AArch64 architectures, for CPython versions 3.9 to 3.14 inclusive.
+
## v1.0.3 (2023-08-30)
- ASMC decoders can now take a genetic map file with a specified non-default location.
diff --git a/arg_needle b/arg_needle
deleted file mode 120000
index e831038..0000000
--- a/arg_needle
+++ /dev/null
@@ -1 +0,0 @@
-src
\ No newline at end of file
diff --git a/cmake/CompilerWarnings.cmake b/cmake/CompilerWarnings.cmake
index 53f2bce..16c37c7 100644
--- a/cmake/CompilerWarnings.cmake
+++ b/cmake/CompilerWarnings.cmake
@@ -1,6 +1,6 @@
# This file is part of the ARG-Needle genealogical inference and
# analysis software suite.
-# Copyright (C) 2023 ARG-Needle Developers.
+# Copyright (C) 2023-2025 ARG-Needle Developers.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
diff --git a/cmake/ProjectSettings.cmake b/cmake/ProjectSettings.cmake
index b73d53e..ad5289b 100644
--- a/cmake/ProjectSettings.cmake
+++ b/cmake/ProjectSettings.cmake
@@ -1,6 +1,6 @@
# This file is part of the ARG-Needle genealogical inference and
# analysis software suite.
-# Copyright (C) 2023 ARG-Needle Developers.
+# Copyright (C) 2023-2025 ARG-Needle Developers.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
diff --git a/cmake/Sanitisers.cmake b/cmake/Sanitisers.cmake
index 8334ae8..8fbf606 100644
--- a/cmake/Sanitisers.cmake
+++ b/cmake/Sanitisers.cmake
@@ -1,6 +1,6 @@
# This file is part of the ARG-Needle genealogical inference and
# analysis software suite.
-# Copyright (C) 2023 ARG-Needle Developers.
+# Copyright (C) 2023-2025 ARG-Needle Developers.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
diff --git a/cmake/StaticAnalysers.cmake b/cmake/StaticAnalysers.cmake
index f8bb49f..260e6f0 100644
--- a/cmake/StaticAnalysers.cmake
+++ b/cmake/StaticAnalysers.cmake
@@ -1,6 +1,6 @@
# This file is part of the ARG-Needle genealogical inference and
# analysis software suite.
-# Copyright (C) 2023 ARG-Needle Developers.
+# Copyright (C) 2023-2025 ARG-Needle Developers.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..c104575
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,82 @@
+[build-system]
+requires = [
+ "scikit-build-core>=0.11.6",
+ "pybind11==3.0.1",
+ "setuptools"
+]
+build-backend = "scikit_build_core.build"
+
+[project]
+dynamic = ["readme"]
+name = "arg-needle"
+version = "1.1.0"
+description = "Ancestral recombination graph (ARG)"
+authors = [
+ { name = "ARG-Needle Developers" }
+]
+requires-python = ">=3.9"
+
+classifiers = [
+ "Development Status :: 4 - Beta",
+ "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
+ "Programming Language :: Python :: 3 :: Only",
+ "Programming Language :: Python :: 3.9",
+ "Programming Language :: Python :: 3.10",
+ "Programming Language :: Python :: 3.11",
+ "Programming Language :: Python :: 3.12",
+ "Programming Language :: Python :: 3.13",
+ "Programming Language :: Python :: 3.14",
+]
+
+dependencies = [
+ 'arg-needle-lib>=1.2.0',
+ 'asmc-asmc>=1.4.0',
+ 'fastcluster',
+ 'msprime>=1.3.0',
+ 'numpy>=1.17.0',
+ 'pandas',
+ 'psutil',
+ 'tskit>=1.0.0',
+]
+
+[project.optional-dependencies]
+dev = [
+ "pytest",
+ "h5py",
+]
+
+[project.scripts]
+infer_args="arg_needle.scripts.infer_args:main"
+infer_args_advanced="arg_needle.scripts.infer_args_advanced:main"
+prepare_example="arg_needle.scripts.prepare_example:main"
+
+[tool.scikit-build]
+minimum-version = "build-system.requires"
+build.verbose = true
+cmake.build-type = "Release"
+build.targets = ["arg_needle_hashing_pybind"]
+wheel.packages = ["src/arg_needle"]
+metadata.readme.provider = "scikit_build_core.metadata.fancy_pypi_readme"
+
+[tool.scikit-build.cmake.define]
+ARG_NEEDLE_TESTING = "OFF"
+ARG_NEEDLE_PYTHON_BINDINGS = "ON"
+ARG_NEEDLE_BUILDING_FROM_PYPROJECT = "ON"
+
+[tool.hatch.metadata.hooks.fancy-pypi-readme]
+content-type = "text/markdown"
+
+[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
+path = "PyPI_README.md"
+
+[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
+path = "RELEASE_NOTES.md"
+
+[tool.setuptools.packages.find]
+where = ["src"]
+
+[tool.setuptools.package-data]
+"arg_needle" = ["resources/*"]
+
+[tool.pytest.ini_options]
+testpaths = ["test"]
diff --git a/setup-pre-commit.sh b/setup-pre-commit.sh
deleted file mode 100755
index 95333fe..0000000
--- a/setup-pre-commit.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-# Set up pre-commit hooks, in this case just clang-format checking
-#
-# Note: this overwrites existing .git/hooks/pre-commit
-#
-# See .clang-format for configuration file
-# Code modified from https://github.com/KDAB/kdabtv/tree/master/Qt-Widgets-and-more/clang-format
-# Based on this tutorial: https://www.youtube.com/watch?v=Cz36YveDI2E
-
-echo "#!/bin/sh
-
-python .git/hooks/pre-commit-clang-format.py" > .git/hooks/pre-commit
-
-
-echo "import subprocess
-try:
- output = str(subprocess.check_output([\"git\", \"clang-format\", \"--diff\"]))
-except subprocess.CalledProcessError as e:
- print(e)
- print(\"Error raised, try installing clang-format.\\n\")
- exit(1)
-
-if \"clang-format did not modify any files\" not in output and \"no modified files to format\" not in output:
- print(\"Run git clang-format, add the modified files, then commit.\\n\")
- exit(1)
-else:
- exit(0)" > .git/hooks/pre-commit-clang-format.py
-
-
-chmod +x .git/hooks/pre-commit .git/hooks/pre-commit-clang-format.py
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 1ba9ccb..0000000
--- a/setup.py
+++ /dev/null
@@ -1,165 +0,0 @@
-# This file is part of the ARG-Needle genealogical inference and
-# analysis software suite.
-# Copyright (C) 2023 ARG-Needle Developers.
-
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-
-# You should have received a copy of the GNU General Public License
-# along with this program. If not, see .
-
-# Based on https://github.com/pybind/cmake_example
-
-import os
-import sys
-import subprocess
-
-from setuptools import setup, Extension
-from setuptools.command.build_ext import build_ext
-
-# Convert distutils Windows platform specifiers to CMake -A arguments
-PLAT_TO_CMAKE = {
- "win32": "Win32",
- "win-amd64": "x64",
- "win-arm32": "ARM",
- "win-arm64": "ARM64",
-}
-
-
-# A CMakeExtension needs a sourcedir instead of a file list.
-# The name must be the _single_ output extension from the CMake build.
-# If you need multiple extensions, see scikit-build.
-class CMakeExtension(Extension):
- def __init__(self, name, sourcedir=""):
- Extension.__init__(self, name, sources=[])
- self.sourcedir = os.path.abspath(sourcedir)
-
-
-class CMakeBuild(build_ext):
-
- def build_extension(self, ext):
- extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
-
- # required for auto-detection of auxiliary "native" libs
- if not extdir.endswith(os.path.sep):
- extdir += os.path.sep
-
- cfg = "Debug" if self.debug else "Release"
-
- # CMake lets you override the generator - we need to check this.
- # Can be set with Conda-Build, for example.
- cmake_generator = os.environ.get("CMAKE_GENERATOR", "")
-
- # Set Python_EXECUTABLE instead if you use PYBIND11_FINDPYTHON
- # EXAMPLE_VERSION_INFO shows you how to pass a value into the C++ code
- # from Python.
- cmake_args = [
- f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}",
- f"-DPYTHON_EXECUTABLE={sys.executable}",
- f"-DCMAKE_BUILD_TYPE={cfg}",
- f"-DWARNINGS_AS_ERRORS=OFF",
- f"-DENABLE_TESTING=OFF",
- f"-DMAKE_DOCS=OFF",
- ]
- build_args = []
-
- if self.compiler.compiler_type != "msvc":
- # Using Ninja-build since it a) is available as a wheel and b)
- # multithreads automatically. MSVC would require all variables be
- # exported for Ninja to pick it up, which is a little tricky to do.
- # Users can override the generator with CMAKE_GENERATOR in CMake
- # 3.15+.
- if not cmake_generator:
- cmake_args += ["-GNinja"]
-
- else:
-
- # Single config generators are handled "normally"
- single_config = any(x in cmake_generator for x in {"NMake", "Ninja"})
-
- # CMake allows an arch-in-generator style for backward compatibility
- contains_arch = any(x in cmake_generator for x in {"ARM", "Win64"})
-
- # Specify the arch if using MSVC generator, but only if it doesn't
- # contain a backward-compatibility arch spec already in the
- # generator name.
- if not single_config and not contains_arch:
- cmake_args += ["-A", PLAT_TO_CMAKE[self.plat_name]]
-
- # Multi-config generators have a different way to specify configs
- if not single_config:
- cmake_args += [
- "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}".format(cfg.upper(), extdir)
- ]
- build_args += ["--config", cfg]
-
- # Set CMAKE_BUILD_PARALLEL_LEVEL to control the parallel build level
- # across all generators.
- if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ:
- # self.parallel is a Python 3 only way to set parallel jobs by hand
- # using -j in the build_ext call, not supported by pip or PyPA-build.
- if hasattr(self, "parallel") and self.parallel:
- # CMake 3.12+ only.
- build_args += ["-j{}".format(self.parallel)]
-
- if not os.path.exists(self.build_temp):
- os.makedirs(self.build_temp)
-
- subprocess.check_call(
- ["cmake", ext.sourcedir] + cmake_args, cwd=self.build_temp
- )
- subprocess.check_call(
- ["cmake", "--build", "."] + build_args, cwd=self.build_temp
- )
-
-
-with open('PyPI_README.md', encoding='utf-8') as f:
- long_description = f.read()
-
-with open('RELEASE_NOTES.md', encoding='utf-8') as f:
- release_notes = f.read()
-
-setup(
- name='arg-needle',
- version='1.0.3',
- author='PalamaraLab (https://palamaralab.github.io/)',
- url='https://github.com/PalamaraLab/arg_needle/',
- install_requires=[
- 'arg-needle-lib>=1.0.0',
- 'asmc-asmc>=1.3.1',
- 'fastcluster',
- 'msprime>=1.0.0',
- 'numpy>=1.17.0',
- 'pandas',
- 'psutil',
- 'tskit>=0.1.5',
- ],
- extras_require={
- 'dev': [
- 'pytest',
- ],
- },
- description='Ancestral recombination graph (ARG)',
- packages=['arg_needle', 'arg_needle.scripts'],
- long_description='\n'.join([long_description, release_notes]),
- long_description_content_type='text/markdown',
- ext_modules=[CMakeExtension('arg_needle')],
- cmdclass=dict(build_ext=CMakeBuild),
- entry_points = {
- 'console_scripts': [
- 'arg_needle=arg_needle.scripts.infer_args:main',
- 'arg_needle_multistep=arg_needle.scripts.infer_args_advanced:main',
- 'arg_needle_prepare_example=arg_needle.scripts.prepare_example:main',
- ],
- },
- include_package_data=True,
- package_data={'': ['resources/*']},
- zip_safe=False,
-)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 5dc1c6c..7482dad 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,6 +1,6 @@
# This file is part of the ARG-Needle genealogical inference and
# analysis software suite.
-# Copyright (C) 2023 ARG-Needle Developers.
+# Copyright (C) 2023-2025 ARG-Needle Developers.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -43,8 +43,8 @@ set_target_properties(arg_needle_hashing PROPERTIES PUBLIC_HEADER "${arg_needle_
target_link_libraries(arg_needle_hashing PRIVATE Boost::headers Boost::iostreams)
target_link_libraries(arg_needle_hashing PRIVATE project_warnings)
-if (PYTHON_BINDINGS)
- set_target_properties(arg_needle_hashing PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
+if (ARG_NEEDLE_PYTHON_BINDINGS)
pybind11_add_module(arg_needle_hashing_pybind hashing/pybind.cpp)
target_link_libraries(arg_needle_hashing_pybind PRIVATE arg_needle_hashing)
+ install(TARGETS arg_needle_hashing_pybind LIBRARY DESTINATION arg_needle)
endif ()
diff --git a/src/__init__.py b/src/arg_needle/__init__.py
similarity index 95%
rename from src/__init__.py
rename to src/arg_needle/__init__.py
index d857496..258d285 100644
--- a/src/__init__.py
+++ b/src/arg_needle/__init__.py
@@ -1,6 +1,6 @@
# This file is part of the ARG-Needle genealogical inference and
# analysis software suite.
-# Copyright (C) 2023 ARG-Needle Developers.
+# Copyright (C) 2023-2025 ARG-Needle Developers.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
diff --git a/src/decoders.py b/src/arg_needle/decoders.py
similarity index 98%
rename from src/decoders.py
rename to src/arg_needle/decoders.py
index 7c13680..3c1336e 100644
--- a/src/decoders.py
+++ b/src/arg_needle/decoders.py
@@ -1,6 +1,6 @@
# This file is part of the ARG-Needle genealogical inference and
# analysis software suite.
-# Copyright (C) 2023 ARG-Needle Developers.
+# Copyright (C) 2023-2025 ARG-Needle Developers.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -30,7 +30,7 @@
# our packages
from asmc.asmc import DecodingParams, ASMC
-import arg_needle_hashing_pybind as arg_needle_hashing
+from .arg_needle_hashing_pybind import HapData
from .utils import btime
logging.basicConfig(
@@ -329,7 +329,7 @@ def make_asmc_decoder(
if use_hashing:
if verbose:
logging.info("Making HapData object")
- hasher = arg_needle_hashing.HapData(
+ hasher = HapData(
mode, haps_file_root, hash_word_size, mapfile, fill_sites=False)
logging.info("Hashing data is {} by {}".format(hasher.num_haps, hasher.num_sites))
@@ -337,7 +337,7 @@ def make_asmc_decoder(
if use_hashing and backup_hash_word_size > 0:
if verbose:
logging.info("Making backup HapData object")
- backup_hasher = arg_needle_hashing.HapData(
+ backup_hasher = HapData(
mode, haps_file_root, backup_hash_word_size,
map_file_path=mapfile, fill_sites=False)
logging.info("Backup hashing data is {} by {}".format(hasher.num_haps, hasher.num_sites))
diff --git a/src/inference.py b/src/arg_needle/inference.py
similarity index 99%
rename from src/inference.py
rename to src/arg_needle/inference.py
index ae2f7b5..abb56fe 100644
--- a/src/inference.py
+++ b/src/arg_needle/inference.py
@@ -1,6 +1,6 @@
# This file is part of the ARG-Needle genealogical inference and
# analysis software suite.
-# Copyright (C) 2023 ARG-Needle Developers.
+# Copyright (C) 2023-2025 ARG-Needle Developers.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
diff --git a/src/resources/30-100-2000_CEU.decodingQuantities.gz b/src/arg_needle/resources/30-100-2000_CEU.decodingQuantities.gz
similarity index 100%
rename from src/resources/30-100-2000_CEU.decodingQuantities.gz
rename to src/arg_needle/resources/30-100-2000_CEU.decodingQuantities.gz
diff --git a/src/resources/CEU.demo b/src/arg_needle/resources/CEU.demo
similarity index 100%
rename from src/resources/CEU.demo
rename to src/arg_needle/resources/CEU.demo
diff --git a/src/resources/ukb_chr2_spectrum.tsv b/src/arg_needle/resources/ukb_chr2_spectrum.tsv
similarity index 100%
rename from src/resources/ukb_chr2_spectrum.tsv
rename to src/arg_needle/resources/ukb_chr2_spectrum.tsv
diff --git a/src/scripts/README.md b/src/arg_needle/scripts/README.md
similarity index 100%
rename from src/scripts/README.md
rename to src/arg_needle/scripts/README.md
diff --git a/src/scripts/__init__.py b/src/arg_needle/scripts/__init__.py
similarity index 93%
rename from src/scripts/__init__.py
rename to src/arg_needle/scripts/__init__.py
index 4783f5a..638e42b 100644
--- a/src/scripts/__init__.py
+++ b/src/arg_needle/scripts/__init__.py
@@ -1,6 +1,6 @@
# This file is part of the ARG-Needle genealogical inference and
# analysis software suite.
-# Copyright (C) 2023 ARG-Needle Developers.
+# Copyright (C) 2023-2025 ARG-Needle Developers.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
diff --git a/src/scripts/infer_args.py b/src/arg_needle/scripts/infer_args.py
similarity index 98%
rename from src/scripts/infer_args.py
rename to src/arg_needle/scripts/infer_args.py
index 10908b8..6b61f0a 100644
--- a/src/scripts/infer_args.py
+++ b/src/arg_needle/scripts/infer_args.py
@@ -1,6 +1,6 @@
# This file is part of the ARG-Needle genealogical inference and
# analysis software suite.
-# Copyright (C) 2023 ARG-Needle Developers.
+# Copyright (C) 2023-2025 ARG-Needle Developers.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
diff --git a/src/scripts/infer_args_advanced.py b/src/arg_needle/scripts/infer_args_advanced.py
similarity index 99%
rename from src/scripts/infer_args_advanced.py
rename to src/arg_needle/scripts/infer_args_advanced.py
index 6cb035b..5d73f20 100644
--- a/src/scripts/infer_args_advanced.py
+++ b/src/arg_needle/scripts/infer_args_advanced.py
@@ -1,6 +1,6 @@
# This file is part of the ARG-Needle genealogical inference and
# analysis software suite.
-# Copyright (C) 2023 ARG-Needle Developers.
+# Copyright (C) 2023-2025 ARG-Needle Developers.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
diff --git a/src/scripts/prepare_example.py b/src/arg_needle/scripts/prepare_example.py
similarity index 99%
rename from src/scripts/prepare_example.py
rename to src/arg_needle/scripts/prepare_example.py
index bc02370..9a0ec90 100644
--- a/src/scripts/prepare_example.py
+++ b/src/arg_needle/scripts/prepare_example.py
@@ -1,6 +1,6 @@
# This file is part of the ARG-Needle genealogical inference and
# analysis software suite.
-# Copyright (C) 2023 ARG-Needle Developers.
+# Copyright (C) 2023-2025 ARG-Needle Developers.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
diff --git a/src/simulator.py b/src/arg_needle/simulator.py
similarity index 98%
rename from src/simulator.py
rename to src/arg_needle/simulator.py
index 5c5f47f..6ff9a38 100644
--- a/src/simulator.py
+++ b/src/arg_needle/simulator.py
@@ -1,6 +1,6 @@
# This file is part of the ARG-Needle genealogical inference and
# analysis software suite.
-# Copyright (C) 2023 ARG-Needle Developers.
+# Copyright (C) 2023-2025 ARG-Needle Developers.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
diff --git a/src/utils.py b/src/arg_needle/utils.py
similarity index 97%
rename from src/utils.py
rename to src/arg_needle/utils.py
index 102728c..30bd384 100644
--- a/src/utils.py
+++ b/src/arg_needle/utils.py
@@ -1,6 +1,6 @@
# This file is part of the ARG-Needle genealogical inference and
# analysis software suite.
-# Copyright (C) 2023 ARG-Needle Developers.
+# Copyright (C) 2023-2025 ARG-Needle Developers.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
diff --git a/src/hashing/FileUtils.cpp b/src/hashing/FileUtils.cpp
index 0699994..aac8028 100644
--- a/src/hashing/FileUtils.cpp
+++ b/src/hashing/FileUtils.cpp
@@ -1,7 +1,7 @@
/*
This file is part of the ARG-Needle genealogical inference and
analysis software suite.
- Copyright (C) 2023 ARG-Needle Developers.
+ Copyright (C) 2023-2025 ARG-Needle Developers.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -25,63 +25,73 @@
// The license file can be found at 3rd_party/Eagle/COPYING from the
// root of this repository.
-#include
-#include
-#include
-#include
-#include
-#include
-
#include "FileUtils.hpp"
#include
#include
-namespace FileUtils {
+#include
+#include
+#include
+#include
-using std::cerr;
-using std::endl;
-using std::string;
-using std::vector;
-
-bool fileExists(const std::string& name) {
- std::ifstream f(name.c_str());
- return f.good();
-}
-
-int AutoGzIfstream::lineCount(const std::string& file) {
- AutoGzIfstream fin;
- fin.openOrExit(file);
- int ctr = 0;
- string line;
- while (getline(fin, line))
- ctr++;
- return ctr;
-}
-
-void AutoGzIfstream::openOrExit(const std::string& file, std::ios_base::openmode mode) {
- fin.open(file.c_str(), mode);
- if (!fin) {
- cerr << "ERROR: Unable to open file: " << file << endl;
- exit(1);
- }
- if ((int) file.length() > 3 && file.substr(file.length() - 3) == ".gz")
- boost_in.push(boost::iostreams::gzip_decompressor());
- boost_in.push(fin);
-}
-
-void AutoGzIfstream::close() {
- fin.close();
- boost_in.reset();
-}
-
-AutoGzIfstream::operator bool() const {
- return !boost_in.fail();
-}
-
-AutoGzIfstream& getline(AutoGzIfstream& in, std::string& s) {
- std::getline(in.boost_in, s);
- return in;
-}
+namespace FileUtils {
+ struct AutoGzIfstream::Impl {
+ boost::iostreams::filtering_istream boost_in;
+ std::ifstream fin;
+ };
+
+ AutoGzIfstream::AutoGzIfstream() : pimpl(std::make_unique()) {
+ }
+
+ AutoGzIfstream::~AutoGzIfstream() noexcept = default;
+
+ bool fileExists(const std::filesystem::path &file) {
+ std::ifstream f(file.c_str());
+ return f.good();
+ }
+
+ int AutoGzIfstream::lineCount(const std::filesystem::path &file) {
+ AutoGzIfstream fin;
+ fin.openOrExit(file);
+ int ctr = 0;
+ std::string line;
+ while (getline(fin, line)) {
+ ctr++;
+ }
+ fin.close();
+ return ctr;
+ }
+
+ void AutoGzIfstream::openOrExit(const std::filesystem::path &file, std::ios_base::openmode mode) {
+ pimpl->fin.open(file.c_str(), mode);
+ if (!pimpl->fin) {
+ std::cerr << "ERROR: Unable to open file: " << file << std::endl;
+ exit(1);
+ }
+ if (file.extension() == ".gz") {
+ pimpl->boost_in.push(boost::iostreams::gzip_decompressor());
+ }
+ pimpl->boost_in.push(pimpl->fin);
+ }
+
+ void AutoGzIfstream::close() {
+ pimpl->fin.close();
+ pimpl->boost_in.reset();
+ }
+
+ AutoGzIfstream::operator bool() const noexcept {
+ return !pimpl->boost_in.fail();
+ }
+
+ AutoGzIfstream &getline(AutoGzIfstream &in, std::string &s) {
+ std::getline(in.pimpl->boost_in, s);
+ return in;
+ }
+
+ AutoGzIfstream &AutoGzIfstream::operator>>(std::string &x) {
+ pimpl->boost_in >> x;
+ return *this;
+ }
} // namespace FileUtils
diff --git a/src/hashing/FileUtils.hpp b/src/hashing/FileUtils.hpp
index 3f1d262..de1f004 100644
--- a/src/hashing/FileUtils.hpp
+++ b/src/hashing/FileUtils.hpp
@@ -1,7 +1,7 @@
/*
This file is part of the ARG-Needle genealogical inference and
analysis software suite.
- Copyright (C) 2023 ARG-Needle Developers.
+ Copyright (C) 2023-2025 ARG-Needle Developers.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -25,39 +25,108 @@
// The license file can be found at 3rd_party/Eagle/COPYING from the
// root of this repository.
-#ifndef FILEUTILS_HPP
-#define FILEUTILS_HPP
+#ifndef ARG_NEEDLE_FILE_UTILS_HPP
+#define ARG_NEEDLE_FILE_UTILS_HPP
-#include
+#include
+#include
#include
-#include
-
-#include
namespace FileUtils {
-
-bool fileExists(const std::string& name);
-
-class AutoGzIfstream {
- boost::iostreams::filtering_istream boost_in;
- std::ifstream fin;
-
-public:
- static int lineCount(const std::string& file);
-
- void openOrExit(const std::string& file, std::ios_base::openmode mode = std::ios::in);
- void close();
- template AutoGzIfstream& operator>>(T& x) {
- boost_in >> x;
- return *this;
- }
-
- operator bool() const;
- friend AutoGzIfstream& getline(AutoGzIfstream& in, std::string& s);
-};
-
-AutoGzIfstream& getline(AutoGzIfstream& in, std::string& s);
-
+ /**
+ * @brief Check whether a given file exists on disk.
+ *
+ * @param file Path to the file to check.
+ * @return true if the file exists, false otherwise.
+ */
+ bool fileExists(const std::filesystem::path &file);
+
+ /**
+ * @class AutoGzIfstream
+ * @brief Stream wrapper that transparently reads either plain-text or gzip-compressed files.
+ *
+ * AutoGzIfstream detects whether an input file is compressed (.gz) and automatically
+ * opens it appropriately. It behaves similarly to std::ifstream but supports reading
+ * gzip-compressed streams without requiring explicit decompression by the caller.
+ *
+ * Internally uses a pimpl to hide implementation details and avoid exposing boost
+ * libraries at the interface level.
+ */
+ class AutoGzIfstream {
+ struct Impl;
+ std::unique_ptr pimpl;
+
+ public:
+ /**
+ * @brief Construct an unopened AutoGzIfstream.
+ */
+ AutoGzIfstream();
+
+ /**
+ * @brief Destructor closes the stream if open and releases internal resources.
+ */
+ ~AutoGzIfstream() noexcept;
+
+ /**
+ * @brief Count the number of lines in a file (supports gzipped and plain files).
+ *
+ * @param file Path to the file whose line count will be computed.
+ * @return Number of lines in the file.
+ */
+ [[nodiscard]] static int lineCount(const std::filesystem::path &file);
+
+ /**
+ * @brief Open a file for reading or exit the program if opening fails.
+ *
+ * Automatically detects gzip compression based on file contents.
+ *
+ * @param file Path to the file to open.
+ * @param mode Stream opening mode (defaults to std::ios::in).
+ */
+ void openOrExit(const std::filesystem::path &file,
+ std::ios_base::openmode mode = std::ios::in);
+
+ /**
+ * @brief Close the underlying stream.
+ */
+ void close();
+
+ /**
+ * @brief Read whitespace-delimited input into a string via the extraction operator.
+ *
+ * @param x Output string that will receive the parsed token.
+ * @return Reference to this stream.
+ */
+ AutoGzIfstream &operator>>(std::string &x);
+
+ /**
+ * @brief Boolean conversion indicating whether the stream is currently valid.
+ *
+ * Allows usage in conditions such as:
+ * @code
+ * if (stream) { ... }
+ * @endcode
+ *
+ * @return true if the stream is open and in a good state, false otherwise.
+ */
+ [[nodiscard]] explicit operator bool() const noexcept;
+
+ /**
+ * @brief Friend declaration enabling getline(AutoGzIfstream&, ...).
+ */
+ friend AutoGzIfstream &getline(AutoGzIfstream &in, std::string &s);
+ };
+
+ /**
+ * @brief Read a full line from an AutoGzIfstream into a string.
+ *
+ * Supports both compressed and uncompressed input sources.
+ *
+ * @param in Stream to read from.
+ * @param s Output string receiving the line (without delimiter).
+ * @return Reference to the stream.
+ */
+ AutoGzIfstream &getline(AutoGzIfstream &in, std::string &s);
} // namespace FileUtils
-#endif
+#endif // ARG_NEEDLE_FILE_UTILS_HPP
diff --git a/src/hashing/HapData.cpp b/src/hashing/HapData.cpp
index 4be4e90..ceb7a07 100644
--- a/src/hashing/HapData.cpp
+++ b/src/hashing/HapData.cpp
@@ -1,7 +1,7 @@
/*
This file is part of the ARG-Needle genealogical inference and
analysis software suite.
- Copyright (C) 2023 ARG-Needle Developers.
+ Copyright (C) 2023-2025 ARG-Needle Developers.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -33,18 +33,8 @@
#include "HapData.hpp"
#include "utils.hpp"
-using std::cerr;
-using std::cout;
-using std::deque;
-using std::endl;
-using std::ostream;
-using std::pair;
-using std::string;
-using std::tuple;
-using std::unordered_map;
-using std::unordered_set;
-
-HapData::HapData(string mode, string file_root_path, unsigned int _word_size, string map_file_path,
+
+HapData::HapData(std::string mode, std::string file_root_path, unsigned int _word_size, std::string map_file_path,
bool fill_sites)
: word_size(_word_size) {
if (mode == "sequence") {
@@ -54,21 +44,21 @@ HapData::HapData(string mode, string file_root_path, unsigned int _word_size, st
data_mode = HapDataMode::array;
}
else {
- throw std::logic_error(THROW_LINE("Mode not recognized."));
+ throw std::logic_error(MAKE_ERROR("Mode not recognized."));
}
if (sizeof(word_type) != 8) {
- throw std::logic_error(THROW_LINE("Expected word_type to be 8 bytes (64 bits)."));
+ throw std::logic_error(MAKE_ERROR("Expected word_type to be 8 bytes (64 bits)."));
}
if (sizeof(1ull) < 8) {
throw std::logic_error(
- THROW_LINE("Expected unsigned long long to be at least 8 bytes (64 bits)."));
+ MAKE_ERROR("Expected unsigned long long to be at least 8 bytes (64 bits)."));
}
if (word_size > 64 || word_size <= 0) {
- throw std::logic_error(THROW_LINE("Out of bounds word size."));
+ throw std::logic_error(MAKE_ERROR("Out of bounds word size."));
}
- string line;
+ std::string line;
std::stringstream ss;
// read in .sample[s] file
@@ -80,14 +70,14 @@ HapData::HapData(string mode, string file_root_path, unsigned int _word_size, st
file_samples.openOrExit(file_root_path + ".sample");
}
else {
- cerr << "ERROR. Could not find sample file in " + file_root_path + ".sample[s]" << endl;
+ std::cerr << "ERROR. Could not find sample file in " + file_root_path + ".sample[s]" << std::endl;
exit(1);
}
while (getline(file_samples, line)) {
- vector splitStr;
+ std::vector splitStr;
std::istringstream iss(line);
- string buf;
+ std::string buf;
while (iss >> buf)
splitStr.push_back(buf);
@@ -105,14 +95,13 @@ HapData::HapData(string mode, string file_root_path, unsigned int _word_size, st
// Parse .map[.gz] file
FileUtils::AutoGzIfstream file_map;
- if (map_file_path != "") {
+ if (!map_file_path.empty()) {
// Attempt to read in .map[.gz] file
if (FileUtils::fileExists(map_file_path)) {
file_map.openOrExit(map_file_path);
- // cout << "Using genetic map " << map_file_path << endl;
}
else {
- cerr << "ERROR. Could not open map file " + map_file_path + ", no such file" << endl;
+ std::cerr << "ERROR. Could not open map file " + map_file_path + ", no such file" << std::endl;
exit(1);
}
}
@@ -120,20 +109,18 @@ HapData::HapData(string mode, string file_root_path, unsigned int _word_size, st
// If no map file is specified, default to file_root_path.map[.gz]
if (FileUtils::fileExists(file_root_path + ".map.gz")) {
file_map.openOrExit(file_root_path + ".map.gz");
- // cout << "Using genetic map " << file_root_path << ".map.gz" << endl;
}
else if (FileUtils::fileExists(file_root_path + ".map")) {
file_map.openOrExit(file_root_path + ".map");
- // cout << "Using genetic map " << file_root_path << ".map" << endl;
}
else {
- cerr << "ERROR. Could not find map file in " + file_root_path + ".map.gz or " +
+ std::cerr << "ERROR. Could not find map file in " + file_root_path + ".map.gz or " +
file_root_path + ".map"
- << endl;
+ << std::endl;
exit(1);
}
}
- string map_field[4];
+ std::string map_field[4];
while (getline(file_map, line)) {
ss.clear();
ss.str(line);
@@ -159,27 +146,28 @@ HapData::HapData(string mode, string file_root_path, unsigned int _word_size, st
file_hap.openOrExit(file_root_path + ".haps");
}
else {
- cerr << "ERROR. Could not find hap file in " + file_root_path + ".hap.gz, " + file_root_path +
+ std::cerr << "ERROR. Could not find hap file in " + file_root_path + ".hap.gz, " + file_root_path +
".hap, " + ".haps.gz, or " + file_root_path + ".haps"
- << endl;
+ << std::endl;
exit(1);
}
if (fill_sites) {
- sites = vector>(num_haps, vector());
+ sites = std::vector>(num_haps, std::vector());
}
- words = vector>(num_haps, vector());
- string marker_id;
+ words = std::vector>(num_haps, std::vector());
+ std::string marker_id;
unsigned long int marker_pos;
char al[2], inp;
- int site_id = 0;
+ unsigned int site_id = 0u;
while (getline(file_hap, line)) {
// read the meta data
ss.clear();
ss.str(line);
ss >> map_field[0] >> marker_id >> marker_pos >> al[0] >> al[1];
- if (map_field[0] == "")
+ if (map_field[0].empty()) {
continue;
+ }
if (site_id % word_size == 0) {
for (size_t hap_id = 0; hap_id < num_haps; ++hap_id) {
@@ -212,9 +200,9 @@ HapData::HapData(string mode, string file_root_path, unsigned int _word_size, st
}
}
}
- float maf = (float) maf_ctr / num_haps;
- if (maf > 0.5) {
- maf = 1 - maf;
+ float maf = static_cast(maf_ctr) / static_cast(num_haps);
+ if (maf > 0.5f) {
+ maf = 1.f - maf;
}
site_mafs.push_back(maf);
++site_id;
@@ -222,27 +210,21 @@ HapData::HapData(string mode, string file_root_path, unsigned int _word_size, st
file_hap.close();
}
-HapData::~HapData() {
-#ifdef _DEBUG
- cout << "Deleting: " << *this << endl;
-#endif // _DEBUG
-}
-
void HapData::add_to_hash(size_t hap_id) {
if (hashed_hap_ids.find(hap_id) != hashed_hap_ids.end()) {
- throw std::logic_error(THROW_LINE("This haplotype has already been hashed."));
+ throw std::logic_error(MAKE_ERROR("This haplotype has already been hashed."));
}
if (hap_id >= num_haps) {
- throw std::logic_error(THROW_LINE("Haplotype ID out of bounds."));
+ throw std::logic_error(MAKE_ERROR("Haplotype ID out of bounds."));
}
if (hashes.empty()) {
- hashes = vector>>(
- words[hap_id].size(), unordered_map>());
+ hashes = std::vector>>(
+ words[hap_id].size(), std::unordered_map>());
}
for (size_t i = 0; i < words[hap_id].size(); ++i) {
- vector& hash_value =
+ std::vector& hash_value =
hashes[i][words[hap_id][i]]; // creates if not present, only hashes once
hash_value.push_back(hap_id);
}
@@ -252,23 +234,23 @@ void HapData::add_to_hash(size_t hap_id) {
void HapData::print_hap(size_t hap_id) {
if (hap_id >= num_haps) {
- throw std::logic_error(THROW_LINE("Haplotype ID out of bounds."));
+ throw std::logic_error(MAKE_ERROR("Haplotype ID out of bounds."));
}
- cout << "Bits for hap_id = " << hap_id << endl;
+ std::cout << "Bits for hap_id = " << hap_id << std::endl;
for (size_t site_id = 0; site_id < num_sites; ++site_id) {
- cout << sites[hap_id][site_id];
+ std::cout << sites[hap_id][site_id];
if (site_id % word_size == word_size - 1) {
- cout << " ";
+ std::cout << " ";
}
}
- cout << endl;
+ std::cout << std::endl;
- cout << "Words (hex) for hap_id = " << hap_id << endl;
+ std::cout << "Words (hex) for hap_id = " << hap_id << std::endl;
std::cout << std::hex << std::showbase;
for (auto const& word : words[hap_id]) {
- cout << word << " ";
+ std::cout << word << " ";
}
- cout << endl;
+ std::cout << std::endl;
std::cout << std::dec << std::noshowbase;
// cout << "Words (decimal)" << endl;
@@ -280,62 +262,62 @@ void HapData::print_hap(size_t hap_id) {
void HapData::print_hashes() {
for (size_t i = 0; i < hashes.size(); ++i) {
- cout << "Hash for word " << i << " of " << hashes.size() << endl;
+ std::cout << "Hash for word " << i << " of " << hashes.size() << std::endl;
for (auto const& map_entry : hashes[i]) {
- unsigned int num_bits = word_size;
+ unsigned long num_bits = word_size;
if (i == hashes.size() - 1) {
- num_bits = ((num_sites - 1) % word_size) + 1;
+ num_bits = ((num_sites - 1ul) % word_size) + 1ul;
}
for (size_t j = 0; j < num_bits; ++j) {
- cout << ((map_entry.first >> j) & 1);
+ std::cout << ((map_entry.first >> j) & 1);
}
- cout << ":";
+ std::cout << ":";
for (const size_t id : map_entry.second) {
- cout << " " << id;
+ std::cout << " " << id;
}
- cout << endl;
+ std::cout << std::endl;
}
- cout << endl;
+ std::cout << std::endl;
}
}
void HapData::print_word_match_diagram(size_t hap_id1, size_t hap_id2) {
if (hap_id1 >= num_haps || hap_id2 >= num_haps) {
- throw std::logic_error(THROW_LINE("Haplotype ID out of bounds."));
+ throw std::logic_error(MAKE_ERROR("Haplotype ID out of bounds."));
}
for (size_t i = 0; i < words[hap_id1].size(); ++i) {
if (i != 0) {
if (i % 100 == 0) {
- cout << endl;
+ std::cout << std::endl;
}
if (i % 25 == 0) {
- cout << endl;
+ std::cout << std::endl;
}
else if (i % 5 == 0) {
- cout << " ";
+ std::cout << " ";
}
}
if (words[hap_id1][i] == words[hap_id2][i]) {
- cout << "x";
+ std::cout << "x";
}
else {
- cout << "_";
+ std::cout << "_";
}
}
- cout << endl;
+ std::cout << std::endl;
}
-vector>>>
+std::vector>>>
HapData::get_closest_cousins(size_t hap_id, unsigned int k, unsigned int tolerance,
double window_size_genetic) {
// find the windows
- vector windows; // Window defined in HapData.hpp
+ std::vector windows; // Window defined in HapData.hpp
size_t num_words = words[hap_id].size();
if (window_size_genetic <= 0) {
// make a new window for each and every word
for (size_t j = 0; j < num_words; ++j) {
- Window w;
+ Window w{};
w.start = j;
w.end = j + 1;
w.index = j;
@@ -353,7 +335,7 @@ HapData::get_closest_cousins(size_t hap_id, unsigned int k, unsigned int toleran
(genetic_positions[last_word_site] - start_genetic >= window_size_genetic &&
genetic_positions[num_sites - 1] - genetic_positions[last_word_site + 1] >=
window_size_genetic)) {
- Window w;
+ Window w{};
w.start = start_word;
w.end = j + 1;
w.index = window_index;
@@ -367,7 +349,7 @@ HapData::get_closest_cousins(size_t hap_id, unsigned int k, unsigned int toleran
}
}
- vector words_to_windows;
+ std::vector words_to_windows;
for (size_t i = 0; i < windows.size(); ++i) {
Window w = windows[i];
for (size_t j = w.start; j < w.end; ++j) {
@@ -377,25 +359,24 @@ HapData::get_closest_cousins(size_t hap_id, unsigned int k, unsigned int toleran
// how high each sample scores in each window
// we only record samples that have matched
- vector> window_scores(
- windows.size(), unordered_map());
+ std::vector> window_scores(
+ windows.size(), std::unordered_map());
// stretches of matching material separated by 2*k + 1 fillers, where k is the number
// of mismatches, max size defined by 2*tolerance + 1
- vector>> stretches(hap_id, deque>());
+ std::vector>> stretches(hap_id, std::deque>());
// size_t num_overall_matches = 0;
for (size_t i = 0; i < num_words; ++i) {
// in some cases, the word does not yet exist in the hashmap
if (hashes[i].find(words[hap_id][i]) != hashes[i].end()) {
- const vector& hash_value = hashes[i].find(words[hap_id][i])->second;
- // num_overall_matches += hash_value.size();
- for (auto v : hash_value) {
+ const std::vector& matches = hashes[i].find(words[hap_id][i])->second;
+ for (auto v : matches) {
// check the end of stretches to figure out what to do
- if (stretches[v].size() == 0) {
+ if (stretches[v].empty()) {
stretches[v].emplace_back(i, i + 1); // end is exclusive
}
else {
- pair& back_pair = stretches[v].back();
+ std::pair& back_pair = stretches[v].back();
if (back_pair.second == i) {
back_pair.second = i + 1; // end is exclusive
}
@@ -419,7 +400,7 @@ HapData::get_closest_cousins(size_t hap_id, unsigned int k, unsigned int toleran
// pop_front to get to size 2*tolerance + 1
while (stretches[v].size() > 2 * tolerance + 1) {
- pair& item = stretches[v].front();
+ std::pair& item = stretches[v].front();
if (item.second != 0) {
size_t range_start = item.first;
// old version was buggy
@@ -439,10 +420,10 @@ HapData::get_closest_cousins(size_t hap_id, unsigned int k, unsigned int toleran
// if our range is [6, 16), we want [5, 10) to [15, 20) inclusive
for (size_t window_index = words_to_windows[range_start];
window_index <= words_to_windows[range_end - 1]; ++window_index) {
- size_t& hash_value =
+ size_t& best_len =
window_scores[window_index][v]; // creates if not present, only hashes once
- if (range_size > hash_value) {
- hash_value = range_size;
+ if (range_size > best_len) {
+ best_len = range_size;
}
}
}
@@ -454,8 +435,8 @@ HapData::get_closest_cousins(size_t hap_id, unsigned int k, unsigned int toleran
// go over all the stretches and pop_front
for (size_t v = 0; v < hap_id; ++v) {
- while (stretches[v].size() > 0) {
- pair item = stretches[v].front();
+ while (!stretches[v].empty()) {
+ std::pair item = stretches[v].front();
if (item.second != 0) {
size_t range_start = item.first;
// old version is buggy in general, but should work in this case
@@ -487,28 +468,27 @@ HapData::get_closest_cousins(size_t hap_id, unsigned int k, unsigned int toleran
}
// take the values in window_scores and sort to find top k
- vector>>> results;
+ std::vector>>> results;
for (const Window& w : windows) {
size_t window_start_site = w.start * word_size;
size_t window_end_site = std::min(w.end * word_size - 1, num_sites - 1);
- vector> stats;
+ std::vector> stats;
for (const auto& map_entry : window_scores[w.index]) {
- size_t hap_id = map_entry.first;
- double score = (double) map_entry.second;
- stats.emplace_back(score, hap_id);
+ size_t map_entry_hap_id = map_entry.first;
+ auto score = static_cast(map_entry.second);
+ stats.emplace_back(score, map_entry_hap_id);
}
size_t actual_k = std::min(k, stats.size());
// use this if we want sorted
std::partial_sort(
- stats.begin(), stats.begin() + actual_k, stats.end(), std::greater>());
+ stats.begin(), stats.begin() + static_cast(actual_k), stats.end(), std::greater>());
// use this if we don't care about sorted
// std::nth_element(stats.begin(), stats.begin() + actual_k, stats.end(),
// std::greater>());
// append to results
- results.push_back(
- std::make_tuple(window_start_site, window_end_site, vector>()));
+ results.emplace_back(window_start_site, window_end_site, std::vector>());
for (size_t stats_idx = 0; stats_idx < actual_k; ++stats_idx) {
std::get<2>(results[results.size() - 1])
.emplace_back(stats[stats_idx].second, stats[stats_idx].first);
@@ -518,7 +498,7 @@ HapData::get_closest_cousins(size_t hap_id, unsigned int k, unsigned int toleran
return results;
}
-ostream& operator<<(ostream& os, const HapData& data) {
+std::ostream& operator<<(std::ostream& os, const HapData& data) {
os << "HapData with " << data.num_haps << " haplotypes and " << data.num_sites;
os << " sites, word size = " << data.word_size << " bits";
return os;
diff --git a/src/hashing/HapData.hpp b/src/hashing/HapData.hpp
index 17d47f4..671bc44 100644
--- a/src/hashing/HapData.hpp
+++ b/src/hashing/HapData.hpp
@@ -1,7 +1,7 @@
/*
This file is part of the ARG-Needle genealogical inference and
analysis software suite.
- Copyright (C) 2023 ARG-Needle Developers.
+ Copyright (C) 2023-2025 ARG-Needle Developers.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -17,11 +17,10 @@
along with this program. If not, see .
*/
-#ifndef __HAP_DATA_HPP_
-#define __HAP_DATA_HPP_
+#ifndef ARG_NEELE_HAP_DATA_HPP
+#define ARG_NEELE_HAP_DATA_HPP
-#include
-#include
+#include
#include
#include
#include
@@ -29,14 +28,6 @@
#include
#include
-using std::ostream;
-using std::pair;
-using std::string;
-using std::tuple;
-using std::unordered_map;
-using std::unordered_set;
-using std::vector;
-
struct Window {
size_t start, end, index; // end is inclusive
friend bool operator<(const Window& a, const Window& b) {
@@ -55,31 +46,31 @@ class HapData {
public:
typedef uint64_t word_type;
- unsigned int num_haps = 0;
- unsigned int num_sites = 0;
+ unsigned long num_haps = 0ul;
+ unsigned long num_sites = 0ul;
unsigned int word_size;
HapDataMode data_mode;
- vector physical_positions;
- vector genetic_positions;
- vector site_mafs;
- vector sample_names;
- vector> sites;
- vector> words;
+ std::vector physical_positions;
+ std::vector genetic_positions;
+ std::vector site_mafs;
+ std::vector sample_names;
+ std::vector> sites;
+ std::vector> words;
- vector>> hashes;
- unordered_set hashed_hap_ids;
+ std::vector>> hashes;
+ std::unordered_set hashed_hap_ids;
- HapData(string mode, string file_root_path, unsigned int _word_size = 64,
- string map_file_path = "", bool fill_sites = true);
- ~HapData();
+ HapData(std::string mode, std::string file_root_path, unsigned int _word_size = 64,
+ std::string map_file_path = "", bool fill_sites = true);
+ ~HapData() = default;
void add_to_hash(size_t hap_id);
- vector>>>
+ std::vector>>>
get_closest_cousins(size_t hap_id, unsigned int k, unsigned int tolerance = 0,
double window_size_genetic = 0);
void print_hap(size_t hap_id);
void print_hashes();
void print_word_match_diagram(size_t hap_id1, size_t hap_id2);
- friend ostream& operator<<(ostream& os, const HapData& data);
+ friend std::ostream& operator<<(std::ostream& os, const HapData& data);
};
-#endif // __HAP_DATA_HPP_
+#endif // ARG_NEELE_HAP_DATA_HPP
diff --git a/src/hashing/pybind.cpp b/src/hashing/pybind.cpp
index 7e77bde..b8eda37 100644
--- a/src/hashing/pybind.cpp
+++ b/src/hashing/pybind.cpp
@@ -1,7 +1,7 @@
/*
This file is part of the ARG-Needle genealogical inference and
analysis software suite.
- Copyright (C) 2023 ARG-Needle Developers.
+ Copyright (C) 2023-2025 ARG-Needle Developers.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -56,7 +56,7 @@ PYBIND11_MODULE(arg_needle_hashing_pybind, m) {
"physical_position_at",
[](const HapData& data, size_t site) {
if (site >= data.num_sites) {
- throw std::logic_error(THROW_LINE("Out of bounds site."));
+ throw std::logic_error(MAKE_ERROR("Out of bounds site."));
}
return data.physical_positions[site];
},
@@ -65,7 +65,7 @@ PYBIND11_MODULE(arg_needle_hashing_pybind, m) {
"genetic_position_at",
[](const HapData& data, size_t site) {
if (site >= data.num_sites) {
- throw std::logic_error(THROW_LINE("Out of bounds site."));
+ throw std::logic_error(MAKE_ERROR("Out of bounds site."));
}
return data.genetic_positions[site];
},
@@ -74,7 +74,7 @@ PYBIND11_MODULE(arg_needle_hashing_pybind, m) {
"site_maf_at",
[](const HapData& data, size_t site) {
if (site >= data.num_sites) {
- throw std::logic_error(THROW_LINE("Out of bounds site."));
+ throw std::logic_error(MAKE_ERROR("Out of bounds site."));
}
return data.site_mafs[site];
},
@@ -83,7 +83,7 @@ PYBIND11_MODULE(arg_needle_hashing_pybind, m) {
"sample_name",
[](const HapData& data, size_t hap_id) {
if (hap_id >= data.num_haps) {
- throw std::logic_error(THROW_LINE("Out of bounds hap_id."));
+ throw std::logic_error(MAKE_ERROR("Out of bounds hap_id."));
}
return data.sample_names[hap_id];
},
diff --git a/src/hashing/utils.hpp b/src/hashing/utils.hpp
index 0fb16d7..85619b5 100644
--- a/src/hashing/utils.hpp
+++ b/src/hashing/utils.hpp
@@ -1,7 +1,7 @@
/*
This file is part of the ARG-Needle genealogical inference and
analysis software suite.
- Copyright (C) 2023 ARG-Needle Developers.
+ Copyright (C) 2023-2025 ARG-Needle Developers.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -17,14 +17,15 @@
along with this program. If not, see .
*/
-#ifndef __UTILS_HPP_
-#define __UTILS_HPP_
+#ifndef ARG_NEEDLE_UTILS_HPP
+#define ARG_NEEDLE_UTILS_HPP
#include
-using std::string;
+inline std::string make_error(const std::string &msg, const char *file, const int line) noexcept {
+ return std::string(file) + ":" + std::to_string(line) + ": " + msg;
+}
-// Utility for exceptions
-#define THROW_LINE(a) (string(__FILE__) + ":" + std::to_string(__LINE__) + ": " + a)
+#define MAKE_ERROR(msg) (make_error((msg), __FILE__, __LINE__))
-#endif // __UTILS_HPP_
+#endif // ARG_NEEDLE_UTILS_HPP
diff --git a/test/cpp/CMakeLists.txt b/test/cpp/CMakeLists.txt
new file mode 100644
index 0000000..148cf22
--- /dev/null
+++ b/test/cpp/CMakeLists.txt
@@ -0,0 +1,33 @@
+# This file is part of the ARG-Needle genealogical inference and
+# analysis software suite.
+# Copyright (C) 2023-2025 ARG-Needle Developers.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+set(
+ test_files
+ test_file_utils.cpp
+ test_utils.cpp
+)
+
+add_executable(cpp_tests ${test_files})
+target_link_libraries(cpp_tests PRIVATE arg_needle_hashing Catch2::Catch2WithMain)
+
+catch_discover_tests(cpp_tests)
+
+set(ARG_NEEDLE_TEST_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
+set(ARG_NEEDLE_RESOURCES_DIR "${CMAKE_SOURCE_DIR}/src/arg_needle/resources")
+
+add_definitions(-DARG_NEEDLE_TEST_DIR=\"${ARG_NEEDLE_TEST_DIR}\")
+add_definitions(-DARG_NEEDLE_RESOURCES_DIR=\"${ARG_NEEDLE_RESOURCES_DIR}\")
diff --git a/test/cpp/test_file_utils.cpp b/test/cpp/test_file_utils.cpp
new file mode 100644
index 0000000..0a846e9
--- /dev/null
+++ b/test/cpp/test_file_utils.cpp
@@ -0,0 +1,59 @@
+/*
+ This file is part of the ARG-Needle genealogical inference and
+ analysis software suite.
+ Copyright (C) 2023-2025 ARG-Needle Developers.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+*/
+
+#include
+
+#include "FileUtils.hpp"
+
+
+TEST_CASE( "FileUtils::fileExists", "[test_file_utils]" ) {
+ REQUIRE(FileUtils::fileExists(ARG_NEEDLE_TEST_DIR "/CMakeLists.txt") == true);
+ REQUIRE(FileUtils::fileExists(ARG_NEEDLE_TEST_DIR "/file_that_does_not_exist") == false);
+}
+
+TEST_CASE( "FileUtils::AutoGzIfstream", "[test_file_utils]")
+{
+ SECTION("open and close gz file")
+ {
+ REQUIRE(FileUtils::fileExists(ARG_NEEDLE_RESOURCES_DIR "/30-100-2000_CEU.decodingQuantities.gz") == true);
+ FileUtils::AutoGzIfstream gz_file;
+ gz_file.openOrExit(ARG_NEEDLE_RESOURCES_DIR "/30-100-2000_CEU.decodingQuantities.gz");
+ gz_file.close();
+ }
+
+ SECTION("count lines in file")
+ {
+ REQUIRE(FileUtils::fileExists(ARG_NEEDLE_RESOURCES_DIR "/30-100-2000_CEU.decodingQuantities.gz") == true);
+ REQUIRE(FileUtils::AutoGzIfstream::lineCount(ARG_NEEDLE_RESOURCES_DIR "/30-100-2000_CEU.decodingQuantities.gz") == 35245);
+ }
+
+ SECTION("extract line from a file")
+ {
+ REQUIRE(FileUtils::fileExists(ARG_NEEDLE_RESOURCES_DIR "/30-100-2000_CEU.decodingQuantities.gz") == true);
+
+ FileUtils::AutoGzIfstream gz_file;
+ gz_file.openOrExit(ARG_NEEDLE_RESOURCES_DIR "/30-100-2000_CEU.decodingQuantities.gz");
+
+ std::string first_line;
+ FileUtils::getline(gz_file, first_line);
+ gz_file.close();
+
+ REQUIRE(first_line == "TransitionType");
+ }
+}
\ No newline at end of file
diff --git a/test/cpp/test_utils.cpp b/test/cpp/test_utils.cpp
new file mode 100644
index 0000000..db78b30
--- /dev/null
+++ b/test/cpp/test_utils.cpp
@@ -0,0 +1,41 @@
+/*
+ This file is part of the ARG-Needle genealogical inference and
+ analysis software suite.
+ Copyright (C) 2023-2025 ARG-Needle Developers.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+*/
+
+#include
+#include
+
+#include "HapData.hpp"
+#include "utils.hpp"
+
+using Catch::Matchers::ContainsSubstring;
+
+void test_throw() {
+ throw std::logic_error(MAKE_ERROR("Something went wrong"));
+}
+
+
+TEST_CASE("make_error", "[utils]") {
+
+ REQUIRE_THROWS_WITH(test_throw(),
+ ContainsSubstring( "test_utils.cpp:" ) && ContainsSubstring( "Something went wrong" ));
+
+ REQUIRE_THROWS_WITH(HapData("banana", ""),
+ ContainsSubstring( "HapData.cpp:" ) && ContainsSubstring( "Mode not recognized" ));
+}
+
diff --git a/test/test_import.py b/test/test_import.py
index b73d831..5c0bc44 100644
--- a/test/test_import.py
+++ b/test/test_import.py
@@ -1,6 +1,6 @@
# This file is part of the ARG-Needle genealogical inference and
# analysis software suite.
-# Copyright (C) 2023 ARG-Needle Developers.
+# Copyright (C) 2023-2025 ARG-Needle Developers.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -20,4 +20,6 @@
add_default_arg_building_arguments, normalize_arg, trim_arg
)
-print("Successfully imported all arg_needle components")
+def test_import():
+ assert True
+ print("Successfully imported all arg_needle components")
diff --git a/test/test_regression.py b/test/test_regression.py
new file mode 100644
index 0000000..3c8f2f8
--- /dev/null
+++ b/test/test_regression.py
@@ -0,0 +1,99 @@
+# This file is part of the ARG-Needle genealogical inference and
+# analysis software suite.
+# Copyright (C) 2023-2025 ARG-Needle Developers.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+
+# This test assumes you have installed the dev dependencies of arg-needle.
+# In the root of this repository, run:
+#
+# pip install .[dev]
+
+import subprocess
+import tempfile
+import pathlib
+import h5py
+import numpy as np
+import shutil
+
+DATA_DIR = pathlib.Path(__file__).resolve().parent / "data"
+
+ARGN_KEYS = ['arg_file_version', 'chromosome', 'datetime_created', 'end', 'mutations', 'node_bounds', 'num_edges',
+ 'num_mutations', 'num_nodes', 'offset', 'start', 'threaded_samples']
+
+def check_attr_keys(attrs):
+ """
+ Check if the keys are as expected in generated HDF5 file
+ """
+ keys_in_generated_file = sorted([str(x) for x in attrs.keys()])
+
+ if keys_in_generated_file != ARGN_KEYS:
+ print(f"Expected the following keys:\n{ARGN_KEYS}\n but got:\n{keys_in_generated_file}")
+ return False
+
+ return True
+
+def check_attr_vals(attrs):
+ """
+ Check attr values are correct
+ """
+
+ # Deterministic values:
+ assert attrs["arg_file_version"] == 2
+ assert attrs["chromosome"] == 1
+ assert np.isclose(attrs["start"], 0.0, rtol=1e-8)
+ assert np.isclose(attrs["end"], 2000079.0, rtol=1e-8)
+ assert attrs["mutations"] == False
+ assert attrs["node_bounds"] == True
+ assert attrs["offset"] == 10001457
+ assert attrs["threaded_samples"] == 400
+
+ # These values were calculated by running the example about 100 times
+ nodes_mean = 17203.69792
+ nodes_std = 127.8394651
+ edges_mean = 93654.98958
+ edges_std = 591.7562353
+
+ # This should almost never fail
+ assert attrs["num_nodes"] > nodes_mean - 3.0 * nodes_std
+ assert attrs["num_nodes"] < nodes_mean + 3.0 * nodes_std
+
+ assert attrs["num_edges"] > edges_mean - 3.0 * edges_std
+ assert attrs["num_edges"] < edges_mean + 3.0 * edges_std
+
+ return True
+
+
+def test_script_output():
+
+ with tempfile.TemporaryDirectory() as tmpdir:
+ tmp = pathlib.Path(tmpdir)
+
+ # Run scripts inside temporary directory
+ subprocess.run(["prepare_example"], cwd=tmp, check=True)
+ subprocess.run(["infer_args", "--normalize", "0"], cwd=tmp, check=True)
+
+ # Check output file exists
+ outfile = tmp / "example.argn"
+ assert outfile.exists()
+
+ # Compare to frozen output
+ with h5py.File(outfile, "r") as arg_file:
+ assert check_attr_keys(arg_file.attrs)
+ assert check_attr_vals(arg_file.attrs)
+
+
+if __name__ == '__main__':
+ test_script_output()