diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 0000000..a1bbad5 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,62 @@ +name: Bug Report +description: File a bug report +title: "[Bug]: " +labels: ["bug"] +assignees: + - octocat +body: + - type: markdown + attributes: + value: | + Thanks for taking the time to fill out this bug report! + - type: input + id: contact + attributes: + label: Contact Details + description: How can we get in touch with you if we need more info? + placeholder: ex. email@example.com + validations: + required: false + - type: textarea + id: what-happened + attributes: + label: What happened? + description: Also tell us, what did you expect to happen? + placeholder: Tell us what you see! + value: "A bug happened!" + validations: + required: true + + - type: textarea + id: version + attributes: + label: Version + description: What version of our software are you running? + validations: + required: true + + - type: dropdown + id: platform + attributes: + label: What platform are you working on? + multiple: true + options: + - Mac Intel + - Linux + - Windows + - Mac M + - type: textarea + id: logs + attributes: + label: Relevant log output + description: Please copy and paste any relevant log output in trace log mode. This will be automatically formatted into code, so no need for backticks. + render: bash + + - type: checkboxes + id: terms + attributes: + label: Code of Conduct + description: By submitting this issue, you agree to follow our [Code of Conduct](https://github.com/ylab-hi/ScanNLS/blob/main/CODE_OF_CONDUCT.md) + options: + - label: I agree to follow this project's Code of Conduct + required: true diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000..2bc5d5f --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,19 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: "" +labels: "" +assignees: "" +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context or screenshots about the feature request here. diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 32e2534..5a848ff 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -8,3 +8,8 @@ updates: # Prefix all commit messages with "deps: ", which should be # accepted as a conventional commit and trigger release-please prefix: "deps" + + - package-ecosystem: github-actions + directory: "/" + schedule: + interval: daily diff --git a/.github/labels.yml b/.github/labels.yml new file mode 100644 index 0000000..f7f83aa --- /dev/null +++ b/.github/labels.yml @@ -0,0 +1,66 @@ +--- +# Labels names are important as they are used by Release Drafter to decide +# regarding where to record them in changelog or if to skip them. +# +# The repository labels will be automatically configured using this file and +# the GitHub Action https://github.com/marketplace/actions/github-labeler. +- name: breaking + description: Breaking Changes + color: bfd4f2 +- name: bug + description: Something isn't working + color: d73a4a +- name: build + description: Build System and Dependencies + color: bfdadc +- name: ci + description: Continuous Integration + color: 4a97d6 +- name: dependencies + description: Pull requests that update a dependency file + color: 0366d6 +- name: documentation + description: Improvements or additions to documentation + color: 0075ca +- name: duplicate + description: This issue or pull request already exists + color: cfd3d7 +- name: enhancement + description: New feature or request + color: a2eeef +- name: github_actions + description: Pull requests that update Github_actions code + color: "000000" +- name: good first issue + description: Good for newcomers + color: 7057ff +- name: help wanted + description: Extra attention is needed + color: 008672 +- name: invalid + description: This doesn't seem right + color: e4e669 +- name: performance + description: Performance + color: "016175" +- name: python + description: Pull requests that update Python code + color: 2b67c6 +- name: question + description: Further information is requested + color: d876e3 +- name: refactoring + description: Refactoring + color: ef67c4 +- name: removal + description: Removals and Deprecations + color: 9ae7ea +- name: style + description: Style + color: c120e5 +- name: testing + description: Testing + color: b1fc6f +- name: wontfix + description: This will not be worked on + color: ffffff diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml new file mode 100644 index 0000000..9054eec --- /dev/null +++ b/.github/workflows/labeler.yml @@ -0,0 +1,20 @@ +name: Labeler + +on: + push: + branches: + - main + +jobs: + labeler: + runs-on: ubuntu-latest + steps: + - name: Check out the repository + uses: actions/checkout@v4 + with: + fetch-depth: 1 + + - name: Run Labeler + uses: crazy-max/ghaction-github-labeler@v5.0.0 + with: + skip-delete: true diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 3995e60..0644462 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -1,10 +1,31 @@ -name: CI +name: CI-rust on: push: - branches: [master] + branches: + - main + paths: + - src/** + - examples/** + - Cargo.toml + - .github/workflows/test.yml + pull_request: - branches: [master] + branches: + - main + paths: + - src/** + - examples/** + - Cargo.toml + - .github/workflows/test.yml + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +#ref https://github.com/pola-rs/polars/blob/main/.github/workflows/test-rust.yml +env: + RUSTFLAGS: -C debuginfo=0 # Do not produce debug symbols to keep memory usage down jobs: Formatting: @@ -12,6 +33,8 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v2 + with: + fetch-depth: 1 - name: Install stable toolchain uses: actions-rs/toolchain@v1 @@ -63,7 +86,9 @@ jobs: rust: stable steps: - name: Checkout repository - uses: actions/checkout@v2 + uses: actions/checkout@v4 + with: + fetch-depth: 1 - uses: actions-rs/toolchain@v1 with: @@ -86,10 +111,12 @@ jobs: needs: Formatting runs-on: ubuntu-latest env: - MSRV_VERSION: 1.62.0 + MSRV_VERSION: 1.70.0 steps: - name: Checkout repository - uses: actions/checkout@v2 + uses: actions/checkout@v4 + with: + fetch-depth: 1 - name: Install MSRV toolchain uses: actions-rs/toolchain@v1 @@ -100,37 +127,8 @@ jobs: - name: check if README matches MSRV defined here run: grep $MSRV_VERSION README.md - - name: pin dependency versions for MSRV - run: | - cargo update -p indexmap --precise 1.8.2 - - name: Run tests uses: actions-rs/cargo@v1 with: command: test args: --all --no-fail-fast - - Coverage: - needs: Formatting - runs-on: ubuntu-latest - steps: - - name: Checkout repository - uses: actions/checkout@v2 - - - name: Install nightly toolchain - uses: actions-rs/toolchain@v1 - with: - toolchain: nightly - override: true - - - name: Install and run cargo-tarpaulin - uses: actions-rs/tarpaulin@v0.1 - with: - version: "0.26.0" - args: "--workspace --all-features --run-types Tests,Doctests --out Lcov --timeout 300" - - - name: Upload coverage - uses: coverallsapp/github-action@v1 - with: - github-token: ${{ secrets.GITHUB_TOKEN }} - path-to-lcov: ./lcov.info diff --git a/.gitignore b/.gitignore index 3038700..ff14de3 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,76 @@ bench/random-bench.csv bench/random-cov-bench.csv bench/*.bed data/ + + +# Python binding ignore +# Byte-compiled / optimized / DLL files +__pycache__/ +.pytest_cache/ +*.py[cod] + +# C extensions +*.so + +# Distribution / packaging +.Python +.venv/ +env/ +bin/ +build/ +develop-eggs/ +dist/ +eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +include/ +man/ +venv/ +*.egg-info/ +.installed.cfg +*.egg + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt +pip-selfcheck.json + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.cache +nosetests.xml +coverage.xml + +# Translations +*.mo + +# Mr Developer +.mr.developer.cfg +.project +.pydevproject + +# Rope +.ropeproject + +# Django stuff: +*.log +*.pot + +.DS_Store + +# Sphinx documentation +docs/_build/ + +# PyCharm +.idea/ + +# VSCode +.vscode/ + +# Pyenv +.python-version diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bd60306..40b8c4b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v4.5.0 hooks: - id: check-yaml stages: [commit] @@ -14,15 +14,19 @@ repos: stages: [commit] - id: detect-private-key stages: [commit] - - - repo: https://github.com/crate-ci/committed - rev: v1.0.18 - hooks: - - id: committed - stages: [commit-msg] + - id: check-added-large-files + args: ["--maxkb=150000"] + - id: end-of-file-fixer + - id: trailing-whitespace - repo: https://github.com/doublify/pre-commit-rust rev: v1.0 hooks: - id: fmt - id: cargo-check + + - repo: https://github.com/crate-ci/typos + rev: typos-dict-v0.11.2 + hooks: + - id: typos + exclude: "extc|svg|psl|CHANGELOG" diff --git a/README.md b/README.md index 11205a0..8126a37 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,3 @@ - # COITrees: Cache Oblivious Interval Trees COITrees implements a data structure for very fast overlap queries of a @@ -24,15 +23,17 @@ variants are implemented to exploit AVX2 instructions on x86-64 cpus type is oppurtunistically defined to one of these types if the right instruction set is detected. Typically it's necessary to compile with the environment variable `RUSTFLAGS="-Ctarget-cpu=native"` set for this to work. The fallback -implemntation (`BasicCOITree`) supports any platform rust compiles to and +implementation (`BasicCOITree`) supports any platform rust compiles to and remains highly efficient. # Trying Out +The Minimum Supported Rust Version (MSRV) is 1.70.0. This is primary a library for use in other programs, but for benchmarking purposes it includes a program for intersecting BED files. To try out, just clone this repo and run: + ```shell cargo run --release --example bed-intersect -- test1.bed test2.bed > intersections.bed ``` @@ -45,51 +46,50 @@ million lines of `B`. ## Intervals in sorted order -| | A vs B | B vs A | A vs A | B' vs B' | -| ----------------------------------- | ---------: | ---------: | -------: | ---------: | -| coitrees AVX | 11.8s | **3.7s** | 0.7 | 5.3s | -| coitrees AVX (`--sorted`) | 6.4s | 4.2s | **0.6s** | **0.5s** | -| coitrees | 11.4s | 5.2s | 0.8s | 8.3s | -| coitrees (`--sorted`) | **5.8s** | 5.4s | **0.6s** | **0.5s** | -| cgranges (`bedcov-cr -c`) | 35.4s | 6.6s | 2.0s | 17.6s | -| AIList | 13.8s | 10.1s | 1.1s | 18.4s | -| CITree | 20.1s | 13.5s | 1.6s | 45.7s | -| NCList | 22.5s | 16.8s | 1.9s | 39.8s | -| AITree | 23.8s | 26.3s | 2.1s | 63.4s | -| `bedtools coverage -counts -sorted` | 257.5s | 295.6s | 71.6s | 2130.9s | -| `bedtools coverage -counts` | 322.4s | 378.5s | 75.0s | 3595.9s | +| | A vs B | B vs A | A vs A | B' vs B' | +| ----------------------------------- | -------: | -------: | -------: | -------: | +| coitrees AVX | 11.8s | **3.7s** | 0.7 | 5.3s | +| coitrees AVX (`--sorted`) | 6.4s | 4.2s | **0.6s** | **0.5s** | +| coitrees | 11.4s | 5.2s | 0.8s | 8.3s | +| coitrees (`--sorted`) | **5.8s** | 5.4s | **0.6s** | **0.5s** | +| cgranges (`bedcov-cr -c`) | 35.4s | 6.6s | 2.0s | 17.6s | +| AIList | 13.8s | 10.1s | 1.1s | 18.4s | +| CITree | 20.1s | 13.5s | 1.6s | 45.7s | +| NCList | 22.5s | 16.8s | 1.9s | 39.8s | +| AITree | 23.8s | 26.3s | 2.1s | 63.4s | +| `bedtools coverage -counts -sorted` | 257.5s | 295.6s | 71.6s | 2130.9s | +| `bedtools coverage -counts` | 322.4s | 378.5s | 75.0s | 3595.9s | ### With coverage -| | A vs B | B vs A | A vs A | B' vs B' | -| ----------------------------------- | ---------: | ---------: | -------: | ---------: | -| coitrees AVX | 18.2s | **4.8s** | 1.1s | 16.0s | -| coitrees | **14.6s** | 5.7s | **1.0s** | **12.0s** | -| cgranges | 38.4s | 8.1s | 2.2s | 31.0s | -| CITree | 23.2s | 25.6s | 2.0s | 160.4s | +| | A vs B | B vs A | A vs A | B' vs B' | +| ------------ | --------: | -------: | -------: | --------: | +| coitrees AVX | 18.2s | **4.8s** | 1.1s | 16.0s | +| coitrees | **14.6s** | 5.7s | **1.0s** | **12.0s** | +| cgranges | 38.4s | 8.1s | 2.2s | 31.0s | +| CITree | 23.2s | 25.6s | 2.0s | 160.4s | ## Intervals in randomized order -| | A vs B | B vs A | A vs A | B' vs B' | -| ----------------------------------- | ---------: | ---------: | -------: | --------: | -| coitrees AVX | **23.9s** | **7.2s** | **1.6s** | **6.1s** | -| coitrees | 24.2s | 8.9s | 1.9s | 9.4s | -| cgranges (`bedcov-cr -c`) | 55.7s | 11.1s | 3.3s | 19.6s | -| AIList | 31.2s | 18.2s | 2.3s | 19.3s | -| CITree | 39.4s | 19.0s | 2.9s | 47.1s | -| NCList | 42.7s | 23.8s | 3.4s | 44.0s | -| AITree | 225.3s | 134.8s | 14.7s | 921.6s | -| `bedtools coverage -counts` | 1160.4s | 849.6s | 104.5s | 9254.6s | +| | A vs B | B vs A | A vs A | B' vs B' | +| --------------------------- | --------: | -------: | -------: | -------: | +| coitrees AVX | **23.9s** | **7.2s** | **1.6s** | **6.1s** | +| coitrees | 24.2s | 8.9s | 1.9s | 9.4s | +| cgranges (`bedcov-cr -c`) | 55.7s | 11.1s | 3.3s | 19.6s | +| AIList | 31.2s | 18.2s | 2.3s | 19.3s | +| CITree | 39.4s | 19.0s | 2.9s | 47.1s | +| NCList | 42.7s | 23.8s | 3.4s | 44.0s | +| AITree | 225.3s | 134.8s | 14.7s | 921.6s | +| `bedtools coverage -counts` | 1160.4s | 849.6s | 104.5s | 9254.6s | ### With coverage -| | A vs B | B vs A | A vs A | B' vs B' | -| ----------------------------------- | ---------: | ---------: | -------: | ---------: | -| coitrees AVX | 34.3s | **8.8s** | **2.2s** | 16.3s | -| coitrees | **29.6s** | 9.7s | 2.3s | **13.1s** | -| cgranges | 57.6s | 12.5s | 3.6s | 32.6s | -| CITree | 50.0s | 32.5s | 3.8s | 170.4s | - +| | A vs B | B vs A | A vs A | B' vs B' | +| ------------ | --------: | -------: | -------: | --------: | +| coitrees AVX | 34.3s | **8.8s** | **2.2s** | 16.3s | +| coitrees | **29.6s** | 9.7s | 2.3s | **13.1s** | +| cgranges | 57.6s | 12.5s | 3.6s | 32.6s | +| CITree | 50.0s | 32.5s | 3.8s | 170.4s | All benchmarks run on a ryzen 5950x. diff --git a/bench/CITree b/bench/CITree index 27171b1..e19ce28 100755 --- a/bench/CITree +++ b/bench/CITree @@ -1,4 +1,3 @@ #!/bin/sh ~/src/cgranges/test/bedcov-itree-nocov $1 $2 - diff --git a/bench/CITree-cov b/bench/CITree-cov index 56587ee..69bd6dd 100755 --- a/bench/CITree-cov +++ b/bench/CITree-cov @@ -1,4 +1,3 @@ #!/bin/sh ~/src/cgranges/test/bedcov-itree $1 $2 - diff --git a/bench/bedtools b/bench/bedtools index 8f525ab..d48b2ca 100755 --- a/bench/bedtools +++ b/bench/bedtools @@ -1,4 +1,3 @@ #!/bin/sh bedtools coverage -counts -a $1 -b $2 - diff --git a/bench/bedtools-sorted b/bench/bedtools-sorted index f8abfc4..d3e31b4 100755 --- a/bench/bedtools-sorted +++ b/bench/bedtools-sorted @@ -1,4 +1,3 @@ #!/bin/sh bedtools coverage -counts -sorted -a $1 -b $2 - diff --git a/bench/cgranges b/bench/cgranges index 3200914..4134f76 100755 --- a/bench/cgranges +++ b/bench/cgranges @@ -1,4 +1,3 @@ #!/bin/sh ~/src/cgranges/test/bedcov-cr -c $1 $2 - diff --git a/bench/cgranges-cov b/bench/cgranges-cov index 36d8a66..d78d2f0 100755 --- a/bench/cgranges-cov +++ b/bench/cgranges-cov @@ -1,4 +1,3 @@ #!/bin/sh ~/src/cgranges/test/bedcov-cr $1 $2 - diff --git a/bench/coitrees b/bench/coitrees index b2a4ee6..d20870c 100755 --- a/bench/coitrees +++ b/bench/coitrees @@ -1,4 +1,3 @@ #!/bin/sh ../target/release/examples/bed-intersect $1 $2 - diff --git a/bench/coitrees-cov b/bench/coitrees-cov index 640661f..36dfd53 100755 --- a/bench/coitrees-cov +++ b/bench/coitrees-cov @@ -1,4 +1,3 @@ #!/bin/sh ../target/release/examples/bed-intersect --coverage $1 $2 - diff --git a/bench/coitrees-sorted b/bench/coitrees-sorted index 01937ab..77e2628 100755 --- a/bench/coitrees-sorted +++ b/bench/coitrees-sorted @@ -1,4 +1,3 @@ #!/bin/sh ../target/release/examples/bed-intersect --sorted $1 $2 - diff --git a/bench/coitrees-tvt b/bench/coitrees-tvt index 24105f4..c736e27 100755 --- a/bench/coitrees-tvt +++ b/bench/coitrees-tvt @@ -1,4 +1,3 @@ #!/bin/sh ../target/release/examples/bed-intersect --tree-vs-tree $1 $2 - diff --git a/src/avx.rs b/src/avx.rs index 07eb7ae..62e23ac 100644 --- a/src/avx.rs +++ b/src/avx.rs @@ -485,7 +485,7 @@ where self.nodes.is_empty() } - // /// Find intervals in the set overlaping the query `[first, last]` and call `visit` on every overlapping node + // /// Find intervals in the set overlapping the query `[first, last]` and call `visit` on every overlapping node fn query(&'a self, first: i32, last: i32, mut visit: F) where F: FnMut(&Interval<&'a T>), @@ -612,8 +612,8 @@ where let node = &self.nodes[self.i]; if self.j < 8 { let ret = Some(Interval { - first: node.first(self.j), - last: node.last(self.j), + first: node.first(self.j) + 1, + last: node.last(self.j) - 1, metadata: &node.metadata[self.j], }); self.count += 1; @@ -652,8 +652,8 @@ where let node = &self.nodes[self.i]; self.count += 1; Some(Interval { - first: node.first(self.j), - last: node.last(self.j), + first: node.first(self.j) + 1, + last: node.last(self.j) - 1, metadata: &node.metadata[self.j], }) } @@ -902,7 +902,7 @@ where return; } - // not overlaping or preceding + // not overlapping or preceding if first < self.prev_first || first > self.prev_last { // no overlap with previous query. have to resort to regular query strategy self.overlapping_intervals.clear(); diff --git a/src/lib.rs b/src/lib.rs index 14d3c59..867254d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -11,7 +11,7 @@ //! structs which store integer, end-inclusive intervals along with associated //! metadata. The tree can be queried directly for coverage or overlaps, or //! through the intermediary `SortedQuerenty` which keeps track of some state -//! to accelerate overlaping queries. +//! to accelerate overlapping queries. mod interval; pub use interval::*; diff --git a/src/neon.rs b/src/neon.rs index 304ee58..b6edae0 100644 --- a/src/neon.rs +++ b/src/neon.rs @@ -382,7 +382,7 @@ where self.nodes.is_empty() } - // /// Find intervals in the set overlaping the query `[first, last]` and call `visit` on every overlapping node + // /// Find intervals in the set overlapping the query `[first, last]` and call `visit` on every overlapping node fn query(&'a self, first: i32, last: i32, mut visit: F) where F: FnMut(&Interval<&'a T>), @@ -509,8 +509,8 @@ where let node = &self.nodes[self.i]; if self.j < LANE_SIZE { let ret = Some(Interval { - first: node.first(self.j), - last: node.last(self.j), + first: node.first(self.j) + 1, + last: node.last(self.j) - 1, metadata: &node.metadata[self.j], }); self.count += 1; @@ -549,8 +549,8 @@ where let node = &self.nodes[self.i]; self.count += 1; Some(Interval { - first: node.first(self.j), - last: node.last(self.j), + first: node.first(self.j) + 1, + last: node.last(self.j) - 1, metadata: &node.metadata[self.j], }) } @@ -799,7 +799,7 @@ where return; } - // not overlaping or preceding + // not overlapping or preceding if first < self.prev_first || first > self.prev_last { // no overlap with previous query. have to resort to regular query strategy self.overlapping_intervals.clear(); diff --git a/src/nosimd.rs b/src/nosimd.rs index 48099d2..170e1c5 100644 --- a/src/nosimd.rs +++ b/src/nosimd.rs @@ -11,7 +11,7 @@ //! structs which store integer, end-inclusive intervals along with associated //! metadata. The tree can be queried directly for coverage or overlaps, or //! through the intermediary `SortedQuerent` which keeps track of some state -//! to accelerate overlaping queries. +//! to accelerate overlapping queries. use super::interval::{GenericInterval, IntWithMax, Interval, IntervalTree, SortedQuerent}; use std::cmp::Ordering; @@ -191,7 +191,7 @@ where self.nodes.is_empty() } - /// Find intervals in the set overlaping the query `[first, last]` and call `visit` on every overlapping node + /// Find intervals in the set overlapping the query `[first, last]` and call `visit` on every overlapping node fn query(&'a self, first: i32, last: i32, mut visit: F) where F: FnMut(&IntervalNode), @@ -548,7 +548,7 @@ where return; } - // not overlaping or preceding + // not overlapping or preceding if first < self.prev_first || first > self.prev_last { // no overlap with previous query. have to resort to regular query strategy self.overlapping_intervals.clear();