From 7bde62db51aff955b0ed3e75c9c3124dae683c00 Mon Sep 17 00:00:00 2001 From: Will Dampier Date: Fri, 6 Mar 2026 15:51:51 -0500 Subject: [PATCH 01/11] modernizing --- .github/workflows/ci.yml | 50 +++++++++++++++++++++ .readthedocs.yaml | 17 ++++++++ LICENSE | 21 +++++++++ cigarmath/__init__.py | 4 +- docs/api.rst | 94 ++++++++++++++++++++++++++++++++++++++++ docs/conf.py | 14 +++++- docs/index.rst | 2 +- docs/installation.rst | 30 ++++++++----- docs/requirements.txt | 3 ++ pyproject.toml | 77 ++++++++++++++++++++++++++++++++ setup.py | 42 ------------------ 11 files changed, 296 insertions(+), 58 deletions(-) create mode 100644 .github/workflows/ci.yml create mode 100644 .readthedocs.yaml create mode 100644 LICENSE create mode 100644 docs/api.rst create mode 100644 docs/requirements.txt create mode 100644 pyproject.toml delete mode 100644 setup.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..3b41e17 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,50 @@ +name: CI + +on: + push: + branches: ["main", "master", "publish-prep"] + pull_request: + branches: ["main", "master"] + +jobs: + test: + name: Python ${{ matrix.python-version }} + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install ".[io]" + pip install pytest + + - name: Run tests + run: pytest + + lint: + name: Lint + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install flake8 + run: pip install flake8 + + - name: Run flake8 + run: flake8 cigarmath tests diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000..1524e07 --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,17 @@ +version: 2 + +build: + os: ubuntu-22.04 + tools: + python: "3.11" + +sphinx: + configuration: docs/conf.py + +python: + install: + - method: pip + path: . + extra_requirements: + - io + - requirements: docs/requirements.txt diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..e34b613 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022-present Will Dampier and DV Klopfenstein + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/cigarmath/__init__.py b/cigarmath/__init__.py index 854d0f0..e020611 100644 --- a/cigarmath/__init__.py +++ b/cigarmath/__init__.py @@ -2,14 +2,13 @@ __author__ = """Will Dampier""" __email__ = "wnd22@drexel.edu" -__version__ = "0.1.0" +__version__ = "0.2.0" from .clipping import left_clipping from .clipping import right_clipping from .clipping import declip from .clipping import is_hard_clipped -from .clipping import left_clipping from .clipping import softclipify @@ -19,7 +18,6 @@ from .block import query_offset from .block import query_block from .block import block_overlap_length -from .block import reference_offset from .block import reference_mapping_blocks from .block import reference_deletion_blocks diff --git a/docs/api.rst b/docs/api.rst new file mode 100644 index 0000000..7efb31e --- /dev/null +++ b/docs/api.rst @@ -0,0 +1,94 @@ +API Reference +============= + +.. contents:: Modules + :local: + :depth: 1 + +Definitions +----------- + +.. automodule:: cigarmath.defn + :members: + :undoc-members: + :show-inheritance: + +Blocks +------ + +.. automodule:: cigarmath.block + :members: + :undoc-members: + :show-inheritance: + +Clipping +-------- + +.. automodule:: cigarmath.clipping + :members: + :undoc-members: + :show-inheritance: + +Mapping +------- + +.. automodule:: cigarmath.mapping + :members: + :undoc-members: + :show-inheritance: + +Inference +--------- + +.. automodule:: cigarmath.inference + :members: + :undoc-members: + :show-inheritance: + +Iterators +--------- + +.. automodule:: cigarmath.iterators + :members: + :undoc-members: + :show-inheritance: + +Conversions +----------- + +.. automodule:: cigarmath.conversions + :members: + :undoc-members: + :show-inheritance: + +Combine +------- + +.. automodule:: cigarmath.combine + :members: + :undoc-members: + :show-inheritance: + +Pileup +------ + +.. automodule:: cigarmath.pileup + :members: + :undoc-members: + :show-inheritance: + +Core Operations +--------------- + +.. automodule:: cigarmath.cigarmath + :members: + :undoc-members: + :show-inheritance: + +I/O +--- + +.. automodule:: cigarmath.io + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/conf.py b/docs/conf.py index febc005..a53f535 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -31,7 +31,17 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode'] +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.autosummary', + 'sphinx.ext.viewcode', + 'sphinx.ext.napoleon', + 'sphinx_autodoc_typehints', +] + +autosummary_generate = True +napoleon_google_docstring = True +napoleon_numpy_docstring = True # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] @@ -83,7 +93,7 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'alabaster' +html_theme = 'sphinx_rtd_theme' # Theme options are theme-specific and customize the look and feel of a # theme further. For a list of options available for each theme, see the diff --git a/docs/index.rst b/docs/index.rst index 333971e..05f07a1 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -9,7 +9,7 @@ Welcome to Cigar Math's documentation! installation usage rearrangements - modules + api contributing authors history diff --git a/docs/installation.rst b/docs/installation.rst index 4fdf03c..1b0fa49 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -23,29 +23,39 @@ you through the process. .. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/ +Optional dependencies +--------------------- + +To use SAM/BAM file I/O via ``cigarmath.io``, install with the ``io`` extra: + +.. code-block:: console + + $ pip install cigarmath[io] + +This installs `pysam`_, which requires a Unix-like system (Linux or macOS). + +.. _pysam: https://pysam.readthedocs.io + + From sources ------------ The sources for Cigar Math can be downloaded from the `Github repo`_. -You can either clone the public repository: - .. code-block:: console - $ git clone git://github.com/judowill/cigarmath + $ git clone https://github.com/DamLabResources/cigarmath.git -Or download the `tarball`_: +Once you have a copy of the source, you can install it with: .. code-block:: console - $ curl -OJL https://github.com/judowill/cigarmath/tarball/master + $ pip install . -Once you have a copy of the source, you can install it with: +Or with optional dependencies: .. code-block:: console - $ python setup.py install - + $ pip install ".[io]" -.. _Github repo: https://github.com/judowill/cigarmath -.. _tarball: https://github.com/judowill/cigarmath/tarball/master +.. _Github repo: https://github.com/DamLabResources/cigarmath diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..f852e84 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,3 @@ +sphinx +sphinx-rtd-theme +sphinx-autodoc-typehints diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..89b0187 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,77 @@ +[build-system] +requires = ["setuptools>=61", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "cigarmath" +version = "0.2.3" +description = "Performs common operations on cigar strings" +readme = "README.md" +license = { file = "LICENSE" } +authors = [ + { name = "Will Dampier", email = "wnd22@drexel.edu" }, + { name = "DV Klopfenstein" }, +] +keywords = ["cigarmath", "bioinformatics", "CIGAR", "SAM", "BAM", "genomics"] +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "Natural Language :: English", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Scientific/Engineering :: Bio-Informatics", +] +requires-python = ">=3.8" +dependencies = [] + +[project.optional-dependencies] +io = ["pysam"] +dev = [ + "bump2version", + "wheel", + "flake8", + "coverage", + "sphinx", + "sphinx-rtd-theme", + "sphinx-autodoc-typehints", + "twine", + "pytest", + "black", + "pysam", +] + +[project.urls] +Homepage = "https://github.com/DamLabResources/cigarmath" +Documentation = "https://cigarmath.readthedocs.io" +Repository = "https://github.com/DamLabResources/cigarmath" +"Bug Tracker" = "https://github.com/DamLabResources/cigarmath/issues" + +[tool.setuptools.packages.find] +include = ["cigarmath", "cigarmath.*"] + +[tool.pytest.ini_options] +addopts = "--ignore=setup.py" + +[tool.flake8] +exclude = ["docs"] + +[tool.bumpversion] +current_version = "0.2.3" +commit = true +tag = true + +[[tool.bumpversion.files]] +filename = "pyproject.toml" +search = 'version = "{current_version}"' +replace = 'version = "{new_version}"' + +[[tool.bumpversion.files]] +filename = "cigarmath/__init__.py" +search = '__version__ = "{current_version}"' +replace = '__version__ = "{new_version}"' diff --git a/setup.py b/setup.py deleted file mode 100644 index 8f17131..0000000 --- a/setup.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python - -"""The setup script.""" - -from setuptools import setup, find_packages - -with open('README.md') as readme_file: - readme = readme_file.read() - -with open('HISTORY.md') as history_file: - history = history_file.read() - -requirements = [ ] - -test_requirements = ['pytest>=3', ] - -setup( - author="Will Dampier", - author_email='wnd22@drexel.edu', - python_requires='>=3.6', - classifiers=[ - 'Development Status :: 2 - Pre-Alpha', - 'Intended Audience :: Developers', - 'Natural Language :: English', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - ], - description="Performs common operations on cigar strings", - install_requires=requirements, - long_description=readme + '\n\n' + history, - include_package_data=True, - keywords='cigarmath', - name='cigarmath', - packages=find_packages(include=['cigarmath', 'cigarmath.*']), - test_suite='tests', - tests_require=test_requirements, - url='https://github.com/DamLabResources/cigarmath', - version='0.2.3', - zip_safe=False, -) From 4655d0d9e09aa45d7483058016ff5b2b89b04f83 Mon Sep 17 00:00:00 2001 From: Will Dampier Date: Thu, 23 Apr 2026 12:05:26 -0400 Subject: [PATCH 02/11] fixing docs for fixed width setup --- cigarmath/block.py | 132 ++++++++++++++++++++++----------------- cigarmath/cigarmath.py | 14 +++-- cigarmath/clipping.py | 104 ++++++++++++++++-------------- cigarmath/combine.py | 52 ++++++++------- cigarmath/conversions.py | 44 +++++++------ cigarmath/inference.py | 34 +++++----- cigarmath/iterators.py | 27 ++++---- cigarmath/mapping.py | 82 +++++++++++++----------- cigarmath/pileup.py | 31 ++++----- docs/conf.py | 3 + 10 files changed, 289 insertions(+), 234 deletions(-) diff --git a/cigarmath/block.py b/cigarmath/block.py index 90cc091..f9c1b7b 100644 --- a/cigarmath/block.py +++ b/cigarmath/block.py @@ -20,13 +20,15 @@ def reference_offset(cigartuples: CigarTuples) -> int: """Calculate the length of the reference mapping block based on cigartuples. - REF AAAAGACC--CCC - QRY AAAA-ACCGGCCC - CGS HHHMMMMDMMMIIMMMHHHH - CGT 3H 4M 1D3M 2I 3M 4H + Example:: - >>>> reference_offset(cigartuples) - 10 + REF AAAAGACC--CCC + QRY AAAA-ACCGGCCC + CGS HHHMMMMDMMMIIMMMHHHH + CGT 3H 4M 1D3M 2I 3M 4H + + reference_offset(cigartuples) + 10 """ # add up reference-consuming blocks @@ -40,14 +42,16 @@ def reference_offset(cigartuples: CigarTuples) -> int: def reference_block(cigartuples: CigarTuples, reference_start: int = 0) -> Tuple[int, int]: """Returns a tuple of the reference (start, end) positions of the aligned segment - POS 01234567890 12345 - REF AAAAGACC--CCC - QRY AAAA-ACCGGCCC - CGS HHHMMMMDMMMIIMMMHHHH - CGT 3H 4M 1D3M 2I 3M 4H + Example:: + + POS 01234567890 12345 + REF AAAAGACC--CCC + QRY AAAA-ACCGGCCC + CGS HHHMMMMDMMMIIMMMHHHH + CGT 3H 4M 1D3M 2I 3M 4H - >>>> reference_block(cigartuples, reference_start=3) - (3, 13) + reference_block(cigartuples, reference_start=3) + (3, 13) """ offset = reference_offset(cigartuples) @@ -57,13 +61,15 @@ def reference_block(cigartuples: CigarTuples, reference_start: int = 0) -> Tuple def query_offset(cigartuples: CigarTuples) -> int: """Calculate the length of the query mapping block based on cigartuples. - REF AAAAGACC--CCC - QRY AAAA-ACCGGCCC - CGS HHHMMMMDMMMIIMMMHHHH - CGT 3H 4M 1D3M 2I 3M 4H + Example:: + + REF AAAAGACC--CCC + QRY AAAA-ACCGGCCC + CGS HHHMMMMDMMMIIMMMHHHH + CGT 3H 4M 1D3M 2I 3M 4H - >>>> query_offset(cigartuples) - 11 + query_offset(cigartuples) + 11 """ consumes_query_offset = set(CONSUMES_QUERY) @@ -83,13 +89,15 @@ def query_offset(cigartuples: CigarTuples) -> int: def query_start(cigartuples: CigarTuples) -> int: """Return the start position on the query of this alignment. - REF AAAAGACC--CCC - QRY AAAA-ACCGGCCC - CGS HHHMMMMDMMMIIMMMHHHH - CGT 3H 4M 1D3M 2I 3M 4H + Example:: - >>>> query_start(cigartuples) - 3 + REF AAAAGACC--CCC + QRY AAAA-ACCGGCCC + CGS HHHMMMMDMMMIIMMMHHHH + CGT 3H 4M 1D3M 2I 3M 4H + + query_start(cigartuples) + 3 """ return left_clipping(cigartuples) @@ -98,14 +106,16 @@ def query_start(cigartuples: CigarTuples) -> int: def query_block(cigartuples: CigarTuples) -> Tuple[int, int]: """Returns a tuple of the query (start, end) positions of the aligned segment - POS 01234567890 12345 - REF AAAAGACC--CCC - QRY AAAA-ACCGGCCC - CGS HHHMMMMDMMMIIMMMHHHH - CGT 3H 4M 1D3M 2I 3M 4H + Example:: + + POS 01234567890 12345 + REF AAAAGACC--CCC + QRY AAAA-ACCGGCCC + CGS HHHMMMMDMMMIIMMMHHHH + CGT 3H 4M 1D3M 2I 3M 4H - >>>> query_block(cigartuples) - (3, 12) + query_block(cigartuples) + (3, 12) """ # check clipping @@ -121,23 +131,25 @@ def block_overlap_length(block_a: Tuple[int, int], block_b: Tuple[int, int]) -> """Given two (start,stop) tuples, return their overlap. negative values indicate distance to overlap. - POS0 000000000011111111112222222222 - POS1 012345678901234567890123456789 + Examples:: + + POS0 000000000011111111112222222222 + POS1 012345678901234567890123456789 - BLKA ---------- - BLKB ---------- + BLKA ---------- + BLKB ---------- - >>> block_overlap_length((4, 13), (10, 19)) - 4 + block_overlap_length((4, 13), (10, 19)) + 4 - POS0 000000000011111111112222222222 - POS1 012345678901234567890123456789 + POS0 000000000011111111112222222222 + POS1 012345678901234567890123456789 - BLKA ----- - BLKB ---------- + BLKA ----- + BLKB ---------- - >>> block_overlap_length((4, 8), (16, 25)) - -7 + block_overlap_length((4, 8), (16, 25)) + -7 """ return min(block_a[1], block_b[1]) - max(block_a[0], block_b[0]) @@ -149,15 +161,17 @@ def reference_mapping_blocks( deletion_split: int = 10 ) -> Iterator[Tuple[int, int]]: """Yield (reference_start, reference_stop) blocks of mapped sites split by deletions. - - POS0 000000000011111111112222222222 - POS1 012345678901234567890123456789 - CGS MMMMMMMDDDMMMMDDDDDDMMMM - - >> reference_mapping_blocks(cigartuples, reference_start=3, deletion_split=5) - (3, 16) - (22, 26) + Example:: + + POS0 000000000011111111112222222222 + POS1 012345678901234567890123456789 + + CGS MMMMMMMDDDMMMMDDDDDDMMMM + + reference_mapping_blocks(cigartuples, reference_start=3, deletion_split=5) + (3, 16) + (22, 26) """ del_ops = {BAM_CDEL, BAM_CREF_SKIP} @@ -179,14 +193,16 @@ def reference_deletion_blocks( ) -> Iterator[Tuple[int, int]]: """Yield (reference_start, reference_stop) blocks of deletions larger than minimum size - POS0 000000000011111111112222222222 - POS1 012345678901234567890123456789 + Example:: + + POS0 000000000011111111112222222222 + POS1 012345678901234567890123456789 - CGS MMMMDDDDDDMMMMDDDDDDMMMM + CGS MMMMDDDDDDMMMMDDDDDDMMMM - >>> reference_deletion_blocks(cigartuples, reference_start=3) - (7, 12) - (17, 22) + reference_deletion_blocks(cigartuples, reference_start=3) + (7, 12) + (17, 22) """ del_ops = {BAM_CDEL, BAM_CREF_SKIP} diff --git a/cigarmath/cigarmath.py b/cigarmath/cigarmath.py index 59e91dc..5a70cbc 100644 --- a/cigarmath/cigarmath.py +++ b/cigarmath/cigarmath.py @@ -24,13 +24,15 @@ def simplify_blocks(cigartuples, collapse=True): """Replace extended cigars (= and X) with M. - POS 0123456789012345 - REF AAAAGACCCCC - QRY AAAAACCGGCC - CGS ====xx=xx== + Example:: - >>>> simplify_blocks(cigartuples) - [(0, 11)] + POS 0123456789012345 + REF AAAAGACCCCC + QRY AAAAACCGGCC + CGS ====xx=xx== + + simplify_blocks(cigartuples) + [(0, 11)] """ if collapse: diff --git a/cigarmath/clipping.py b/cigarmath/clipping.py index 90dd0af..e395ed0 100644 --- a/cigarmath/clipping.py +++ b/cigarmath/clipping.py @@ -22,13 +22,15 @@ def left_clipping(cigartuples: CigarTuples, with_hard: bool = True) -> int: """Returns the length of clipped bases (hard or soft) on the left side of the alignment - REF AAAAACCCCC - QRY TTTAAAAACCCCCGGGG - CGS SSSMMMMMMMMMMSSSS - CGT 3S 10M 4S + Example:: - >>> left_clipping(cigartuples) - 3 + REF AAAAACCCCC + QRY TTTAAAAACCCCCGGGG + CGS SSSMMMMMMMMMMSSSS + CGT 3S 10M 4S + + left_clipping(cigartuples) + 3 """ soft = cigartuples[0][0] == BAM_CSOFT_CLIP hard = cigartuples[0][0] == BAM_CHARD_CLIP @@ -40,13 +42,15 @@ def left_clipping(cigartuples: CigarTuples, with_hard: bool = True) -> int: def right_clipping(cigartuples: CigarTuples, with_hard: bool = True) -> int: """Returns the length of clipped bases (hard or soft) on the right side of the alignment - REF AAAAACCCCC - QRY TTTAAAAACCCCCGGGG - CGS SSSMMMMMMMMMMSSSS - CGT 3S 10M 4S + Example:: + + REF AAAAACCCCC + QRY TTTAAAAACCCCCGGGG + CGS SSSMMMMMMMMMMSSSS + CGT 3S 10M 4S - >>> right_clipping(cigartuples) - 4 + right_clipping(cigartuples) + 4 """ soft = cigartuples[-1][0] == BAM_CSOFT_CLIP hard = cigartuples[-1][0] == BAM_CHARD_CLIP @@ -58,21 +62,23 @@ def right_clipping(cigartuples: CigarTuples, with_hard: bool = True) -> int: def declip(cigartuples: CigarTuples, *args: T) -> CigarTuples: """Return a set of cigartuples with clipping removed, if any. - REF AAAAACCCCC - QRY TTTAAAAACCCCCGGGG - CGS SSSMMMMMMMMMMSSSS - CGT 3S 10M 4S + Examples:: - >>>> declip(cigartuples) - (0, 10) - - You can also provide sequences or lists as extra *args - which will be clipped based on the tuples. - - >>>> seq = 'TTTAAAAACCCCCGGGG' - >>>> tups, clipped_seq = declip(cigartuples, seq) - (0, 10) - AAAAACCCCC + REF AAAAACCCCC + QRY TTTAAAAACCCCCGGGG + CGS SSSMMMMMMMMMMSSSS + CGT 3S 10M 4S + + declip(cigartuples) + (0, 10) + + seq = "TTTAAAAACCCCCGGGG" + tups, clipped_seq = declip(cigartuples, seq) + (0, 10) + AAAAACCCCC + + You can also provide sequences or lists as extra ``*args``, which will be + clipped based on the tuples. """ left_clip = (cigartuples[0][0] == BAM_CSOFT_CLIP) | ( cigartuples[0][0] == BAM_CHARD_CLIP @@ -97,13 +103,15 @@ def declip(cigartuples: CigarTuples, *args: T) -> CigarTuples: def is_hard_clipped(cigartuples: CigarTuples) -> bool: """Return True if the cigar indicates HARD clipping - REF AAAAACCCCC - QRY AAAAACCCCC - CGS HHHMMMMMMMMMMHHHH - CGT 3H 10M 4H + Example:: - >>>> is_hard_clipped(cigartuples) - True + REF AAAAACCCCC + QRY AAAAACCCCC + CGS HHHMMMMMMMMMMHHHH + CGT 3H 10M 4H + + is_hard_clipped(cigartuples) + True """ return (cigartuples[0][0] == BAM_CHARD_CLIP) or ( cigartuples[-1][0] == BAM_CHARD_CLIP @@ -112,13 +120,15 @@ def is_hard_clipped(cigartuples: CigarTuples) -> bool: def softclipify(cigartuples: CigarTuples, required_mapping: int = 1) -> Tuple[CigarTuples, int]: """Converts initial and final cigars into softclips - - REF --AAAAGACCCCCGACTCGTTA--- - QUE TT----AACCCCCGAC----TAGCA - CIG IIDDDDMMMMMMMMMMDDDDMMIII - - OUT SS MMMMMMMMMMDDDDMMSSS required_mapping = 1 - OUT SS MMMMMMMMMM SSSSS required_mapping = 4 + + Example:: + + REF --AAAAGACCCCCGACTCGTTA--- + QUE TT----AACCCCCGAC----TAGCA + CIG IIDDDDMMMMMMMMMMDDDDMMIII + + OUT SS MMMMMMMMMMDDDDMMSSS required_mapping = 1 + OUT SS MMMMMMMMMM SSSSS required_mapping = 4 """ left_ind, left_soft_sz = _decide_softclip_end(cigartuples, required_mapping) right_ind, right_soft_sz = _decide_softclip_end(cigartuples[::-1], required_mapping) @@ -139,13 +149,15 @@ def softclipify(cigartuples: CigarTuples, required_mapping: int = 1) -> Tuple[Ci def _decide_softclip_end(cigartuples: CigarTuples, required_mapping: int) -> Tuple[Union[int, None], Union[int, None]]: """Helper function to search cigartuples until you find a MAPPING block of a particular size. - - REF --AAAAGACCCCCGACTCGTTA--- - QUE TT----AACCCCCGAC----TAGCA - CIG IIDDDDMMMMMMMMMMDDDDMMIII - - cigartuples = [(BAM_CINS, 2), (BAM_CDEL, 4), (BAM_CMATCH, 10), - (BAM_CDEL, 4), (BAM_CMATCH, 2), (BAM_CINS, 3)] + + Example:: + + REF --AAAAGACCCCCGACTCGTTA--- + QUE TT----AACCCCCGAC----TAGCA + CIG IIDDDDMMMMMMMMMMDDDDMMIII + + cigartuples = [(BAM_CINS, 2), (BAM_CDEL, 4), (BAM_CMATCH, 10), + (BAM_CDEL, 4), (BAM_CMATCH, 2), (BAM_CINS, 3)] """ MAPPING_OP = set([BAM_CMATCH, BAM_CEQUAL, BAM_CDIFF]) diff --git a/cigarmath/combine.py b/cigarmath/combine.py index d1fe99d..c7a020a 100644 --- a/cigarmath/combine.py +++ b/cigarmath/combine.py @@ -20,18 +20,22 @@ def combine_adjacent_alignments( Returns: Tuple of (reference_start, combined_cigartuples) - Example: - First alignment: REF: AAAAGATC--CCC (ref_start=4) - QRY: AAAA-ATCGGCCC - CGT: 4M1D3M2I3M - - Second alignment:REF: CCCTAG (ref_start=12) - QRY: CCTAG - CGT: 3M2M - - Combined: REF: AAAAGATC--CCCCCTAG - QRY: AAAA-ATCGGCCCCCTAG - CGT: 4M1D3M2I6M2M + Example:: + + First alignment: + REF: AAAAGATC--CCC (ref_start=4) + QRY: AAAA-ATCGGCCC + CGT: 4M1D3M2I3M + + Second alignment: + REF: CCCTAG (ref_start=12) + QRY: CCTAG + CGT: 3M2M + + Combined: + REF: AAAAGATC--CCCCCTAG + QRY: AAAA-ATCGGCCCCCTAG + CGT: 4M1D3M2I6M2M """ first_start, first_cigars = first[0], declip(first[1]) second_start, second_cigars = second[0], declip(second[1]) @@ -156,14 +160,15 @@ def trim_alignment( Returns: Tuple of (new_ref_start, trimmed_cigartuples) - Example: + Example:: + ref_start = 10 - cigartuples = [(0,5), (2,1), (0,3)] # 5M1D3M + cigartuples = [(0, 5), (2, 1), (0, 3)] # 5M1D3M left = 2 right = 1 - add_clipping = 'soft' - - Returns: (12, [(4,2), (0,3), (2,1), (0,2), (4,1)]) # 2S3M1D2M1S + add_clipping = "soft" + + Returns: (12, [(4, 2), (0, 3), (2, 1), (0, 2), (4, 1)]) # 2S3M1D2M1S """ if (left == 0) and (right == 0): return ref_start, cigartuples @@ -209,15 +214,16 @@ def combine_multiple_alignments( Raises: ValueError: If alignments overlap more than allowed or are not sequential - Example: + Example:: + alignments = [ - (10, [(0,5), (2,2), (0,3)]), # 5M2D3M at ref:10 - (25, [(0,4), (1,2), (0,2)]), # 4M2I2M at ref:25 - (35, [(0,5)]) # 5M at ref:35 + (10, [(0, 5), (2, 2), (0, 3)]), # 5M2D3M at ref:10 + (25, [(0, 4), (1, 2), (0, 2)]), # 4M2I2M at ref:25 + (35, [(0, 5)]), # 5M at ref:35 ] allowed_overlap = 2 - - Returns: (10, [(0,5), (2,2), (0,7), (1,2), (0,7)]) + + Returns: (10, [(0, 5), (2, 2), (0, 7), (1, 2), (0, 7)]) """ if not alignments: raise ValueError("No alignments provided") diff --git a/cigarmath/conversions.py b/cigarmath/conversions.py index 59438bc..2b07e04 100644 --- a/cigarmath/conversions.py +++ b/cigarmath/conversions.py @@ -37,18 +37,20 @@ def segments_to_binary( mapping: Optional[List[bool]] = None ) -> List[bool]: """Given a list of segments create a binary array of all covered regions. - - POS0 000000000011111111112222222222333333333 - POS1 012345678901234567890123456789012345678 - REF AAAAGACCCCCGACTAGCTAGCATGCTATCTAGCTAGCA - QRY1 AAAAGA-----GAC - QRY2 TGCTA---AGCTAG - RES 111111000001110000000001111111111111100 - - alns = [(0, '6M5D3M'), - (23, '5M3D6M')] - - >> segments_to_binary(alns, max_genome_size=38, deletion_size=4) + + Example:: + + POS0 000000000011111111112222222222333333333 + POS1 012345678901234567890123456789012345678 + REF AAAAGACCCCCGACTAGCTAGCATGCTATCTAGCTAGCA + QRY1 AAAAGA-----GAC + QRY2 TGCTA---AGCTAG + RES 111111000001110000000001111111111111100 + + alns = [(0, "6M5D3M"), + (23, "5M3D6M")] + + segments_to_binary(alns, max_genome_size=38, deletion_size=4) """ if mapping is None: @@ -113,14 +115,16 @@ def cigartuples2pairs( def msa2cigartuples(ref_msa: MSA, query_msa: MSA) -> Tuple[int, CigarTuples]: """Given a pair of multiple-sequence alignments return reference_start and cigartuples. - - REF AAAAGACCCCCGACTAGCTAGCATGCT----ATCTAGCTAGCA - QRY ----AACCCCCGAC----TAGCATGCTTTTTATCTAGCT---- - CIGAR MMMMMMMMMMDDDDMMMMMMMMIIIIIMMMMMMMM - - >> ref_start, cigartuples = msa2cigartuples(ref_msa, query_msa) - ref_start = 4 - cigartuples = [(0, 10), (2, 4), (0, 9), (1, 4), (0, 8)] + + Example:: + + REF AAAAGACCCCCGACTAGCTAGCATGCT----ATCTAGCTAGCA + QRY ----AACCCCCGAC----TAGCATGCTTTTTATCTAGCT---- + CIGAR MMMMMMMMMMDDDDMMMMMMMMIIIIIMMMMMMMM + + ref_start, cigartuples = msa2cigartuples(ref_msa, query_msa) + ref_start = 4 + cigartuples = [(0, 10), (2, 4), (0, 9), (1, 4), (0, 8)] """ assert len(ref_msa) == len(query_msa) diff --git a/cigarmath/inference.py b/cigarmath/inference.py index 093fd11..2426943 100644 --- a/cigarmath/inference.py +++ b/cigarmath/inference.py @@ -14,14 +14,16 @@ def inferred_query_sequence_length(cigartuples: CigarTuples) -> int: """Returns the expected length of query_sequence based on cigartuples - POS 01234567890 12345 - REF AAAAGACC--CCC - QRY AAAA-ACCGGCCC - CGS HHHMMMMDMMMIIMMMHHHH - CGT 3H 4M 1D3M 2I 3M 4H - - >>>> inferred_query_sequence_length(cigartuples) - 19 + Example:: + + POS 01234567890 12345 + REF AAAAGACC--CCC + QRY AAAA-ACCGGCCC + CGS HHHMMMMDMMMIIMMMHHHH + CGT 3H 4M 1D3M 2I 3M 4H + + inferred_query_sequence_length(cigartuples) + 19 """ return sum( block_size for bam_num, block_size in cigartuples if bam_num in CONSUMES_QUERY @@ -31,14 +33,16 @@ def inferred_query_sequence_length(cigartuples: CigarTuples) -> int: def inferred_reference_length(cigartuples: CigarTuples) -> int: """Returns the expected length of query_sequence based on cigartuples - POS 01234567890 12345 - REF AAAAGACC--CCC - QRY AAAA-ACCGGCCC - CGS HHHMMMMDMMMIIMMMHHHH - CGT 3H 4M 1D3M 2I 3M 4H + Example:: + + POS 01234567890 12345 + REF AAAAGACC--CCC + QRY AAAA-ACCGGCCC + CGS HHHMMMMDMMMIIMMMHHHH + CGT 3H 4M 1D3M 2I 3M 4H - >>>> inferred_reference_length(cigartuples) - 11 + inferred_reference_length(cigartuples) + 11 """ return sum( block_size diff --git a/cigarmath/iterators.py b/cigarmath/iterators.py index 74a31f2..a26ce5d 100644 --- a/cigarmath/iterators.py +++ b/cigarmath/iterators.py @@ -35,19 +35,20 @@ class CigarIndex: def cigar_iterator(cigartuples: CigarTuples, reference_start: int = 0) -> Iterator[CigarIndex]: """Yields alignment, query, reference, and cigar indexes in alignment order. - - ALNPOS 01234567890 # Index of the entire alignment - RPOS 0123 456789 # Index within the reference - REF AAGA--CTTCGG - CIGAR SMMIIMDDMSS - CIGIND 01122344566 # Index of the cigar block - CBLKIND 001010010 # Index within the cigar block - QRY -xAAGGC--Cxx - QPOS 012345 678 # Index within the query - - - for cigar_index in cigar_iterator(cigartuples, reference_start = 2): - print(cigar_index) + + Example:: + + ALNPOS 01234567890 # Index of the entire alignment + RPOS 0123 456789 # Index within the reference + REF AAGA--CTTCGG + CIGAR SMMIIMDDMSS + CIGIND 01122344566 # Index of the cigar block + CBLKIND 001010010 # Index within the cigar block + QRY -xAAGGC--Cxx + QPOS 012345 678 # Index within the query + + for cigar_index in cigar_iterator(cigartuples, reference_start=2): + print(cigar_index) """ if cigartuples[0][0] in CLIPPING: diff --git a/cigarmath/mapping.py b/cigarmath/mapping.py index 90f49a7..879541a 100644 --- a/cigarmath/mapping.py +++ b/cigarmath/mapping.py @@ -12,19 +12,21 @@ def reference2query(cigartuples: CigarTuples, reference_start: int = 0) -> Iterator[Optional[int]]: """Create a generator the same size as the reference alignment that maps positions in the reference to positions in the query. - - RPOS 0123 456789 # Index within the reference - REF AAGA--CTTCGG - CIGAR SMMIIMDDMSS - QRY -xAAGGC--Cxx - QPOS 012345 678 # Index within the query - - r2q = 12 5NN6 - - cigarstring = 1S2M2I1M2D1M2S - reference_start = 2 - >> r2q = reference2query(cigartuples, reference_start=2) - (1, 2, 5, None, None, 6) + + Example:: + + RPOS 0123 456789 # Index within the reference + REF AAGA--CTTCGG + CIGAR SMMIIMDDMSS + QRY -xAAGGC--Cxx + QPOS 012345 678 # Index within the query + + r2q = 12 5NN6 + + cigarstring = 1S2M2I1M2D1M2S + reference_start = 2 + r2q = reference2query(cigartuples, reference_start=2) + (1, 2, 5, None, None, 6) """ for cig_index in cigar_iterator(cigartuples, reference_start=reference_start): if cig_index.reference_index is not None: @@ -34,18 +36,20 @@ def reference2query(cigartuples: CigarTuples, reference_start: int = 0) -> Itera def query2reference(cigartuples: CigarTuples, reference_start: int = 0) -> Iterator[Optional[int]]: """Create a generator the same size as the query that maps positions in the query to positions in the reference - - RPOS 0123 456789 # Index within the reference - REF AAGA--CTTCGG - CIGAR SMMIIMDDMSS - QRY -xAAGGC--Cxx - QPOS 012345 678 # Index within the query - q2r = N23NN4 7NN - - cigarstring = 1S2M2I1M2D1M2S - reference_start = 2 - >> q2r = query2reference(cigartuples, reference_start=2) - [None, 2, 3, None, None, 4, 7, None, None] + + Example:: + + RPOS 0123 456789 # Index within the reference + REF AAGA--CTTCGG + CIGAR SMMIIMDDMSS + QRY -xAAGGC--Cxx + QPOS 012345 678 # Index within the query + q2r = N23NN4 7NN + + cigarstring = 1S2M2I1M2D1M2S + reference_start = 2 + q2r = query2reference(cigartuples, reference_start=2) + [None, 2, 3, None, None, 4, 7, None, None] """ for cig_index in cigar_iterator(cigartuples, reference_start=reference_start): if cig_index.query_index is not None: @@ -55,19 +59,21 @@ def query2reference(cigartuples: CigarTuples, reference_start: int = 0) -> Itera def query2cigar(cigartuples: CigarTuples, reference_start: int = 0) -> Iterator[Tuple[int, int]]: """Create a generator the same size as the query that maps positions in the query to CIGAR positions. - - REF AAGA--CTTCGG - CIGAR SMMIIMDDMSS - QRY -xAAGGC--Cxx - QPOS 012345 678 # Index within the query - - CIGIND 011223 566 # Index of the cigar block - CBLKIND 001010 001 # Index within the cigar block - - cigarstring = 1S2M2I1M2D1M2S - reference_start = 2 - >> q2c = query2cigar(cigartuples, reference_start=2) - [(0,0), (1,0), (1,1), (2,0), (2,1), (3,0), (5,0), (6,0), (6,1)] + + Example:: + + REF AAGA--CTTCGG + CIGAR SMMIIMDDMSS + QRY -xAAGGC--Cxx + QPOS 012345 678 # Index within the query + + CIGIND 011223 566 # Index of the cigar block + CBLKIND 001010 001 # Index within the cigar block + + cigarstring = 1S2M2I1M2D1M2S + reference_start = 2 + q2c = query2cigar(cigartuples, reference_start=2) + [(0, 0), (1, 0), (1, 1), (2, 0), (2, 1), (3, 0), (5, 0), (6, 0), (6, 1)] """ for cig_index in cigar_iterator(cigartuples, reference_start=reference_start): if cig_index.query_index is not None: diff --git a/cigarmath/pileup.py b/cigarmath/pileup.py index 444202c..6664e7c 100644 --- a/cigarmath/pileup.py +++ b/cigarmath/pileup.py @@ -27,23 +27,24 @@ def depth( Returns: Dict mapping reference positions to Counter of bases at that position - Example: - REF: AAAAGACC--CCC - QRY: AAAA-ACCGGCCC + Example:: + + REF: AAAAGACC--CCC + QRY: AAAA-ACCGGCCC CIGAR: 4M1D3M2I3M - - >>> depth([(0,4), (2,1), (0,3), (1,2), (0,3)], query_sequence="AAAAACCGGCCC") + + depth([(0, 4), (2, 1), (0, 3), (1, 2), (0, 3)], query_sequence="AAAAACCGGCCC") { - 0: Counter({'A': 1}), - 1: Counter({'A': 1}), - 2: Counter({'A': 1}), - 3: Counter({'A': 1}), - 4: Counter({'-': 1}), # Deletion - 5: Counter({'A': 1}), - 6: Counter({'C': 1}), - 7: Counter({'C': 1}), - 8: Counter({'C': 1}), - 9: Counter({'C': 1}), + 0: Counter({"A": 1}), + 1: Counter({"A": 1}), + 2: Counter({"A": 1}), + 3: Counter({"A": 1}), + 4: Counter({"-": 1}), # Deletion + 5: Counter({"A": 1}), + 6: Counter({"C": 1}), + 7: Counter({"C": 1}), + 8: Counter({"C": 1}), + 9: Counter({"C": 1}), } """ # Initialize counts or use previous diff --git a/docs/conf.py b/docs/conf.py index a53f535..54584c4 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -42,6 +42,9 @@ autosummary_generate = True napoleon_google_docstring = True napoleon_numpy_docstring = True +napoleon_use_admonition_for_examples = False +autodoc_member_order = 'bysource' +autodoc_inherit_docstrings = False # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] From 58b00cdc267e1b2297d0390f20a79e1baf9fe359 Mon Sep 17 00:00:00 2001 From: Will Dampier Date: Thu, 23 Apr 2026 12:11:04 -0400 Subject: [PATCH 03/11] fixes for sphinx warnings --- docs/conf.py | 4 ++-- docs/history.rst | 6 +++++- docs/readme.rst | 11 ++++++++++- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 54584c4..8410838 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -77,7 +77,7 @@ # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = None +language = "en" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -107,7 +107,7 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ['_static'] if os.path.isdir('_static') else [] # -- Options for HTMLHelp output --------------------------------------- diff --git a/docs/history.rst b/docs/history.rst index 2506499..d698006 100644 --- a/docs/history.rst +++ b/docs/history.rst @@ -1 +1,5 @@ -.. include:: ../HISTORY.rst +History +======= + +The changelog is maintained in Markdown at the repository root: +`HISTORY.md `_. diff --git a/docs/readme.rst b/docs/readme.rst index 72a3355..2427f5e 100644 --- a/docs/readme.rst +++ b/docs/readme.rst @@ -1 +1,10 @@ -.. include:: ../README.rst +README +====== + +The project README is maintained in Markdown at the repository root: +`README.md `_. + +For installation and usage details in this documentation site, see: + +- :doc:`installation` +- :doc:`usage` From 0f75594c4b0a34d06d390f31a715dbc9058632f9 Mon Sep 17 00:00:00 2001 From: Will Dampier Date: Thu, 23 Apr 2026 12:13:27 -0400 Subject: [PATCH 04/11] unpinning --- requirements_dev.txt | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/requirements_dev.txt b/requirements_dev.txt index ebb3f1d..c1b31d3 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -1,12 +1,12 @@ -pip==19.2.3 -bump2version==0.5.11 -wheel==0.33.6 -watchdog==0.9.0 -flake8==3.7.8 -tox==3.14.0 -coverage==4.5.4 -Sphinx==1.8.5 -twine==1.14.0 +pip +bump2version +wheel +watchdog +flake8 +tox +coverage +Sphinx +twine -pytest==6.2.4 -black==21.7b0 +pytest +black \ No newline at end of file From 141913aba99bc0abf0548c3e3485f5ff87c8475b Mon Sep 17 00:00:00 2001 From: Will Dampier Date: Thu, 23 Apr 2026 12:15:21 -0400 Subject: [PATCH 05/11] pointing at correct repo --- CONTRIBUTING.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index e382444..aba889b 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -15,7 +15,7 @@ Types of Contributions Report Bugs ~~~~~~~~~~~ -Report bugs at https://github.com/judowill/cigarmath/issues. +Report bugs at https://github.com/DamLabResources/cigarmath/issues. If you are reporting a bug, please include: From 33f20805668c3b48e29c029c8f4598996fcf646d Mon Sep 17 00:00:00 2001 From: Will Dampier Date: Thu, 23 Apr 2026 14:14:04 -0400 Subject: [PATCH 06/11] adding cigar explanation and basic usage --- docs/cigar.rst | 87 ++++++++++++++++++++ docs/index.rst | 1 + docs/usage.rst | 217 ++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 303 insertions(+), 2 deletions(-) create mode 100644 docs/cigar.rst diff --git a/docs/cigar.rst b/docs/cigar.rst new file mode 100644 index 0000000..cf85fb6 --- /dev/null +++ b/docs/cigar.rst @@ -0,0 +1,87 @@ +CIGAR Format +============ + +CIGAR stands for Compact Idiosyncratic Gapped Alignment Report and is the +alignment edit string used in SAM/BAM files. A CIGAR encodes how a query/read +maps to a reference as a sequence of ```` blocks. + +Examples: + +- ``10M``: 10 aligned positions (match/mismatch in SAM ``M`` semantics) +- ``4M1D3M2I3M``: aligned block, deletion, aligned block, insertion, aligned block +- ``3H4M1D3M2I3M4H``: hard clipping at both ends around the mapped region + + +Where CIGAR Is Used +------------------- + +- SAM/BAM ``CIGAR`` field in read alignments. +- Coordinate conversion tools (query <-> reference position mapping). +- Coverage/pileup calculations and deletion/insertion-aware interval logic. +- Post-processing split alignments and trimming/merging alignments. + +In this project, CIGAR operations are primarily represented as ``cigartuples``: + +.. code-block:: python + + import cigarmath as cm + + cigartuples = cm.cigarstr2tup("3H4M1D3M2I3M4H") + # [(5, 3), (0, 4), (2, 1), (0, 3), (1, 2), (0, 3), (5, 4)] + + cm.cigartup2str(cigartuples) + # "3H4M1D3M2I3M4H" + + +Operation Semantics +------------------- + +The core operations used by SAM/BAM and ``cigarmath``: + ++-----------+------------------------------+----------------+---------------------+ +| Op letter | Meaning | Consumes query | Consumes reference | ++===========+==============================+================+=====================+ +| ``M`` | Alignment match/mismatch | Yes | Yes | ++-----------+------------------------------+----------------+---------------------+ +| ``I`` | Insertion to reference | Yes | No | ++-----------+------------------------------+----------------+---------------------+ +| ``D`` | Deletion from reference | No | Yes | ++-----------+------------------------------+----------------+---------------------+ +| ``N`` | Reference skip | No | Yes | ++-----------+------------------------------+----------------+---------------------+ +| ``S`` | Soft clipping | Yes | No | ++-----------+------------------------------+----------------+---------------------+ +| ``H`` | Hard clipping | No | No | ++-----------+------------------------------+----------------+---------------------+ +| ``P`` | Padding | No | No | ++-----------+------------------------------+----------------+---------------------+ +| ``=`` | Sequence match | Yes | Yes | ++-----------+------------------------------+----------------+---------------------+ +| ``X`` | Sequence mismatch | Yes | Yes | ++-----------+------------------------------+----------------+---------------------+ + +In ``cigarmath``, operation codes are integer indices into this operation list +(``M=0, I=1, D=2, N=3, S=4, H=5, P=6, ==7, X=8, B=9``), exposed in +``cigarmath.defn`` via constants like ``BAM_CMATCH``, ``BAM_CDEL``, and +``BAM_CSOFT_CLIP``. + + +Conventions in cigarmath +------------------------ + +- Coordinates are 0-based. +- Many coordinate mapping methods may return ``None`` at deleted or skipped positions. +- Clipping-aware operations are available (``left_clipping``, ``right_clipping``, + ``declip``) to control whether clipped sequence contributes to downstream logic. +- ``M`` is treated as generic aligned positions (matches/mismatches), while + extended semantics can use ``=`` and ``X``. + + +Canonical References +-------------------- + +- SAM format specification: https://samtools.github.io/hts-specs/SAMv1.pdf +- pysam ``cigartuples`` notes: + https://pysam.readthedocs.io/en/latest/api.html#pysam.AlignedSegment.cigartuples + +For practical examples using these semantics, continue to :doc:`usage`. diff --git a/docs/index.rst b/docs/index.rst index 05f07a1..382d9c6 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -7,6 +7,7 @@ Welcome to Cigar Math's documentation! readme installation + cigar usage rearrangements api diff --git a/docs/usage.rst b/docs/usage.rst index 305b5ad..3014178 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -2,6 +2,219 @@ Usage ===== -To use Cigar Math in a project:: +This page focuses on practical workflows for working with CIGAR strings and +``cigartuples`` in Python. - import cigarmath +A CIGAR string compactly describes how a query aligns to a reference (for +example ``3H4M1D3M2I3M4H``). In ``cigarmath``, most operations use +``cigartuples`` represented as ``(operation_code, length)`` pairs such as +``[(5, 3), (0, 4), (2, 1), ...]``. For a detailed explanation of the CIGAR +format and operation semantics, see :doc:`cigar`. + +The examples below use top-level imports: + +.. code-block:: python + + import cigarmath as cm + + +Quick Start +----------- + +Parse a CIGAR string and get the mapped reference interval. + +.. code-block:: python + + cigartuples = cm.cigarstr2tup("3H4M1D3M2I3M4H") + reference_start = 3 + cm.reference_block(cigartuples, reference_start) + +.. code-block:: text + + (3, 14) + +Use this when you need a fast way to move from SAM-style CIGAR text to reference coordinates. + + +Coordinate Basics +----------------- + +Get both reference and query intervals from the same alignment. + +.. code-block:: python + + cigartuples = cm.cigarstr2tup("3H4M1D3M2I3M4H") + ref_block = cm.reference_block(cigartuples, reference_start=3) + qry_block = cm.query_block(cigartuples) + qry_start = cm.query_start(cigartuples) + ref_block, qry_block, qry_start + +.. code-block:: text + + ((3, 14), (3, 15), 3) + +Use this for coordinate bookkeeping before overlap, pileup, or interval conversion. + + +Clipping and Cleanup +-------------------- + +Inspect clipping and remove it before downstream operations. + +.. code-block:: python + + cigartuples = cm.cigarstr2tup("3H4M1D3M2I3M4H") + left = cm.left_clipping(cigartuples) + right = cm.right_clipping(cigartuples) + unclipped = cm.declip(cigartuples) + hard = cm.is_hard_clipped(cigartuples) + left, right, unclipped, hard + +.. code-block:: text + + (3, 4, [(0, 4), (2, 1), (0, 3), (1, 2), (0, 3)], True) + +Use this when clipping would otherwise shift query coordinates unexpectedly. + + +Mapping and Liftover +-------------------- + +Map positions between reference and query spaces. + +.. code-block:: python + + cigartuples = cm.cigarstr2tup("3H4M1D3M2I3M4H") + r2q = list(cm.reference2query(cigartuples, reference_start=3)) + q2r = list(cm.query2reference(cigartuples, reference_start=3)) + lift = list(cm.liftover(cigartuples, 3, 7, 8, 12, reference_start=3)) + r2q, q2r[:10], lift + +.. code-block:: text + + ([3, 4, 5, 6, None, 7, 8, 9, 12, 13, 14], + [None, None, None, 3, 4, 5, 6, 8, 9, 10], + [3, 6, 7, 13]) + +`None` indicates unmapped positions (for example, deletions in one coordinate system). + + +Block Splitting and Overlap +--------------------------- + +Split alignments into mapped blocks around deletions and detect deletion intervals. + +.. code-block:: python + + cigartuples = cm.cigarstr2tup("3H4M1D3M2I3M4H") + mapped = list(cm.reference_mapping_blocks(cigartuples, reference_start=3, deletion_split=1)) + deletions = list(cm.reference_deletion_blocks(cigartuples, reference_start=3, min_size=1)) + overlap = cm.block_overlap_length((3, 14), (10, 20)) + mapped, deletions, overlap + +.. code-block:: text + + ([(3, 7), (8, 14)], [(7, 8)], 4) + +Use this to create interval-style representations from per-read alignments. + + +Combining and Trimming Alignments +--------------------------------- + +Trim query bases or merge neighboring alignments. + +.. code-block:: python + + trimmed = cm.trim_alignment( + 10, + ((0, 5), (2, 1), (0, 3)), + left=2, + right=1, + add_clipping="soft", + ) + + combined = cm.combine_adjacent_alignments( + (4, ((0, 4), (2, 1), (0, 3), (1, 2), (0, 3))), + (12, ((0, 3), (0, 2))), + ) + + trimmed, combined + +.. code-block:: text + + ((12, ((4, 2), (0, 3), (2, 1), (0, 2), (4, 1))), + (4, ((0, 4), (2, 1), (0, 3), (1, 2), (0, 5)))) + +Use this when reconciling split or partially overlapping alignments. + + +Pileup and Base Counts +---------------------- + +Count per-position bases directly from CIGARs. + +.. code-block:: python + + counts = cm.depth( + [(0, 4), (2, 1), (0, 3), (1, 2), (0, 3)], + query_sequence="AAAAACCGGCCC", + ) + counts[0], counts[4], counts[9] + +.. code-block:: text + + (Counter({'A': 1}), Counter({'-': 1}), Counter({'C': 1})) + +Deletions are represented by `'-'` at reference positions. + + +Conversion Utilities +-------------------- + +Convert between CIGAR text, tuples, pair maps, and MSA-style strings. + +.. code-block:: python + + cigar = cm.cigarstr2tup("1S2M2I1M2D1M2S") + cigar_text = cm.cigartup2str(cigar) + pairs = cm.cigartuples2pairs(cigar, reference_start=2)[:8] + + ref_msa = "AAAAGACCCCCGACTAGCTAGCATGCT----ATCTAGCTAGCA" + qry_msa = "----AACCCCCGAC----TAGCATGCTTTTTATCTAGCT----" + msa_start, msa_cigars = cm.msa2cigartuples(ref_msa, qry_msa) + + cigar_text, pairs, (msa_start, msa_cigars) + +.. code-block:: text + + ('1S2M2I1M2D1M2S', + [(0, None), (1, 2), (2, 3), (3, 3), (4, 3), (5, 4), (6, 5), (6, 6)], + (4, [(0, 10), (2, 4), (0, 9), (1, 4), (0, 8)])) + +Use these helpers when moving between aligner outputs and downstream analysis formats. + + +I/O Integration (Optional) +-------------------------- + +The `cigarmath.io` module can stream and combine alignments from SAM/BAM via `pysam`. +Install the optional dependency first; see :doc:`installation`. + +Common entry points: + +.. code-block:: python + + from cigarmath import io + # io.segment_stream_pysam(...) + # io.combined_segment_stream(...) + + +Common Pitfalls and Tips +------------------------ + +- Coordinates are 0-based throughout this library. +- `reference2query` and `query2reference` can return `None` at gaps/deletions. +- Run `declip(...)` when clipped ends are not part of your downstream coordinate model. +- For long deletions, tune `deletion_split` in `reference_mapping_blocks`. +- `combine_multiple_alignments` expects compatible query ordering and overlap behavior. From 491f8f31858e6f9703a6db1f38f5960cd2a6eb6d Mon Sep 17 00:00:00 2001 From: Will Dampier Date: Thu, 23 Apr 2026 14:22:43 -0400 Subject: [PATCH 07/11] updating old tox --- tox.ini | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tox.ini b/tox.ini index ac01bb0..c94377a 100644 --- a/tox.ini +++ b/tox.ini @@ -1,11 +1,13 @@ [tox] -envlist = py36, py37, py38, flake8 +envlist = py38, py39, py310, py311, py312, flake8 [travis] python = + 3.12: py312 + 3.11: py311 + 3.10: py310 + 3.9: py39 3.8: py38 - 3.7: py37 - 3.6: py36 [testenv:flake8] basepython = python From 032dd866564ebe82f2b1ae89b1ba42787b884e90 Mon Sep 17 00:00:00 2001 From: Will Dampier Date: Thu, 23 Apr 2026 14:56:22 -0400 Subject: [PATCH 08/11] flake8 massive fix --- cigarmath/__init__.py | 43 +++++++- cigarmath/block.py | 17 +-- cigarmath/cigarmath.py | 15 --- cigarmath/clipping.py | 26 +++-- cigarmath/combine.py | 84 +++++++------- cigarmath/conversions.py | 119 ++++++++++---------- cigarmath/defn.py | 19 ++-- cigarmath/inference.py | 1 + cigarmath/io.py | 20 ++-- cigarmath/iterators.py | 74 ++++++------- cigarmath/mapping.py | 4 +- cigarmath/pileup.py | 15 +-- pyproject.toml | 3 + setup.cfg | 4 +- tests/test_aligned_pairs.py | 24 ++-- tests/test_binary_map.py | 51 ++++----- tests/test_block.py | 24 ++-- tests/test_cigarmath.py | 3 - tests/test_clipping.py | 12 +- tests/test_combine.py | 106 +++++++++--------- tests/test_inference.py | 4 - tests/test_io.py | 33 +++--- tests/test_iterators.py | 211 +++++++++++++++++------------------- tests/test_mapping.py | 22 ++-- tests/test_msa.py | 87 +++++++-------- tests/test_pileup.py | 54 +++++---- 26 files changed, 558 insertions(+), 517 deletions(-) diff --git a/cigarmath/__init__.py b/cigarmath/__init__.py index e020611..69a44f2 100644 --- a/cigarmath/__init__.py +++ b/cigarmath/__init__.py @@ -33,7 +33,6 @@ from .conversions import cigartuples2pairs from .conversions import msa2cigartuples -from .conversions import softclipify from .cigarmath import collapse_adjacent_blocks @@ -55,4 +54,44 @@ from .rearrangement import format_read_rearrangement_summary from .rearrangement import rearrangement_segment_stream from .rearrangement import reference_lengths_from_pysam_header -from .rearrangement import RearrangementEvent \ No newline at end of file +from .rearrangement import RearrangementEvent + +__all__ = [ + "left_clipping", + "right_clipping", + "declip", + "is_hard_clipped", + "softclipify", + "reference_offset", + "reference_block", + "query_start", + "query_offset", + "query_block", + "block_overlap_length", + "reference_mapping_blocks", + "reference_deletion_blocks", + "inferred_query_sequence_length", + "inferred_reference_length", + "cigarstr2tup", + "cigartup2str", + "io", + "segments_to_binary", + "cigartuples2pairs", + "msa2cigartuples", + "collapse_adjacent_blocks", + "reference2query", + "query2reference", + "cigar_iterator", + "cigar_iterator_reference_slice", + "liftover", + "iterator_attach", + "combine_multiple_alignments", + "combine_adjacent_alignments", + "trim_alignment", + "depth", + "infer_rearrangements", + "format_read_rearrangement_summary", + "rearrangement_segment_stream", + "reference_lengths_from_pysam_header", + "RearrangementEvent", +] diff --git a/cigarmath/block.py b/cigarmath/block.py index f9c1b7b..a3a0486 100644 --- a/cigarmath/block.py +++ b/cigarmath/block.py @@ -17,6 +17,7 @@ ) from cigarmath.clipping import left_clipping + def reference_offset(cigartuples: CigarTuples) -> int: """Calculate the length of the reference mapping block based on cigartuples. @@ -99,7 +100,7 @@ def query_start(cigartuples: CigarTuples) -> int: query_start(cigartuples) 3 """ - + return left_clipping(cigartuples) @@ -156,8 +157,8 @@ def block_overlap_length(block_a: Tuple[int, int], block_b: Tuple[int, int]) -> def reference_mapping_blocks( - cigartuples: CigarTuples, - reference_start: int = 0, + cigartuples: CigarTuples, + reference_start: int = 0, deletion_split: int = 10 ) -> Iterator[Tuple[int, int]]: """Yield (reference_start, reference_stop) blocks of mapped sites split by deletions. @@ -173,22 +174,22 @@ def reference_mapping_blocks( (3, 16) (22, 26) """ - + del_ops = {BAM_CDEL, BAM_CREF_SKIP} - + left, right = reference_start, reference_start for op, sz in cigartuples: if (op in del_ops) and (sz >= deletion_split): yield (left, right) left = right + sz right += sz * (op in CONSUMES_REFERENCE) - + yield left, right def reference_deletion_blocks( - cigartuples: CigarTuples, - reference_start: int = 0, + cigartuples: CigarTuples, + reference_start: int = 0, min_size: int = 1 ) -> Iterator[Tuple[int, int]]: """Yield (reference_start, reference_stop) blocks of deletions larger than minimum size diff --git a/cigarmath/cigarmath.py b/cigarmath/cigarmath.py index 5a70cbc..1d7dabe 100644 --- a/cigarmath/cigarmath.py +++ b/cigarmath/cigarmath.py @@ -5,21 +5,6 @@ All rights reserved""" __author__ = "Will Dampier, PhD" -from cigarmath.defn import ( - BAM_CSOFT_CLIP, - BAM_CHARD_CLIP, - NTS, - BAM_CDEL, - BAM_CREF_SKIP, - CIGAR_HDRS, - CIGAR2BAM, - CONSUMES_REFERENCE, - CONSUMES_QUERY, -) - - - - def simplify_blocks(cigartuples, collapse=True): """Replace extended cigars (= and X) with M. diff --git a/cigarmath/clipping.py b/cigarmath/clipping.py index e395ed0..3b48c0e 100644 --- a/cigarmath/clipping.py +++ b/cigarmath/clipping.py @@ -5,7 +5,7 @@ All rights reserved""" __author__ = "Will Dampier, PhD" -from typing import Tuple, List, Union, TypeVar, Any +from typing import Tuple, Union, TypeVar from cigarmath.defn import ( CigarTuples, BAM_CSOFT_CLIP, @@ -19,6 +19,7 @@ T = TypeVar('T') # For generic types in declip function + def left_clipping(cigartuples: CigarTuples, with_hard: bool = True) -> int: """Returns the length of clipped bases (hard or soft) on the left side of the alignment @@ -86,17 +87,17 @@ def declip(cigartuples: CigarTuples, *args: T) -> CigarTuples: right_clip = (cigartuples[-1][0] == BAM_CSOFT_CLIP) | ( cigartuples[-1][0] == BAM_CHARD_CLIP ) - + cigarstart = 1 if left_clip else 0 cigarend = -1 if right_clip else None - + if args: clipstart = cigartuples[0][1] if left_clip else 0 clipend = -cigartuples[-1][1] if right_clip else None print(clipstart, clipend) clipped_args = [items[clipstart:clipend] for items in args] return cigartuples[cigarstart:cigarend], *clipped_args - + return cigartuples[cigarstart:cigarend] @@ -130,20 +131,22 @@ def softclipify(cigartuples: CigarTuples, required_mapping: int = 1) -> Tuple[Ci OUT SS MMMMMMMMMMDDDDMMSSS required_mapping = 1 OUT SS MMMMMMMMMM SSSSS required_mapping = 4 """ - left_ind, left_soft_sz = _decide_softclip_end(cigartuples, required_mapping) - right_ind, right_soft_sz = _decide_softclip_end(cigartuples[::-1], required_mapping) + left_ind, left_soft_sz = _decide_softclip_end( + cigartuples, required_mapping) + right_ind, right_soft_sz = _decide_softclip_end( + cigartuples[::-1], required_mapping) offset = 0 if left_soft_sz: for op, sz in cigartuples[:left_ind]: if op in CONSUMES_REFERENCE: offset += sz - + left = [(BAM_CSOFT_CLIP, left_soft_sz)] if left_soft_sz else [] right = [(BAM_CSOFT_CLIP, right_soft_sz)] if right_soft_sz else [] - + middle_slc = slice(left_ind, -right_ind or None) - + return left+cigartuples[middle_slc]+right, offset @@ -160,9 +163,9 @@ def _decide_softclip_end(cigartuples: CigarTuples, required_mapping: int) -> Tup (BAM_CDEL, 4), (BAM_CMATCH, 2), (BAM_CINS, 3)] """ MAPPING_OP = set([BAM_CMATCH, BAM_CEQUAL, BAM_CDIFF]) - + soft_sz = 0 - + for num, (op, sz) in enumerate(cigartuples): if (op in MAPPING_OP) and (sz >= required_mapping): return num, soft_sz @@ -172,5 +175,4 @@ def _decide_softclip_end(cigartuples: CigarTuples, required_mapping: int) -> Tup return None, None - # Copyright (C) 2022-present, Dampier & DV Klopfenstein, PhD. All rights reserved diff --git a/cigarmath/combine.py b/cigarmath/combine.py index c7a020a..86e4fdc 100644 --- a/cigarmath/combine.py +++ b/cigarmath/combine.py @@ -7,19 +7,20 @@ from cigarmath.cigarmath import collapse_adjacent_blocks from cigarmath.clipping import declip + def combine_adjacent_alignments( first: Tuple[int, CigarTuples], second: Tuple[int, CigarTuples] ) -> Tuple[int, CigarTuples]: """Combine two adjacent alignments, adding deletion if there's a gap or trimming if there's overlap. - + Args: first: Tuple of (reference_start, cigartuples) for first alignment second: Tuple of (reference_start, cigartuples) for second alignment - + Returns: Tuple of (reference_start, combined_cigartuples) - + Example:: First alignment: @@ -39,32 +40,32 @@ def combine_adjacent_alignments( """ first_start, first_cigars = first[0], declip(first[1]) second_start, second_cigars = second[0], declip(second[1]) - + assert first_start < second_start, 'Can only combine adjacent alignments' # Get the reference blocks to check for overlap/gap first_ref_start, first_ref_end = reference_block(first_cigars, first_start) - + # Calculate gap/overlap size gap_size = second_start - first_ref_end - + if gap_size > 0: # There's a gap - add deletion operation between alignments combined_cigars = list(first_cigars) combined_cigars.append((BAM_CDEL, gap_size)) combined_cigars.extend(second_cigars) return first_start, tuple(collapse_adjacent_blocks(combined_cigars)) - + elif gap_size < 0: # There's an overlap - need to trim second alignment # Get query coordinates to determine where to trim first_q_start, first_q_end = query_block(first_cigars) second_q_start, second_q_end = query_block(second_cigars) - + # Calculate how many query bases to trim from second alignment # This is based on how many reference bases overlap overlap = -gap_size - + # Trim the overlapping portion from the start of second alignment trimmed_start, trimmed_cigars = trim_alignment( second_start, @@ -73,46 +74,48 @@ def combine_adjacent_alignments( ) print('second cigars', second_cigars, 'trimming', overlap) print('trimmed cigars', trimmed_cigars) - + # Combine the alignments combined_cigars = list(first_cigars) combined_cigars.extend(trimmed_cigars) return first_start, tuple(collapse_adjacent_blocks(combined_cigars)) - + else: # Alignments are perfectly adjacent combined_cigars = list(first_cigars) combined_cigars.extend(second_cigars) return first_start, tuple(collapse_adjacent_blocks(combined_cigars)) + def _trim( cigartuples: CigarTuples, trim_amount: int, from_start: bool = True ) -> Tuple[CigarTuples, int]: """Helper function to trim query bases from either end of alignment. - + Args: cigartuples: List of (operation, length) tuples trim_amount: Number of query bases to trim from_start: If True, trim from start, if False trim from end - + Returns: Tuple of (trimmed_cigartuples, reference_position_delta) """ if trim_amount == 0: return cigartuples, 0 - + new_tuples = [] ref_pos_delta = 0 to_trim = trim_amount - + # Reverse tuples if trimming from end tuples = reversed(cigartuples) if not from_start else cigartuples - + # Get the appropriate method for appending or inserting - extend_func = partial(new_tuples.append) if from_start else partial(new_tuples.insert, 0) - + extend_func = partial(new_tuples.append) if from_start else partial( + new_tuples.insert, 0) + for op, length in tuples: if to_trim > 0: if op in CONSUMES_QUERY: @@ -126,7 +129,7 @@ def _trim( # Partial operation new_length = length - to_trim extend_func((op, new_length)) - + if op in CONSUMES_REFERENCE: ref_pos_delta += to_trim to_trim = 0 @@ -137,7 +140,7 @@ def _trim( else: # Past trimming extend_func((op, length)) - + return tuple(new_tuples), ref_pos_delta @@ -149,17 +152,17 @@ def trim_alignment( add_clipping: Optional[str] = None ) -> Tuple[int, CigarTuples]: """Trim query bases from left and/or right of alignment. - + Args: ref_start: Starting position on reference cigartuples: List of (operation, length) tuples left: Number of query bases to trim from left right: Number of query bases to trim from right add_clipping: If 'soft' or 'hard', add clipping operations to maintain query length - + Returns: Tuple of (new_ref_start, trimmed_cigartuples) - + Example:: ref_start = 10 @@ -172,22 +175,22 @@ def trim_alignment( """ if (left == 0) and (right == 0): return ref_start, cigartuples - + if add_clipping not in (None, 'soft', 'hard'): raise ValueError("add_clipping must be None, 'soft', or 'hard'") - + # Handle left trimming first trimmed_cigars, ref_delta = _trim(cigartuples, left, from_start=True) - + # Then right trimming if needed if right > 0: trimmed_cigars, _ = _trim(trimmed_cigars, right, from_start=False) - + # Add clipping if requested if add_clipping: from cigarmath.defn import BAM_CSOFT_CLIP, BAM_CHARD_CLIP clip_op = BAM_CSOFT_CLIP if add_clipping == 'soft' else BAM_CHARD_CLIP - + result = [] if left > 0: result.append((clip_op, left)) @@ -195,25 +198,26 @@ def trim_alignment( if right > 0: result.append((clip_op, right)) trimmed_cigars = tuple(result) - + return ref_start + ref_delta, trimmed_cigars + def combine_multiple_alignments( alignments: List[Tuple[int, CigarTuples]], allowed_overlap: int = 0 ) -> Tuple[int, CigarTuples]: """Combine multiple alignments after sorting by query position and validating. - + Args: alignments: List of (reference_start, cigartuples) tuples allowed_overlap: Maximum number of query bases that can overlap between adjacent alignments - + Returns: Tuple of (reference_start, combined_cigartuples) - + Raises: ValueError: If alignments overlap more than allowed or are not sequential - + Example:: alignments = [ @@ -229,22 +233,22 @@ def combine_multiple_alignments( raise ValueError("No alignments provided") if len(alignments) == 1: return alignments[0] - + # Sort alignments by query start position sorted_alns = sorted( alignments, key=lambda x: query_block(x[1])[0] ) - + # Validate that alignments don't overlap more than allowed # and that they are sequential in reference space prev_query_end = None prev_ref_end = None - + for ref_start, cigars in sorted_alns: q_start, q_end = query_block(cigars) r_start, r_end = reference_block(cigars, ref_start) - + if prev_query_end is not None: overlap = prev_query_end - q_start if (prev_query_end - q_start) > allowed_overlap: @@ -260,13 +264,13 @@ def combine_multiple_alignments( f"Previous alignment ends at {prev_ref_end}, " f"but next alignment starts at {r_start}" ) - + prev_query_end = q_end prev_ref_end = r_end - + # Combine alignments sequentially result = sorted_alns[0] for next_aln in sorted_alns[1:]: result = combine_adjacent_alignments(result, next_aln) - + return result diff --git a/cigarmath/conversions.py b/cigarmath/conversions.py index 2b07e04..99a9bac 100644 --- a/cigarmath/conversions.py +++ b/cigarmath/conversions.py @@ -5,14 +5,12 @@ All rights reserved""" __author__ = "Will Dampier, PhD" -from typing import List, Tuple, Iterator, Optional, Union -from cigarmath.block import reference_block +from typing import List, Tuple, Optional from cigarmath.block import reference_mapping_blocks from cigarmath.clipping import left_clipping from cigarmath.clipping import right_clipping from cigarmath.clipping import declip from cigarmath.clipping import softclipify -from cigarmath.cigarmath import collapse_adjacent_blocks from cigarmath.defn import ( CigarTuples, CONSUMES_REFERENCE, @@ -22,7 +20,6 @@ BAM_CDEL, BAM_CEQUAL, BAM_CDIFF, - BAM_CSOFT_CLIP, ) # Type aliases @@ -30,10 +27,11 @@ AlignmentPair = Tuple[int, Optional[int]] MSA = str # Multiple Sequence Alignment string + def segments_to_binary( - alns: List[AlignmentSegment], - max_genome_size: int = 10_000, - deletion_size: int = 50, + alns: List[AlignmentSegment], + max_genome_size: int = 10_000, + deletion_size: int = 50, mapping: Optional[List[bool]] = None ) -> List[bool]: """Given a list of segments create a binary array of all covered regions. @@ -52,64 +50,77 @@ def segments_to_binary( segments_to_binary(alns, max_genome_size=38, deletion_size=4) """ - + if mapping is None: mapping = [False] * max_genome_size - + for start, cigar in alns: block_iter = reference_mapping_blocks(cigar, - reference_start=start, - deletion_split=deletion_size) + reference_start=start, + deletion_split=deletion_size) for ref_start, ref_stop in block_iter: mapping[ref_start:ref_stop] = [True] * (ref_stop-ref_start) - + return mapping def cigartuples2pairs( - cigartuples: CigarTuples, - reference_start: int = 0, - verbose: bool = False, + cigartuples: CigarTuples, + reference_start: int = 0, + verbose: bool = False, clipping_fill: Optional[int] = None ) -> List[AlignmentPair]: - + reference: List[Optional[int]] = [] query: List[int] = [] - + left_clip = left_clipping(cigartuples) if left_clip: - if verbose: print('Adding left clipping', left_clip) + if verbose: + print('Adding left clipping', left_clip) query += list(range(left_clip+1)) reference = [clipping_fill] * left_clip - if verbose: print('Currently', list(zip(query, reference))) - + if verbose: + print('Currently', list(zip(query, reference))) + for op, sz in declip(cigartuples): qstart = query[-1] if query else -1 - if verbose: print('Considering', op, sz) + if verbose: + print('Considering', op, sz) if op in CONSUMES_QUERY: - if verbose: print('Consumes query, adding', list(range(qstart+1, qstart+sz+1))) + if verbose: + print('Consumes query, adding', list( + range(qstart+1, qstart+sz+1))) query += list(range(qstart+1, qstart+sz+1)) else: - if verbose: print('Skips query, adding', [qstart]*sz) + if verbose: + print('Skips query, adding', [qstart]*sz) query += [query[-1]]*sz - - rstart = reference[-1] if (reference and reference[-1]) else reference_start-1 + + rstart = reference[-1] if (reference and reference[-1] + ) else reference_start-1 if op in CONSUMES_REFERENCE: - if verbose: print('Consumes Ref, adding', list(range(rstart+1, rstart+sz+1))) + if verbose: + print('Consumes Ref, adding', list( + range(rstart+1, rstart+sz+1))) reference += list(range(rstart+1, rstart+sz+1)) else: - if verbose: print('Skips Ref, adding', [rstart]*sz) + if verbose: + print('Skips Ref, adding', [rstart]*sz) reference += [rstart]*sz - - if verbose: print('Currently', list(zip(query, reference))) - + + if verbose: + print('Currently', list(zip(query, reference))) + right_clip = right_clipping(cigartuples) if right_clip: - if verbose: print('Adding right clipping:', right_clip) + if verbose: + print('Adding right clipping:', right_clip) query += list(range(query[-1]+1, query[-1]+right_clip+1)) reference += [clipping_fill]*right_clip - if verbose: print('Currently', list(zip(query, reference))) - + if verbose: + print('Currently', list(zip(query, reference))) + return list(zip(query, reference)) @@ -126,18 +137,19 @@ def msa2cigartuples(ref_msa: MSA, query_msa: MSA) -> Tuple[int, CigarTuples]: ref_start = 4 cigartuples = [(0, 10), (2, 4), (0, 9), (1, 4), (0, 8)] """ - + assert len(ref_msa) == len(query_msa) - + cigartuples: CigarTuples = [] cur_op: Optional[int] = None cur_sz = 0 reference_start: Optional[int] = None offset = 0 query_started = False - - non_double = ((r, q) for r, q in zip(ref_msa, query_msa) if (r!='-') | (q!='-')) - + + non_double = ((r, q) for r, q in zip( + ref_msa, query_msa) if (r != '-') | (q != '-')) + for n, (r, q) in enumerate(non_double): query_started |= q != '-' # Don't start until the first non-gap query character @@ -157,37 +169,30 @@ def msa2cigartuples(ref_msa: MSA, query_msa: MSA) -> Tuple[int, CigarTuples]: cur_op, cur_sz = this_op, 1 else: offset += 1 - + if cur_op is not None: cigartuples.append((cur_op, cur_sz)) - - collapsed_tuples = collapse_adjacent_blocks(cigartuples) - softclipped_tuples, clip_offset = softclipify(cigartuples, required_mapping=1) + + softclipped_tuples, clip_offset = softclipify( + cigartuples, required_mapping=1) assert reference_start is not None return reference_start + clip_offset, softclipped_tuples - - - def cigartuples2msa( - reference: str, - query: str, - reference_start: int, + reference: str, + query: str, + reference_start: int, cigartuples: CigarTuples ) -> Tuple[MSA, MSA]: """Converts a cigartuple and reference start into a Multiple Sequence Alignment""" raise NotImplementedError - - - - - - + + def _decide_op(ref_letter: str, query_letter: str, extended: bool = False) -> int: """Determines the cigarop from reference and query MSA positions""" - + if (ref_letter == '-') & (query_letter == '-'): # Both are gaps return None @@ -203,5 +208,5 @@ def _decide_op(ref_letter: str, query_letter: str, extended: bool = False) -> in elif ref_letter != query_letter: # mismatch return BAM_CDIFF if extended else BAM_CMATCH - - raise AssertionError('Did not expect to get here') \ No newline at end of file + + raise AssertionError('Did not expect to get here') diff --git a/cigarmath/defn.py b/cigarmath/defn.py index d582c50..6176dcd 100644 --- a/cigarmath/defn.py +++ b/cigarmath/defn.py @@ -16,17 +16,21 @@ # --- ----------------------------------------------------- ------ ----- NTS = [ NTO._make( - ["M", "alignment match (can be a sequence match or mismatch)", True, True] + ["M", + "alignment match (can be a sequence match or mismatch)", True, True] ), NTO._make(["I", "insertion to the reference", True, False]), NTO._make(["D", "deletion form the reference", False, True]), NTO._make(["N", "skipped region from the reference", False, True]), - NTO._make(["S", "soft clipping (clipped sequences present in SEQ)", True, False]), NTO._make( - ["H", "hard clipping (clipped sequences NOT present in SEQ)", False, False] + ["S", "soft clipping (clipped sequences present in SEQ)", True, False]), + NTO._make( + ["H", + "hard clipping (clipped sequences NOT present in SEQ)", False, False] ), NTO._make( - ["P", "padding (silent deletion from the padded reference)", False, False] + ["P", "padding (silent deletion from the padded reference)", + False, False] ), NTO._make(["=", "sequnce match", True, True]), NTO._make(["X", "sequence mismatch", True, True]), @@ -41,7 +45,8 @@ ] NtoRQC = namedtuple("CigRQC", "Op consumes_ref consumes_query") -NTS_RQC = [NtoRQC._make([nt.Op, nt.consumes_ref, nt.consumes_query]) for nt in NTS] +NTS_RQC = [NtoRQC._make([nt.Op, nt.consumes_ref, nt.consumes_query]) + for nt in NTS] def get_ntrqc_qty(cigartuples): @@ -73,10 +78,10 @@ def cigarstr2tup(cigarstring): return None cigartuples = [] pta = 0 - ##print(f'CIGAR({cigarstring})') + # print(f'CIGAR({cigarstring})') for idx, letter in enumerate(cigarstring): if letter in CIGAR_SET: - ##print(f'LETTER({letter}) VAL({cigarstring[pta:idx]})') + # print(f'LETTER({letter}) VAL({cigarstring[pta:idx]})') cigartuples.append((CIGAR2BAM[letter], int(cigarstring[pta:idx]))) pta = idx + 1 return cigartuples diff --git a/cigarmath/inference.py b/cigarmath/inference.py index 2426943..06c6551 100644 --- a/cigarmath/inference.py +++ b/cigarmath/inference.py @@ -11,6 +11,7 @@ CONSUMES_REFERENCE ) + def inferred_query_sequence_length(cigartuples: CigarTuples) -> int: """Returns the expected length of query_sequence based on cigartuples diff --git a/cigarmath/io.py b/cigarmath/io.py index 99f5e29..5ab4db9 100644 --- a/cigarmath/io.py +++ b/cigarmath/io.py @@ -17,6 +17,7 @@ except ImportError: pass + def segment_stream_pysam( path: str, mode: str = 'rt', @@ -33,13 +34,13 @@ def segment_stream_pysam( with pysam.AlignmentFile(path, mode, check_sq=False) as samfile: iterable = samfile - + if fetch: iterable = iterable.fetch(fetch) - + if downsample: iterable = _downsample(iterable, downsample) - + for segment in iterable: if segment.mapping_quality > min_mapq: if as_tuples: @@ -56,12 +57,13 @@ def _downsample(stream: Iterator, frac: float) -> Iterator: yield item -def _combine_aligned_segments(segments: Iterator) ->Tuple[int, CigarTuples, List]: +def _combine_aligned_segments(segments: Iterator) -> Tuple[int, CigarTuples, List]: """Combine aligned segments into a single alignment.""" - + segments = list(segments) try: - packed = [(segment.reference_start, segment.cigartuples) for segment in segments] + packed = [(segment.reference_start, segment.cigartuples) + for segment in segments] new_start, new_cigars = combine_multiple_alignments(packed) except ValueError: return None, None, segments @@ -75,6 +77,7 @@ def _get_primary_segment(segments: List): return segment return segments[0] + def combined_segment_stream(segments: Iterator) -> Iterator[Tuple[int, CigarTuples, List]]: """Combine aligned segments into a single alignment.""" @@ -84,12 +87,11 @@ def combined_segment_stream(segments: Iterator) -> Iterator[Tuple[int, CigarTupl segments = list(segments) if len(segments) > 1: try: - new_start, new_cigars, segments = _combine_aligned_segments(segments) + new_start, new_cigars, segments = _combine_aligned_segments( + segments) yield (new_start, new_cigars, segments) except ValueError: primary = _get_primary_segment(segments) yield (primary.reference_start, primary.cigartuples, segments) else: yield (segments[0].reference_start, segments[0].cigartuples, segments) - - \ No newline at end of file diff --git a/cigarmath/iterators.py b/cigarmath/iterators.py index a26ce5d..30280b9 100644 --- a/cigarmath/iterators.py +++ b/cigarmath/iterators.py @@ -6,9 +6,7 @@ __author__ = "Will Dampier, PhD" from dataclasses import dataclass -from itertools import dropwhile -from functools import partial -from typing import Optional, Iterator, List, Union, Sequence, Iterable +from typing import Optional, Iterator, List, Sequence from cigarmath.defn import ( CigarTuples, CONSUMES_REFERENCE, @@ -16,7 +14,6 @@ CLIPPING, BAM_CSOFT_CLIP ) -from cigarmath.block import reference_offset @dataclass @@ -50,11 +47,11 @@ def cigar_iterator(cigartuples: CigarTuples, reference_start: int = 0) -> Iterat for cigar_index in cigar_iterator(cigartuples, reference_start=2): print(cigar_index) """ - + if cigartuples[0][0] in CLIPPING: op, sz = cigartuples[0] yield from _left_clip_iterator(sz, cigar_op=op) - + alignment_index = sz-1 query_index = sz-1 cigartuples = cigartuples[1:] @@ -63,21 +60,21 @@ def cigar_iterator(cigartuples: CigarTuples, reference_start: int = 0) -> Iterat query_index = -1 alignment_index = -1 cigar_op_start = 0 - + reference_index = reference_start-1 - + for cigar_index, (cigar_op, size) in enumerate(cigartuples, cigar_op_start): - + # Precalc the changes for the block reference_delta = int(cigar_op in CONSUMES_REFERENCE) query_delta = int(cigar_op in CONSUMES_QUERY) - + for cigar_block_index in range(size): - + alignment_index += 1 query_index += query_delta reference_index += reference_delta - + yield CigarIndex( alignment_index=alignment_index, reference_index=reference_index if cigar_op in CONSUMES_REFERENCE else None, @@ -86,11 +83,11 @@ def cigar_iterator(cigartuples: CigarTuples, reference_start: int = 0) -> Iterat cigar_block_index=cigar_block_index, cigar_op=cigar_op ) - - + + def _left_clip_iterator(size: int, cigar_op: int = BAM_CSOFT_CLIP) -> Iterator[CigarIndex]: "Handle left-clipping special case" - + for index in range(size): yield CigarIndex( alignment_index=index, @@ -102,8 +99,6 @@ def _left_clip_iterator(size: int, cigar_op: int = BAM_CSOFT_CLIP) -> Iterator[C ) - - def cigar_iterator_reference_slice( cigartuples: CigarTuples, reference_start: int = 0, @@ -111,11 +106,11 @@ def cigar_iterator_reference_slice( region_reference_end: Optional[int] = None ) -> Iterator[CigarIndex]: "Return only a slice of the cigar_iterator based on a reference region" - + started = False - + for cigar_index in cigar_iterator(cigartuples, reference_start=reference_start): - + if (cigar_index.reference_index is None) and (not started): # Clipped or insertion BEFORE starting # so skip @@ -124,10 +119,10 @@ def cigar_iterator_reference_slice( # Insertion within the region, so yielding yield cigar_index elif (region_reference_start is not None) and (cigar_index.reference_index < region_reference_start): - # Has a lower-bound, this is below it + # Has a lower-bound, this is below it pass elif (region_reference_end is not None) and (cigar_index.reference_index >= region_reference_end): - # Has an upper-bound, this is above it + # Has an upper-bound, this is above it break else: started = True @@ -141,7 +136,7 @@ def iterator_attach( query_qualities: Optional[Sequence[int]] = None ) -> Iterator[CigarIndex]: "Attach reference and query information to a cigar_index_iterator" - + for cigar_index in cigar_index_iterator: if reference_sequence and (cigar_index.reference_index is not None): cigar_index.reference_letter = reference_sequence[cigar_index.reference_index] @@ -150,22 +145,20 @@ def iterator_attach( if query_qualities: cigar_index.query_quality = query_qualities[cigar_index.query_index] yield cigar_index - - - - - + + def is_sorted_ascending(lst: Sequence) -> bool: return all(lst[i] <= lst[i+1] for i in range(len(lst) - 1)) - + + def _before_site_on_ref(site: int, index: CigarIndex) -> bool: "return True if this index if before this site on the reference" - + if index.reference_index is None: return True return index.reference_index < site-1 - - + + def liftover( cigartuples: CigarTuples, *reference_sites: int, @@ -174,24 +167,25 @@ def liftover( small_indel_limit: int = 10 ) -> Iterator[Optional[int]]: "Create an iterator which yields the query index for each reference site" - + assert edge in {'left', 'right', None} - - cigar_indexes = list(cigar_iterator(cigartuples, reference_start=reference_start)) - + + cigar_indexes = list(cigar_iterator( + cigartuples, reference_start=reference_start)) + for site in reference_sites: yield _liftover_index(cigar_indexes, site, edge, small_indel_limit) - - + + def _liftover_index( cigar_indexes: List[CigarIndex], site: int, edge: Optional[str], limit: int ) -> Optional[int]: - + for n in range(len(cigar_indexes)): - + if cigar_indexes[n].reference_index == site: if (cigar_indexes[n].query_index is not None) or (edge is None): # Mapped or don't care diff --git a/cigarmath/mapping.py b/cigarmath/mapping.py index 879541a..9e9a059 100644 --- a/cigarmath/mapping.py +++ b/cigarmath/mapping.py @@ -9,6 +9,7 @@ from cigarmath.defn import CigarTuples from cigarmath.iterators import cigar_iterator + def reference2query(cigartuples: CigarTuples, reference_start: int = 0) -> Iterator[Optional[int]]: """Create a generator the same size as the reference alignment that maps positions in the reference to positions in the query. @@ -79,6 +80,5 @@ def query2cigar(cigartuples: CigarTuples, reference_start: int = 0) -> Iterator[ if cig_index.query_index is not None: yield (cig_index.cigar_index, cig_index.cigar_block_index) - - + # Copyright (C) 2022-present, Dampier & DV Klopfenstein, PhD. All rights reserved diff --git a/cigarmath/pileup.py b/cigarmath/pileup.py index 6664e7c..3bcd1c1 100644 --- a/cigarmath/pileup.py +++ b/cigarmath/pileup.py @@ -10,6 +10,7 @@ BAM_CREF_SKIP, ) + def depth( cigartuples: CigarTuples, reference_start: int = 0, @@ -17,16 +18,16 @@ def depth( previous_count: Optional[Dict[int, Counter]] = None, ) -> Dict[int, Counter]: """Calculate the pileup depth at each reference position. - + Args: cigartuples: List of CIGAR operations and their lengths reference_start: Starting position on reference (default: 0) query_sequence: Query sequence string (default: None, will use '.' for bases) previous_count: Previous pileup counts to update (default: None) - + Returns: Dict mapping reference positions to Counter of bases at that position - + Example:: REF: AAAAGACC--CCC @@ -51,11 +52,11 @@ def depth( counts = defaultdict(Counter) if previous_count is not None: counts.update(previous_count) - + # Track current positions ref_pos = reference_start query_pos = 0 - + for op, length in cigartuples: # Handle operations that consume reference if op in CONSUMES_REFERENCE: @@ -75,5 +76,5 @@ def depth( # Handle insertions (don't advance ref_pos) elif op in CONSUMES_QUERY: query_pos += length - - return counts \ No newline at end of file + + return counts diff --git a/pyproject.toml b/pyproject.toml index 89b0187..2192c35 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,6 +60,9 @@ addopts = "--ignore=setup.py" [tool.flake8] exclude = ["docs"] +extend-exclude = [".ipynb_checkpoints"] +max-line-length = 120 +extend-ignore = ["W291"] [tool.bumpversion] current_version = "0.2.3" diff --git a/setup.cfg b/setup.cfg index e6c8bad..e926291 100644 --- a/setup.cfg +++ b/setup.cfg @@ -15,6 +15,8 @@ replace = __version__ = '{new_version}' universal = 1 [flake8] -exclude = docs +exclude = docs,.ipynb_checkpoints +max-line-length = 120 +extend-ignore = W291 [tool:pytest] addopts = --ignore=setup.py diff --git a/tests/test_aligned_pairs.py b/tests/test_aligned_pairs.py index 88fb504..cea0d92 100644 --- a/tests/test_aligned_pairs.py +++ b/tests/test_aligned_pairs.py @@ -1,7 +1,8 @@ import cigarmath as cm + def test_aligned_pairs(): - + cigar = '5M' pairs = cm.cigartuples2pairs(cm.cigarstr2tup(cigar)) cor_pairs = [(0, 0), @@ -10,7 +11,7 @@ def test_aligned_pairs(): (3, 3), (4, 4),] assert list(pairs) == cor_pairs - + cigar = '5M' pairs = cm.cigartuples2pairs(cm.cigarstr2tup(cigar), reference_start=5) cor_pairs = [(0, 5), @@ -19,10 +20,10 @@ def test_aligned_pairs(): (3, 8), (4, 9),] assert list(pairs) == cor_pairs - + # REF 0123456--789 # QRY 0123456 - + cigar = '2M2I3M' pairs = cm.cigartuples2pairs(cm.cigarstr2tup(cigar), reference_start=5) cor_pairs = [(0, 5), @@ -32,12 +33,12 @@ def test_aligned_pairs(): (4, 7), (5, 8), (6, 9), - ] + ] assert list(pairs) == cor_pairs - + # REF 0123456789 # QRY 01--234 - + cigar = '2M2D3M' pairs = cm.cigartuples2pairs(cm.cigarstr2tup(cigar), reference_start=1) cor_pairs = [(0, 1), @@ -47,11 +48,12 @@ def test_aligned_pairs(): (2, 5), (3, 6), (4, 7), - ] + ] assert list(pairs) == cor_pairs cigar = '2S5M3H' - pairs = cm.cigartuples2pairs(cm.cigarstr2tup(cigar), reference_start=5, verbose=True) + pairs = cm.cigartuples2pairs(cm.cigarstr2tup( + cigar), reference_start=5, verbose=True) cor_pairs = [(0, None), (1, None), (2, 5), @@ -62,5 +64,5 @@ def test_aligned_pairs(): (7, None), (8, None), (9, None), - ] - assert list(pairs) == cor_pairs \ No newline at end of file + ] + assert list(pairs) == cor_pairs diff --git a/tests/test_binary_map.py b/tests/test_binary_map.py index fa1f15d..d6f98f7 100644 --- a/tests/test_binary_map.py +++ b/tests/test_binary_map.py @@ -1,59 +1,60 @@ import cigarmath as cm + def test_segments_to_binary(): - + alns = [(30, cm.cigarstr2tup("30M"))] - guess = cm.segments_to_binary(alns, max_genome_size = 100) + guess = cm.segments_to_binary(alns, max_genome_size=100) assert sum(guess[:30]) == 0 assert sum(guess[30:60]) == 30 assert sum(guess[60:]) == 0 - + alns = [(10, cm.cigarstr2tup("20M")), (50, cm.cigarstr2tup("20M")) - ] - - guess = cm.segments_to_binary(alns, max_genome_size = 100) - + ] + + guess = cm.segments_to_binary(alns, max_genome_size=100) + assert sum(guess[:10]) == 0 assert sum(guess[10:30]) == 20 - + assert sum(guess[30:50]) == 0 assert sum(guess[50:70]) == 20 - + assert sum(guess[70:]) == 0 - + alns = [(10, cm.cigarstr2tup("20M10D5M")), (50, cm.cigarstr2tup("20M10D5M"))] - guess = cm.segments_to_binary(alns, max_genome_size = 100, deletion_size=1) - + guess = cm.segments_to_binary(alns, max_genome_size=100, deletion_size=1) + assert sum(guess[:10]) == 0 assert sum(guess[10:30]) == 20 assert sum(guess[30:40]) == 0 assert sum(guess[40:45]) == 5 - + assert sum(guess[45:50]) == 0 assert sum(guess[50:70]) == 20 assert sum(guess[70:80]) == 0 assert sum(guess[80:85]) == 5 - + assert sum(guess[85:]) == 0 - - + + def test_segments_to_binary_multi(): - + alns1 = [(10, cm.cigarstr2tup("20M"))] alns2 = [(50, cm.cigarstr2tup("20M"))] - + # Process one alignment stream - guess = cm.segments_to_binary(alns1, max_genome_size = 100) - + guess = cm.segments_to_binary(alns1, max_genome_size=100) + # Process a second alignment stream starting with the previous guess - guess = cm.segments_to_binary(alns2, mapping = guess) - + guess = cm.segments_to_binary(alns2, mapping=guess) + assert sum(guess[:10]) == 0 assert sum(guess[10:30]) == 20 - + assert sum(guess[30:50]) == 0 assert sum(guess[50:70]) == 20 - - assert sum(guess[70:]) == 0 \ No newline at end of file + + assert sum(guess[70:]) == 0 diff --git a/tests/test_block.py b/tests/test_block.py index f978154..c9d5775 100644 --- a/tests/test_block.py +++ b/tests/test_block.py @@ -11,7 +11,6 @@ def test_reference_block(): guess = cm.reference_block(cigarstr2tup(cigar), reference_start=ref_start) assert (ref_start, ref_start + 30) == guess - cigar = "20S30M10S" guess = cm.reference_block(cigarstr2tup(cigar), reference_start=ref_start) assert ( @@ -86,7 +85,7 @@ def test_block_overlap(): correct == guess ), f"Expected {correct} overlap but got {guess} for {block_b}, {block_a}" - + def test_reference_deletion_blocks(): "Test detecting large deletions in cigartuples" @@ -120,19 +119,20 @@ def test_reference_deletion_blocks(): guess = list(cm.reference_deletion_blocks(cigartups)) correct = [] assert guess == correct - - + + def test_reference_mapping_blocks(): - + cigar = '6M3D4M6D4M' cigartups = cigarstr2tup(cigar) - blocks = list(cm.reference_mapping_blocks(cigartups, reference_start=3, deletion_split=5)) - + blocks = list(cm.reference_mapping_blocks( + cigartups, reference_start=3, deletion_split=5)) + assert blocks == [(3, 16), (22, 26)] - - + cigar = '6M3D4M6D4M' cigartups = cigarstr2tup(cigar) - blocks = list(cm.reference_mapping_blocks(cigartups, reference_start=3, deletion_split=10)) - - assert blocks == [(3, 26)] \ No newline at end of file + blocks = list(cm.reference_mapping_blocks( + cigartups, reference_start=3, deletion_split=10)) + + assert blocks == [(3, 26)] diff --git a/tests/test_cigarmath.py b/tests/test_cigarmath.py index 8a6db14..a9d4f4d 100644 --- a/tests/test_cigarmath.py +++ b/tests/test_cigarmath.py @@ -37,9 +37,6 @@ def test_cigarstr2tuples(): check_cigartuples(guess, correct) - - - def test_simplify_blocks(): "Test collapsing adjacent cigar blocks of the same type" diff --git a/tests/test_clipping.py b/tests/test_clipping.py index 4169b8d..c17468f 100644 --- a/tests/test_clipping.py +++ b/tests/test_clipping.py @@ -108,17 +108,17 @@ def test_declip(): correct = [(defn.BAM_CMATCH, 30)] check_cigartuples(guess, correct) - + def test_declip_args(): - + cigar = "5S10M2S" cigartups = cigarstr2tup(cigar) seq = 'sssssMMMMMMMMMMss' cigar, clipped_seq = cm.declip(cigartups, seq) - + assert clipped_seq == 'MMMMMMMMMM' - + # Can accept many things cigar, _, _, clipped_seq = cm.declip(cigartups, seq, seq, seq) - - assert clipped_seq == 'MMMMMMMMMM' \ No newline at end of file + + assert clipped_seq == 'MMMMMMMMMM' diff --git a/tests/test_combine.py b/tests/test_combine.py index c117c5e..4e343e2 100644 --- a/tests/test_combine.py +++ b/tests/test_combine.py @@ -7,36 +7,38 @@ combine_multiple_alignments ) + def test_trim_alignment(): """Test trimming query bases from alignments""" - + # Test left trimming cigar = "5M1D3M" # AAAAADCCC cigartups = cigarstr2tup(cigar) ref_start = 10 - + new_start, new_cigars = trim_alignment(ref_start, cigartups, left=2) assert new_start == 12 - assert tuple(new_cigars) == ((0,3), (2,1), (0,3)) - + assert tuple(new_cigars) == ((0, 3), (2, 1), (0, 3)) + # Test right trimming new_start, new_cigars = trim_alignment(ref_start, cigartups, right=2) assert new_start == 10 - assert tuple(new_cigars) == ((0,5), (2,1), (0,1)) - + assert tuple(new_cigars) == ((0, 5), (2, 1), (0, 1)) + # Test both ends - new_start, new_cigars = trim_alignment(ref_start, cigartups, left=2, right=2) + new_start, new_cigars = trim_alignment( + ref_start, cigartups, left=2, right=2) assert new_start == 12 - assert tuple(new_cigars) == ((0,3), (2,1), (0,1)) - + assert tuple(new_cigars) == ((0, 3), (2, 1), (0, 1)) + # Test with insertions cigar = "5M2I3M" # AAAAAIICCC cigartups = cigarstr2tup(cigar) - + new_start, new_cigars = trim_alignment(ref_start, cigartups, left=6) assert new_start == 15 - assert tuple(new_cigars) == ((1,1),(0, 3)) - + assert tuple(new_cigars) == ((1, 1), (0, 3)) + # Test no trimming new_start, new_cigars = trim_alignment(ref_start, cigartups) assert new_start == ref_start @@ -53,26 +55,26 @@ def test_trim_alignment(): new_start, new_cigars = trim_alignment(ref_start, cigartups, left=12) correct_cigars = [ - (BAM_CINS, 1), - (BAM_CMATCH, 3), - (BAM_CDEL, 2), - (BAM_CMATCH, 6), + (BAM_CINS, 1), + (BAM_CMATCH, 3), + (BAM_CDEL, 2), + (BAM_CMATCH, 6), ] assert new_start == 10 assert tuple(new_cigars) == tuple(correct_cigars) - + new_start, new_cigars = trim_alignment(ref_start, cigartups, left=15) correct_cigars = [ - (BAM_CMATCH, 1), - (BAM_CDEL, 2), - (BAM_CMATCH, 6), + (BAM_CMATCH, 1), + (BAM_CDEL, 2), + (BAM_CMATCH, 6), ] assert new_start == 12 assert tuple(new_cigars) == tuple(correct_cigars) new_start, new_cigars = trim_alignment(ref_start, cigartups, left=18) correct_cigars = [ - (BAM_CMATCH, 4), + (BAM_CMATCH, 4), ] print(new_cigars, correct_cigars) assert new_start == 17 @@ -81,19 +83,23 @@ def test_trim_alignment(): def test_trim_alignment_with_clipping(): """Test trimming query bases from alignments with clipping""" - + # Test adding soft clipping ref_start = 10 cigartups = cigarstr2tup("5M1D3M") - new_start, new_cigars = trim_alignment(ref_start, cigartups, left=2, right=2, add_clipping='soft') + new_start, new_cigars = trim_alignment( + ref_start, cigartups, left=2, right=2, add_clipping='soft') assert new_start == 12 - assert tuple(new_cigars) == ((BAM_CSOFT_CLIP, 2), (0,3), (2,1), (0,1), (BAM_CSOFT_CLIP, 2)) - + assert tuple(new_cigars) == ((BAM_CSOFT_CLIP, 2), + (0, 3), (2, 1), (0, 1), (BAM_CSOFT_CLIP, 2)) + # Test adding hard clipping - new_start, new_cigars = trim_alignment(ref_start, cigartups, left=2, right=2, add_clipping='hard') + new_start, new_cigars = trim_alignment( + ref_start, cigartups, left=2, right=2, add_clipping='hard') assert new_start == 12 - assert tuple(new_cigars) == ((BAM_CHARD_CLIP, 2), (0,3), (2,1), (0,1), (BAM_CHARD_CLIP, 2)) - + assert tuple(new_cigars) == ((BAM_CHARD_CLIP, 2), + (0, 3), (2, 1), (0, 1), (BAM_CHARD_CLIP, 2)) + # Test invalid clipping type try: trim_alignment(ref_start, cigartups, left=2, add_clipping='invalid') @@ -104,13 +110,12 @@ def test_trim_alignment_with_clipping(): def test_combine_adjacent_alignments(): """Test combining two adjacent alignments""" - + # REF: 0123456789012345678901234567890 # Q1 : AAAAASSS # Q2 : SSSSSCCC # COR: AAAAADDDDDCCC - # Test with gap (deletion) first = (5, cigarstr2tup("5M3S")) second = (15, cigarstr2tup("5S3M")) @@ -120,11 +125,10 @@ def test_combine_adjacent_alignments(): (BAM_CDEL, 5), (BAM_CMATCH, 3), ] - + new_start, new_cigars = combine_adjacent_alignments(first, second) assert new_start == 5 assert tuple(new_cigars) == tuple(correct_cigars) - # Test with overlap # REF: 0123456789012345678|--|901234567890 @@ -132,7 +136,6 @@ def test_combine_adjacent_alignments(): # Q2 : CCC--CCCC|CC|CCCC # COR: AAAAAAAAAAAACC|CC|CCCC - # Test with gap (deletion) first = (5, cigarstr2tup("12M 17S")) second = (10, cigarstr2tup("12S 3M 2D 8M 2I 4M")) @@ -141,8 +144,8 @@ def test_combine_adjacent_alignments(): (BAM_CMATCH, 16), (BAM_CINS, 2), (BAM_CMATCH, 4), - ] - + ] + new_start, new_cigars = combine_adjacent_alignments(first, second) assert new_start == 5 assert tuple(new_cigars) == tuple(correct_cigars) @@ -150,20 +153,19 @@ def test_combine_adjacent_alignments(): # Test perfectly adjacent first = (10, cigarstr2tup("5M")) # AAAAA second = (15, cigarstr2tup("3M")) # CCC - + new_start, new_cigars = combine_adjacent_alignments(first, second) assert new_start == 10 - assert tuple(new_cigars) == ((0,8),) - + assert tuple(new_cigars) == ((0, 8),) def test_combine_multiple_alignments(): """Test combining multiple alignments""" - + # Test basic combination alignments = [ - (5, cigarstr2tup("5M 2D 3M 18S")), + (5, cigarstr2tup("5M 2D 3M 18S")), (20, cigarstr2tup("8S 4M 2I 2M 5S")), (29, cigarstr2tup("16S 5M 2I 3M")) ] @@ -187,7 +189,7 @@ def test_combine_multiple_alignments(): (BAM_CMATCH, 4), (BAM_CINS, 2), (BAM_CMATCH, 2), - + (BAM_CDEL, 3), (BAM_CMATCH, 5), @@ -196,9 +198,9 @@ def test_combine_multiple_alignments(): ] assert tuple(new_cigars) == tuple(correct_cigars) - + overlapping = [ - (5, cigarstr2tup("5M 2D 3M 18S")), + (5, cigarstr2tup("5M 2D 3M 18S")), (9, cigarstr2tup("8S 4M 2I 2M 10S")), (29, cigarstr2tup("16S 5M 2I 3M")) ] @@ -215,13 +217,12 @@ def test_combine_multiple_alignments(): assert False, "Should raise ValueError for overlap" except ValueError: pass - correct_cigars = [ (BAM_CMATCH, 5), (BAM_CDEL, 2), (BAM_CMATCH, 5), - + (BAM_CDEL, 12), (BAM_CMATCH, 5), @@ -230,32 +231,33 @@ def test_combine_multiple_alignments(): ] # Should succeed with overlap allowed - new_start, new_cigars = combine_multiple_alignments(overlapping, allowed_overlap=10) + new_start, new_cigars = combine_multiple_alignments( + overlapping, allowed_overlap=10) print(correct_cigars) print(new_cigars) - + assert new_start == 5 assert tuple(new_cigars) == tuple(correct_cigars) - + # Test empty input try: combine_multiple_alignments([]) assert False, "Should raise ValueError for empty input" except ValueError: pass - + # Test single alignment single = [(10, cigarstr2tup("5M"))] new_start, new_cigars = combine_multiple_alignments(single) assert new_start == 10 - assert tuple(new_cigars) == ((0,5),) - + assert tuple(new_cigars) == ((0, 5),) + # Test non-sequential reference positions non_sequential = [ (20, cigarstr2tup("5M")), (10, cigarstr2tup("5M")), ] - + try: combine_multiple_alignments(non_sequential) assert False, "Should raise ValueError for non-sequential alignments" diff --git a/tests/test_inference.py b/tests/test_inference.py index f1028af..c7813e1 100644 --- a/tests/test_inference.py +++ b/tests/test_inference.py @@ -5,10 +5,6 @@ All rights reserved""" __author__ = "Will Dampier, PhD" -from cigarmath.defn import cigarstr2tup - -import cigarmath as cm - def check_cigartuples(guess, correct): "Compare two lists of cigartuples" diff --git a/tests/test_io.py b/tests/test_io.py index bc359a5..c384cdc 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -2,34 +2,37 @@ import cigarmath as cm + def test_pysam_stream(): - - stream = cm.io.segment_stream_pysam('tests/test_data/test.sam', + + stream = cm.io.segment_stream_pysam('tests/test_data/test.sam', mode='r') - + for num, aln in enumerate(stream): assert isinstance(aln, pysam.AlignedSegment) - - assert num == 247 # Correct number of records - - + + assert num == 247 # Correct number of records + + def test_pysam_stream_mapq(): - - stream = cm.io.segment_stream_pysam('tests/test_data/test.sam', + + stream = cm.io.segment_stream_pysam('tests/test_data/test.sam', mode='r', min_mapq=30) - + for num, aln in enumerate(stream): assert isinstance(aln, pysam.AlignedSegment) - - assert num == 198 # Correct number of records + + assert num == 198 # Correct number of records + def test_combined_segment_stream(): - stream = sorted(cm.io.segment_stream_pysam('tests/test_data/test.sam', mode='r'), key=lambda x: x.query_name) + stream = sorted(cm.io.segment_stream_pysam( + 'tests/test_data/test.sam', mode='r'), key=lambda x: x.query_name) stream = cm.io.combined_segment_stream(stream) - + for num, (start, cigars, segments) in enumerate(stream): assert isinstance(segments[0], pysam.AlignedSegment) - assert num == 192 # Correct number of records \ No newline at end of file + assert num == 192 # Correct number of records diff --git a/tests/test_iterators.py b/tests/test_iterators.py index 2450b79..a8154a5 100644 --- a/tests/test_iterators.py +++ b/tests/test_iterators.py @@ -5,87 +5,87 @@ import cigarmath as cm -from cigarmath.defn import cigarstr2tup from cigarmath.iterators import CigarIndex def make_example(): - + cigar = '1S2M2I1M2D1M2S' reference_start = 2 - - correct = [CigarIndex(alignment_index = 0, - reference_index = None, - query_index = 0, - cigar_index = 0, + + correct = [CigarIndex(alignment_index=0, + reference_index=None, + query_index=0, + cigar_index=0, cigar_block_index=0, - cigar_op = 4), - CigarIndex(alignment_index = 1, - reference_index = 2, - query_index = 1, - cigar_index = 1, + cigar_op=4), + CigarIndex(alignment_index=1, + reference_index=2, + query_index=1, + cigar_index=1, cigar_block_index=0, - cigar_op = 0), - CigarIndex(alignment_index = 2, - reference_index = 3, - query_index = 2, - cigar_index = 1, + cigar_op=0), + CigarIndex(alignment_index=2, + reference_index=3, + query_index=2, + cigar_index=1, cigar_block_index=1, - cigar_op = 0), - CigarIndex(alignment_index = 3, - reference_index = None, - query_index = 3, - cigar_index = 2, + cigar_op=0), + CigarIndex(alignment_index=3, + reference_index=None, + query_index=3, + cigar_index=2, cigar_block_index=0, - cigar_op = 1), - CigarIndex(alignment_index = 4, - reference_index = None, - query_index = 4, - cigar_index = 2, + cigar_op=1), + CigarIndex(alignment_index=4, + reference_index=None, + query_index=4, + cigar_index=2, cigar_block_index=1, - cigar_op = 1), - CigarIndex(alignment_index = 5, - reference_index = 4, - query_index = 5, - cigar_index = 3, + cigar_op=1), + CigarIndex(alignment_index=5, + reference_index=4, + query_index=5, + cigar_index=3, cigar_block_index=0, - cigar_op = 0), - CigarIndex(alignment_index = 6, - reference_index = 5, - query_index = None, - cigar_index = 4, + cigar_op=0), + CigarIndex(alignment_index=6, + reference_index=5, + query_index=None, + cigar_index=4, cigar_block_index=0, - cigar_op = 2), - CigarIndex(alignment_index = 7, - reference_index = 6, - query_index = None, - cigar_index = 4, + cigar_op=2), + CigarIndex(alignment_index=7, + reference_index=6, + query_index=None, + cigar_index=4, cigar_block_index=1, - cigar_op = 2), - CigarIndex(alignment_index = 8, - reference_index = 7, - query_index = 6, - cigar_index = 5, + cigar_op=2), + CigarIndex(alignment_index=8, + reference_index=7, + query_index=6, + cigar_index=5, cigar_block_index=0, - cigar_op = 0), - - CigarIndex(alignment_index = 9, - reference_index = None, - query_index = 7, - cigar_index = 6, + cigar_op=0), + + CigarIndex(alignment_index=9, + reference_index=None, + query_index=7, + cigar_index=6, cigar_block_index=0, - cigar_op = 4), - - CigarIndex(alignment_index = 10, - reference_index = None, - query_index = 8, - cigar_index = 6, + cigar_op=4), + + CigarIndex(alignment_index=10, + reference_index=None, + query_index=8, + cigar_index=6, cigar_block_index=1, - cigar_op = 4), + cigar_op=4), ] - + return cm.cigarstr2tup(cigar), reference_start, correct + def check_index_list(guess_items, correct_items): for guess, cor in zip(guess_items, correct_items): @@ -93,14 +93,13 @@ def check_index_list(guess_items, correct_items): print('guess', guess) print('\n\n') assert guess == cor - + assert len(guess_items) == len(correct_items) - def test_cigar_iterator(): """ - + ALNPOS 01234567890 # Index of the entire alignment RPOS 0123 456789 # Index within the reference REF AAGA--CTTCGG @@ -110,43 +109,42 @@ def test_cigar_iterator(): QRY -xAAGGC--Cxx QPOS 012345 678 # Index within the query """ - + cigartups, ref_start, correct_indexes = make_example() cigiters = list(cm.cigar_iterator(cigartups, reference_start=ref_start)) - + check_index_list(cigiters, correct_indexes) - - + + def test_cigar_iterator_reference_slice(): - + cigartups, ref_start, correct_indexes = make_example() - + cigiters = list(cm.cigar_iterator_reference_slice(cigartups, - reference_start=ref_start)) + reference_start=ref_start)) print('Checking no boundary') check_index_list(cigiters, correct_indexes[1:]) - + cigiters = list(cm.cigar_iterator_reference_slice(cigartups, - reference_start=ref_start, - region_reference_start = 5)) + reference_start=ref_start, + region_reference_start=5)) print('Checking lower bound') check_index_list(cigiters, correct_indexes[6:]) - + cigiters = list(cm.cigar_iterator_reference_slice(cigartups, - reference_start=ref_start, - region_reference_end = 5)) + reference_start=ref_start, + region_reference_end=5)) print('Checking upper bound') check_index_list(cigiters, correct_indexes[1:6]) - - + cigiters = list(cm.cigar_iterator_reference_slice(cigartups, - reference_start=ref_start, - region_reference_start = 4, - region_reference_end = 7)) + reference_start=ref_start, + region_reference_start=4, + region_reference_end=7)) print('Checking both bound') check_index_list(cigiters, correct_indexes[5:-3]) - - + + def test_iterator_attach(): """ ALNPOS 01234567890 # Index of the entire alignment @@ -158,13 +156,13 @@ def test_iterator_attach(): QRY -xAAGGC--Cxx QPOS 012345 678 # Index within the query """ - + cigartups, ref_start, correct_indexes = make_example() cigiters = list(cm.cigar_iterator(cigartups, reference_start=ref_start)) reference = 'AAGACTTCGG' query = 'xAAGGCCxx' quals = [0, 1, 2, 3, 4, 5, 6, 7, 8] - + new_iters = cm.iterator_attach(cigiters, reference_sequence=reference, query_sequence=query, @@ -172,22 +170,15 @@ def test_iterator_attach(): ) new_iters = list(new_iters) assert len(new_iters) == len(cigiters) - + for it in new_iters: if it.reference_index is not None: assert it.reference_letter == reference[it.reference_index] if it.query_index is not None: assert it.query_letter == query[it.query_index] assert it.query_quality == quals[it.query_index] - - - - - - - - - + + def test_liftover(): """ ALNPOS 01234567890 # Index of the entire alignment @@ -199,26 +190,28 @@ def test_liftover(): QRY -xAAGGC--Cxx QPOS 012345 678 # Index within the query """ - + cigartups, ref_start, _ = make_example() - + lift_ref_pos = [3, 4, 5, 6, 7] - + # right lift_que_pos = [2, 5, 6, 6, 6] - lift_pos = list(cm.liftover(cigartups, *lift_ref_pos, reference_start=ref_start, edge='right')) - + lift_pos = list(cm.liftover(cigartups, *lift_ref_pos, + reference_start=ref_start, edge='right')) + assert lift_pos == lift_que_pos - - + # left lift_que_pos = [2, 5, 5, 5, 6] - lift_pos = list(cm.liftover(cigartups, *lift_ref_pos, reference_start=ref_start, edge='left')) - + lift_pos = list(cm.liftover(cigartups, *lift_ref_pos, + reference_start=ref_start, edge='left')) + assert lift_pos == lift_que_pos - + # None lift_que_pos = [2, 5, None, None, 6] - lift_pos = list(cm.liftover(cigartups, *lift_ref_pos, reference_start=ref_start, edge=None)) - - assert lift_pos == lift_que_pos \ No newline at end of file + lift_pos = list(cm.liftover(cigartups, *lift_ref_pos, + reference_start=ref_start, edge=None)) + + assert lift_pos == lift_que_pos diff --git a/tests/test_mapping.py b/tests/test_mapping.py index 696b805..365f9b5 100644 --- a/tests/test_mapping.py +++ b/tests/test_mapping.py @@ -1,28 +1,29 @@ import cigarmath as cm - + + def test_reference2query(): """RPOS 0123 456789 # Index within the reference REF AAGA--CTTCGG CIGAR SMMIIMDDMSS QRY -xAAGGC--Cxx QPOS 012345 678 # Index within the query - + r2q = 12 5NN6 - + cigarstring = 1S2M2I1M2D1M2S reference_start = 2 >> r2q = reference2query(cigartuples, reference_start=2) [1, 2, 5, None, None, 6] """ - + cigar = '1S2M2I1M2D1M2S' cigartuples = cm.cigarstr2tup(cigar) r2q = list(cm.reference2query(cigartuples, reference_start=2)) correct = [1, 2, 5, None, None, 6] - + assert r2q == correct - - + + def test_query2reference(): """ RPOS 0123 456789 # Index within the reference @@ -31,17 +32,16 @@ def test_query2reference(): QRY -xAAGGC--Cxx QPOS 012345 678 # Index within the query q2r = N23NN4 7NN - + cigarstring = 1S2M2I1M2D1M2S reference_start = 2 >> q2r = query2reference(cigartuples, reference_start=2) [None, 2, 3, None, None, 4, 7, None, None] """ - + cigar = '1S2M2I1M2D1M2S' cigartuples = cm.cigarstr2tup(cigar) r2q = list(cm.query2reference(cigartuples, reference_start=2)) correct = [None, 2, 3, None, None, 4, 7, None, None] - + assert r2q == correct - diff --git a/tests/test_msa.py b/tests/test_msa.py index 6f0e82e..d70fe9d 100644 --- a/tests/test_msa.py +++ b/tests/test_msa.py @@ -5,96 +5,93 @@ All rights reserved""" __author__ = "Will Dampier, PhD" -from cigarmath.defn import cigarstr2tup - import cigarmath as cm -from cigarmath import defn def test_msa2cigartuples_basic(): - - ref = 'AAAAGACCCCCGACTAGCTAGCATGCT----ATCTAGCTAGCA' + + ref = 'AAAAGACCCCCGACTAGCTAGCATGCT----ATCTAGCTAGCA' query = '----AACCCCCGAC----TAGCATGCTTTTTATCTAGCT----' # CIG MMMMMMMMMMDDDDMMMMMMMMIIIIIMMMMMMMM - + cor_ref_start = 4 cor_cigartuples = [(0, 10), (2, 4), (0, 9), (1, 4), (0, 8)] - + ref_start, cigartuples = cm.msa2cigartuples(ref, query) - + assert ref_start == cor_ref_start assert cigartuples == cor_cigartuples - - + + def test_msa2cigartuples_extra_items(): - - ref = '--AAAAGACCCCCGACTA--GCTAGCATGCT----ATCTAGCTAGCA' + + ref = '--AAAAGACCCCCGACTA--GCTAGCATGCT----ATCTAGCTAGCA' query = '------AACCCCCGAC------TAGCATGCTTTTTATCTAGCT----' # CIG MMMMMMMMMMDDDDMMMMMMMMIIIIIMMMMMMMM - + cor_ref_start = 4 cor_cigartuples = [(0, 10), (2, 4), (0, 9), (1, 4), (0, 8)] - + ref_start, cigartuples = cm.msa2cigartuples(ref, query) - + assert ref_start == cor_ref_start assert cigartuples == cor_cigartuples - - - + + def test_msa2cigartuples_pre_items(): - - ref = '----AAAAGACCCCCGACTA--GCTAGCATGCT----ATCTAGCTAGCA---' + + ref = '----AAAAGACCCCCGACTA--GCTAGCATGCT----ATCTAGCTAGCA---' query = 'TT------AACCCCCGAC------TAGCATGCTTTTTATCTAGCT----TTT' # CIG MMMMMMMMMMDDDDMMMMMMMMIIIIIMMMMMMMM - + cor_ref_start = 4 cor_cigartuples = [(4, 2), (0, 10), (2, 4), (0, 9), (1, 4), (0, 8), (4, 3)] - + ref_start, cigartuples = cm.msa2cigartuples(ref, query) - + assert ref_start == cor_ref_start assert cigartuples == cor_cigartuples - - + + def test_softclipify(): - + # REF --AAAAGACCCCCGACTCGTTA--- # QUE TT----AACCCCCGAC----TAGCA - # CIG IIDDDDMMMMMMMMMMDDDDMMIII + # CIG IIDDDDMMMMMMMMMMDDDDMMIII # OUT SS MMMMMMMMMMDDDDMMSSS required_mapping = 1 - - cigartuples = [(1, 2), (2,4), (0, 10), (2, 4), (0, 2), (1, 3)] - + + cigartuples = [(1, 2), (2, 4), (0, 10), (2, 4), (0, 2), (1, 3)] + cortuples = [(4, 2), (0, 10), (2, 4), (0, 2), (4, 3)] coroffset = 4 - + newtuples, newoffset = cm.softclipify(cigartuples) - + assert newtuples == cortuples assert newoffset == coroffset - - + + def test_decide_softclip_end(): - - cigartuples = [(1, 2), (2,4), (0, 10), (2, 4), (0, 2), (1, 3)] - + + cigartuples = [(1, 2), (2, 4), (0, 10), (2, 4), (0, 2), (1, 3)] + left_cor_soft_size = 2 left_cor_ind = 2 - + right_cor_soft_size = 3 right_cor_ind = 1 - + left_ind, left_soft_sz = cm.clipping._decide_softclip_end(cigartuples, 1) - + assert left_cor_soft_size == left_soft_sz assert left_cor_ind == left_ind - - right_ind, right_soft_sz = cm.clipping._decide_softclip_end(cigartuples[::-1], 1) + + right_ind, right_soft_sz = cm.clipping._decide_softclip_end( + cigartuples[::-1], 1) assert right_cor_soft_size == right_soft_sz assert right_cor_ind == right_ind - + slc = slice(left_ind, -right_ind or None) - - assert cigartuples[slc] == [(0, 10), (2, 4), (0, 2)] \ No newline at end of file + + assert cigartuples[slc] == [(0, 10), (2, 4), (0, 2)] diff --git a/tests/test_pileup.py b/tests/test_pileup.py index b7344fe..96e6b44 100644 --- a/tests/test_pileup.py +++ b/tests/test_pileup.py @@ -4,41 +4,44 @@ from cigarmath.defn import cigarstr2tup from collections import Counter + def test_basic_depth(): """Test basic pileup depth calculation""" # Example from docstring - - cigarstring = '4M 2I 2M 2D 3M' + + cigarstring = '4M 2I 2M 2D 3M' # MMMMIIMMDDMMMM query_sequence = "AAAAACCGGCC" - + cigartuples = cigarstr2tup(cigarstring) - + result = depth(cigartuples, query_sequence=query_sequence) - + assert result[0] == Counter({'A': 1}) assert result[1] == Counter({'A': 1}) assert result[2] == Counter({'A': 1}) assert result[3] == Counter({'A': 1}) - assert result[4] == Counter({'C': 1}) + assert result[4] == Counter({'C': 1}) assert result[5] == Counter({'G': 1}) - + # Deletion # Deletion - + # GCC assert result[8] == Counter({'G': 1}) assert result[9] == Counter({'C': 1}) assert result[10] == Counter({'C': 1}) + def test_reference_start(): """Test pileup with non-zero reference start""" - cigartuples = [(0,3), (2,2), (0,2)] # 3M2D2M + cigartuples = [(0, 3), (2, 2), (0, 2)] # 3M2D2M query_sequence = "AAATT" - - result = depth(cigartuples, reference_start=10, query_sequence=query_sequence) - + + result = depth(cigartuples, reference_start=10, + query_sequence=query_sequence) + assert result[10] == Counter({'A': 1}) assert result[11] == Counter({'A': 1}) assert result[12] == Counter({'A': 1}) @@ -47,12 +50,13 @@ def test_reference_start(): assert result[15] == Counter({'T': 1}) assert result[16] == Counter({'T': 1}) + def test_no_query_sequence(): """Test pileup when query sequence is not provided""" - cigartuples = [(0,3), (2,1), (0,2)] # 3M1D2M - + cigartuples = [(0, 3), (2, 1), (0, 2)] # 3M1D2M + result = depth(cigartuples) - + assert result[0] == Counter({'.': 1}) assert result[1] == Counter({'.': 1}) assert result[2] == Counter({'.': 1}) @@ -60,18 +64,20 @@ def test_no_query_sequence(): assert result[4] == Counter({'.': 1}) assert result[5] == Counter({'.': 1}) + def test_previous_count(): """Test updating previous pileup counts""" # First alignment - cigartuples1 = [(0,3)] # 3M + cigartuples1 = [(0, 3)] # 3M query_sequence1 = "AAA" result1 = depth(cigartuples1, query_sequence=query_sequence1) - + # Second alignment - cigartuples2 = [(0,3)] # 3M + cigartuples2 = [(0, 3)] # 3M query_sequence2 = "TTT" - result2 = depth(cigartuples2, query_sequence=query_sequence2, previous_count=result1) - + result2 = depth(cigartuples2, query_sequence=query_sequence2, + previous_count=result1) + # Check combined counts assert result2[0] == Counter({'A': 1, 'T': 1}) assert result2[1] == Counter({'A': 1, 'T': 1}) @@ -80,15 +86,15 @@ def test_previous_count(): def test_skipped_regions(): """Test pileup with skipped regions (N in CIGAR)""" - cigartuples = [(0,2), (3,3), (0,2)] # 2M3N2M + cigartuples = [(0, 2), (3, 3), (0, 2)] # 2M3N2M query_sequence = "AATT" - + result = depth(cigartuples, query_sequence=query_sequence) - + assert result[0] == Counter({'A': 1}) assert result[1] == Counter({'A': 1}) assert result[2] == Counter({'-': 1}) assert result[3] == Counter({'-': 1}) assert result[4] == Counter({'-': 1}) assert result[5] == Counter({'T': 1}) - assert result[6] == Counter({'T': 1}) \ No newline at end of file + assert result[6] == Counter({'T': 1}) From ca3b05919c6895e03ef875051edfea26e4854b76 Mon Sep 17 00:00:00 2001 From: Will Dampier Date: Thu, 23 Apr 2026 16:32:37 -0400 Subject: [PATCH 09/11] configuration updates --- .github/workflows/ci.yml | 9 +++-- .gitignore | 7 ++++ .readthedocs.yaml | 1 + .travis.yml | 15 -------- CONTRIBUTING.rst | 19 +++++----- PR.md | 82 ++++++++++++++++++++++++++++++++++++++++ cigarmath/__init__.py | 18 ++++++++- docs/installation.rst | 14 ++++++- pyproject.toml | 23 +++-------- requirements_dev.txt | 12 ------ setup.cfg | 22 ----------- tox.ini | 31 +++++++-------- 12 files changed, 151 insertions(+), 102 deletions(-) delete mode 100644 .travis.yml create mode 100644 PR.md delete mode 100644 requirements_dev.txt delete mode 100644 setup.cfg diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3b41e17..b2d3e56 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,8 +26,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install ".[io]" - pip install pytest + python -m pip install ".[dev,io]" - name: Run tests run: pytest @@ -43,8 +42,10 @@ jobs: with: python-version: "3.11" - - name: Install flake8 - run: pip install flake8 + - name: Install project (dev+io) + run: | + python -m pip install --upgrade pip + python -m pip install ".[dev,io]" - name: Run flake8 run: flake8 cigarmath tests diff --git a/.gitignore b/.gitignore index f877339..8b97d2f 100644 --- a/.gitignore +++ b/.gitignore @@ -106,6 +106,13 @@ ENV/ # mypy .mypy_cache/ +# ruff +.ruff_cache/ +ruff_cache/ + +# uv (if you start using uv lockfiles locally) +uv.lock + # IDE settings .vscode/ .idea/ diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 1524e07..e339119 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -13,5 +13,6 @@ python: - method: pip path: . extra_requirements: + - dev - io - requirements: docs/requirements.txt diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index aaba487..0000000 --- a/.travis.yml +++ /dev/null @@ -1,15 +0,0 @@ -# Config file for automatic testing at travis-ci.com - -language: python -python: - - 3.8 - - 3.7 - - 3.6 - -# Command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors -install: pip install -U tox-travis - -# Command to run tests, e.g. python setup.py test -script: tox - - diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index aba889b..fdcd74e 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -102,9 +102,9 @@ Before you submit a pull request, check that it meets these guidelines: 2. If the pull request adds functionality, the docs should be updated. Put your new functionality into a function with a docstring, and add the feature to the list in README.rst. -3. The pull request should work for Python 3.5, 3.6, 3.7 and 3.8, and for PyPy. Check - https://travis-ci.com/judowill/cigarmath/pull_requests - and make sure that the tests pass for all supported Python versions. +3. The pull request should work for the Python versions supported by + `pyproject.toml` (currently 3.8+). Check GitHub Actions on your PR and + make sure the tests pass for the full CI matrix. Tips ---- @@ -117,12 +117,11 @@ $ pytest tests.test_cigarmath Deploying --------- -A reminder for the maintainers on how to deploy. -Make sure all your changes are committed (including an entry in HISTORY.rst). -Then run:: +A reminder for the maintainers on how to deploy a release. -$ bump2version patch # possible: major / minor / patch -$ git push -$ git push --tags +1. Update `version` in `pyproject.toml`. +2. Update `HISTORY.md` (if you are maintaining a changelog for the release). +3. Build the distribution and upload to PyPI (example):: -Travis will then deploy to PyPI if tests pass. +$ python -m build +$ python -m twine upload dist/* diff --git a/PR.md b/PR.md new file mode 100644 index 0000000..69563a1 --- /dev/null +++ b/PR.md @@ -0,0 +1,82 @@ +## Title +Modernize packaging/CI and overhaul docs for RTD rendering and CIGAR guidance + +## Summary +- Modernizes project packaging and metadata by moving to `pyproject.toml`, removing legacy `setup.py`, and updating development requirements. +- Adds/updates CI and documentation infrastructure (`.github/workflows/ci.yml`, `.readthedocs.yaml`, Sphinx config/requirements) for current Python and Read the Docs workflows. +- Expands user-facing documentation substantially, including a practical `usage` guide and a new dedicated CIGAR format explainer page. +- Normalizes API docstring examples across core modules so fixed-width alignment examples render correctly and consistently on RTD. +- Updates tox matrix to current supported Python versions (`3.8`-`3.12`) to match project classifiers. +- Applies a full repository flake8 cleanup across package and tests, addressing style debt and unused-symbol issues. + +## Commits Included (main...publish-prep) +1. `864c0f2` modernizing +2. `4418b42` fixing docs for fixed width setup +3. `ed10800` fixes for sphinx warnings +4. `60fe528` unpinning +5. `a6b88ea` pointing at correct repo +6. `dba9d6f` adding cigar explanation and basic usage +7. `023158b` updating old tox +8. `858c865` flake8 massive fix + +## Notable Changes by Area + +### Packaging and metadata +- Added `pyproject.toml` with project metadata and dependency groups. +- Removed legacy `setup.py`. +- Consolidated dev tooling into the `dev` extra in `pyproject.toml` (retired `requirements_dev.txt`). +- Updated package version metadata reference points in `cigarmath/__init__.py`. + +### CI and release/doc infrastructure +- Added GitHub Actions workflow at `.github/workflows/ci.yml`. +- Added Read the Docs config at `.readthedocs.yaml`. +- Updated `tox.ini` env matrix to `py38, py39, py310, py311, py312, flake8`. + +### Documentation structure and quality +- Expanded docs pages: `docs/api.rst`, `docs/installation.rst`, `docs/readme.rst`, `docs/history.rst`, `docs/index.rst`. +- Added new CIGAR reference page: `docs/cigar.rst`. +- Rewrote `docs/usage.rst` into a workflow-based guide with runnable examples and expected outputs. +- Updated docs build requirements: `docs/requirements.txt`. +- Tuned Sphinx config in `docs/conf.py` for stable rendering and warning cleanup. + +### API docstring rendering fixes +- Updated fixed-width examples to explicit literal-block style in: + - `cigarmath/block.py` + - `cigarmath/cigarmath.py` + - `cigarmath/clipping.py` + - `cigarmath/inference.py` + - `cigarmath/mapping.py` + - `cigarmath/iterators.py` + - `cigarmath/combine.py` + - `cigarmath/pileup.py` + - `cigarmath/conversions.py` + +### Lint and style sweep +- Ran `flake8` cleanup across `cigarmath/` and `tests/`. +- Resolved unused-import/redefinition issues in public exports (`cigarmath/__init__.py`) and module imports. +- Removed checkpoint artifact from lint surface (`.ipynb_checkpoints` excluded / cleaned). +- Standardized formatting and whitespace across tests and core modules. + +## Diff Scope +- **42 files changed** +- **1476 insertions**, **831 deletions** + +## Test Plan +- [x] Build docs locally: + - `python -m sphinx -b html docs docs/_build/html` +- [x] Validate no Sphinx build warnings in latest pass. +- [x] Run style checks: + - `flake8 cigarmath tests` +- [ ] Run tox across available local interpreters: + - `tox -e py38,py39,py310,py311,py312,flake8` +- [ ] Run unit tests directly: + - `pytest` + +## Risk / Reviewer Notes +- Largest review surface is docs, packaging modernization, and the broad lint-driven formatting sweep. +- Core behavioral intent should be unchanged; most additional churn is style/format and import hygiene. +- Recommend reviewers focus on: + - Packaging metadata correctness (`pyproject.toml`) + - CI/tox Python version alignment + - RTD/Sphinx rendering behavior for fixed-width examples + - Spot-checking representative tests/modules touched by flake8 normalization diff --git a/cigarmath/__init__.py b/cigarmath/__init__.py index 69a44f2..213e70d 100644 --- a/cigarmath/__init__.py +++ b/cigarmath/__init__.py @@ -1,9 +1,10 @@ """Top-level package for Cigar Math.""" +import importlib.metadata +import importlib.util + __author__ = """Will Dampier""" __email__ = "wnd22@drexel.edu" -__version__ = "0.2.0" - from .clipping import left_clipping from .clipping import right_clipping @@ -56,6 +57,18 @@ from .rearrangement import reference_lengths_from_pysam_header from .rearrangement import RearrangementEvent + +def _read_runtime_version() -> str: + try: + return importlib.metadata.version("cigarmath") + except importlib.metadata.PackageNotFoundError: + if importlib.util.find_spec("cigarmath") is not None: + return "0.0.0+uninstalled" + raise + + +__version__ = _read_runtime_version() + __all__ = [ "left_clipping", "right_clipping", @@ -94,4 +107,5 @@ "rearrangement_segment_stream", "reference_lengths_from_pysam_header", "RearrangementEvent", + "__version__", ] diff --git a/docs/installation.rst b/docs/installation.rst index 1b0fa49..2f1563c 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -37,6 +37,18 @@ This installs `pysam`_, which requires a Unix-like system (Linux or macOS). .. _pysam: https://pysam.readthedocs.io +Development install +------------------- + +To work on Cigar Math locally (tests, linting, docs tooling), install the +``dev`` extra. This is the supported way to get a consistent developer +environment (CI, tox, and Read the Docs use the same dependency groups). + +.. code-block:: console + + $ pip install -e ".[dev,io]" + + From sources ------------ @@ -56,6 +68,6 @@ Or with optional dependencies: .. code-block:: console - $ pip install ".[io]" + $ pip install ".[dev,io]" .. _Github repo: https://github.com/DamLabResources/cigarmath diff --git a/pyproject.toml b/pyproject.toml index 2192c35..41246ee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,11 +33,14 @@ dependencies = [] [project.optional-dependencies] io = ["pysam"] dev = [ - "bump2version", + "pip", "wheel", + "build", + "tox", + "watchdog", "flake8", "coverage", - "sphinx", + "Sphinx", "sphinx-rtd-theme", "sphinx-autodoc-typehints", "twine", @@ -56,25 +59,9 @@ Repository = "https://github.com/DamLabResources/cigarmath" include = ["cigarmath", "cigarmath.*"] [tool.pytest.ini_options] -addopts = "--ignore=setup.py" [tool.flake8] exclude = ["docs"] extend-exclude = [".ipynb_checkpoints"] max-line-length = 120 extend-ignore = ["W291"] - -[tool.bumpversion] -current_version = "0.2.3" -commit = true -tag = true - -[[tool.bumpversion.files]] -filename = "pyproject.toml" -search = 'version = "{current_version}"' -replace = 'version = "{new_version}"' - -[[tool.bumpversion.files]] -filename = "cigarmath/__init__.py" -search = '__version__ = "{current_version}"' -replace = '__version__ = "{new_version}"' diff --git a/requirements_dev.txt b/requirements_dev.txt deleted file mode 100644 index c1b31d3..0000000 --- a/requirements_dev.txt +++ /dev/null @@ -1,12 +0,0 @@ -pip -bump2version -wheel -watchdog -flake8 -tox -coverage -Sphinx -twine - -pytest -black \ No newline at end of file diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index e926291..0000000 --- a/setup.cfg +++ /dev/null @@ -1,22 +0,0 @@ -[bumpversion] -current_version = 0.1.0 -commit = True -tag = True - -[bumpversion:file:setup.py] -search = version='{current_version}' -replace = version='{new_version}' - -[bumpversion:file:cigarmath/__init__.py] -search = __version__ = '{current_version}' -replace = __version__ = '{new_version}' - -[bdist_wheel] -universal = 1 - -[flake8] -exclude = docs,.ipynb_checkpoints -max-line-length = 120 -extend-ignore = W291 -[tool:pytest] -addopts = --ignore=setup.py diff --git a/tox.ini b/tox.ini index c94377a..e93029e 100644 --- a/tox.ini +++ b/tox.ini @@ -1,28 +1,23 @@ [tox] envlist = py38, py39, py310, py311, py312, flake8 - -[travis] -python = - 3.12: py312 - 3.11: py311 - 3.10: py310 - 3.9: py39 - 3.8: py38 +isolated_build = true [testenv:flake8] -basepython = python -deps = flake8 -commands = flake8 cigarmath tests +basepython = python3 +usedevelop = true +extras = + dev + io +commands = + flake8 cigarmath tests [testenv] setenv = PYTHONPATH = {toxinidir} -deps = - -r{toxinidir}/requirements_dev.txt -; If you want to make tox run the tests with the same versions, create a -; requirements.txt with the pinned versions and uncomment the following line: -; -r{toxinidir}/requirements.txt +usedevelop = true +extras = + dev + io commands = - pip install -U pip + python -m pip install -U pip pytest --basetemp={envtmpdir} - From ddbd9eaf025c4b2cf3386bc43ffb19b47d31f372 Mon Sep 17 00:00:00 2001 From: Will Dampier Date: Thu, 23 Apr 2026 16:38:25 -0400 Subject: [PATCH 10/11] late flake8s --- .flake8 | 4 ++++ pyproject.toml | 1 - tests/test_iterators.py | 6 +++--- tests/test_mapping.py | 8 ++++---- 4 files changed, 11 insertions(+), 8 deletions(-) create mode 100644 .flake8 diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..da4497d --- /dev/null +++ b/.flake8 @@ -0,0 +1,4 @@ +[flake8] +exclude = docs +extend-exclude = .ipynb_checkpoints +max-line-length = 120 diff --git a/pyproject.toml b/pyproject.toml index 41246ee..a5233f2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,4 +64,3 @@ include = ["cigarmath", "cigarmath.*"] exclude = ["docs"] extend-exclude = [".ipynb_checkpoints"] max-line-length = 120 -extend-ignore = ["W291"] diff --git a/tests/test_iterators.py b/tests/test_iterators.py index a8154a5..c9e8795 100644 --- a/tests/test_iterators.py +++ b/tests/test_iterators.py @@ -107,7 +107,7 @@ def test_cigar_iterator(): CIGIND 01122344566 # Index of the cigar block CBLKIND 001010010 # Index within the cigar block QRY -xAAGGC--Cxx - QPOS 012345 678 # Index within the query + QPOS 012345 678 # Index within the query """ cigartups, ref_start, correct_indexes = make_example() @@ -154,7 +154,7 @@ def test_iterator_attach(): CIGIND 01122344566 # Index of the cigar block CBLKIND 001010010 # Index within the cigar block QRY -xAAGGC--Cxx - QPOS 012345 678 # Index within the query + QPOS 012345 678 # Index within the query """ cigartups, ref_start, correct_indexes = make_example() @@ -188,7 +188,7 @@ def test_liftover(): CIGIND 01122344566 # Index of the cigar block CBLKIND 001010010 # Index within the cigar block QRY -xAAGGC--Cxx - QPOS 012345 678 # Index within the query + QPOS 012345 678 # Index within the query """ cigartups, ref_start, _ = make_example() diff --git a/tests/test_mapping.py b/tests/test_mapping.py index 365f9b5..98223d1 100644 --- a/tests/test_mapping.py +++ b/tests/test_mapping.py @@ -6,9 +6,9 @@ def test_reference2query(): REF AAGA--CTTCGG CIGAR SMMIIMDDMSS QRY -xAAGGC--Cxx - QPOS 012345 678 # Index within the query + QPOS 012345 678 # Index within the query - r2q = 12 5NN6 + r2q = 12 5NN6 cigarstring = 1S2M2I1M2D1M2S reference_start = 2 @@ -30,8 +30,8 @@ def test_query2reference(): REF AAGA--CTTCGG CIGAR SMMIIMDDMSS QRY -xAAGGC--Cxx - QPOS 012345 678 # Index within the query - q2r = N23NN4 7NN + QPOS 012345 678 # Index within the query + q2r = N23NN4 7NN cigarstring = 1S2M2I1M2D1M2S reference_start = 2 From 7794f9253a84aeccb0f58e175392a5919968142d Mon Sep 17 00:00:00 2001 From: Will Dampier Date: Thu, 21 May 2026 17:19:25 -0400 Subject: [PATCH 11/11] resolved rearrangement docs --- docs/api.rst | 10 ++++++++++ docs/rearrangements.rst | 1 + tests/test_rearrangement.py | 2 -- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/docs/api.rst b/docs/api.rst index 7efb31e..21f1445 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -69,6 +69,16 @@ Combine :undoc-members: :show-inheritance: +.. _rearrangements-api: + +Rearrangements +-------------- + +.. automodule:: cigarmath.rearrangement + :members: + :undoc-members: + :show-inheritance: + Pileup ------ diff --git a/docs/rearrangements.rst b/docs/rearrangements.rst index 781cfc3..2e752b9 100644 --- a/docs/rearrangements.rst +++ b/docs/rearrangements.rst @@ -13,6 +13,7 @@ See :func:`cigarmath.rearrangement.infer_rearrangements`, :func:`cigarmath.rearrangement.rearrangement_segment_stream`, :func:`cigarmath.rearrangement.reference_lengths_from_pysam_header`, and :func:`cigarmath.rearrangement.format_read_rearrangement_summary`. +For full autodoc listings, see :ref:`rearrangements-api`. When ``reference_lengths`` is supplied (e.g. from the SAM/BAM ``@SQ`` header), genome-scale negative reference overlap is classified as **REF_WRAP** (linear diff --git a/tests/test_rearrangement.py b/tests/test_rearrangement.py index b1a457f..0e6829c 100644 --- a/tests/test_rearrangement.py +++ b/tests/test_rearrangement.py @@ -12,7 +12,6 @@ infer_rearrangements, rearrangement_segment_stream, reference_lengths_from_pysam_header, - RearrangementEvent, ) HXB2F_LEN = 9086 @@ -487,7 +486,6 @@ def test_rearrangement_segment_stream_test_sam(): assert segments[event.segment_indices[1]].query_name == query_name - def test_format_summary_inversion_two_segment(): read_len = 1000 mappings = [