diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index c63df7e..714ffec 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -2,9 +2,9 @@ name: Test on: push: - branches: [main, master] + branches: [main, master, dev] pull_request: - branches: [main, master] + branches: [main, master, dev] jobs: test: diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..9544fa6 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 Louison Lesage + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md index 07e8190..c089a36 100755 --- a/README.md +++ b/README.md @@ -1,8 +1,9 @@ # WizardEye -![Python](https://img.shields.io/badge/Python-green.svg) +![Python](https://img.shields.io/badge/Python->=3.8-green.svg) ![Beta](https://img.shields.io/badge/beta-red.svg) -[![Version](https://img.shields.io/badge/version-0.1.0b1-orange.svg)](https://github.com/TheLokj/WizardEye/releases) +[![Version](https://img.shields.io/badge/version-0.1.2-yellow.svg)](https://github.com/TheLokj/WizardEye/releases) +[![Install WizardEye using bioconda](https://img.shields.io/badge/Install%20WizardEye%20using-bioconda-brightblue.svg?style=flat)](http://bioconda.github.io/recipes/wizardeye/README.html) [![Python CI](https://github.com/TheLokj/WizardEye/actions/workflows/test.yml/badge.svg)](https://github.com/TheLokj/WizardEye/actions/workflows/test.yml) ![GitHub bugs](https://img.shields.io/github/issues/TheLokj/WizardEye/bug) @@ -47,7 +48,13 @@ Please also note that WizardEye is only designed to identify ambiguous regions * ## Installation -You can install WizardEye by cloning this repository and by running the following commands from the main folder: +The last WizardEye version can be installed using conda: + +``` +conda install -c bioconda wizardeye +``` + +You can also install WizardEye by cloning this repository and by running the following commands from the main folder: ``` python3 -m venv .wizardeye @@ -134,9 +141,9 @@ wizardeye database --clean -d /path/to/database You can compute ambiguous regions of `reference.fa` that can be targeted, using specific BWA parameters, by reads from `risky.fa` with: ``` -wizardeye align -i /path/to/risky.fa -r /path/to/reference.fa -d /path/to/database \ - -k k_mer_length -w sliding_windows \ - -bn bwa_missing_prob_err_rate -bo bwa_max_gap_opens -bl bwa_seed_length +wizardeye align -i /path/to/risky.fa -r /path/to/reference.fa -d /path/to/database \ + -k 35 -w 1 \ + -bn 0.01 -bo 2 -bl 16500 ``` Note that you can provide tags here to describe `risky.fa`. This can be used to describe the sequence, for example by specifying phylogeny and/or a particular environment: `-t Mammalia,Carnivora,Felis,Cave`. This is useful to filter a `.bam` file directly according to specific tags. You can manually set a track identifier with `--track_ID`, in order to save it in database metadata. Note also that, to prevent misuse, it is not possible to add a new track to the database if the reference alignment file is not exactly the same, including sequence names. This is due to an MD5-based control to avoid silently corrupted analyses. @@ -170,12 +177,12 @@ This will filter out all reads that can be ambiguously aligned to your target if For `filter`, `-r` must be the reference identifier already present in your WizardEye database (for example `hg19`), and `-d/--db-root` is mandatory. -By default, WizardEye only considers reads that can be aligned with a single position in the reference. However, if you want to exclude reads that can be aligned with multiple positions, and then be even more restrictive, specify the additional parameter `--considere-all`. +By default, WizardEye considers all reads that can be aligned to the reference. However, if you want to only consider reads that can be aligned with a single position in the reference, specify the `--only-unique` parameter to be more restrictive. You can also filter out reads based on specific tracks: ``` -wizardeye filter -i alignment.bam -r hg19 --exclude-tracks myotis_alcathoe,ursus_arctos -d /path/to/database +wizardeye filter -i alignment.bam -r hg19 --exclude-tracks myotis_alcathoe,ursus_arctos -k 35 -w 1 -bn 0.01 -bo 2 -bl 16500 -d /path/to/database ``` > [!WARNING] @@ -187,21 +194,18 @@ If sequence naming differs between BAM and tracks (for example `chr1` vs `1`), f To handle the trade-off between sensitivity and specificity, you can specify a stringency value during filtering. This criterion is defined as described below. -For a position $P$ in the target, there are exactly `-k`/`-w` different possible k-mers that overlap that position perfectly. After mapping, among the `n` k-mers overlapping the position (maximum `-k`/`-w` if no mismatches are allowed, otherwise more), `u` is computed as the number of these k-mers that are unique in the target. The position is then highlighted as ambiguous if `u`/(`-k`/`-w`) >= `-rc`, i.e., if the proportion of unique k-mers overlapping the position compared with the total number of possible k-mers overlapping the position is higher than the stringency. +For a position $P$ in the target, there are exactly `-k`/`-w` different possible k-mers that overlap that position perfectly. After mapping, `n` k-mers can overlap the position (maximum `-k`/`-w` if no mismatches are allowed, otherwise more). The position is then highlighted as ambiguous if `n`/(`-k`/`-w`) >= `-rc`, i.e., if the proportion of k-mers overlapping the position compared with the total number of possible k-mers overlapping the position is higher than the stringency. -For example, in a situation with one mismatch allowed, with `-k=40` and `-rc=0.25`, if a position is overlapped by 10 exact k-mers and 10 k-mers with one mismatch, the position is retained only if 10 of these k-mers are unique, regardless of exactness. In a similar case with `-rc=0.50`, the region is highlighted only if all 20 k-mers map uniquely there. +If `--only-unique` is used, the same behavior and formulae are still used but only uniquely aligned k-mers are considered, i.e. k-mers without BWA `XA` tag and with `MAPQ>0`. For example, in a situation with one mismatch allowed, with `-k=40` and `-rc=0.25`, if a position is overlapped by 10 exact k-mers and 10 k-mers with one mismatch, the position is retained only if 10 of these k-mers are unique, regardless of exactness. In a similar case with `-rc=0.50`, the region is highlighted only if all 20 k-mers map uniquely there. ``` -wizardeye filter -i alignment.bam -r hg19 --exclude-tracks myotis_alcathoe,ursus_arctos -r 0.25 -d /path/to/database +wizardeye filter -i alignment.bam -r hg19 --exclude-tracks myotis_alcathoe,ursus_arctos -k 35 -w 1 -bn 0.01 -bo 2 -bl 16500 -p 0.25 -d /path/to/database --only-unique ``` -Note that the ratio is not weighted by depth. In another case with a repetitive region, where a position is overlapped by 3000 k-mers, the ratio is still the same: if 40 of these are unique, the position is highlighted. - -Unique k-mers are defined as k-mers without BWA `XA` tag and with `MAPQ>0`. +Note that the ratio is not weighted by depth. In another case with a repetitive region, where a position is overlapped by 3000 k-mers, the ratio is still the same. This behavior aims to reproduce the [Heng Li's seqbility](https://github.com/lh3/misc/tree/cc0f36a9a19f35765efb9387389d9f3a6756f08f/seq/seqbility) logic, which is not directly usable in a cross-mappability context. -If `--considere-all` is used, the same behavior and formulae are still used but uniqueness is simply no longer required. ##### Frequency @@ -221,31 +225,31 @@ wizardeye filter -i alignment.bam -r hg19 --exclude-tracks myotis_alcathoe,ursus #### Output -WizardEye produces a tabulation-separated report containing, for each read, the decision and the overlapping tracks and tags, such as follows: +WizardEye produces a tabulation-separated report containing, for each read, the decision and the overlapping tracks, such as follows: -| read_id | excluded | overlapped | tags | -|------------------------------------------|----------|------------|------| -| read_1 | true | sus_scrofa,bos_taurus | Suina,Ruminantia | -| read_2 | false | | | -| read_3 | true | felis_catus | Feliformia | -| read_4 | true | bos_taurus | Ruminantia | -| read_5 | true | bos_taurus,ovis_aries | Ruminantia | -| read_6 | false | | | +| read_id | filtered_out | associated_tracks | +|------------------------------------------|----------|------------| +| read_1 | true | sus_scrofa,bos_taurus | +| read_2 | false | | +| read_3 | true | felis_catus | +| read_4 | true | bos_taurus | +| read_5 | true | bos_taurus,ovis_aries | +| read_6 | false | | With `--export-bam`, WizardEye produces two more files: - a BAM `excluded` with the reads excluded by the filtration, -- a BAM `filtered` with the reads kept by the filtration, +- a BAM `filtered` with the reads kept by the filtration. ### Count and compute statistics If you prefer to use your own-made filter, you can export a per-read report which summarizes the number of k-mers which can overlap the reference per read-interval using the `count` command. -This command take the same parameters as `filter`, plus a parameter `--mode` to specify the type of statistical summary you want betweemn `sum`, `max`, `min`, `cov`, `mean` and `std`. Statistics are computed on interval. For example, if `max` is specified, for each track, the maximum number of overlapping k-mers from this track on a position in the read associated interval is reported. +This command takes the same parameters as `filter`, plus a parameter `--mode` to specify the type of statistical summary you want between `sum`, `max`, `min`, `cov`, `mean` and `std`. Statistics are computed on interval. For example, if `max` is specified, for each track, the maximum number of overlapping k-mers from this track on a position in the read associated interval is reported. ``` -wizardeye count -i alignment.bam -r hg19 --exclude-tags Farm -k 35 -r 1 -bn 0.01 -bo 2 -bl 16500 -d /path/to/database -m max +wizardeye count -i alignment.bam -r hg19 --exclude-tags Farm -k 35 -w 1 -bn 0.01 -bo 2 -bl 16500 -d /path/to/database -m max ``` In this example, track whose are not reported using `filter`+`-r=0.01` should have a `0` in their column, as it means that the maximum number of overlapping k-mers in the interval is 0. @@ -291,4 +295,4 @@ This command creates the target/track directory, copies the two BigWig files as It is recommanded to complete your filtration using an evolutionnary-aware method such as Kraken2. This combination is useful to remove both reads that belong to completely different organisms and reads that can be ambiguous between closely related organisms. -*Last update of this documentation: beta-0.1.0.* \ No newline at end of file +*Last update of this documentation: 0.1.2.* \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 17214c8..4dd4aaa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta" [project] name = "WizardEye" dynamic = ["version"] -description = "Cross-mappability helper CLI" +description = "A Python tool to create, manage, and filter by cross-mappability tracks." readme = "./README.md" requires-python = ">=3.8,<3.13" dependencies = [ diff --git a/src/wizardeye/cli.py b/src/wizardeye/cli.py index 7488ffb..100cabb 100644 --- a/src/wizardeye/cli.py +++ b/src/wizardeye/cli.py @@ -12,7 +12,7 @@ from typing import List, Optional from .mappability import create_mappability_track -from .filter import generate_global_mask, count_k_mers_on_bam, filter_bam_alternative +from .filter import generate_global_mask, count_k_mers_on_bam, filter_bam from .utils import from_charlist_to_list, log, BWAParameters from .version import DISPLAY_VERSION, PACKAGE_VERSION, print_version_message from .db import ( @@ -30,7 +30,10 @@ ) app = typer.Typer( - help="A Python tool to create, manage, and filter by cross-mappability tracks." + help="WizardEye: A Python tool to create, manage, and filter by cross-mappability tracks.", + no_args_is_help=True, + add_completion=False, + suggest_commands=False, ) # -- Helper functions -- @@ -125,7 +128,7 @@ def _request_tracks_from_args( exclude_tracks: str, kmer_length: int, offset_step: int, - considere_all: bool, + only_unique: bool, no_cache: bool, cross_stringency: float, bwa_params: Optional[BWAParameters] = None, @@ -170,7 +173,6 @@ def _request_tracks_from_args( # Log requested parameters colorful = sys.stdout.isatty() and sys.stderr.isatty() default = "\033[0;90m(default)\033[0m" if colorful else "(default)" - hard = "\033[0;33m(hard)\033[0m" if colorful else "(hard)" log(f"Reference used: {ref}", "I") log(f"Input alignment file to process: {input_bam}", "I") @@ -184,7 +186,7 @@ def _request_tracks_from_args( "I", ) log( - f"Mask source mode: {f'all k-mers {hard}' if considere_all else f'uniquely aligned k-mers {default}'}", + f"Mask source mode: {'uniquely aligned k-mers' if only_unique else f'all k-mers {default}'}", "I", ) log( @@ -250,7 +252,10 @@ def common_options( return -@app.command(help="Initialize, validate, or inspect tracks in a WizardEye database.") +@app.command( + help="Initialize, validate, or inspect tracks in a WizardEye database.", + no_args_is_help=True, +) def database( db_root: str = typer.Option( ..., "-d", "--db-root", help="Path to the database root directory." @@ -431,7 +436,8 @@ def database( # Main alignment command with all parameters for track generation # Same behavior that generate_cross_mappability_filter_bwa.sh @app.command( - help="Generate cross-mappability tracks from input FASTA files using specific bwa aln parameters." + help="Generate cross-mappability tracks from input FASTA files using specific bwa aln parameters.", + no_args_is_help=True, ) def align( # Input parameters @@ -447,7 +453,7 @@ def align( None, "--track_ID", "--track-id", - help="Manual identifier used to name/reference the generated track (defaults to input FASTA stem).", + help="Manual identifier to store in track metadata.", ), tag: Optional[List[str]] = typer.Option( None, @@ -531,9 +537,6 @@ def align( if not manual_track_id: log("--track_ID cannot be empty.", "E") raise typer.Exit(code=1) - manual_track_id = ( - manual_track_id.replace("/", "_").replace("\\", "_").replace(" ", "_") - ) bwa_params = BWAParameters( missing_prob_err_rate=bwa_missing_prob_err_rate, @@ -546,11 +549,7 @@ def align( total_inputs = len(input_fastas) for idx, one_input_fasta in enumerate(input_fastas, start=1): base_query_name = Path(one_input_fasta).stem - query_track_id = ( - base_query_name - if not manual_track_id - else f"{base_query_name}__{manual_track_id}" - ) + query_track_id = base_query_name all_aln_str = "_N" if bwa_all_aln else "" track_name = f"{query_track_id}_k{kmer_length}_w{offset_step}_n{float(bwa_missing_prob_err_rate):g}_o{bwa_max_gap_opens}_l{bwa_seed_length}{all_aln_str}" @@ -613,7 +612,8 @@ def align( @app.command( - help="Filter an input BAM using selected cross-mappability tracks and stringency." + help="Filter an input BAM using selected cross-mappability tracks and stringency.", + no_args_is_help=True, ) def filter( # Alignment parameters used to construct the BAM file @@ -635,9 +635,6 @@ def filter( "-bN", help="BWA aln -N used for alignment (compute every alternative mapping).", ), - bwa_threads: Optional[int] = typer.Option( - 1, "-bj", "--bwa-threads", help="BWA aln -t used for alignment." - ), db_root: str = typer.Option( ..., "-d", "--db-root", help="Path to the database root directory." ), @@ -660,16 +657,11 @@ def filter( "--cross-stringency", help="Stringency threshold in [0.0, 1.0].", ), - considere_all: bool = typer.Option( + only_unique: bool = typer.Option( False, - "--considere_all", - "--considere-all", - help="Use map_all.bw instead of map_uniq.bw to build the exclusion mask.", - ), - no_cache: bool = typer.Option( - False, - "--no-cache", - help="Disable mask caching: recompute track masks and avoid writing cache files.", + "--only-unique", + "--only_unique", + help="Consider only unique k-mers (no XA tag & MAPQ>0) area.", ), output_filtered_bam: Optional[str] = typer.Option( None, @@ -687,21 +679,11 @@ def filter( "--report-output", help="Output TSV report with columns: read_id, excluded, overlapped, tags.", ), - count_only: bool = typer.Option( - False, - "--count-only", - help="Generate only a per-read/per-track overlap-count report from one track source (map_uniq.bw by default, map_all.bw with --considere-all).", - ), export_bam: bool = typer.Option( False, "--export-bam", help="Write filtered/excluded BAM outputs. By default, only the TSV report is generated.", ), - n_threads: int = typer.Option( - 1, - "-j", - help="Number of threads for parallel overlap extraction from BigWig tracks.", - ), kmer_length: Optional[int] = typer.Option( None, "-k", @@ -712,7 +694,7 @@ def filter( "-w", "--offset-step", "--sliding-window", - help="Offset/sliding window to filter on. Smaller, more sensitive.", + help="Offset/sliding window to filter on. Must match track generation parameter.", ), ): start_time = time.perf_counter() @@ -727,7 +709,7 @@ def filter( max_gap_opens=bwa_max_gap_opens, seed_length=bwa_seed_length, all_aln=bwa_all_aln, - threads=bwa_threads, + threads=None, ) valid_tracks = _request_tracks_from_args( ref, @@ -737,8 +719,8 @@ def filter( exclude_tracks, kmer_length, offset_step, - considere_all, - no_cache, + only_unique, + None, cross_stringency, bwa_params, ) @@ -750,7 +732,7 @@ def filter( raise typer.Exit(code=1) try: - filter_result = filter_bam_alternative( + filter_result = filter_bam( input_bam=input_bam, ref=ref, db_root=db_root, @@ -759,9 +741,7 @@ def filter( offset_step=offset_step, bwa_params=bwa_params, stringency=cross_stringency, - consider_all=considere_all, - no_cache=no_cache, - n_threads=n_threads, + consider_all=not (only_unique), output_filtered_bam=output_filtered_bam, output_excluded_bam=output_excluded_bam, output_report_tsv=output_report_tsv, @@ -805,7 +785,8 @@ def filter( @app.command( - help="Export a merged BED mask using the same track-selection logic as filter." + help="Export a merged BED mask using the same track-selection logic as filter.", + no_args_is_help=True, ) def export( ref: str = typer.Option(..., "-r", "--ref", help="Reference species name."), @@ -831,21 +812,18 @@ def export( help="Offset/sliding window to target.", ), bwa_missing_prob_err_rate: Optional[float] = typer.Option( - None, "-bn", help="BWA aln -n used for selected tracks." + None, "-bn", help="BWA aln -n used to generate selected tracks." ), bwa_max_gap_opens: Optional[int] = typer.Option( - None, "-bo", help="BWA aln -o used for selected tracks." + None, "-bo", help="BWA aln -o used to generate selected tracks." ), bwa_seed_length: Optional[int] = typer.Option( - None, "-bl", help="BWA aln -l used for selected tracks." + None, "-bl", help="BWA aln -l used to generate selected tracks." ), bwa_all_aln: Optional[bool] = typer.Option( None, "-bN", - help="BWA aln -N used for selected tracks (compute every alternative mapping).", - ), - bwa_threads: Optional[int] = typer.Option( - None, "-bj", "--bwa-threads", help="BWA aln -t used for selected tracks." + help="BWA aln -N used to generate selected tracks (compute every alternative mapping).", ), cross_stringency: float = typer.Option( 0.99, @@ -855,11 +833,11 @@ def export( "--cross-stringency", help="Stringency threshold in [0.0, 1.0].", ), - considere_all: bool = typer.Option( + only_unique: bool = typer.Option( False, - "--considere_all", - "--considere-all", - help="Use map_all.bw instead of map_uniq.bw to build the exported mask.", + "--only-unique", + "--only_unique", + help="Consider only unique k-mers (no XA tag & MAPQ>0) area.", ), db_root: str = typer.Option( ..., "-d", "--db-root", help="Path to the database root directory." @@ -884,7 +862,7 @@ def export( max_gap_opens=bwa_max_gap_opens, seed_length=bwa_seed_length, all_aln=bwa_all_aln, - threads=bwa_threads, + threads=None, ) valid_tracks = _request_tracks_from_args( @@ -895,7 +873,7 @@ def export( exclude_tracks, kmer_length, offset_step, - considere_all, + only_unique, None, cross_stringency, bwa_params, @@ -908,7 +886,7 @@ def export( kmer_length=kmer_length, offset_step=offset_step, cross_stringency=cross_stringency, - consider_all=considere_all, + consider_all=not (only_unique), n_threads=n_threads, db_root=db_root, output_file=output_bed, @@ -930,6 +908,7 @@ def export( @app.command( "import", help="Import a track manually by providing BigWig files and generation parameters.", + no_args_is_help=True, ) def import_tracks( ref: str = typer.Option( @@ -1064,7 +1043,8 @@ def import_tracks( @app.command( - help="Count for each read the number of overlapping k-mers fron required tracks." + help="Count for each read the number of overlapping k-mers fron required tracks.", + no_args_is_help=True, ) def count( # Alignment parameters used to construct the BAM file @@ -1089,9 +1069,6 @@ def count( "-bN", help="BWA aln -N used for alignment (compute every alternative mapping).", ), - bwa_threads: Optional[int] = typer.Option( - None, "-bj", "--bwa-threads", help="BWA aln -t used for alignment." - ), # Track generation parameters db_root: str = typer.Option( ..., "-d", "--db-root", help="Path to the database root directory." @@ -1106,7 +1083,7 @@ def count( "-w", "--offset-step", "--sliding-window", - help="Offset/sliding window to filter on. Smaller, more sensitive.", + help="Offset/sliding window to filter on. Must match track generation parameter.", ), # Filtration parameters exclude_tags: Optional[List[str]] = typer.Option( @@ -1119,11 +1096,11 @@ def count( "--exclude-tracks", help="Track identifier(s) to filter out. Comma-separated (e.g. genius_species1, genius_species2).", ), - considere_all: bool = typer.Option( + only_unique: bool = typer.Option( False, "--only-unique", "--only_unique", - help="Use map_all.bw instead of map_uniq.bw to build the exclusion mask.", + help="Consider only unique k-mers (no XA tag & MAPQ>0) area.", ), no_cache: bool = typer.Option( False, @@ -1161,7 +1138,7 @@ def count( max_gap_opens=bwa_max_gap_opens, seed_length=bwa_seed_length, all_aln=bwa_all_aln, - threads=bwa_threads, + threads=None, ) valid_tracks = _request_tracks_from_args( @@ -1172,7 +1149,7 @@ def count( exclude_tracks, kmer_length, offset_step, - considere_all, + only_unique, no_cache, None, bwa_params, @@ -1191,7 +1168,7 @@ def count( offset_step=offset_step, count_mode=count_mode, bwa_params=bwa_params, - consider_all=considere_all, + consider_all=not (only_unique), n_threads=n_threads, output_report_tsv=output_report_tsv, ) diff --git a/src/wizardeye/filter.py b/src/wizardeye/filter.py index a35254e..14297b1 100644 --- a/src/wizardeye/filter.py +++ b/src/wizardeye/filter.py @@ -9,7 +9,6 @@ """ from __future__ import annotations -from collections import defaultdict import hashlib import pyBigWig import shutil @@ -348,343 +347,6 @@ def compute_stringency_on_bedGraph( def filter_bam( - input_bam: str, - ref: str, - db_root: str, - exclude_tracks: List[Track], - kmer_length: int, - offset_step: int, - bwa_params: Optional[BWAParameters] = None, - stringency: float = 0.99, - consider_all: bool = False, - output_report_tsv: Optional[str] = None, - export_bam: bool = False, - output_filtered_bam: Optional[str] = None, - output_excluded_bam: Optional[str] = None, - no_cache: bool = False, - n_threads: int = 1, -) -> Dict[str, object]: - """Main function to filter a BAM file using several tracks and generate requested outputs. - - Args: - input_bam (str): Path to the input BAM file to filter. - ref (str): Name of the reference species. - exclude_tracks (List[str]): List of input track names to consider. - kmer_length (int): Length of the k-mers used during track generations. - offset_step (int): Step size for the offset used during track generations. - bwa (Optional[BWAParameters]): BWA alignment parameters for track selection. - stringency (float): Cross-stringency threshold, must be between 0.0 and 1.0. - consider_all (bool): If True, all k-mers are considered in the cross-stringency - calculation. If False (default), only uniquely aligned k-mers are considered. - output_report_tsv (Optional[str]): Path to the output TSV report file. - export_bam: If True, splits the input BAM into filtered and excluded BAM files. - output_filtered_bam (Optional[str]): Path to the output BAM file containing filtered reads if export_bam is True. - output_excluded_bam (Optional[str]): Path to the output BAM file containing excluded reads if export_bam is True. - db_root (str): Root directory of the database. - no_cache (bool): If True, do not use and generate cached tracks. - n_threads (int): Number of threads to use for parallel processing. - """ - if pysam is None: - raise RuntimeError("pysam is required for report generation and BAM filtering") - - # Parameters validation - input_bam_path = Path(input_bam) - if not input_bam_path.exists() or not input_bam_path.is_file(): - raise FileNotFoundError(f"Input BAM not found: {input_bam_path}") - - if kmer_length < 1: - raise ValueError("kmer_length must be a positive integer") - if offset_step < 1: - raise ValueError("offset_step must be a positive integer") - if not (0.0 <= stringency <= 1.0): - raise ValueError("stringency must be between 0.0 and 1.0") - - normalized_tracks = from_charlist_to_list(exclude_tracks) - if not normalized_tracks: - raise ValueError("exclude_tracks cannot be empty") - - # Fail fast on initial-file incompatibility before any mask computation. - ref_dir = Path(db_root) / ref - reference_seq_sizes = ref_dir / f"{ref}.sizes" - validate_bam_compatibility( - input_bam_path, - reference_seq_sizes, - bwa_params=bwa_params, - ) - - sorted_tracks = sorted(set(normalized_tracks)) - - tracks_for_params = get_tracks( - ref_species=ref, - kmer_length=kmer_length, - offset_step=offset_step, - db_root=db_root, - bwa_params=bwa_params, - ) - normalized_requested_tracks = set(sorted_tracks) - track_to_tags: Dict[str, Set[str]] = {} - for track in tracks_for_params: - if track.track_name not in normalized_requested_tracks: - continue - tags = set(from_charlist_to_list(track.info.get("tags", []), lowercase=True)) - for key in { - track.track_name, - track.identity.query_species, - track.query_name, - track.track_name.split("_k")[0], - }: - track_to_tags.setdefault(key, set()).update(tags) - - report_tsv = ( - Path(output_report_tsv) - if output_report_tsv - else _default_output_table(input_bam_path) - ) - report_tsv.parent.mkdir(parents=True, exist_ok=True) - - merged_mask = generate_global_mask( - ref_species=ref, - inputs=sorted_tracks, - kmer_length=kmer_length, - offset_step=offset_step, - cross_stringency=stringency, - consider_all=consider_all, - n_threads=n_threads, - db_root=db_root, - output_file=None, - bwa_params=bwa_params, - no_cache=no_cache, - ) - - filtered_bam: Optional[Path] = None - excluded_bam: Optional[Path] = None - try: - ( - read_tracks, - read_tags, - excluded_read_ids, - n_total, - n_excluded, - n_total_records, - ) = _filter_bam_with_mask( - input_bam=input_bam_path, - mask_path=merged_mask, - track_to_tags=track_to_tags, - ) - report_path = write_filtration_report( - output_report_tsv=report_tsv, - read_tracks=read_tracks, - read_tags=read_tags, - ) - n_filtered = max(0, n_total - n_excluded) - - if export_bam: - filtered_bam = ( - Path(output_filtered_bam) - if output_filtered_bam - else _default_output_bam(input_bam_path, "filtered") - ) - excluded_bam = ( - Path(output_excluded_bam) - if output_excluded_bam - else _default_output_bam(input_bam_path, "excluded") - ) - log("Filtering BAM from excluded read IDs...", "I") - filter_bam_from_reads_id( - input_bam=input_bam_path, - excluded_read_ids=excluded_read_ids, - output_filtered_bam=filtered_bam, - output_excluded_bam=excluded_bam, - ) - - for bam_path in (filtered_bam, excluded_bam): - try: - pysam.index(str(bam_path)) - except Exception: - log(f"Could not index BAM (possibly unsorted): {bam_path}", "W") - finally: - # Clean up any temporary mask generated when cache is disabled. - if no_cache: - try: - merged_mask.unlink(missing_ok=True) - except TypeError: - if merged_mask.exists(): - merged_mask.unlink() - - return { - "mask": merged_mask, - "filtered_bam": filtered_bam, - "excluded_bam": excluded_bam, - "report_tsv": report_path, - "n_total": n_total, - "n_filtered": n_filtered, - "n_excluded": n_excluded, - "n_total_records": n_total_records, - } - - -def _filter_bam_with_mask( - input_bam: Path, - mask_path: Path, - track_to_tags: Optional[Dict[str, Set[str]]] = None, -) -> Tuple[Dict[str, Set[str]], Dict[str, Set[str]], Set[str], int, int, int]: - """Filter a BAM file using a mask file and identify overlapping reads. - - Args: - input_bam (Path): Path to the input BAM file. - mask_path (Path): Path to the mask file. - track_to_tags (Optional[Dict[str, Set[str]]]): Dictionary mapping track names to sets of tags. - - Returns: - - Dict[str, Set[str]]: read_tracks mapping read IDs to overlapping track names - - Dict[str, Set[str]]: read_tags mapping read IDs to overlapping track tags - - Set[str]: excluded_read_ids set of read IDs that overlap the mask - - int: n_total total number of distinct read IDs - - int: n_excluded number of excluded read IDs - - int: n_total_records total number of input BAM records processed - - Notes: - This function currently use subprocesses to avoid to parse every .bw in memory. - This may evolve in future version to use pyBigWig instead. However, it currently - excludes reads using IDs, which can lead to the exclusion of different reads using - same IDs. - """ - - log("Identifying overlapping reads...", "I") - - # Initialize data structures - track_to_tags = track_to_tags or {} - read_tracks = defaultdict(set) - read_tags = defaultdict(set) - excluded_read_ids: Set[str] = set() - - all_read_ids = {} - n_total_records = 0 - - # Get reads count - log(f"samtools view {input_bam}", "C") - view_cmd = ["samtools", "view", str(input_bam)] - view_proc = subprocess.Popen(view_cmd, stdout=subprocess.PIPE, text=True) - for line in view_proc.stdout: - n_total_records += 1 - read_id = line.split("\t", 1)[0] - all_read_ids[read_id] = None - view_proc.wait() - - for rid in all_read_ids.keys(): - read_tracks[rid] = set() - read_tags[rid] = set() - - # Get the intersection between the BAM file and the mask file - log(f"bedtools intersect -abam {input_bam} -b {mask_path} -wa -wb -bed", "C") - intersect_cmd = [ - "bedtools", - "intersect", - "-abam", - str(input_bam), - "-b", - str(mask_path), - "-wa", - "-wb", - "-bed", - ] - - process = subprocess.Popen( - intersect_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True - ) - - # Process reads - try: - for line in process.stdout: - parts = line.strip().split("\t") - if len(parts) >= 16: - read_id = parts[3] - excluded_read_ids.add(read_id) - - tracks_str = parts[-1] - tracks = tracks_str.split(",") if tracks_str else [] - - for track in tracks: - if track: - read_tracks[read_id].add(track) - read_tags[read_id].update(track_to_tags.get(track, set())) - else: - raise RuntimeError(f"Unexpected ouput from bedtools: {line}") - finally: - process.stdout.close() - - if process.wait() != 0: - raise subprocess.CalledProcessError(process.returncode, intersect_cmd) - - if len(all_read_ids) != n_total_records: - log( - f"Number of unique IDs in BAM ({len(all_read_ids)}) does not match number of records in BED ({n_total_records}): this could be" - " due to duplicates. Currently, WizardEye filter using read-id and will then exclude duplicates if at least one is filtered.", - "W", - ) - - return ( - dict(read_tracks), - dict(read_tags), - excluded_read_ids, - len(all_read_ids), - len(excluded_read_ids), - n_total_records, - ) - - -def filter_bam_from_reads_id( - input_bam: Path, - excluded_read_ids: Set[str], - output_filtered_bam: Path, - output_excluded_bam: Path, -) -> Tuple[int, int, int]: - """Split BAM in one pysam pass using excluded read IDs. - - Args: - input_bam: Path to input BAM file. - excluded_read_ids: Set of read IDs to exclude. - output_filtered_bam: Path to output BAM file with filtered reads. - output_excluded_bam: Path to output BAM file with excluded reads. - - Returns: - Tuple[int, int, int]: Number of total records, number of filtered records and number of excluded records.""" - - log("Splitting BAM into excluded/filtered files based on filtration...", "I") - - if pysam is None: - raise RuntimeError("pysam is required to filter BAM from read IDs") - - output_filtered_bam.parent.mkdir(parents=True, exist_ok=True) - output_excluded_bam.parent.mkdir(parents=True, exist_ok=True) - - n_total_records = 0 - n_excluded_records = 0 - - with pysam.AlignmentFile(str(input_bam), "rb") as bam: - with pysam.AlignmentFile( - str(output_filtered_bam), "wb", template=bam - ) as filtered_handle: - with pysam.AlignmentFile( - str(output_excluded_bam), "wb", template=bam - ) as excluded_handle: - for read in bam.fetch(until_eof=True): - n_total_records += 1 - read_id = read.query_name - if read_id and read_id in excluded_read_ids: - excluded_handle.write(read) - n_excluded_records += 1 - else: - filtered_handle.write(read) - - n_filtered_records = n_total_records - n_excluded_records - return n_total_records, n_filtered_records, n_excluded_records - - -# -- Alternative filtration using pyBigWig -- - - -def filter_bam_alternative( input_bam: str, ref: str, db_root: str, @@ -698,8 +360,6 @@ def filter_bam_alternative( export_bam: bool = False, output_filtered_bam: Optional[str] = None, output_excluded_bam: Optional[str] = None, - no_cache: bool = False, - n_threads: int = 1, ) -> Dict[str, object]: """Alternative BAM filtering function using pyBigWig directly instead of bedtools mask intersection. @@ -720,7 +380,6 @@ def filter_bam_alternative( output_filtered_bam (Optional[str]): Path to the output BAM file containing filtered reads if export_bam is True. output_excluded_bam (Optional[str]): Path to the output BAM file containing excluded reads if export_bam is True. db_root (str): Root directory of the database. - no_cache (bool): If True, do not use and generate cached tracks. Returns: Dict[str, object]: Dictionary containing mask path (None for this alternative), filtered/excluded BAM paths, report path, and counts. @@ -851,9 +510,7 @@ def filter_bam_alternative( read_tags[read_id].update(track_to_tags.get(track_name, set())) report_path = write_filtration_report( - output_report_tsv=report_tsv, - read_tracks=read_tracks, - read_tags=read_tags, + output_report_tsv=report_tsv, read_tracks=read_tracks ) n_filtered = max(0, n_mapped_records - len(excluded_read_ids)) @@ -901,6 +558,54 @@ def filter_bam_alternative( bw.close() +def filter_bam_from_reads_id( + input_bam: Path, + excluded_read_ids: Set[str], + output_filtered_bam: Path, + output_excluded_bam: Path, +) -> Tuple[int, int, int]: + """Split BAM in one pysam pass using excluded read IDs. + + Args: + input_bam: Path to input BAM file. + excluded_read_ids: Set of read IDs to exclude. + output_filtered_bam: Path to output BAM file with filtered reads. + output_excluded_bam: Path to output BAM file with excluded reads. + + Returns: + Tuple[int, int, int]: Number of total records, number of filtered records and number of excluded records.""" + + log("Splitting BAM into excluded/filtered files based on filtration...", "I") + + if pysam is None: + raise RuntimeError("pysam is required to filter BAM from read IDs") + + output_filtered_bam.parent.mkdir(parents=True, exist_ok=True) + output_excluded_bam.parent.mkdir(parents=True, exist_ok=True) + + n_total_records = 0 + n_excluded_records = 0 + + with pysam.AlignmentFile(str(input_bam), "rb") as bam: + with pysam.AlignmentFile( + str(output_filtered_bam), "wb", template=bam + ) as filtered_handle: + with pysam.AlignmentFile( + str(output_excluded_bam), "wb", template=bam + ) as excluded_handle: + for read in bam.fetch(until_eof=True): + n_total_records += 1 + read_id = read.query_name + if read_id and read_id in excluded_read_ids: + excluded_handle.write(read) + n_excluded_records += 1 + else: + filtered_handle.write(read) + + n_filtered_records = n_total_records - n_excluded_records + return n_total_records, n_filtered_records, n_excluded_records + + # --- Count-report related function --- @@ -1111,9 +816,7 @@ def _default_output_count_table(input_bam: Path) -> Path: def write_filtration_report( - output_report_tsv: Path, - read_tracks: Dict[str, Set[str]], - read_tags: Dict[str, Set[str]], + output_report_tsv: Path, read_tracks: Dict[str, Set[str]] ) -> Path: """Write one line per read_id with exclusion flag, overlapping tracks and tags. @@ -1126,12 +829,10 @@ def write_filtration_report( Path: The path to the generated report.""" output_report_tsv.parent.mkdir(parents=True, exist_ok=True) with output_report_tsv.open("w", encoding="utf-8") as handle: - handle.write("read_id\texcluded\toverlapped\ttags\n") + handle.write("read_id\tfiltered_out\tassociated_tracks\n") for read_id, track_set in read_tracks.items(): tracks = sorted(track_set) - tags = sorted(read_tags.get(read_id, set())) excluded = "true" if tracks else "false" overlapped = ",".join(tracks) - joined_tags = ",".join(tags) - handle.write(f"{read_id}\t{excluded}\t{overlapped}\t{joined_tags}\n") + handle.write(f"{read_id}\t{excluded}\t{overlapped}\n") return output_report_tsv diff --git a/src/wizardeye/mappability.py b/src/wizardeye/mappability.py index ca4f93b..c64c4d0 100644 --- a/src/wizardeye/mappability.py +++ b/src/wizardeye/mappability.py @@ -45,7 +45,8 @@ # This control the maximum number of best hits to refers # Despite bwa manual, it seems to also alterate bwa results in single end condition # This is only useful without -N -BWA_R_BEST_HITS = 10000000 +# 2147483647 (0x7FFFFFFF) is the maximum possible number based on bwaaln.c +BWA_R_BEST_HITS = 2147483647 # -- WizardEye mappability utilities -- diff --git a/src/wizardeye/version.py b/src/wizardeye/version.py index 24ed3a1..dd1243b 100644 --- a/src/wizardeye/version.py +++ b/src/wizardeye/version.py @@ -10,8 +10,8 @@ import sys from typing import Optional -DISPLAY_VERSION = "beta-0.1.0" -PACKAGE_VERSION = "0.1.0b1" +DISPLAY_VERSION = "0.1.2" +PACKAGE_VERSION = "0.1.2" def _get_git_commit_hash() -> Optional[str]: diff --git a/tests/generate_cross_mappability_filter_bwa.sh b/tests/generate_cross_mappability_filter_bwa.sh index db3ec28..577335c 100755 --- a/tests/generate_cross_mappability_filter_bwa.sh +++ b/tests/generate_cross_mappability_filter_bwa.sh @@ -224,9 +224,9 @@ for other_path in "$input_fasta_directory"/*.{fa,fasta}; do # Use GNU parallel to run BWA alignments in parallel, then convert to BAM and merge into a single BAM file for counting. find "$tmp_local/" -name "*.fasta" | parallel -j "$n_threads" \ - "bwa aln -t 1 -R 10000000 -n $bwa_missing_prob_err_rate -o $bwa_max_gap_opens \ + "bwa aln -t 1 -R 2147483647 -n $bwa_missing_prob_err_rate -o $bwa_max_gap_opens \ -l $bwa_seed_length '$input_target' {} > {.}.sai && \ - bwa samse -n 10000000 '$input_target' {.}.sai {} | \ + bwa samse -n 2147483647 '$input_target' {.}.sai {} | \ samtools view -b -F 4 - > {.}.bam && \ rm -f {} {.}.sai" diff --git a/tests/integration/consistency.py b/tests/integration/consistency.py index 96eb19a..f8ecab1 100644 --- a/tests/integration/consistency.py +++ b/tests/integration/consistency.py @@ -559,6 +559,7 @@ def test_export_same_behavior_as_original_script(): str(wizardeye_mask), "-d", str(wizardeye_db), + "--only-unique", ] result = subprocess.run( @@ -1010,124 +1011,6 @@ def test_consistency_align_parallelisation(): ) -# def test_consistency_align_chunk_size(): -# """Test if the results are the same with different chunk sizes for the align function. - -# This should be the case: chunks are computed on k-mers, not complete sequence. -# """ - -# # Different chunk sizes to test -# chunk_sizes = [50, 100, 1000, 2000, 5000, 7500, 10000, 25000, 50000, 100000] - -# # List of query FASTAs to test against HG19_FA as target - use single species for speed -# query_fastas = [SUS_SCROFA_FA, CANIS_LUPUS_FA, RATTUS_NORVEGICUS_FA] - -# for query_fa in query_fastas: -# # Use a single persistent temp directory for all chunk sizes -# with tempfile.TemporaryDirectory(prefix=f"wizardeye_consistency_chunk_{query_fa.stem}_") as base_tmpdir: -# base_path = Path(base_tmpdir) - -# # Dictionary to store BigWig paths per chunk size -# bw_files = {} - -# for chunk_size in chunk_sizes: -# with tempfile.TemporaryDirectory(prefix=f"cs{chunk_size}_") as tmpdir: -# wizardeye_db = Path(tmpdir) / "database" -# wizardeye_db.mkdir(parents=True) - -# # Initialize WizardEye database -# subprocess.run( -# ["python3", "-m", "wizardeye", "database", "--init", "-d", str(tmpdir)], -# check=True, -# capture_output=True, -# text=True, -# encoding='utf-8', -# errors='replace', -# env={**subprocess.os.environ, "PYTHONPATH": str(SRC_DIR)} -# ) - -# # Run WizardEye align with this chunk size -# wizardeye_cmd = [ -# "python3", "-m", "wizardeye", "align", -# "-i", str(query_fa), -# "-r", str(HG19_FA), -# "-k", str(STANDARD_KMER_LENGTH), -# "-w", str(STANDARD_OFFSET_STEP), -# "-bn", str(STANDARD_BWA_MISSING_PROB_ERR_RATE), -# "-bo", str(STANDARD_BWA_MAX_GAP_OPENINGS), -# "-bl", str(STANDARD_BWA_SEED_LENGTH), -# "-j", str(STANDARD_N_THREADS), -# "-cs", str(chunk_size), -# "-d", str(wizardeye_db), -# ] - -# result = subprocess.run( -# wizardeye_cmd, -# capture_output=True, -# encoding='utf-8', -# errors='replace', -# text=True, -# env={**subprocess.os.environ, "PYTHONPATH": str(SRC_DIR)} -# ) -# if result.returncode != 0: -# print(f"wizardeye align stderr for {query_fa.name} chunk_size {chunk_size}: {result.stderr}") -# print(f"wizardeye align stdout for {query_fa.name} chunk_size {chunk_size}: {result.stdout}") -# raise RuntimeError(f"wizardeye align execution failed with return code {result.returncode}") - -# # Locate WizardEye output -# hg19_stem = HG19_FA.stem -# query_stem = query_fa.stem -# bn_str = f"{float(STANDARD_BWA_MISSING_PROB_ERR_RATE):g}" - -# track_pattern = f"{query_stem}_k{STANDARD_KMER_LENGTH}_w{STANDARD_OFFSET_STEP}_n{bn_str}_o{STANDARD_BWA_MAX_GAP_OPENINGS}_l{STANDARD_BWA_SEED_LENGTH}" -# track_dir = wizardeye_db / hg19_stem / track_pattern - -# assert track_dir.exists(), f"wizardeye track directory not found: {track_dir}" - -# wizardeye_map_all_bw = track_dir / "map_all.bw" -# wizardeye_map_uniq_bw = track_dir / "map_uniq.bw" - -# assert wizardeye_map_all_bw.exists(), f"wizardeye did not create {wizardeye_map_all_bw}" -# assert wizardeye_map_uniq_bw.exists(), f"wizardeye did not create {wizardeye_map_uniq_bw}" - -# # Copy BigWig files to persistent base directory before tmpdir is cleaned up -# dest_dir = base_path / f"cs{chunk_size}" -# dest_dir.mkdir(parents=True, exist_ok=True) -# shutil.copy(wizardeye_map_all_bw, dest_dir / "map_all.bw") -# shutil.copy(wizardeye_map_uniq_bw, dest_dir / "map_uniq.bw") - -# bw_files[chunk_size] = (dest_dir / "map_all.bw", dest_dir / "map_uniq.bw") - -# # Compare all outputs with the first one (smallest chunk size) -# first_cs = chunk_sizes[0] -# first_map_all, first_map_uniq = bw_files[first_cs] - -# for chunk_size in chunk_sizes[1:]: -# current_map_all, current_map_uniq = bw_files[chunk_size] - -# with tempfile.TemporaryDirectory(prefix=f"wizardeye_compare_chunk_cs{chunk_size}_") as compare_dir: -# compare_path = Path(compare_dir) - -# # Convert to bedGraph for comparison -# first_all_bg = compare_path / "first_map_all.bg" -# first_uniq_bg = compare_path / "first_map_uniq.bg" -# current_all_bg = compare_path / "current_map_all.bg" -# current_uniq_bg = compare_path / "current_map_uniq.bg" - -# subprocess.run(["bigWigToBedGraph", str(first_map_all), str(first_all_bg)], check=True) -# subprocess.run(["bigWigToBedGraph", str(first_map_uniq), str(first_uniq_bg)], check=True) -# subprocess.run(["bigWigToBedGraph", str(current_map_all), str(current_all_bg)], check=True) -# subprocess.run(["bigWigToBedGraph", str(current_map_uniq), str(current_uniq_bg)], check=True) - -# # Compare map_all -# assert compare_bedgraph_files(first_all_bg, current_all_bg, f"map_all cs{first_cs} vs cs{chunk_size} ({query_fa.name})"), \ -# f"map_all.bw differs between chunk_size {first_cs} and {chunk_size} for {query_fa.name}" - -# # Compare map_uniq -# assert compare_bedgraph_files(first_uniq_bg, current_uniq_bg, f"map_uniq cs{first_cs} vs cs{chunk_size} ({query_fa.name})"), \ -# f"map_uniq.bw differs between chunk_size {first_cs} and {chunk_size} for {query_fa.name}" - - def test_consistency_count_same_launch(): """Test if the results are the same across 10 count rounds with the same parameters.""" @@ -1194,8 +1077,6 @@ def test_consistency_count_same_launch(): str(STANDARD_BWA_SEED_LENGTH), "-m", "mean", - "-j", - str(STANDARD_N_THREADS), "--exclude-tracks", exclude_tracks_str, "--report-output", @@ -1327,8 +1208,6 @@ def test_consistency_count_parallelisation(): str(STANDARD_BWA_SEED_LENGTH), "-m", "mean", - "-j", - str(n_threads), "--exclude-tracks", exclude_tracks_str, "--report-output", @@ -1461,13 +1340,10 @@ def test_consistency_filter_same_launch(): str(STANDARD_BWA_SEED_LENGTH), "-p", str(STANDARD_CROSS_STRINGENCY), - "-j", - str(STANDARD_N_THREADS), "--exclude-tracks", exclude_tracks_str, "--report-output", str(report_path), - "--count-only", "-d", str(wizardeye_db), ] @@ -1598,13 +1474,10 @@ def test_consistency_filter_parallelisation(): str(STANDARD_BWA_SEED_LENGTH), "-p", str(STANDARD_CROSS_STRINGENCY), - "-j", - str(n_threads), "--exclude-tracks", exclude_tracks_str, "--report-output", str(report_path), - "--count-only", "-d", str(wizardeye_db), ] @@ -1732,8 +1605,6 @@ def test_export_and_filter_same_results(): str(STANDARD_BWA_SEED_LENGTH), "-p", str(cross_stringency), - "-j", - str(STANDARD_N_THREADS), "--exclude-tracks", exclude_tracks_str, "--report-output",