From 928b48f38a7b622e27a9d0d94415c21c56f04240 Mon Sep 17 00:00:00 2001 From: bbimber Date: Wed, 14 Jan 2026 22:12:21 -0800 Subject: [PATCH 1/4] Add check for TSO and infer UMI length --- nimble/fastq_barcode_processor.py | 41 +++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/nimble/fastq_barcode_processor.py b/nimble/fastq_barcode_processor.py index 5412133..40b9dfb 100644 --- a/nimble/fastq_barcode_processor.py +++ b/nimble/fastq_barcode_processor.py @@ -14,6 +14,30 @@ def hamming_distance(s1, s2): return sum(c1 != c2 for c1, c2 in zip(s1, s2)) +def infer_umi_length(r1_iter, search_string = "TTTCTTATATGGG", max_records = 100, min_records = 10, cb_length = 16): + tso_starts = [] + for idx, (r1_record) in enumerate(zip(r1_iter), start=1): + pos = r1_record.find(search_string) + if (pos > -1): + tso_starts.append(pos) + if (len(tso_starts) >= min_records): + break + + if (idx > max_records): + break + + if (len(tso_starts) < min_records): + raise ValueError("Unable to infer UMI length. Insufficent R1 sequences contained " + search_string) + + tso_starts = set(tso_starts) + if (len(tso_starts) > 1): + raise ValueError("Unable to infer UMI length. TSO sequence found at inconsistent positions of R1: " + ",".join(tso_starts)) + + umi_length = tso_starts[1] - cb_length + print("Inferred UMI length of: " + umi_length) + + return(umi_length) + def build_hamming_index(whitelist): """ Build an index mapping each valid CB to all possible 1-edit variants. @@ -128,7 +152,7 @@ def correct_cell_barcode(raw_cb, quality_scores, whitelist, hamming_index, corre return best_candidate -def parse_10x_barcode_from_r1(sequence, cb_length=16, umi_length=12): +def parse_10x_barcode_from_r1(sequence, cb_length=16, umi_length=10, tso_length=13): """ Parse cell barcode and UMI from 10x R1 read sequence. Returns (cell_barcode, umi, remaining_sequence) @@ -137,11 +161,11 @@ def parse_10x_barcode_from_r1(sequence, cb_length=16, umi_length=12): return None, None, "" cell_barcode = sequence[:cb_length] umi = sequence[cb_length:cb_length + umi_length] - remaining_sequence = sequence[cb_length + umi_length:] + remaining_sequence = sequence[cb_length + umi_length + tso_length:] return cell_barcode, umi, remaining_sequence -def process_pair(r1_record, r2_record, whitelist, hamming_index, correction_cache, stats, cb_length=16, umi_length=12): +def process_pair(r1_record, r2_record, whitelist, hamming_index, correction_cache, stats, cb_length=16, umi_length=10, tso_length = 13): """ Process a single FASTQ pair and return (r1_bam, r2_bam), or None if skipped. Correction logic: @@ -156,7 +180,7 @@ def process_pair(r1_record, r2_record, whitelist, hamming_index, correction_cach return None r1_seq = str(r1_record.seq) - raw_cb, umi, remaining_r1_seq = parse_10x_barcode_from_r1(r1_seq, cb_length, umi_length) + raw_cb, umi, remaining_r1_seq = parse_10x_barcode_from_r1(r1_seq, cb_length, umi_length, tso_length) if raw_cb is None or umi is None: stats['too_short'] += 1 return None @@ -251,6 +275,8 @@ def fastq_to_bam_with_barcodes(r1_fastq, r2_fastq, cb_whitelist_file, output_bam r1_iter = SeqIO.parse(r1_handle, "fastq") r2_iter = SeqIO.parse(r2_handle, "fastq") + umi_length = infer_umi_length(r1_iter) + with ThreadPoolExecutor(max_workers=num_cores) as executor: futures = {} @@ -308,7 +334,12 @@ def fastq_to_bam_with_barcodes(r1_fastq, r2_fastq, cb_whitelist_file, output_bam corrected_pct = 100.0 * stats.get('cb_corrected', 0) / total_cb_processed dropped_pct = 100.0 * stats.get('cb_no_correction', 0) / total_cb_processed print(f" Correction rate: {perfect_pct:.2f}% perfect, {corrected_pct:.2f}% corrected, {dropped_pct:.2f}% dropped") - + + if (dropped_pct == 100.0): + raise ValueError("There were no passing cellbarcodes. This likely indicates an error with the inclusionlist") + else: + raise ValueError("No cellbarcodes were processed. There is likely a problem with the input files") + print(f"\nOther filters:") print(f" Name mismatch: {stats.get('name_mismatch', 0)}") print(f" Too short: {stats.get('too_short', 0)}") From dd9c4c6db4c3da9343bc8e92acc1e97c6c8fd273 Mon Sep 17 00:00:00 2001 From: Sebastian Benjamin Date: Fri, 16 Jan 2026 10:33:17 -0800 Subject: [PATCH 2/4] UMI length detection and TSO stripping + CLI args --- nimble/__main__.py | 67 +++++- nimble/fastq_barcode_processor.py | 363 ++++++++++++++++++------------ 2 files changed, 271 insertions(+), 159 deletions(-) diff --git a/nimble/__main__.py b/nimble/__main__.py index 119ba15..2b95302 100755 --- a/nimble/__main__.py +++ b/nimble/__main__.py @@ -424,11 +424,50 @@ def sort_input_bam(bam, cores, tmp_dir): fastq_to_bam_parser = subparsers.add_parser('fastq-to-bam') fastq_to_bam_parser.add_argument('--r1-fastq', help='Path to R1 FASTQ file.', type=str, required=True) fastq_to_bam_parser.add_argument('--r2-fastq', help='Path to R2 FASTQ file.', type=str, required=True) - fastq_to_bam_parser.add_argument("--map", required=True, help="Cell barcode whitelist file (one CB per line, .gz or plain text)") + fastq_to_bam_parser.add_argument( + "--map", + required=True, + help="Cell barcode whitelist file (one CB per line, .gz or plain text)" + ) fastq_to_bam_parser.add_argument('--output', help='Path for output BAM file.', type=str, required=True) fastq_to_bam_parser.add_argument('-c', '--num_cores', help='The number of cores to use for processing.', type=int, default=1) fastq_to_bam_parser.add_argument('--cb-length', help='Length of cell barcode (default: 16).', type=int, default=16) - fastq_to_bam_parser.add_argument('--umi-length', help='Length of UMI (default: 12).', type=int, default=12) + fastq_to_bam_parser.add_argument( + '--infer-umi', + help='Infer UMI length from R1 by locating the TSO motif and strip the motif from R1 (default: on).', + action='store_true', + default=True + ) + fastq_to_bam_parser.add_argument( + '--no-infer-umi', + help='Disable UMI inference. Requires --umi-length. Does not strip TSO from R1.', + action='store_true', + default=False + ) + fastq_to_bam_parser.add_argument( + '--umi-length', + help='UMI length (required if --no-infer-umi is set). Ignored when inference is enabled.', + type=int, + default=None + ) + fastq_to_bam_parser.add_argument( + '--tso-search-string', + help='TSO motif to locate in R1 for UMI inference (default: TTTCTTATATGGG).', + type=str, + default="TTTCTTATATGGG" + ) + fastq_to_bam_parser.add_argument( + '--infer-prefix-pairs', + help='Number of read pairs to buffer for UMI inference (default: 2000).', + type=int, + default=2000 + ) + fastq_to_bam_parser.add_argument( + '--min-records-with-tso', + help='Minimum number of R1 reads containing the TSO motif required to accept inference (default: 10).', + type=int, + default=10 + ) args = parser.parse_args() @@ -442,14 +481,24 @@ def sort_input_bam(bam, cores, tmp_dir): summarize_columns_list = args.summarize.split(',') if args.summarize else None report(args.input, args.output, summarize_columns_list, args.threshold, args.disable_thresholding) elif args.subcommand == 'fastq-to-bam': + infer_umi = (not args.no_infer_umi) and args.infer_umi + + if not infer_umi and args.umi_length is None: + print("Error: --umi-length is required when --no-infer-umi is set.", file=sys.stderr) + sys.exit(2) + fastq_to_bam_with_barcodes( - args.r1_fastq, - args.r2_fastq, - args.map, - args.output, - args.num_cores, - args.cb_length, - args.umi_length + r1_fastq=args.r1_fastq, + r2_fastq=args.r2_fastq, + cb_whitelist_file=args.map, + output_bam=args.output, + num_cores=args.num_cores, + cb_length=args.cb_length, + umi_length=args.umi_length, + infer_umi=infer_umi, + tso_search_string=args.tso_search_string, + infer_prefix_pairs=args.infer_prefix_pairs, + min_records_with_tso=args.min_records_with_tso, ) elif args.subcommand == 'plot': if os.path.getsize(args.input_file) > 0: diff --git a/nimble/fastq_barcode_processor.py b/nimble/fastq_barcode_processor.py index 40b9dfb..a711b38 100644 --- a/nimble/fastq_barcode_processor.py +++ b/nimble/fastq_barcode_processor.py @@ -3,6 +3,7 @@ import sys from collections import defaultdict from concurrent.futures import ThreadPoolExecutor, as_completed +from itertools import islice, chain import pysam from Bio import SeqIO @@ -10,69 +11,37 @@ def hamming_distance(s1, s2): """Calculate Hamming distance between two strings of equal length.""" if len(s1) != len(s2): - return float('inf') + return float("inf") return sum(c1 != c2 for c1, c2 in zip(s1, s2)) -def infer_umi_length(r1_iter, search_string = "TTTCTTATATGGG", max_records = 100, min_records = 10, cb_length = 16): - tso_starts = [] - for idx, (r1_record) in enumerate(zip(r1_iter), start=1): - pos = r1_record.find(search_string) - if (pos > -1): - tso_starts.append(pos) - if (len(tso_starts) >= min_records): - break - - if (idx > max_records): - break - - if (len(tso_starts) < min_records): - raise ValueError("Unable to infer UMI length. Insufficent R1 sequences contained " + search_string) - - tso_starts = set(tso_starts) - if (len(tso_starts) > 1): - raise ValueError("Unable to infer UMI length. TSO sequence found at inconsistent positions of R1: " + ",".join(tso_starts)) - - umi_length = tso_starts[1] - cb_length - print("Inferred UMI length of: " + umi_length) - - return(umi_length) - def build_hamming_index(whitelist): """ Build an index mapping each valid CB to all possible 1-edit variants. - This allows efficient lookup of correction candidates. - - Returns: - dict[str variant] = set[str valid_cb] + Returns: dict[str variant] = set[str valid_cb] """ - bases = ['A', 'C', 'G', 'T', 'N'] + bases = ["A", "C", "G", "T", "N"] hamming_index = defaultdict(set) - + for valid_cb in whitelist: - # For each position, generate all possible single-base substitutions for i in range(len(valid_cb)): for base in bases: if base != valid_cb[i]: - variant = valid_cb[:i] + base + valid_cb[i+1:] + variant = valid_cb[:i] + base + valid_cb[i + 1 :] hamming_index[variant].add(valid_cb) - + return hamming_index def load_cb_whitelist(whitelist_path): """ - Load a cell barcode whitelist (one CB per line). - Build a Hamming distance = 1 index for efficient correction. - - Returns: - whitelist: set of valid cell barcodes - hamming_index: dict mapping variants to valid CBs + Load a cell barcode whitelist (one CB per line) and build a Hamming distance = 1 index. + Returns: (whitelist_set, hamming_index) """ whitelist = set() - - open_func = gzip.open if whitelist_path.endswith('.gz') else open - mode = 'rt' if whitelist_path.endswith('.gz') else 'r' + + open_func = gzip.open if whitelist_path.endswith(".gz") else open + mode = "rt" if whitelist_path.endswith(".gz") else "r" try: with open_func(whitelist_path, mode) as f: @@ -86,60 +55,43 @@ def load_cb_whitelist(whitelist_path): print(f"Loaded whitelist from {whitelist_path}") print(f" Valid cell barcodes: {len(whitelist)}") - + print("Building Hamming distance index...") hamming_index = build_hamming_index(whitelist) print(f" Indexed {len(hamming_index)} variants") - + return whitelist, hamming_index def correct_cell_barcode(raw_cb, quality_scores, whitelist, hamming_index, correction_cache): """ Correct a raw cell barcode using 10x-style correction: - 1. Check for perfect match - 2. If not, find all candidates with Hamming distance = 1 - 3. Among candidates, select the one where the differing base has the lowest quality score - - Args: - raw_cb: Raw cell barcode string - quality_scores: List of quality scores for the CB region - whitelist: Set of valid cell barcodes - hamming_index: Dict mapping variants to valid CBs - correction_cache: Dict for caching corrections - - Returns: - Corrected CB string, or None if no valid correction found + 1) perfect match + 2) candidates at Hamming distance = 1 (via index) + 3) if multiple, choose the candidate whose differing position has the lowest Q + Returns corrected CB or None. """ - # Check cache first if raw_cb in correction_cache: return correction_cache[raw_cb] - - # Check for perfect match + if raw_cb in whitelist: correction_cache[raw_cb] = raw_cb return raw_cb - - # Find candidates with Hamming distance = 1 + candidates = hamming_index.get(raw_cb, set()) - if not candidates: correction_cache[raw_cb] = None return None - + if len(candidates) == 1: - # Only one candidate, use it corrected = next(iter(candidates)) correction_cache[raw_cb] = corrected return corrected - - # Multiple candidates: select based on quality scores - # Find the differing position and pick candidate with lowest quality at that position + best_candidate = None - lowest_quality = float('inf') - + lowest_quality = float("inf") + for candidate in candidates: - # Find the position where they differ for i, (raw_base, cand_base) in enumerate(zip(raw_cb, candidate)): if raw_base != cand_base: qual = quality_scores[i] @@ -147,122 +99,199 @@ def correct_cell_barcode(raw_cb, quality_scores, whitelist, hamming_index, corre lowest_quality = qual best_candidate = candidate break - + correction_cache[raw_cb] = best_candidate return best_candidate -def parse_10x_barcode_from_r1(sequence, cb_length=16, umi_length=10, tso_length=13): +def infer_umi_and_tso_lengths_from_r1_records( + r1_records, + search_string="TTTCTTATATGGG", + cb_length=16, + min_records=10, +): + """ + Infer UMI length by locating the TSO motif in R1 reads. + Assumes structure: [CB][UMI][TSO...][cDNA...] + umi_length = tso_start - cb_length + tso_length = len(search_string) (the removed prefix length) + """ + tso_starts = [] + for rec in r1_records: + seq = str(rec.seq) + pos = seq.find(search_string) + if pos != -1: + tso_starts.append(pos) + if len(tso_starts) >= min_records: + break + + if len(tso_starts) < min_records: + raise ValueError( + f"Unable to infer UMI length: fewer than {min_records} reads contained TSO motif {search_string}" + ) + + unique = set(tso_starts) + if len(unique) != 1: + raise ValueError( + f"Unable to infer UMI length: TSO motif found at inconsistent positions in R1: {sorted(unique)}" + ) + + tso_start = next(iter(unique)) + umi_length = tso_start - cb_length + if umi_length <= 0: + raise ValueError( + f"Inferred UMI length <= 0 (tso_start={tso_start}, cb_length={cb_length}). Check motif/structure." + ) + + tso_length = len(search_string) + + print(f"Inferred UMI length: {umi_length}") + print(f"Using TSO length: {tso_length} (len of search_string)") + return umi_length, tso_length + + +def parse_10x_barcode_from_r1(sequence, cb_length=16, umi_length=12, tso_length=0): """ - Parse cell barcode and UMI from 10x R1 read sequence. - Returns (cell_barcode, umi, remaining_sequence) + Parse CB and UMI from R1 sequence, then drop optional TSO. + Returns (cell_barcode, umi, remaining_sequence). """ - if len(sequence) < cb_length + umi_length: + prefix_len = cb_length + umi_length + tso_length + if len(sequence) < prefix_len: return None, None, "" cell_barcode = sequence[:cb_length] - umi = sequence[cb_length:cb_length + umi_length] - remaining_sequence = sequence[cb_length + umi_length + tso_length:] + umi = sequence[cb_length : cb_length + umi_length] + remaining_sequence = sequence[prefix_len:] return cell_barcode, umi, remaining_sequence -def process_pair(r1_record, r2_record, whitelist, hamming_index, correction_cache, stats, cb_length=16, umi_length=10, tso_length = 13): +def process_pair( + r1_record, + r2_record, + whitelist, + hamming_index, + correction_cache, + stats, + cb_length=16, + umi_length=12, + tso_length=0, +): """ - Process a single FASTQ pair and return (r1_bam, r2_bam), or None if skipped. - Correction logic: - - Correct CB using 10x-style Hamming distance = 1 correction with quality scores - - Use raw UMI (no correction) + Process a single FASTQ pair; return (r1_bam, r2_bam) or None if skipped. + - Correct CB (Hamming distance 0/1) using CB qualities. + - Use raw UMI. + - Drop (CB+UMI+TSO) from R1 sequence and qualities consistently. """ - # Normalize names (handle /1 and /2 suffixes if present) - r1_name = r1_record.id.removesuffix('/1') - r2_name = r2_record.id.removesuffix('/2') + r1_name = r1_record.id.removesuffix("/1") + r2_name = r2_record.id.removesuffix("/2") if r1_name != r2_name: - stats['name_mismatch'] += 1 + stats["name_mismatch"] += 1 return None r1_seq = str(r1_record.seq) - raw_cb, umi, remaining_r1_seq = parse_10x_barcode_from_r1(r1_seq, cb_length, umi_length, tso_length) + raw_cb, umi, remaining_r1_seq = parse_10x_barcode_from_r1( + r1_seq, cb_length=cb_length, umi_length=umi_length, tso_length=tso_length + ) if raw_cb is None or umi is None: - stats['too_short'] += 1 + stats["too_short"] += 1 return None if len(remaining_r1_seq) == 0: - stats['no_remaining_seq'] += 1 + stats["no_remaining_seq"] += 1 return None - # Get quality scores for the CB region + # CB qualities: first cb_length bases cb_quality_scores = r1_record.letter_annotations["phred_quality"][:cb_length] - - # Correct cell barcode using 10x-style correction + corrected_cb = correct_cell_barcode(raw_cb, cb_quality_scores, whitelist, hamming_index, correction_cache) - if corrected_cb is None: - stats['cb_no_correction'] += 1 + stats["cb_no_correction"] += 1 return None - - # Track correction statistics + if corrected_cb == raw_cb: - stats['cb_perfect_match'] += 1 + stats["cb_perfect_match"] += 1 else: - stats['cb_corrected'] += 1 + stats["cb_corrected"] += 1 - barcode_length = cb_length + umi_length + prefix_len = cb_length + umi_length + tso_length - # Build unaligned BAM records with corrected CB and raw UMI r1_bam = pysam.AlignedSegment() r1_bam.query_name = r1_name r1_bam.query_sequence = remaining_r1_seq - # Biopython stores qualities as list[int]; slice past CB+UMI - r1_bam.query_qualities = r1_record.letter_annotations["phred_quality"][barcode_length:] - r1_bam.flag = 77 # 0x004D: paired, first in pair, unmapped, mate unmapped + # IMPORTANT: drop the same prefix from qualities as from sequence + r1_bam.query_qualities = r1_record.letter_annotations["phred_quality"][prefix_len:] + r1_bam.flag = 77 # paired, first in pair, unmapped, mate unmapped r1_bam.reference_id = -1 r1_bam.reference_start = -1 r1_bam.mapping_quality = 0 r1_bam.set_tag("CB", corrected_cb) - r1_bam.set_tag("UB", umi) # Use raw UMI + r1_bam.set_tag("UB", umi) r2_bam = pysam.AlignedSegment() r2_bam.query_name = r2_name r2_bam.query_sequence = str(r2_record.seq) r2_bam.query_qualities = r2_record.letter_annotations["phred_quality"] - r2_bam.flag = 141 # 0x008D: paired, second in pair, unmapped, mate unmapped + r2_bam.flag = 141 # paired, second in pair, unmapped, mate unmapped r2_bam.reference_id = -1 r2_bam.reference_start = -1 r2_bam.mapping_quality = 0 r2_bam.set_tag("CB", corrected_cb) - r2_bam.set_tag("UB", umi) # Use raw UMI + r2_bam.set_tag("UB", umi) return r1_bam, r2_bam -def fastq_to_bam_with_barcodes(r1_fastq, r2_fastq, cb_whitelist_file, output_bam, num_cores=1, cb_length=16, umi_length=12): +def fastq_to_bam_with_barcodes( + r1_fastq, + r2_fastq, + cb_whitelist_file, + output_bam, + num_cores=1, + cb_length=16, + umi_length=None, + infer_umi=True, + tso_search_string="TTTCTTATATGGG", + infer_prefix_pairs=2000, + min_records_with_tso=10, +): """ Convert paired FASTQ files to unaligned BAM with CB/UB tags using multiple threads. - Performs 10x-style cell barcode correction using a whitelist. + - Corrects CB using whitelist-based Hamming distance 0/1 correction with CB qualities. + - Optionally infers UMI length by locating a TSO motif in R1 and removes the TSO from R1. + (If infer_umi=False, uses umi_length as provided and does not remove TSO unless you set tso_search_string + to match and set infer_umi=True.) Args: - r1_fastq: path to R1 FASTQ(.gz) - r2_fastq: path to R2 FASTQ(.gz) - cb_whitelist_file: path to cell barcode whitelist file (one CB per line, .gz or plain text) - output_bam: path to output BAM + r1_fastq, r2_fastq: FASTQ(.gz) paths + cb_whitelist_file: whitelist file + output_bam: output BAM path num_cores: threads - cb_length: length of cell barcode (default 16) - umi_length: length of UMI (default 12) + cb_length: cell barcode length + umi_length: if infer_umi=False, required + infer_umi: infer umi_length and remove tso motif + tso_search_string: TSO motif used for inference; tso_length = len(tso_search_string) + infer_prefix_pairs: number of pairs to buffer for inference + min_records_with_tso: minimum reads containing motif to accept inference """ print("Loading cell barcode whitelist...") whitelist, hamming_index = load_cb_whitelist(cb_whitelist_file) - - # Initialize correction cache (shared across threads, but thread-safe via GIL for dict operations) + correction_cache = {} - stats = defaultdict(int) - r1_open_func = gzip.open if r1_fastq.endswith('.gz') else open - r2_open_func = gzip.open if r2_fastq.endswith('.gz') else open - r1_mode = 'rt' if r1_fastq.endswith('.gz') else 'r' - r2_mode = 'rt' if r2_fastq.endswith('.gz') else 'r' + r1_open_func = gzip.open if r1_fastq.endswith(".gz") else open + r2_open_func = gzip.open if r2_fastq.endswith(".gz") else open + r1_mode = "rt" if r1_fastq.endswith(".gz") else "r" + r2_mode = "rt" if r2_fastq.endswith(".gz") else "r" header = { - 'HD': {'VN': '1.6', 'SO': 'queryname'}, - 'PG': [{'ID': 'nimble-fastq-to-bam', 'PN': 'nimble', 'VN': '1.2', 'CL': 'whitelist-based CB correction'}] + "HD": {"VN": "1.6", "SO": "queryname"}, + "PG": [ + { + "ID": "nimble-fastq-to-bam", + "PN": "nimble", + "VN": "1.2", + "CL": "whitelist-based CB correction (+ optional UMI inference + TSO removal)", + } + ], } print(f"Processing paired FASTQ files with {num_cores} threads...") @@ -275,34 +304,65 @@ def fastq_to_bam_with_barcodes(r1_fastq, r2_fastq, cb_whitelist_file, output_bam r1_iter = SeqIO.parse(r1_handle, "fastq") r2_iter = SeqIO.parse(r2_handle, "fastq") - umi_length = infer_umi_length(r1_iter) + # Buffer a small matched prefix for inference so we don't desync iterators. + prefix_pairs = list(islice(zip(r1_iter, r2_iter), infer_prefix_pairs)) + if not prefix_pairs: + raise ValueError("Input FASTQs are empty or could not be read") + + if infer_umi: + prefix_r1 = [p[0] for p in prefix_pairs] + inferred_umi_length, tso_length = infer_umi_and_tso_lengths_from_r1_records( + prefix_r1, + search_string=tso_search_string, + cb_length=cb_length, + min_records=min_records_with_tso, + ) + umi_length_use = inferred_umi_length + else: + if umi_length is None: + raise ValueError("umi_length must be provided when infer_umi=False") + umi_length_use = int(umi_length) + tso_length = 0 + + # Reconstruct full streams + r1_full = chain((p[0] for p in prefix_pairs), r1_iter) + r2_full = chain((p[1] for p in prefix_pairs), r2_iter) with ThreadPoolExecutor(max_workers=num_cores) as executor: futures = {} - for idx, (r1_record, r2_record) in enumerate(zip(r1_iter, r2_iter), start=1): - stats['total_pairs'] += 1 - fut = executor.submit(process_pair, r1_record, r2_record, - whitelist, hamming_index, correction_cache, stats, - cb_length, umi_length) + for idx, (r1_record, r2_record) in enumerate(zip(r1_full, r2_full), start=1): + stats["total_pairs"] += 1 + fut = executor.submit( + process_pair, + r1_record, + r2_record, + whitelist, + hamming_index, + correction_cache, + stats, + cb_length, + umi_length_use, + tso_length, + ) futures[fut] = True # Throttle in-flight futures to limit memory use if len(futures) >= num_cores * 100: done_any = 0 - for done in as_completed(list(futures)[:num_cores * 10]): + for done in as_completed(list(futures)[: num_cores * 10]): res = done.result() if res: r1_bam, r2_bam = res bam_out.write(r1_bam) bam_out.write(r2_bam) - stats['written_pairs'] += 1 + stats["written_pairs"] += 1 del futures[done] done_any += 1 if done_any >= num_cores * 10: break - if stats['total_pairs'] % 1_000_000 == 0: + if stats["total_pairs"] % 1_000_000 == 0: print(f"Processed {stats['total_pairs']} read pairs...") # Drain remaining futures @@ -312,40 +372,43 @@ def fastq_to_bam_with_barcodes(r1_fastq, r2_fastq, cb_whitelist_file, output_bam r1_bam, r2_bam = res bam_out.write(r1_bam) bam_out.write(r2_bam) - stats['written_pairs'] += 1 + stats["written_pairs"] += 1 del futures[done] except Exception as e: print(f"Error during processing: {e}", file=sys.stderr) sys.exit(1) - # Print summary with correction statistics print("\n=== Processing Statistics ===") print(f"Total read pairs: {stats.get('total_pairs', 0)}") print(f"Written pairs: {stats.get('written_pairs', 0)}") - print(f"\nCell Barcode Correction:") + + print("\nCell Barcode Correction:") print(f" Perfect matches: {stats.get('cb_perfect_match', 0)}") print(f" Corrected (1-edit): {stats.get('cb_corrected', 0)}") print(f" No valid correction: {stats.get('cb_no_correction', 0)}") - - total_cb_processed = stats.get('cb_perfect_match', 0) + stats.get('cb_corrected', 0) + stats.get('cb_no_correction', 0) + + total_cb_processed = ( + stats.get("cb_perfect_match", 0) + + stats.get("cb_corrected", 0) + + stats.get("cb_no_correction", 0) + ) + if total_cb_processed > 0: - perfect_pct = 100.0 * stats.get('cb_perfect_match', 0) / total_cb_processed - corrected_pct = 100.0 * stats.get('cb_corrected', 0) / total_cb_processed - dropped_pct = 100.0 * stats.get('cb_no_correction', 0) / total_cb_processed + perfect_pct = 100.0 * stats.get("cb_perfect_match", 0) / total_cb_processed + corrected_pct = 100.0 * stats.get("cb_corrected", 0) / total_cb_processed + dropped_pct = 100.0 * stats.get("cb_no_correction", 0) / total_cb_processed print(f" Correction rate: {perfect_pct:.2f}% perfect, {corrected_pct:.2f}% corrected, {dropped_pct:.2f}% dropped") - if (dropped_pct == 100.0): - raise ValueError("There were no passing cellbarcodes. This likely indicates an error with the inclusionlist") + if dropped_pct == 100.0: + raise ValueError("There were no passing cell barcodes. This likely indicates an error with the whitelist.") else: - raise ValueError("No cellbarcodes were processed. There is likely a problem with the input files") + raise ValueError("No cell barcodes were processed. There is likely a problem with the input files.") - print(f"\nOther filters:") + print("\nOther filters:") print(f" Name mismatch: {stats.get('name_mismatch', 0)}") print(f" Too short: {stats.get('too_short', 0)}") print(f" No remaining sequence: {stats.get('no_remaining_seq', 0)}") - - # Print unique cache size + print(f"\nCorrection cache size: {len(correction_cache)} unique raw CBs") - - print(f"\nOutput BAM written to: {output_bam}") + print(f"\nOutput BAM written to: {output_bam}") \ No newline at end of file From c511cb63bc7b9ffaf24c480571de302be75534e6 Mon Sep 17 00:00:00 2001 From: Sebastian Benjamin Date: Fri, 16 Jan 2026 13:49:10 -0800 Subject: [PATCH 3/4] Update actions/checkout and actions/setup-python --- .github/workflows/ci.yml | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5cbb8cb..fe7e26d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,18 +1,22 @@ name: Run Tests on: [push, pull_request] -jobs: + +jobs: test: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - name: Install Python 3 - uses: actions/setup-python@v1 + - uses: actions/checkout@v4 + + - name: Install Python + uses: actions/setup-python@v4 with: - python-version: 3.9 + python-version: "3.9" + - name: Install dependencies run: | python -m pip install --upgrade pip - pip install --upgrade pip setuptools==57.5.0 + pip install setuptools==57.5.0 pip install -r requirements.txt + - name: Run tests with unittest run: python -m unittest test/test.py From 9a2aaaae822e70b64446b4fcbf3ba7451e37eb52 Mon Sep 17 00:00:00 2001 From: Sebastian Benjamin Date: Fri, 16 Jan 2026 15:20:07 -0800 Subject: [PATCH 4/4] Add the skip_tso_trimming flag to nimble align --- nimble/__main__.py | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/nimble/__main__.py b/nimble/__main__.py index 2b95302..5e5bf30 100755 --- a/nimble/__main__.py +++ b/nimble/__main__.py @@ -31,6 +31,7 @@ ALIGN_TRIES = 10 ALIGN_TRIES_THRESHOLD = 0 +DOWNLOAD_THRESHOLD = 3 def validate_gzip(file_path): try: @@ -150,7 +151,7 @@ def download(release): # Check if the aligner exists -- if it does, call it with the given parameters. -def align(reference, output, input, num_cores, strand_filter, trim, tmpdir): +def align(reference, output, input, num_cores, strand_filter, trim, tmpdir, skip_tso_trimming): path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "aligner") if not os.path.exists(path): @@ -159,11 +160,11 @@ def align(reference, output, input, num_cores, strand_filter, trim, tmpdir): global ALIGN_TRIES ALIGN_TRIES = ALIGN_TRIES + 1 - if ALIGN_TRIES >= ALIGN_THRESHOLD: + if ALIGN_TRIES >= DOWNLOAD_THRESHOLD: print("Error -- could not find or download aligner.") sys.exit() - return align(reference, output, input, num_cores, strand_filter, trim, tmpdir) + return align(reference, output, input, num_cores, strand_filter, trim, tmpdir, skip_tso_trimming) print("Aligning input data to the reference libraries") sys.stdout.flush() @@ -190,6 +191,9 @@ def align(reference, output, input, num_cores, strand_filter, trim, tmpdir): if trim != "": processed_param_list.extend(["-t", trim]) + + if skip_tso_trimming: + processed_param_list.extend(["--skip_tso_trimming"]) print(processed_param_list) proc = subprocess.Popen([path] + processed_param_list) @@ -399,6 +403,12 @@ def sort_input_bam(bam, cores, tmp_dir): align_parser.add_argument('--strand_filter', help='Filter reads based on strand information.', type=str, default="unstranded") align_parser.add_argument('--trim', help='Configuration for trimming read-data, in the format :, comma-separated, one entry for each passed library', type=str, default="") align_parser.add_argument('--tmpdir', help='Path to a temporary directory for sorting .bam files', type=str, default=None) + align_parser.add_argument( + '--skip_tso_trimming', + help='Skip trimming 13bp off the R1 read for the TSO', + action='store_true', + default=False + ) report_parser = subparsers.add_parser('report') report_parser.add_argument('-i', '--input', help='The input file.', type=str, required=True) @@ -476,7 +486,18 @@ def sort_input_bam(bam, cores, tmp_dir): elif args.subcommand == 'generate': generate(args.file, args.opt_file, args.output_path) elif args.subcommand == 'align': - sys.exit(align(args.reference, args.output, args.input, args.num_cores, args.strand_filter, args.trim, args.tmpdir)) + sys.exit( + align( + args.reference, + args.output, + args.input, + args.num_cores, + args.strand_filter, + args.trim, + args.tmpdir, + args.skip_tso_trimming + ) + ) elif args.subcommand == 'report': summarize_columns_list = args.summarize.split(',') if args.summarize else None report(args.input, args.output, summarize_columns_list, args.threshold, args.disable_thresholding)