From 928b48f38a7b622e27a9d0d94415c21c56f04240 Mon Sep 17 00:00:00 2001
From: bbimber <bbimber@gmail.com>
Date: Wed, 14 Jan 2026 22:12:21 -0800
Subject: [PATCH 1/4] Add check for TSO and infer UMI length

---
 nimble/fastq_barcode_processor.py | 41 +++++++++++++++++++++++++++----
 1 file changed, 36 insertions(+), 5 deletions(-)

diff --git a/nimble/fastq_barcode_processor.py b/nimble/fastq_barcode_processor.py
index 5412133..40b9dfb 100644
--- a/nimble/fastq_barcode_processor.py
+++ b/nimble/fastq_barcode_processor.py
@@ -14,6 +14,30 @@ def hamming_distance(s1, s2):
     return sum(c1 != c2 for c1, c2 in zip(s1, s2))
 
 
+def infer_umi_length(r1_iter, search_string = "TTTCTTATATGGG", max_records = 100, min_records = 10, cb_length = 16):
+    tso_starts = []
+    for idx, (r1_record) in enumerate(zip(r1_iter), start=1):
+        pos = r1_record.find(search_string)
+        if (pos > -1):
+            tso_starts.append(pos)
+            if (len(tso_starts) >= min_records):
+                break
+
+        if (idx > max_records):
+            break
+
+    if (len(tso_starts) < min_records):
+        raise ValueError("Unable to infer UMI length. Insufficent R1 sequences contained " + search_string)
+
+    tso_starts = set(tso_starts)
+    if (len(tso_starts) > 1):
+        raise ValueError("Unable to infer UMI length. TSO sequence found at inconsistent positions of R1: " + ",".join(tso_starts))
+
+    umi_length = tso_starts[1] - cb_length
+    print("Inferred UMI length of: " + umi_length)
+
+    return(umi_length)
+
 def build_hamming_index(whitelist):
     """
     Build an index mapping each valid CB to all possible 1-edit variants.
@@ -128,7 +152,7 @@ def correct_cell_barcode(raw_cb, quality_scores, whitelist, hamming_index, corre
     return best_candidate
 
 
-def parse_10x_barcode_from_r1(sequence, cb_length=16, umi_length=12):
+def parse_10x_barcode_from_r1(sequence, cb_length=16, umi_length=10, tso_length=13):
     """
     Parse cell barcode and UMI from 10x R1 read sequence.
     Returns (cell_barcode, umi, remaining_sequence)
@@ -137,11 +161,11 @@ def parse_10x_barcode_from_r1(sequence, cb_length=16, umi_length=12):
         return None, None, ""
     cell_barcode = sequence[:cb_length]
     umi = sequence[cb_length:cb_length + umi_length]
-    remaining_sequence = sequence[cb_length + umi_length:]
+    remaining_sequence = sequence[cb_length + umi_length + tso_length:]
     return cell_barcode, umi, remaining_sequence
 
 
-def process_pair(r1_record, r2_record, whitelist, hamming_index, correction_cache, stats, cb_length=16, umi_length=12):
+def process_pair(r1_record, r2_record, whitelist, hamming_index, correction_cache, stats, cb_length=16, umi_length=10, tso_length = 13):
     """
     Process a single FASTQ pair and return (r1_bam, r2_bam), or None if skipped.
     Correction logic:
@@ -156,7 +180,7 @@ def process_pair(r1_record, r2_record, whitelist, hamming_index, correction_cach
         return None
 
     r1_seq = str(r1_record.seq)
-    raw_cb, umi, remaining_r1_seq = parse_10x_barcode_from_r1(r1_seq, cb_length, umi_length)
+    raw_cb, umi, remaining_r1_seq = parse_10x_barcode_from_r1(r1_seq, cb_length, umi_length, tso_length)
     if raw_cb is None or umi is None:
         stats['too_short'] += 1
         return None
@@ -251,6 +275,8 @@ def fastq_to_bam_with_barcodes(r1_fastq, r2_fastq, cb_whitelist_file, output_bam
             r1_iter = SeqIO.parse(r1_handle, "fastq")
             r2_iter = SeqIO.parse(r2_handle, "fastq")
 
+            umi_length = infer_umi_length(r1_iter)
+
             with ThreadPoolExecutor(max_workers=num_cores) as executor:
                 futures = {}
 
@@ -308,7 +334,12 @@ def fastq_to_bam_with_barcodes(r1_fastq, r2_fastq, cb_whitelist_file, output_bam
         corrected_pct = 100.0 * stats.get('cb_corrected', 0) / total_cb_processed
         dropped_pct = 100.0 * stats.get('cb_no_correction', 0) / total_cb_processed
         print(f"  Correction rate: {perfect_pct:.2f}% perfect, {corrected_pct:.2f}% corrected, {dropped_pct:.2f}% dropped")
-    
+
+        if (dropped_pct == 100.0):
+            raise ValueError("There were no passing cellbarcodes. This likely indicates an error with the inclusionlist")
+    else:
+        raise ValueError("No cellbarcodes were processed. There is likely a problem with the input files")
+
     print(f"\nOther filters:")
     print(f"  Name mismatch: {stats.get('name_mismatch', 0)}")
     print(f"  Too short: {stats.get('too_short', 0)}")

From dd9c4c6db4c3da9343bc8e92acc1e97c6c8fd273 Mon Sep 17 00:00:00 2001
From: Sebastian Benjamin <hex@hextraza.moe>
Date: Fri, 16 Jan 2026 10:33:17 -0800
Subject: [PATCH 2/4] UMI length detection and TSO stripping + CLI args

---
 nimble/__main__.py                |  67 +++++-
 nimble/fastq_barcode_processor.py | 363 ++++++++++++++++++------------
 2 files changed, 271 insertions(+), 159 deletions(-)

diff --git a/nimble/__main__.py b/nimble/__main__.py
index 119ba15..2b95302 100755
--- a/nimble/__main__.py
+++ b/nimble/__main__.py
@@ -424,11 +424,50 @@ def sort_input_bam(bam, cores, tmp_dir):
     fastq_to_bam_parser = subparsers.add_parser('fastq-to-bam')
     fastq_to_bam_parser.add_argument('--r1-fastq', help='Path to R1 FASTQ file.', type=str, required=True)
     fastq_to_bam_parser.add_argument('--r2-fastq', help='Path to R2 FASTQ file.', type=str, required=True)
-    fastq_to_bam_parser.add_argument("--map", required=True, help="Cell barcode whitelist file (one CB per line, .gz or plain text)")
+    fastq_to_bam_parser.add_argument(
+        "--map",
+        required=True,
+        help="Cell barcode whitelist file (one CB per line, .gz or plain text)"
+    )
     fastq_to_bam_parser.add_argument('--output', help='Path for output BAM file.', type=str, required=True)
     fastq_to_bam_parser.add_argument('-c', '--num_cores', help='The number of cores to use for processing.', type=int, default=1)
     fastq_to_bam_parser.add_argument('--cb-length', help='Length of cell barcode (default: 16).', type=int, default=16)
-    fastq_to_bam_parser.add_argument('--umi-length', help='Length of UMI (default: 12).', type=int, default=12)
+    fastq_to_bam_parser.add_argument(
+        '--infer-umi',
+        help='Infer UMI length from R1 by locating the TSO motif and strip the motif from R1 (default: on).',
+        action='store_true',
+        default=True
+    )
+    fastq_to_bam_parser.add_argument(
+        '--no-infer-umi',
+        help='Disable UMI inference. Requires --umi-length. Does not strip TSO from R1.',
+        action='store_true',
+        default=False
+    )
+    fastq_to_bam_parser.add_argument(
+        '--umi-length',
+        help='UMI length (required if --no-infer-umi is set). Ignored when inference is enabled.',
+        type=int,
+        default=None
+    )
+    fastq_to_bam_parser.add_argument(
+        '--tso-search-string',
+        help='TSO motif to locate in R1 for UMI inference (default: TTTCTTATATGGG).',
+        type=str,
+        default="TTTCTTATATGGG"
+    )
+    fastq_to_bam_parser.add_argument(
+        '--infer-prefix-pairs',
+        help='Number of read pairs to buffer for UMI inference (default: 2000).',
+        type=int,
+        default=2000
+    )
+    fastq_to_bam_parser.add_argument(
+        '--min-records-with-tso',
+        help='Minimum number of R1 reads containing the TSO motif required to accept inference (default: 10).',
+        type=int,
+        default=10
+    )
 
     args = parser.parse_args()
 
@@ -442,14 +481,24 @@ def sort_input_bam(bam, cores, tmp_dir):
         summarize_columns_list = args.summarize.split(',') if args.summarize else None
         report(args.input, args.output, summarize_columns_list, args.threshold, args.disable_thresholding)
     elif args.subcommand == 'fastq-to-bam':
+        infer_umi = (not args.no_infer_umi) and args.infer_umi
+
+        if not infer_umi and args.umi_length is None:
+            print("Error: --umi-length is required when --no-infer-umi is set.", file=sys.stderr)
+            sys.exit(2)
+
         fastq_to_bam_with_barcodes(
-            args.r1_fastq, 
-            args.r2_fastq, 
-            args.map,
-            args.output, 
-            args.num_cores,
-            args.cb_length,
-            args.umi_length
+            r1_fastq=args.r1_fastq,
+            r2_fastq=args.r2_fastq,
+            cb_whitelist_file=args.map,
+            output_bam=args.output,
+            num_cores=args.num_cores,
+            cb_length=args.cb_length,
+            umi_length=args.umi_length,
+            infer_umi=infer_umi,
+            tso_search_string=args.tso_search_string,
+            infer_prefix_pairs=args.infer_prefix_pairs,
+            min_records_with_tso=args.min_records_with_tso,
         )
     elif args.subcommand == 'plot':
         if os.path.getsize(args.input_file) > 0:
diff --git a/nimble/fastq_barcode_processor.py b/nimble/fastq_barcode_processor.py
index 40b9dfb..a711b38 100644
--- a/nimble/fastq_barcode_processor.py
+++ b/nimble/fastq_barcode_processor.py
@@ -3,6 +3,7 @@
 import sys
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor, as_completed
+from itertools import islice, chain
 import pysam
 from Bio import SeqIO
 
@@ -10,69 +11,37 @@
 def hamming_distance(s1, s2):
     """Calculate Hamming distance between two strings of equal length."""
     if len(s1) != len(s2):
-        return float('inf')
+        return float("inf")
     return sum(c1 != c2 for c1, c2 in zip(s1, s2))
 
 
-def infer_umi_length(r1_iter, search_string = "TTTCTTATATGGG", max_records = 100, min_records = 10, cb_length = 16):
-    tso_starts = []
-    for idx, (r1_record) in enumerate(zip(r1_iter), start=1):
-        pos = r1_record.find(search_string)
-        if (pos > -1):
-            tso_starts.append(pos)
-            if (len(tso_starts) >= min_records):
-                break
-
-        if (idx > max_records):
-            break
-
-    if (len(tso_starts) < min_records):
-        raise ValueError("Unable to infer UMI length. Insufficent R1 sequences contained " + search_string)
-
-    tso_starts = set(tso_starts)
-    if (len(tso_starts) > 1):
-        raise ValueError("Unable to infer UMI length. TSO sequence found at inconsistent positions of R1: " + ",".join(tso_starts))
-
-    umi_length = tso_starts[1] - cb_length
-    print("Inferred UMI length of: " + umi_length)
-
-    return(umi_length)
-
 def build_hamming_index(whitelist):
     """
     Build an index mapping each valid CB to all possible 1-edit variants.
-    This allows efficient lookup of correction candidates.
-    
-    Returns:
-        dict[str variant] = set[str valid_cb]
+    Returns: dict[str variant] = set[str valid_cb]
     """
-    bases = ['A', 'C', 'G', 'T', 'N']
+    bases = ["A", "C", "G", "T", "N"]
     hamming_index = defaultdict(set)
-    
+
     for valid_cb in whitelist:
-        # For each position, generate all possible single-base substitutions
         for i in range(len(valid_cb)):
             for base in bases:
                 if base != valid_cb[i]:
-                    variant = valid_cb[:i] + base + valid_cb[i+1:]
+                    variant = valid_cb[:i] + base + valid_cb[i + 1 :]
                     hamming_index[variant].add(valid_cb)
-    
+
     return hamming_index
 
 
 def load_cb_whitelist(whitelist_path):
     """
-    Load a cell barcode whitelist (one CB per line).
-    Build a Hamming distance = 1 index for efficient correction.
-
-    Returns:
-        whitelist: set of valid cell barcodes
-        hamming_index: dict mapping variants to valid CBs
+    Load a cell barcode whitelist (one CB per line) and build a Hamming distance = 1 index.
+    Returns: (whitelist_set, hamming_index)
     """
     whitelist = set()
-    
-    open_func = gzip.open if whitelist_path.endswith('.gz') else open
-    mode = 'rt' if whitelist_path.endswith('.gz') else 'r'
+
+    open_func = gzip.open if whitelist_path.endswith(".gz") else open
+    mode = "rt" if whitelist_path.endswith(".gz") else "r"
 
     try:
         with open_func(whitelist_path, mode) as f:
@@ -86,60 +55,43 @@ def load_cb_whitelist(whitelist_path):
 
     print(f"Loaded whitelist from {whitelist_path}")
     print(f"  Valid cell barcodes: {len(whitelist)}")
-    
+
     print("Building Hamming distance index...")
     hamming_index = build_hamming_index(whitelist)
     print(f"  Indexed {len(hamming_index)} variants")
-    
+
     return whitelist, hamming_index
 
 
 def correct_cell_barcode(raw_cb, quality_scores, whitelist, hamming_index, correction_cache):
     """
     Correct a raw cell barcode using 10x-style correction:
-    1. Check for perfect match
-    2. If not, find all candidates with Hamming distance = 1
-    3. Among candidates, select the one where the differing base has the lowest quality score
-    
-    Args:
-        raw_cb: Raw cell barcode string
-        quality_scores: List of quality scores for the CB region
-        whitelist: Set of valid cell barcodes
-        hamming_index: Dict mapping variants to valid CBs
-        correction_cache: Dict for caching corrections
-    
-    Returns:
-        Corrected CB string, or None if no valid correction found
+      1) perfect match
+      2) candidates at Hamming distance = 1 (via index)
+      3) if multiple, choose the candidate whose differing position has the lowest Q
+    Returns corrected CB or None.
     """
-    # Check cache first
     if raw_cb in correction_cache:
         return correction_cache[raw_cb]
-    
-    # Check for perfect match
+
     if raw_cb in whitelist:
         correction_cache[raw_cb] = raw_cb
         return raw_cb
-    
-    # Find candidates with Hamming distance = 1
+
     candidates = hamming_index.get(raw_cb, set())
-    
     if not candidates:
         correction_cache[raw_cb] = None
         return None
-    
+
     if len(candidates) == 1:
-        # Only one candidate, use it
         corrected = next(iter(candidates))
         correction_cache[raw_cb] = corrected
         return corrected
-    
-    # Multiple candidates: select based on quality scores
-    # Find the differing position and pick candidate with lowest quality at that position
+
     best_candidate = None
-    lowest_quality = float('inf')
-    
+    lowest_quality = float("inf")
+
     for candidate in candidates:
-        # Find the position where they differ
         for i, (raw_base, cand_base) in enumerate(zip(raw_cb, candidate)):
             if raw_base != cand_base:
                 qual = quality_scores[i]
@@ -147,122 +99,199 @@ def correct_cell_barcode(raw_cb, quality_scores, whitelist, hamming_index, corre
                     lowest_quality = qual
                     best_candidate = candidate
                 break
-    
+
     correction_cache[raw_cb] = best_candidate
     return best_candidate
 
 
-def parse_10x_barcode_from_r1(sequence, cb_length=16, umi_length=10, tso_length=13):
+def infer_umi_and_tso_lengths_from_r1_records(
+    r1_records,
+    search_string="TTTCTTATATGGG",
+    cb_length=16,
+    min_records=10,
+):
+    """
+    Infer UMI length by locating the TSO motif in R1 reads.
+    Assumes structure: [CB][UMI][TSO...][cDNA...]
+    umi_length = tso_start - cb_length
+    tso_length = len(search_string) (the removed prefix length)
+    """
+    tso_starts = []
+    for rec in r1_records:
+        seq = str(rec.seq)
+        pos = seq.find(search_string)
+        if pos != -1:
+            tso_starts.append(pos)
+            if len(tso_starts) >= min_records:
+                break
+
+    if len(tso_starts) < min_records:
+        raise ValueError(
+            f"Unable to infer UMI length: fewer than {min_records} reads contained TSO motif {search_string}"
+        )
+
+    unique = set(tso_starts)
+    if len(unique) != 1:
+        raise ValueError(
+            f"Unable to infer UMI length: TSO motif found at inconsistent positions in R1: {sorted(unique)}"
+        )
+
+    tso_start = next(iter(unique))
+    umi_length = tso_start - cb_length
+    if umi_length <= 0:
+        raise ValueError(
+            f"Inferred UMI length <= 0 (tso_start={tso_start}, cb_length={cb_length}). Check motif/structure."
+        )
+
+    tso_length = len(search_string)
+
+    print(f"Inferred UMI length: {umi_length}")
+    print(f"Using TSO length: {tso_length} (len of search_string)")
+    return umi_length, tso_length
+
+
+def parse_10x_barcode_from_r1(sequence, cb_length=16, umi_length=12, tso_length=0):
     """
-    Parse cell barcode and UMI from 10x R1 read sequence.
-    Returns (cell_barcode, umi, remaining_sequence)
+    Parse CB and UMI from R1 sequence, then drop optional TSO.
+    Returns (cell_barcode, umi, remaining_sequence).
     """
-    if len(sequence) < cb_length + umi_length:
+    prefix_len = cb_length + umi_length + tso_length
+    if len(sequence) < prefix_len:
         return None, None, ""
     cell_barcode = sequence[:cb_length]
-    umi = sequence[cb_length:cb_length + umi_length]
-    remaining_sequence = sequence[cb_length + umi_length + tso_length:]
+    umi = sequence[cb_length : cb_length + umi_length]
+    remaining_sequence = sequence[prefix_len:]
     return cell_barcode, umi, remaining_sequence
 
 
-def process_pair(r1_record, r2_record, whitelist, hamming_index, correction_cache, stats, cb_length=16, umi_length=10, tso_length = 13):
+def process_pair(
+    r1_record,
+    r2_record,
+    whitelist,
+    hamming_index,
+    correction_cache,
+    stats,
+    cb_length=16,
+    umi_length=12,
+    tso_length=0,
+):
     """
-    Process a single FASTQ pair and return (r1_bam, r2_bam), or None if skipped.
-    Correction logic:
-      - Correct CB using 10x-style Hamming distance = 1 correction with quality scores
-      - Use raw UMI (no correction)
+    Process a single FASTQ pair; return (r1_bam, r2_bam) or None if skipped.
+    - Correct CB (Hamming distance 0/1) using CB qualities.
+    - Use raw UMI.
+    - Drop (CB+UMI+TSO) from R1 sequence and qualities consistently.
     """
-    # Normalize names (handle /1 and /2 suffixes if present)
-    r1_name = r1_record.id.removesuffix('/1')
-    r2_name = r2_record.id.removesuffix('/2')
+    r1_name = r1_record.id.removesuffix("/1")
+    r2_name = r2_record.id.removesuffix("/2")
     if r1_name != r2_name:
-        stats['name_mismatch'] += 1
+        stats["name_mismatch"] += 1
         return None
 
     r1_seq = str(r1_record.seq)
-    raw_cb, umi, remaining_r1_seq = parse_10x_barcode_from_r1(r1_seq, cb_length, umi_length, tso_length)
+    raw_cb, umi, remaining_r1_seq = parse_10x_barcode_from_r1(
+        r1_seq, cb_length=cb_length, umi_length=umi_length, tso_length=tso_length
+    )
     if raw_cb is None or umi is None:
-        stats['too_short'] += 1
+        stats["too_short"] += 1
         return None
     if len(remaining_r1_seq) == 0:
-        stats['no_remaining_seq'] += 1
+        stats["no_remaining_seq"] += 1
         return None
 
-    # Get quality scores for the CB region
+    # CB qualities: first cb_length bases
     cb_quality_scores = r1_record.letter_annotations["phred_quality"][:cb_length]
-    
-    # Correct cell barcode using 10x-style correction
+
     corrected_cb = correct_cell_barcode(raw_cb, cb_quality_scores, whitelist, hamming_index, correction_cache)
-    
     if corrected_cb is None:
-        stats['cb_no_correction'] += 1
+        stats["cb_no_correction"] += 1
         return None
-    
-    # Track correction statistics
+
     if corrected_cb == raw_cb:
-        stats['cb_perfect_match'] += 1
+        stats["cb_perfect_match"] += 1
     else:
-        stats['cb_corrected'] += 1
+        stats["cb_corrected"] += 1
 
-    barcode_length = cb_length + umi_length
+    prefix_len = cb_length + umi_length + tso_length
 
-    # Build unaligned BAM records with corrected CB and raw UMI
     r1_bam = pysam.AlignedSegment()
     r1_bam.query_name = r1_name
     r1_bam.query_sequence = remaining_r1_seq
-    # Biopython stores qualities as list[int]; slice past CB+UMI
-    r1_bam.query_qualities = r1_record.letter_annotations["phred_quality"][barcode_length:]
-    r1_bam.flag = 77                 # 0x004D: paired, first in pair, unmapped, mate unmapped
+    # IMPORTANT: drop the same prefix from qualities as from sequence
+    r1_bam.query_qualities = r1_record.letter_annotations["phred_quality"][prefix_len:]
+    r1_bam.flag = 77  # paired, first in pair, unmapped, mate unmapped
     r1_bam.reference_id = -1
     r1_bam.reference_start = -1
     r1_bam.mapping_quality = 0
     r1_bam.set_tag("CB", corrected_cb)
-    r1_bam.set_tag("UB", umi)  # Use raw UMI
+    r1_bam.set_tag("UB", umi)
 
     r2_bam = pysam.AlignedSegment()
     r2_bam.query_name = r2_name
     r2_bam.query_sequence = str(r2_record.seq)
     r2_bam.query_qualities = r2_record.letter_annotations["phred_quality"]
-    r2_bam.flag = 141                # 0x008D: paired, second in pair, unmapped, mate unmapped
+    r2_bam.flag = 141  # paired, second in pair, unmapped, mate unmapped
     r2_bam.reference_id = -1
     r2_bam.reference_start = -1
     r2_bam.mapping_quality = 0
     r2_bam.set_tag("CB", corrected_cb)
-    r2_bam.set_tag("UB", umi)  # Use raw UMI
+    r2_bam.set_tag("UB", umi)
 
     return r1_bam, r2_bam
 
 
-def fastq_to_bam_with_barcodes(r1_fastq, r2_fastq, cb_whitelist_file, output_bam, num_cores=1, cb_length=16, umi_length=12):
+def fastq_to_bam_with_barcodes(
+    r1_fastq,
+    r2_fastq,
+    cb_whitelist_file,
+    output_bam,
+    num_cores=1,
+    cb_length=16,
+    umi_length=None,
+    infer_umi=True,
+    tso_search_string="TTTCTTATATGGG",
+    infer_prefix_pairs=2000,
+    min_records_with_tso=10,
+):
     """
     Convert paired FASTQ files to unaligned BAM with CB/UB tags using multiple threads.
-    Performs 10x-style cell barcode correction using a whitelist.
+    - Corrects CB using whitelist-based Hamming distance 0/1 correction with CB qualities.
+    - Optionally infers UMI length by locating a TSO motif in R1 and removes the TSO from R1.
+      (If infer_umi=False, uses umi_length as provided and does not remove TSO unless you set tso_search_string
+       to match and set infer_umi=True.)
 
     Args:
-        r1_fastq: path to R1 FASTQ(.gz)
-        r2_fastq: path to R2 FASTQ(.gz)
-        cb_whitelist_file: path to cell barcode whitelist file (one CB per line, .gz or plain text)
-        output_bam: path to output BAM
+        r1_fastq, r2_fastq: FASTQ(.gz) paths
+        cb_whitelist_file: whitelist file
+        output_bam: output BAM path
         num_cores: threads
-        cb_length: length of cell barcode (default 16)
-        umi_length: length of UMI (default 12)
+        cb_length: cell barcode length
+        umi_length: if infer_umi=False, required
+        infer_umi: infer umi_length and remove tso motif
+        tso_search_string: TSO motif used for inference; tso_length = len(tso_search_string)
+        infer_prefix_pairs: number of pairs to buffer for inference
+        min_records_with_tso: minimum reads containing motif to accept inference
     """
     print("Loading cell barcode whitelist...")
     whitelist, hamming_index = load_cb_whitelist(cb_whitelist_file)
-    
-    # Initialize correction cache (shared across threads, but thread-safe via GIL for dict operations)
+
     correction_cache = {}
-    
     stats = defaultdict(int)
 
-    r1_open_func = gzip.open if r1_fastq.endswith('.gz') else open
-    r2_open_func = gzip.open if r2_fastq.endswith('.gz') else open
-    r1_mode = 'rt' if r1_fastq.endswith('.gz') else 'r'
-    r2_mode = 'rt' if r2_fastq.endswith('.gz') else 'r'
+    r1_open_func = gzip.open if r1_fastq.endswith(".gz") else open
+    r2_open_func = gzip.open if r2_fastq.endswith(".gz") else open
+    r1_mode = "rt" if r1_fastq.endswith(".gz") else "r"
+    r2_mode = "rt" if r2_fastq.endswith(".gz") else "r"
 
     header = {
-        'HD': {'VN': '1.6', 'SO': 'queryname'},
-        'PG': [{'ID': 'nimble-fastq-to-bam', 'PN': 'nimble', 'VN': '1.2', 'CL': 'whitelist-based CB correction'}]
+        "HD": {"VN": "1.6", "SO": "queryname"},
+        "PG": [
+            {
+                "ID": "nimble-fastq-to-bam",
+                "PN": "nimble",
+                "VN": "1.2",
+                "CL": "whitelist-based CB correction (+ optional UMI inference + TSO removal)",
+            }
+        ],
     }
 
     print(f"Processing paired FASTQ files with {num_cores} threads...")
@@ -275,34 +304,65 @@ def fastq_to_bam_with_barcodes(r1_fastq, r2_fastq, cb_whitelist_file, output_bam
             r1_iter = SeqIO.parse(r1_handle, "fastq")
             r2_iter = SeqIO.parse(r2_handle, "fastq")
 
-            umi_length = infer_umi_length(r1_iter)
+            # Buffer a small matched prefix for inference so we don't desync iterators.
+            prefix_pairs = list(islice(zip(r1_iter, r2_iter), infer_prefix_pairs))
+            if not prefix_pairs:
+                raise ValueError("Input FASTQs are empty or could not be read")
+
+            if infer_umi:
+                prefix_r1 = [p[0] for p in prefix_pairs]
+                inferred_umi_length, tso_length = infer_umi_and_tso_lengths_from_r1_records(
+                    prefix_r1,
+                    search_string=tso_search_string,
+                    cb_length=cb_length,
+                    min_records=min_records_with_tso,
+                )
+                umi_length_use = inferred_umi_length
+            else:
+                if umi_length is None:
+                    raise ValueError("umi_length must be provided when infer_umi=False")
+                umi_length_use = int(umi_length)
+                tso_length = 0
+
+            # Reconstruct full streams
+            r1_full = chain((p[0] for p in prefix_pairs), r1_iter)
+            r2_full = chain((p[1] for p in prefix_pairs), r2_iter)
 
             with ThreadPoolExecutor(max_workers=num_cores) as executor:
                 futures = {}
 
-                for idx, (r1_record, r2_record) in enumerate(zip(r1_iter, r2_iter), start=1):
-                    stats['total_pairs'] += 1
-                    fut = executor.submit(process_pair, r1_record, r2_record,
-                                          whitelist, hamming_index, correction_cache, stats, 
-                                          cb_length, umi_length)
+                for idx, (r1_record, r2_record) in enumerate(zip(r1_full, r2_full), start=1):
+                    stats["total_pairs"] += 1
+                    fut = executor.submit(
+                        process_pair,
+                        r1_record,
+                        r2_record,
+                        whitelist,
+                        hamming_index,
+                        correction_cache,
+                        stats,
+                        cb_length,
+                        umi_length_use,
+                        tso_length,
+                    )
                     futures[fut] = True
 
                     # Throttle in-flight futures to limit memory use
                     if len(futures) >= num_cores * 100:
                         done_any = 0
-                        for done in as_completed(list(futures)[:num_cores * 10]):
+                        for done in as_completed(list(futures)[: num_cores * 10]):
                             res = done.result()
                             if res:
                                 r1_bam, r2_bam = res
                                 bam_out.write(r1_bam)
                                 bam_out.write(r2_bam)
-                                stats['written_pairs'] += 1
+                                stats["written_pairs"] += 1
                             del futures[done]
                             done_any += 1
                             if done_any >= num_cores * 10:
                                 break
 
-                    if stats['total_pairs'] % 1_000_000 == 0:
+                    if stats["total_pairs"] % 1_000_000 == 0:
                         print(f"Processed {stats['total_pairs']} read pairs...")
 
                 # Drain remaining futures
@@ -312,40 +372,43 @@ def fastq_to_bam_with_barcodes(r1_fastq, r2_fastq, cb_whitelist_file, output_bam
                         r1_bam, r2_bam = res
                         bam_out.write(r1_bam)
                         bam_out.write(r2_bam)
-                        stats['written_pairs'] += 1
+                        stats["written_pairs"] += 1
                     del futures[done]
 
     except Exception as e:
         print(f"Error during processing: {e}", file=sys.stderr)
         sys.exit(1)
 
-    # Print summary with correction statistics
     print("\n=== Processing Statistics ===")
     print(f"Total read pairs: {stats.get('total_pairs', 0)}")
     print(f"Written pairs: {stats.get('written_pairs', 0)}")
-    print(f"\nCell Barcode Correction:")
+
+    print("\nCell Barcode Correction:")
     print(f"  Perfect matches: {stats.get('cb_perfect_match', 0)}")
     print(f"  Corrected (1-edit): {stats.get('cb_corrected', 0)}")
     print(f"  No valid correction: {stats.get('cb_no_correction', 0)}")
-    
-    total_cb_processed = stats.get('cb_perfect_match', 0) + stats.get('cb_corrected', 0) + stats.get('cb_no_correction', 0)
+
+    total_cb_processed = (
+        stats.get("cb_perfect_match", 0)
+        + stats.get("cb_corrected", 0)
+        + stats.get("cb_no_correction", 0)
+    )
+
     if total_cb_processed > 0:
-        perfect_pct = 100.0 * stats.get('cb_perfect_match', 0) / total_cb_processed
-        corrected_pct = 100.0 * stats.get('cb_corrected', 0) / total_cb_processed
-        dropped_pct = 100.0 * stats.get('cb_no_correction', 0) / total_cb_processed
+        perfect_pct = 100.0 * stats.get("cb_perfect_match", 0) / total_cb_processed
+        corrected_pct = 100.0 * stats.get("cb_corrected", 0) / total_cb_processed
+        dropped_pct = 100.0 * stats.get("cb_no_correction", 0) / total_cb_processed
         print(f"  Correction rate: {perfect_pct:.2f}% perfect, {corrected_pct:.2f}% corrected, {dropped_pct:.2f}% dropped")
 
-        if (dropped_pct == 100.0):
-            raise ValueError("There were no passing cellbarcodes. This likely indicates an error with the inclusionlist")
+        if dropped_pct == 100.0:
+            raise ValueError("There were no passing cell barcodes. This likely indicates an error with the whitelist.")
     else:
-        raise ValueError("No cellbarcodes were processed. There is likely a problem with the input files")
+        raise ValueError("No cell barcodes were processed. There is likely a problem with the input files.")
 
-    print(f"\nOther filters:")
+    print("\nOther filters:")
     print(f"  Name mismatch: {stats.get('name_mismatch', 0)}")
     print(f"  Too short: {stats.get('too_short', 0)}")
     print(f"  No remaining sequence: {stats.get('no_remaining_seq', 0)}")
-    
-    # Print unique cache size
+
     print(f"\nCorrection cache size: {len(correction_cache)} unique raw CBs")
-    
-    print(f"\nOutput BAM written to: {output_bam}")
+    print(f"\nOutput BAM written to: {output_bam}")
\ No newline at end of file

From c511cb63bc7b9ffaf24c480571de302be75534e6 Mon Sep 17 00:00:00 2001
From: Sebastian Benjamin <hex@hextraza.moe>
Date: Fri, 16 Jan 2026 13:49:10 -0800
Subject: [PATCH 3/4] Update actions/checkout and actions/setup-python

---
 .github/workflows/ci.yml | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 5cbb8cb..fe7e26d 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -1,18 +1,22 @@
 name: Run Tests
 on: [push, pull_request]
-jobs: 
+
+jobs:
   test:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
-      - name: Install Python 3
-        uses: actions/setup-python@v1
+      - uses: actions/checkout@v4
+
+      - name: Install Python
+        uses: actions/setup-python@v4
         with:
-          python-version: 3.9
+          python-version: "3.9"
+
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install --upgrade pip setuptools==57.5.0
+          pip install setuptools==57.5.0
           pip install -r requirements.txt
+
       - name: Run tests with unittest
         run: python -m unittest test/test.py

From 9a2aaaae822e70b64446b4fcbf3ba7451e37eb52 Mon Sep 17 00:00:00 2001
From: Sebastian Benjamin <hex@hextraza.moe>
Date: Fri, 16 Jan 2026 15:20:07 -0800
Subject: [PATCH 4/4] Add the skip_tso_trimming flag to nimble align

---
 nimble/__main__.py | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/nimble/__main__.py b/nimble/__main__.py
index 2b95302..5e5bf30 100755
--- a/nimble/__main__.py
+++ b/nimble/__main__.py
@@ -31,6 +31,7 @@
 
 ALIGN_TRIES = 10
 ALIGN_TRIES_THRESHOLD = 0
+DOWNLOAD_THRESHOLD = 3
 
 def validate_gzip(file_path):
     try:
@@ -150,7 +151,7 @@ def download(release):
 
 
 # Check if the aligner exists -- if it does, call it with the given parameters.
-def align(reference, output, input, num_cores, strand_filter, trim, tmpdir):
+def align(reference, output, input, num_cores, strand_filter, trim, tmpdir, skip_tso_trimming):
     path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "aligner")
 
     if not os.path.exists(path):
@@ -159,11 +160,11 @@ def align(reference, output, input, num_cores, strand_filter, trim, tmpdir):
 
         global ALIGN_TRIES
         ALIGN_TRIES = ALIGN_TRIES + 1
-        if ALIGN_TRIES >= ALIGN_THRESHOLD:
+        if ALIGN_TRIES >= DOWNLOAD_THRESHOLD:
             print("Error -- could not find or download aligner.")
             sys.exit()
 
-        return align(reference, output, input, num_cores, strand_filter, trim, tmpdir)
+        return align(reference, output, input, num_cores, strand_filter, trim, tmpdir, skip_tso_trimming)
 
     print("Aligning input data to the reference libraries")
     sys.stdout.flush()
@@ -190,6 +191,9 @@ def align(reference, output, input, num_cores, strand_filter, trim, tmpdir):
 
     if trim != "":
         processed_param_list.extend(["-t", trim])
+        
+    if skip_tso_trimming:
+        processed_param_list.extend(["--skip_tso_trimming"])
 
     print(processed_param_list)
     proc = subprocess.Popen([path] + processed_param_list)
@@ -399,6 +403,12 @@ def sort_input_bam(bam, cores, tmp_dir):
     align_parser.add_argument('--strand_filter', help='Filter reads based on strand information.', type=str, default="unstranded")
     align_parser.add_argument('--trim', help='Configuration for trimming read-data, in the format <TARGET_LENGTH>:<STRICTNESS>, comma-separated, one entry for each passed library', type=str, default="")
     align_parser.add_argument('--tmpdir', help='Path to a temporary directory for sorting .bam files', type=str, default=None)
+    align_parser.add_argument(
+        '--skip_tso_trimming',
+        help='Skip trimming 13bp off the R1 read for the TSO',
+        action='store_true',
+        default=False
+    )
 
     report_parser = subparsers.add_parser('report')
     report_parser.add_argument('-i', '--input', help='The input file.', type=str, required=True)
@@ -476,7 +486,18 @@ def sort_input_bam(bam, cores, tmp_dir):
     elif args.subcommand == 'generate':
         generate(args.file, args.opt_file, args.output_path)
     elif args.subcommand == 'align':
-        sys.exit(align(args.reference, args.output, args.input, args.num_cores, args.strand_filter, args.trim, args.tmpdir))
+        sys.exit(
+            align(
+                args.reference,
+                args.output,
+                args.input,
+                args.num_cores,
+                args.strand_filter,
+                args.trim,
+                args.tmpdir,
+                args.skip_tso_trimming
+            )
+        )
     elif args.subcommand == 'report':
         summarize_columns_list = args.summarize.split(',') if args.summarize else None
         report(args.input, args.output, summarize_columns_list, args.threshold, args.disable_thresholding)