From b6702422e9a3fb63110bd6ac5c379c550effcdff Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Mon, 11 May 2026 09:48:01 +0100
Subject: [PATCH 01/23] docs: rename --var_prefix to --protein_prefix in CLI
 examples

The click commands accept --protein_prefix; --var_prefix no longer
exists. Example commands in docs/use-cases.md and docs/pgatk-cli.md
were stale and failed when copy-pasted.

Closes #100
---
 docs/pgatk-cli.md | 10 +++++-----
 docs/use-cases.md | 18 +++++++++---------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/docs/pgatk-cli.md b/docs/pgatk-cli.md
index f36971a..8bf7878 100644
--- a/docs/pgatk-cli.md
+++ b/docs/pgatk-cli.md
@@ -318,7 +318,7 @@ Usage: pgatk vcf-to-proteindb [OPTIONS]
   Options:
     --translation_table INTEGER        Translation table (Default 1)
     --mito_translation_table INTEGER   Mito_trans_table (default 2)
-    --var_prefix TEXT                   String to add as prefix for the variant peptides
+    --protein_prefix TEXT               String to add as prefix for the variant peptides
     --report_ref_seq                   Also report the reference peptide from overlapping transcripts
     --annotation_field_name TEXT       Annotation field name in INFO column (default: CSQ)
     --af_field TEXT                    Field name for variant allele frequency (default: none)
@@ -467,7 +467,7 @@ Usage: pgatk dnaseq-to-proteindb [OPTIONS]
     --biotype_str TEXT                 String used to identify gene/transcript biotype (default: transcript_biotype)
     --expression_str TEXT              String for extracting expression value (default: None)
     --expression_thresh FLOAT          Threshold for expression value filtering (default: 5)
-    --var_prefix TEXT                  Prefix to be added to fasta headers (default: none)
+    --protein_prefix TEXT              Prefix to be added to fasta headers (default: none)
     -h, --help                         Show this message and exit.
 ```
 
@@ -489,7 +489,7 @@ Usage: pgatk dnaseq-to-proteindb [OPTIONS]
         --config_file config/ensembl_config.yaml \
         --input_fasta transcript_sequences.fa \
         --output_proteindb proteindb_from_lincRNA_canonical_sequences.fa \
-        --var_prefix lincRNA_ \
+        --protein_prefix lincRNA_ \
         --include_biotypes lincRNA
     ```
 
@@ -500,7 +500,7 @@ Usage: pgatk dnaseq-to-proteindb [OPTIONS]
         --config_file config/ensembl_config.yaml \
         --input_fasta transcript_sequences.fa \
         --output_proteindb proteindb_from_processed_pseudogene.fa \
-        --var_prefix pseudogene_ \
+        --protein_prefix pseudogene_ \
         --include_biotypes processed_pseudogene,transcribed_processed_pseudogene,translated_processed_pseudogene \
         --skip_including_all_cds
     ```
@@ -512,7 +512,7 @@ Usage: pgatk dnaseq-to-proteindb [OPTIONS]
         --config_file config/ensembl_config.yaml \
         --input_fasta transcript_sequences.fa \
         --output_proteindb proteindb_from_altORFs.fa \
-        --var_prefix altorf_ \
+        --protein_prefix altorf_ \
         --include_biotypes altORFs \
         --skip_including_all_cds
     ```
diff --git a/docs/use-cases.md b/docs/use-cases.md
index bd3cf20..0f151e9 100644
--- a/docs/use-cases.md
+++ b/docs/use-cases.md
@@ -57,7 +57,7 @@ detectable peptides. Translate them in three reading frames:
 pgatk dnaseq-to-proteindb \
     --input_fasta ensembl_human/transcripts.fa \
     --output_proteindb pseudogene.fa \
-    --var_prefix pseudo_ \
+    --protein_prefix pseudo_ \
     --include_biotypes processed_pseudogene,unprocessed_pseudogene,transcribed_processed_pseudogene,transcribed_unprocessed_pseudogene,translated_processed_pseudogene \
     --num_orfs 3 \
     --skip_including_all_cds
@@ -72,7 +72,7 @@ micropeptides:
 pgatk dnaseq-to-proteindb \
     --input_fasta ensembl_human/transcripts.fa \
     --output_proteindb lncrna.fa \
-    --var_prefix lncrna_ \
+    --protein_prefix lncrna_ \
     --include_biotypes lincRNA,antisense,sense_intronic,sense_overlapping \
     --num_orfs 3 \
     --skip_including_all_cds
@@ -87,7 +87,7 @@ peptides:
 pgatk dnaseq-to-proteindb \
     --input_fasta ensembl_human/transcripts.fa \
     --output_proteindb altorf.fa \
-    --var_prefix altorf_ \
+    --protein_prefix altorf_ \
     --include_biotypes altORFs \
     --skip_including_all_cds
 ```
@@ -546,7 +546,7 @@ them in three reading frames:
 pgatk dnaseq-to-proteindb \
     --input_fasta transcripts.fa \
     --output_proteindb lincRNA_proteins.fa \
-    --var_prefix lincRNA_ \
+    --protein_prefix lincRNA_ \
     --include_biotypes lincRNA \
     --num_orfs 3 \
     --skip_including_all_cds
@@ -560,7 +560,7 @@ Some pseudogenes are transcribed and may produce functional peptides:
 pgatk dnaseq-to-proteindb \
     --input_fasta transcripts.fa \
     --output_proteindb pseudogene_proteins.fa \
-    --var_prefix pseudogene_ \
+    --protein_prefix pseudogene_ \
     --include_biotypes processed_pseudogene,transcribed_processed_pseudogene,translated_processed_pseudogene \
     --num_orfs 3 \
     --skip_including_all_cds
@@ -575,7 +575,7 @@ discover upstream ORFs (uORFs), overlapping ORFs, and downstream ORFs:
 pgatk dnaseq-to-proteindb \
     --input_fasta transcripts.fa \
     --output_proteindb altorf_proteins.fa \
-    --var_prefix altorf_ \
+    --protein_prefix altorf_ \
     --include_biotypes altORFs \
     --skip_including_all_cds
 ```
@@ -586,7 +586,7 @@ pgatk dnaseq-to-proteindb \
 pgatk dnaseq-to-proteindb \
     --input_fasta transcripts.fa \
     --output_proteindb antisense_proteins.fa \
-    --var_prefix antisense_ \
+    --protein_prefix antisense_ \
     --include_biotypes antisense,antisense_RNA \
     --num_orfs 3 \
     --skip_including_all_cds
@@ -979,7 +979,7 @@ pgatk cosmic-to-proteindb \
 pgatk dnaseq-to-proteindb \
     --input_fasta ensembl_data/transcripts.fa \
     --output_proteindb lincRNA.fa \
-    --var_prefix lincRNA_ \
+    --protein_prefix lincRNA_ \
     --include_biotypes lincRNA \
     --num_orfs 3 \
     --skip_including_all_cds
@@ -988,7 +988,7 @@ pgatk dnaseq-to-proteindb \
 pgatk dnaseq-to-proteindb \
     --input_fasta ensembl_data/transcripts.fa \
     --output_proteindb pseudogene.fa \
-    --var_prefix pseudogene_ \
+    --protein_prefix pseudogene_ \
     --include_biotypes processed_pseudogene,transcribed_processed_pseudogene \
     --num_orfs 3 \
     --skip_including_all_cds

From bb4edb66c166bd763570c1b728da9064d112d84d Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Mon, 11 May 2026 11:31:01 +0100
Subject: [PATCH 02/23] Stop tracking docs/plans/ (internal working docs)

Implementation plans and design scratch notes are kept locally as
working artifacts only; they don't belong in the published repo.
Files remain on disk and are now gitignored.
---
 .gitignore                                    |   3 +
 .../2026-03-01-pgatk-graph-engine-design.md   | 270 ------------------
 .../2026-03-03-protein-accession-design.md    |  96 -------
 3 files changed, 3 insertions(+), 366 deletions(-)
 delete mode 100644 docs/plans/2026-03-01-pgatk-graph-engine-design.md
 delete mode 100644 docs/plans/2026-03-03-protein-accession-design.md

diff --git a/.gitignore b/.gitignore
index 3e8faad..c9aa01f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -135,3 +135,6 @@ pgatk/testdata/Meleagris_gallopavo*
 .DS_Store
 .codacy/
 .cursor/
+
+# Internal working docs (implementation plans, scratch notes)
+docs/plans/
diff --git a/docs/plans/2026-03-01-pgatk-graph-engine-design.md b/docs/plans/2026-03-01-pgatk-graph-engine-design.md
deleted file mode 100644
index 9048e99..0000000
--- a/docs/plans/2026-03-01-pgatk-graph-engine-design.md
+++ /dev/null
@@ -1,270 +0,0 @@
-# pgatk Evolution: Graph-Based Transcript Modeling & Feature Parity
-
-**Date:** 2026-03-01
-**Status:** Approved
-**Authors:** Yasset Perez-Riverol, Claude (AI assistant)
-
----
-
-## 1. Executive Summary
-
-This document describes the research and implementation plan to evolve pgatk from a linear, one-variant-at-a-time proteogenomics database generator into a graph-based transcript modeling engine capable of multi-variant co-occurrence, gene fusions, RNA editing, and circular RNA support. The plan includes a prerequisite infrastructure cleanup phase and a phased feature rollout with performance optimization.
-
----
-
-## 2. Current State Assessment
-
-### What pgatk does well
-
-- Only Python tool with native one-command downloaders for ENSEMBL, COSMIC, and cBioPortal
-- Integrated decoy generation (4 methods: protein-reverse, protein-shuffle, DecoyPyrat, pgdbdeep)
-- Spectrum-level validation via SpectrumAI (unique among database generators)
-- Aho-Corasick exact matching + sliding-window mismatch search (BlastGetPosition)
-- Published benchmark: 43,501 non-canonical peptides across 64 human cell lines (Umer et al. 2022)
-- Pangenome proteogenomics demonstrated (Wang et al. 2024 preprint): 4,991 novel peptides from HPRC
-
-### Critical gaps vs. the field (2024-2026)
-
-| Gap | Impact | Solved by |
-|---|---|---|
-| No multi-variant co-occurrence | Transcript with N SNPs produces 1 sequence instead of up to 2^N | moPepGen (DAG), vcf2prot (phased VCF) |
-| No gene fusions | Major cancer event type missed entirely | moPepGen |
-| No RNA editing | A-to-I editing creates novel peptides invisible to DNA-only | moPepGen |
-| No circular RNA | Emerging class of translated non-coding RNA | moPepGen |
-| No ClinVar/NCBI support | Open issue #24 since 2019 | Manual workaround only |
-| pandas iterrows in VCF processing | ~100x slower than vectorized operations | Internal bottleneck |
-| No transcript feature caching | Redundant gffutils DB queries per variant | Internal bottleneck |
-
-### Competitive landscape
-
-**moPepGen** (Nature Biotechnology, June 2025): Graph-based (DAG per transcript), handles all event types, ~4x more non-canonical peptides than prior methods. The algorithmic benchmark to match.
-
-**vcf2prot** (ElAbd et al. 2022, bioRxiv): Rust-based Sequence Intermediate Representation (SIR) approach. ~1000x faster than PrecisionProDB. Processes 99,254 variants across 8,192 patients in ~11 minutes. Demonstrates the performance ceiling achievable with systems-level optimization. Limited to phased VCFs with BCFtools/csq annotation.
-
-**PG2** (J. Proteome Research 2023): Snakemake pipeline integrating genome + transcriptome. Handles splicing and fusions but requires heavy infrastructure.
-
-**pXg** (MCP 2024): RNA-Seq + proteomics + de novo sequencing for immunopeptidomics.
-
-**NeoDisc** (Nature Biotechnology 2024): End-to-end clinical neoantigen pipeline.
-
-**PepCentric** (bioRxiv 2025): Repository-scale validation against 2.3B spectra. Complementary to database generators.
-
-### Key code quality findings
-
-- `get_altseq()` (ensembl.py): 60-line heart of variant processing, handles SNP/ins/del but only one variant at a time
-- `vcf_to_proteindb()` (ensembl.py): ~260 lines, main bottleneck is `vcf_reader.iterrows()` + per-row gffutils DB lookups
-- No `pathlib`, no `dataclasses`, inconsistent type hints
-- `db/` standalone scripts use `sys.argv` at import time (not importable as modules)
-- Variable shadowing bug in `protein_database_decoy.py` (`decoy_sequence` variable shadows the imported `pyteomics.fasta.decoy_sequence` function)
-- Broad `except Exception` in several places suppresses unexpected errors
-- Mixed `print()` and logger usage for error reporting
-- Repetitive 3-layer config fallback pattern across all service classes
-
----
-
-## 3. Architecture: TranscriptGraph Engine
-
-### Core Concept
-
-Each transcript is modeled as a directed acyclic graph (DAG):
-
-- **Nodes** represent sequence segments (reference sequence between variant positions)
-- **Edges** represent either the reference path or an alternative (variant) path
-- **Traversal** of all root-to-leaf paths generates all combinatorial protein sequences
-
-```
-Reference:  ──[seg1]──[seg2]──[seg3]──[seg4]──
-                        │                │
-Variant A:              └─[alt_A]─┘      │
-Variant B:                               └─[alt_B]─┘
-
-Paths: ref, A only, B only, A+B  →  4 protein sequences
-```
-
-### Module Structure
-
-```
-pgatk/
-  graph/                          # NEW module
-    __init__.py
-    transcript_graph.py           # TranscriptGraph class (DAG builder + traverser)
-    variant_nodes.py              # Node types: SNP, Insertion, Deletion, Fusion, RNAEdit, CircRNA
-    graph_enumerator.py           # Path enumeration with combinatorial explosion controls
-    event_parsers/                # Input parsers for each event type
-      __init__.py
-      vcf_parser.py              # VCF → graph edges (replaces current get_altseq)
-      fusion_parser.py           # Gene fusion calls → cross-transcript edges
-      rnaedit_parser.py          # RNA editing sites → substitution edges
-      circrna_parser.py          # Back-splice junctions → circular edges
-      clinvar_parser.py          # ClinVar VCF → graph edges (issue #24)
-    filters/
-      __init__.py
-      allele_frequency.py        # AF-based pruning
-      consequence.py             # VEP consequence filtering
-      expression.py              # Expression-level filtering
-      max_variants.py            # Cap combinatorial explosion
-```
-
-### Key Classes
-
-```python
-@dataclass
-class VariantNode:
-    """Base class for all genomic events."""
-    position: int          # CDS position
-    ref_allele: str
-    alt_allele: str
-    event_type: EventType  # SNP, INS, DEL, FUSION, RNA_EDIT, CIRC_RNA
-    metadata: dict         # AF, consequence, source, etc.
-
-
-class TranscriptGraph:
-    """DAG representing a transcript with all its variants."""
-
-    def __init__(self, transcript_id: str, reference_seq: str, strand: str) -> None: ...
-    def add_variant(self, variant: VariantNode) -> None: ...
-    def add_fusion(self, partner_graph: 'TranscriptGraph', breakpoints: tuple) -> None: ...
-    def enumerate_paths(self, max_paths: int = 1000) -> Iterator[ProteinSequence]: ...
-    def to_fasta(self, output: TextIO) -> int: ...  # returns count written
-```
-
-### Combinatorial Explosion Controls
-
-With N variants, naive enumeration produces 2^N sequences. Controls:
-
-1. **Max variants per transcript** (default: 10) — skip transcripts exceeding this
-2. **Max paths per transcript** (default: 1000) — stop enumeration after limit
-3. **Allele frequency pruning** — only include variants above a threshold
-4. **Phasing support** — if phased VCF, only enumerate haplotype-consistent paths
-5. **Consequence filtering** — only include protein-altering consequences
-
-### Integration Strategy
-
-The graph engine sits alongside existing modules (no replacement):
-
-- **Downloaders** remain unchanged
-- **New CLI commands** (`graph-vcf-to-proteindb`, etc.) use the graph engine
-- **Existing commands** continue working for backward compatibility
-- **Shared infrastructure** (`toolbox/`, `config/`) is reused
-
----
-
-## 4. Phased Implementation Plan
-
-### Phase 0 — Infrastructure & Code Quality
-
-**Goal:** Fix existing issues so the graph engine is built on solid ground.
-
-**Deliverables:**
-
-1. Fix variable shadowing bug in `protein_database_decoy.py`
-2. Convert `db/digest_mutant_protein.py` and `db/map_peptide2genome.py` from `sys.argv`/`getopt` import-time execution to proper Click CLI commands
-3. Add `conftest.py` with proper fixtures; remove relative path dependencies in tests
-4. Replace `print()` error reporting with proper logger usage across all modules
-5. Add type hints to all public APIs
-6. Use `dataclasses` for data models (`SNP` in `cgenomes/models.py`, new models)
-7. Use `pathlib.Path` for file operations (replace string concatenation)
-8. Replace broad `except Exception` with specific exception handling
-9. Standardize the 3-layer config fallback pattern (DRY up repetitive `get_*_parameters()` methods)
-10. Expand unit test coverage for core functions: `get_altseq()`, `get_mut_pro_seq()`, `revswitch()`
-11. Replace `pathos` with `concurrent.futures.ProcessPoolExecutor` (stdlib)
-
----
-
-### Phase 1 — Graph Core + SNP/Indel Support
-
-**Goal:** Working graph engine for basic variants, producing a superset of current output.
-
-**Deliverables:**
-
-1. `TranscriptGraph` class with add/enumerate/serialize
-2. `VariantNode` dataclass hierarchy (SNP, Insertion, Deletion)
-3. `vcf_parser.py` — reads VCF (VEP-annotated or unannotated) into graph edges
-4. `graph_enumerator.py` — path enumeration with explosion controls
-5. New CLI command: `graph-vcf-to-proteindb`
-6. Transcript feature caching (eliminate redundant gffutils queries)
-7. Vectorized VCF grouping (replace `iterrows()`)
-8. Unit tests for graph operations + integration tests
-9. Benchmark against current `vcf_to_proteindb()` on test VCFs
-
-**Validation:** Graph engine must produce a superset of sequences from the linear approach.
-
----
-
-### Phase 2 — Cancer Genomics + ClinVar
-
-**Goal:** Bring cancer mutation sources into the graph model and add ClinVar (issue #24).
-
-**Deliverables:**
-
-1. `clinvar_parser.py` — parse ClinVar VCF into graph edges
-2. Refactor `CancerGenomesService.get_mut_pro_seq()` to produce `VariantNode` objects
-3. Graph-aware COSMIC processing (multiple mutations on same gene → combinatorial)
-4. Graph-aware cBioPortal processing
-5. New CLI commands: `clinvar-to-proteindb`, `graph-cosmic-to-proteindb`
-6. Tests with real COSMIC data (multiple mutations per transcript)
-
----
-
-### Phase 3 — Novel Event Types: Fusions, RNA Editing, Circular RNA
-
-**Goal:** Add the event types that differentiate moPepGen from other tools.
-
-**Deliverables:**
-
-1. `fusion_parser.py` — parse gene fusion calls (STAR-Fusion, Arriba, FusionCatcher formats)
-2. `rnaedit_parser.py` — parse RNA editing databases (REDIportal, DARNED) or user sites
-3. `circrna_parser.py` — parse back-splice junction files (CIRCexplorer, CIRI)
-4. Extended `TranscriptGraph` for cross-transcript fusions and circular paths
-5. New CLI commands: `fusion-to-proteindb`, `rnaedit-to-proteindb`, `circrna-to-proteindb`
-6. Combined command: `multi-event-to-proteindb` (merges all event types per transcript)
-
----
-
-### Phase 4 — Performance Optimization
-
-**Goal:** Make the graph engine fast enough for genome-scale datasets.
-
-**Deliverables:**
-
-1. Profile Phases 1-3 on real-world datasets (gnomAD, TCGA)
-2. Identify hot paths via `cProfile` / `py-spy`
-3. Optimize graph traversal (topological sort, memoization of shared subpaths)
-4. Replace remaining `pathos` usage with `concurrent.futures`
-5. Optional: C extension (via `cffi` or `cython`) for innermost graph traversal — only if profiling shows it's the bottleneck
-6. Benchmark against moPepGen and vcf2prot on same datasets
-
----
-
-### Phase Dependencies
-
-```
-Phase 0 (Infrastructure & Quality)
-    └── Phase 1 (Graph Core + SNP/Indel)
-            ├── Phase 2 (Cancer + ClinVar)
-            │       └── Phase 3 (Fusions, RNA Edit, CircRNA)
-            │               └── Phase 4 (Performance)
-            └── (ongoing quality improvements)
-```
-
----
-
-## 5. References
-
-### Primary pgatk publication
-- Umer HM, Audain E, Zhu Y, Pfeuffer J, Sachsenberg T, Lehtiö J, Branca RM, Perez-Riverol Y. Generation of ENSEMBL-based proteogenomics databases boosts the identification of non-canonical peptides. *Bioinformatics*. 2022;38(5):1470-1472. doi:10.1093/bioinformatics/btab838
-
-### Pangenome proteogenomics
-- Wang D, Bouwmeester R, Zheng P, Dai C, Sanchez A, Shu K, Bai M, Umer HM, Perez-Riverol Y. Proteogenomics analysis of human tissues using pangenomes. *bioRxiv*. 2024. doi:10.1101/2024.05.24.595489
-
-### Competing tools
-- moPepGen: Graph-based proteogenomic database generation. *Nature Biotechnology*. June 2025. doi:10.1038/s41587-025-02701-0
-- vcf2prot: ElAbd H, Degenhardt F, Lenz TL, Franke A, Wendorff M. VCF2Prot: An efficient and parallel tool for generating personalized proteomes from VCF files. *bioRxiv*. 2022. doi:10.1101/2022.01.21.477084
-- ProteomeGenerator2: *J. Proteome Research*. 2023. doi:10.1021/acs.jproteome.3c00005
-- pXg: *Molecular & Cellular Proteomics*. 2024. doi:10.1016/j.mcpro.2024.100733
-- NeoDisc: *Nature Biotechnology*. October 2024. doi:10.1038/s41587-024-02420-y
-- PepCentric: *bioRxiv*. February 2025. doi:10.1101/2025.02.24.639867
-
-### Related ecosystem
-- quantms: Dai C et al. *Nature Methods*. 2024. doi:10.1038/s41592-024-02343-1
-- PRIDE 2025: *Nucleic Acids Research*. 2025;53(D1):D543. doi:10.1093/nar/gkae1011
diff --git a/docs/plans/2026-03-03-protein-accession-design.md b/docs/plans/2026-03-03-protein-accession-design.md
deleted file mode 100644
index cfe0b8d..0000000
--- a/docs/plans/2026-03-03-protein-accession-design.md
+++ /dev/null
@@ -1,96 +0,0 @@
-# Protein Accession and FASTA Header Design
-
-Issue: https://github.com/bigbio/pgatk/issues/18
-Branch: `feature/protein-accession-design`
-Date: 2026-03-03
-
-## Problem
-
-Current pgatk FASTA headers are inconsistent across variant sources (VCF, COSMIC, ClinVar) and incompatible with major search engines like SearchGUI, which cannot parse ENSEMBL-style IDs.
-
-## Design
-
-### Two protein categories, two prefix strategies
-
-| Category | Prefix | Accession | Description |
-|----------|--------|-----------|-------------|
-| Canonical (reference) | Keep original (`sp\|`, `tr\|`, `ensp\|`) | Original accession | Untouched from source database |
-| Variant (mutated) | `pgvar\|` | `{TRANSCRIPT_ID}-{INDEX}` | pgatk-generated variant protein |
-
-### Variant header format
-
-```
->pgvar|{TRANSCRIPT_ID}-{INDEX}|{GENE_SYMBOL} {key=value metadata}
-```
-
-**Fields:**
-
-- `pgvar` -- database tag identifying pgatk-generated variant proteins.
-- `{TRANSCRIPT_ID}-{INDEX}` -- accession composed of parent transcript ID and a dash-separated 1-based index (per transcript, per run). Mirrors UniProt isoform convention (`P12345-2`).
-- `{GENE_SYMBOL}` -- gene name, first token after the second pipe.
-- Metadata key=value pairs in the description field:
-
-| Key | Description | Example |
-|-----|-------------|---------|
-| `VariantSource` | Origin database | `COSMIC`, `ClinVar`, `gnomAD`, `dbSNP` |
-| `GenomicCoord` | `chr:pos:ref:alt` | `12:25245347:C:G` |
-| `AAChange` | HGVS protein notation | `p.G13R` |
-| `MutationType` | SO term or short label | `missense_variant` |
-| `dbSNP` | rsID if available | `rs121913529` |
-| `ORF` | Reading frame number (only when multi-ORF) | `1`, `2`, `3` |
-
-### Examples
-
-```fasta
-# Canonical proteins -- untouched from source databases
->sp|P01112|RASH_HUMAN GTPase HRas OS=Homo sapiens
->ensp|ENSP00000309845|BRCA1
-
-# Variant proteins -- unified pgvar| prefix regardless of source
->pgvar|ENST00000311189-1|HRAS VariantSource=COSMIC AAChange=p.G13R MutationType=missense_variant GenomicCoord=12:25245347:C:G
->pgvar|ENST00000311189-2|HRAS VariantSource=COSMIC AAChange=p.Q61L MutationType=missense_variant GenomicCoord=12:25245350:A:T
->pgvar|ENST00000357654-1|BRCA1 VariantSource=ClinVar AAChange=p.R1699Q MutationType=missense_variant GenomicCoord=17:43094464:G:A dbSNP=rs41293455
-
-# Multiple ORFs -- each ORF gets its own index, ORF number in metadata
->pgvar|ENST00000311189-3|HRAS VariantSource=COSMIC AAChange=p.G13R ORF=1
->pgvar|ENST00000311189-4|HRAS VariantSource=COSMIC AAChange=p.G13R ORF=2
->pgvar|ENST00000311189-5|HRAS VariantSource=COSMIC AAChange=p.G13R ORF=3
-```
-
-### Indexing logic
-
-The `-{index}` is per-transcript, per-file generation run:
-
-- First variant on `ENST00000311189` gets `-1`
-- Second variant on same transcript gets `-2`
-- Multi-ORF outputs each consume an index (3 ORFs = 3 indices)
-- First variant on a different transcript resets to `-1`
-
-### Search engine compatibility
-
-| Engine | Compatible | Notes |
-|--------|-----------|-------|
-| SearchGUI / PeptideShaker | Yes | Matches UniProt-like `db\|acc\|name` pattern |
-| MaxQuant | Yes | Default UniProt parse rule works |
-| MSFragger / FragPipe | Yes | Reads full header, splits on first whitespace |
-| Comet | Yes | Parses `>db\|acc\|` natively |
-| DIA-NN | Yes | Follows UniProt-style parsing |
-| Proteome Discoverer | Yes | Supports pipe-delimited headers |
-
-### Files to modify
-
-| File | Change |
-|------|--------|
-| `pgatk/ensembl/ensembl.py` | Refactor `vcf_to_proteindb()` header construction (lines 661-664) |
-| `pgatk/clinvar/clinvar_service.py` | Refactor header construction (lines 554-560) |
-| `pgatk/cgenomes/cgenomes_proteindb.py` | Refactor COSMIC header (line 317), cBioPortal header |
-| `pgatk/toolbox/vcf_utils.py` | Update `write_output()` to handle new format cleanly |
-| `pgatk/config/` | Add constants for `PGVAR_PREFIX`, metadata keys |
-
-### Design decisions
-
-1. **Dash separator** (`-`) between transcript and index, consistent with UniProt isoform convention.
-2. **No ORF suffix in accession** -- ORF number is metadata (`ORF=N`), each ORF gets its own index.
-3. **Canonical proteins are pass-through** -- pgatk does not reformat existing database headers.
-4. **Unified format across all sources** -- COSMIC, ClinVar, VCF variants all use `pgvar|` regardless of origin.
-5. **Key=value metadata** in description field for structured downstream parsing.

From 14ee5e7d4f7895f618078c24197c2e83d87797db Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Mon, 11 May 2026 11:49:56 +0100
Subject: [PATCH 03/23] Switch COSMIC downloader to v103 scripted-download API

Sanger retired the legacy /cosmic/file_download/... endpoint. The new
flow is a two-step API: GET /api/mono/products/v1/downloads/scripted
returns a signed S3 URL (1-hour TTL), which is then fetched without
auth. The downloader is rewritten around this contract.

- Config: replace the v94 per-file URL/filename pairs with a single
  `products` list of `path=` values copy-pasteable from the COSMIC
  scripted-download UI. Defaults cover v103 / GRCh38: genome-screens
  mutations, targeted-screens mutations, gene FASTA, transcripts,
  cell-line mutations, cell-line CNA.
- Downloader: build_api_url() constructs the new endpoint URL; the
  two-step download parses JSON, follows the signed URL, and extracts
  the resulting .tar archive (replaces the old .tar.gz handling).
- Downloader: clear error message when the API returns 200 but the
  body is not the expected JSON-with-url - the failure mode reported
  in the bug. Distinguish API errors from S3 errors.
- Downloader: tqdm progress bar for the S3 download (large files were
  silent before).
- CLI: add -P/--products flag (repeatable) so users can fetch an
  arbitrary product without editing YAML, e.g.
    pgatk cosmic-downloader -P grch37/cancer_mutation_census/v100/...
- Tests: unit tests for URL construction and base64 auth-token format
  (no network).

End-to-end verified with real Sanger credentials: three of the six
default paths (mutations, gene FASTA, cell-line mutations) download
and extract cleanly. The three added afterwards (targeted-screens,
transcripts, CNA) share the same path pattern as the verified ones
and the filenames were taken from the COSMIC UI verbatim.

Closes #98
---
 pgatk/cgenomes/cosmic_downloader.py   | 228 ++++++++++++++------------
 pgatk/commands/cosmic_downloader.py   |  11 +-
 pgatk/config/cosmic_config.yaml       |  29 +++-
 pgatk/tests/test_cosmic_downloader.py |  44 +++++
 4 files changed, 200 insertions(+), 112 deletions(-)
 create mode 100644 pgatk/tests/test_cosmic_downloader.py

diff --git a/pgatk/cgenomes/cosmic_downloader.py b/pgatk/cgenomes/cosmic_downloader.py
index 3332014..836955e 100644
--- a/pgatk/cgenomes/cosmic_downloader.py
+++ b/pgatk/cgenomes/cosmic_downloader.py
@@ -1,7 +1,9 @@
 import base64
-import gzip
-import requests
 import os
+import tarfile
+
+import requests
+from tqdm import tqdm
 
 from pgatk.toolbox.exceptions import AppConfigException
 from pgatk.toolbox.general import ParameterConfiguration, check_create_folders
@@ -12,14 +14,22 @@ class CosmicDownloadService(ParameterConfiguration):
     CONFIG_OUTPUT_DIRECTORY = 'output_directory'
     CONFIG_COSMIC_SERVER = 'cosmic_server'
     CONFIG_COSMIC_FTP_URL = 'cosmic_ftp'
+    CONFIG_API_ENDPOINT = 'api_endpoint'
+    CONFIG_BUCKET = 'bucket'
     CONFIG_COSMIC_FTP_USER = "cosmic_user"
     CONFIG_COSMIC_FTP_PASSWORD = "cosmic_password"
-    CONFIG_COSMIC_MUTATIONS_URL = "mutations_url"
-    CONFIG_COSMIC_MUTATIONS_FILE = "mutations_file"
-    CONFIG_COSMIC_CELLLINE_MUTATIONS_URL = "mutations_cellline_url"
-    CONFIG_COSMIC_CELLLINE_MUTATIONS_FILE = "mutations_cellline_file"
-    CONFIG_COSMIC_CDS_GENES_FILE = "all_cds_genes_file"
-    CONFIG_COSMIC_CELLLINES_GENES_FILE = "all_celllines_genes_file"
+    CONFIG_PRODUCTS = "products"
+
+    # Built-in fallback list, used only if `products` is missing from config.
+    # Real default lives in pgatk/config/cosmic_config.yaml and is the source of truth.
+    _DEFAULT_PRODUCTS = (
+        'grch38/cosmic/v103/Cosmic_GenomeScreensMutant_Tsv_v103_GRCh38.tar',
+        'grch38/cosmic/v103/Cosmic_CompleteTargetedScreensMutant_Tsv_v103_GRCh38.tar',
+        'grch38/cosmic/v103/Cosmic_Genes_Fasta_v103_GRCh38.tar',
+        'grch38/cosmic/v103/Cosmic_Transcripts_Tsv_v103_GRCh38.tar',
+        'grch38/cell_lines/v103/CellLinesProject_GenomeScreensMutant_Tsv_v103_GRCh38.tar',
+        'grch38/cell_lines/v103/CellLinesProject_CompleteCNA_Tsv_v103_GRCh38.tar',
+    )
 
     def __init__(self, config_file, pipeline_arguments):
         """
@@ -29,30 +39,24 @@ def __init__(self, config_file, pipeline_arguments):
         """
         super(CosmicDownloadService, self).__init__(self.CONFIG_KEY_DATA_DOWNLOADER, config_file, pipeline_arguments)
 
-        self._local_path_cosmic = self.get_configuration_default_params(variable=self.CONFIG_OUTPUT_DIRECTORY,
-                                                                        default_value='./database_cosmic/')
-        self._cosmic_ftp_url = self.get_configuration_default_params(variable=self.CONFIG_COSMIC_FTP_URL,
-                                                                     default_value='https://cancer.sanger.ac.uk')
-        self._cosmic_user = self.get_configuration_default_params(variable=self.CONFIG_COSMIC_FTP_USER,
-                                                                  default_value='')
-        self._cosmic_password = self.get_configuration_default_params(variable=self.CONFIG_COSMIC_FTP_PASSWORD,
-                                                                      default_value='')
-        self._cosmic_mutation_url = self.get_configuration_default_params(variable=self.CONFIG_COSMIC_MUTATIONS_URL,
-                                                                          default_value='cosmic/file_download/GRCh38/cosmic/v94')
-        self._cosmic_mutations_file = self.get_configuration_default_params(variable=self.CONFIG_COSMIC_MUTATIONS_FILE,
-                                                                            default_value='CosmicMutantExport.tsv.gz')
-        self._cosmic_cellline_mutation_url = self.get_configuration_default_params(
-            variable=self.CONFIG_COSMIC_CELLLINE_MUTATIONS_URL,
-            default_value='cosmic/file_download/GRCh38/cell_lines/v94')
-        self._cosmic_cellline_mutation_file = self.get_configuration_default_params(
-            variable=self.CONFIG_COSMIC_CELLLINE_MUTATIONS_FILE, default_value='CosmicCLP_MutantExport.tsv.gz')
-        self._cosmic_cellline_mutation_gene = self.get_configuration_default_params(
-            variable=self.CONFIG_COSMIC_CELLLINES_GENES_FILE, default_value='All_CellLines_Genes.fasta.gz')
-        self._cosmic_cdns_file = self.get_configuration_default_params(variable=self.CONFIG_COSMIC_CDS_GENES_FILE,
-                                                                       default_value='All_COSMIC_Genes.fasta.gz')
-
-        self._cosmic_token = base64.b64encode("{}:{}".format(self._cosmic_user, self._cosmic_password)
-                                              .encode()).decode('utf-8')
+        self._local_path_cosmic = self.get_configuration_default_params(
+            variable=self.CONFIG_OUTPUT_DIRECTORY, default_value='./database_cosmic/')
+        self._cosmic_ftp_url = self.get_configuration_default_params(
+            variable=self.CONFIG_COSMIC_FTP_URL, default_value='https://cancer.sanger.ac.uk')
+        self._api_endpoint = self.get_configuration_default_params(
+            variable=self.CONFIG_API_ENDPOINT, default_value='api/mono/products/v1/downloads/scripted')
+        self._bucket = self.get_configuration_default_params(
+            variable=self.CONFIG_BUCKET, default_value='downloads')
+        self._cosmic_user = self.get_configuration_default_params(
+            variable=self.CONFIG_COSMIC_FTP_USER, default_value='')
+        self._cosmic_password = self.get_configuration_default_params(
+            variable=self.CONFIG_COSMIC_FTP_PASSWORD, default_value='')
+        self._products = self.get_configuration_default_params(
+            variable=self.CONFIG_PRODUCTS, default_value=list(self._DEFAULT_PRODUCTS))
+
+        self._cosmic_token = base64.b64encode(
+            "{}:{}".format(self._cosmic_user, self._cosmic_password).encode()
+        ).decode('utf-8')
 
         self.prepare_local_cosmic_repository()
 
@@ -78,88 +82,102 @@ def prepare_local_cosmic_repository(self):
     def get_local_path_root_cosmic_repo(self):
         return self._local_path_cosmic
 
+    def build_api_url(self, product_path):
+        """Build the COSMIC scripted-download API URL for a single product."""
+        return "{server}/{endpoint}?path={path}&bucket={bucket}".format(
+            server=self._cosmic_ftp_url.rstrip('/'),
+            endpoint=self._api_endpoint.strip('/'),
+            path=product_path,
+            bucket=self._bucket,
+        )
+
     def download_mutation_file(self, url_file_name=None):
         """
-        This function will download the mutations file from Cosmic Database.
-        :return: None
-        """
-
-        mutation_output_file = "{}/{}".format(self.get_local_path_root_cosmic_repo(), self._cosmic_mutations_file)
-        cds_genes_output_file = "{}/{}".format(self.get_local_path_root_cosmic_repo(), self._cosmic_cdns_file)
-
-        mutation_celline_output_file = "{}/{}".format(self.get_local_path_root_cosmic_repo(),
-                                                      self._cosmic_cellline_mutation_file)
-        cellines_genes_output_file = "{}/{}".format(self.get_local_path_root_cosmic_repo(),
-                                                    self._cosmic_cellline_mutation_gene)
-
-        server = self._cosmic_ftp_url
-
-        cosmic_version = self._cosmic_mutation_url
-        mutation_file = self._cosmic_mutations_file
+        Download every product listed in `self._products` via COSMIC's v103+
+        scripted-download API. Each product is a `path=` value such as
+        `grch38/cosmic/v103/Cosmic_GenomeScreensMutant_Tsv_v103_GRCh38.tar`.
 
-        cosmic_cellline_version = self._cosmic_cellline_mutation_url
-        mutation_celline_file = self._cosmic_cellline_mutation_file
-
-        all_cds_gene_file = self._cosmic_cdns_file
-
-        all_celllines_gene_file = self._cosmic_cellline_mutation_gene
-
-        mutation_url = "{}/{}/{}".format(server, cosmic_version, mutation_file)
-        cds_gene_url = "{}/{}/{}".format(server, cosmic_version, all_cds_gene_file)
+        If url_file_name is provided, write the API URLs to that file instead of downloading.
+        """
 
-        celllines_gene_url = "{}/{}/{}".format(server, cosmic_cellline_version, all_celllines_gene_file)
-        mutation_cellline_url = "{}/{}/{}".format(server, cosmic_cellline_version, mutation_celline_file)
+        products = list(self._products)
+        if not products:
+            self.get_logger().warning("No COSMIC products configured; nothing to download.")
+            return
 
-        if url_file_name is None:
-            token = "Basic {}".format(self._cosmic_token)
-            self.download_file_cosmic(mutation_url, mutation_output_file, token)
-            self.download_file_cosmic(cds_gene_url, cds_genes_output_file, token)
+        token = "Basic {}".format(self._cosmic_token)
+        output_dir = self.get_local_path_root_cosmic_repo()
 
-            self.download_file_cosmic(celllines_gene_url, cellines_genes_output_file, token)
-            self.download_file_cosmic(mutation_cellline_url, mutation_celline_output_file, token)
+        if url_file_name is not None:
+            with open(url_file_name, 'w', encoding='utf-8') as url_file:
+                for path in products:
+                    api_url = self.build_api_url(path)
+                    output_file = "{}/{}".format(output_dir, os.path.basename(path))
+                    url_file.write("{}\t{}\n".format(api_url, output_file))
+            return
 
-        else:
-            if url_file_name is not None:
-                with open(url_file_name, 'w', encoding='utf-8') as url_file:
-                    url_file.write("{}\t{}\n".format(mutation_url, mutation_output_file))
-                    url_file.write("{}\t{}\n".format(cds_gene_url, cds_genes_output_file))
-                    url_file.write("{}\t{}\n".format(celllines_gene_url, cellines_genes_output_file))
-                    url_file.write("{}\t{}\n".format(mutation_cellline_url, mutation_celline_output_file))
+        for path in products:
+            api_url = self.build_api_url(path)
+            output_file = "{}/{}".format(output_dir, os.path.basename(path))
+            self.download_file_cosmic(api_url, output_file, token)
 
-    def download_file_cosmic(self, url, local_file, token):
+    def download_file_cosmic(self, api_url, local_file, token):
         """
-        Download file from cosmic repository using requests
-        :param url: url of the file to be download
-        :param local_file: local file
-        :param token: token to be used
-        :return:
+        Two-step download from COSMIC's scripted-download API:
+        1) GET api_url with Basic auth -> returns JSON {"url": "<s3-presigned-url>"}
+        2) GET the presigned URL (no auth, 1-hour TTL) -> stream to local_file
+
+        If local_file ends in .tar, extract its contents to the same directory
+        and delete the archive.
         """
+        api_response = requests.get(api_url, headers={'Authorization': token}, timeout=30)
+        if api_response.status_code != 200:
+            msg = ("COSMIC API request failed: HTTP {} for {}. Body: {!r}"
+                   .format(api_response.status_code, api_url, api_response.text[:200]))
+            self.get_logger().error(msg)
+            raise AppConfigException(msg)
+
+        try:
+            payload = api_response.json()
+            download_url = payload['url']
+        except (ValueError, KeyError) as exc:
+            msg = ("COSMIC API returned 200 but the body is not a JSON object with a 'url' "
+                   "key (got: {!r}). This usually means the `path=` value is wrong for the "
+                   "release/assembly, or your account lacks access. URL was: {}. Error: {}"
+                   .format(api_response.text[:200], api_url, exc))
+            self.get_logger().error(msg)
+            raise AppConfigException(msg)
 
-        response = requests.get(url, stream=True, headers={'Authorization': token}, timeout=30)
-        if response.status_code == 200:
-            url = response.json()['url']
-            msg = "Downloading file from url '{}'".format(url)
-            self.get_logger().debug(msg)
-
-            response = requests.get(url, stream=True, timeout=30)
-            if response.status_code == 200:
-                with open(local_file, 'wb') as f:
-                    f.write(response.content)
-                    msg = "Download Finish for file '{}'".format(local_file)
-                    self.get_logger().debug(msg)
-                if local_file.endswith('.gz'):
-                    extracted_file = local_file.replace('.gz', '')
-                    with open(extracted_file, 'w', encoding='utf-8') as outfile:
-                        try:
-                            outfile.write(gzip.decompress(open(local_file, 'rb').read()).decode('utf-8'))
-                        except UnicodeDecodeError:
-                            outfile.write(gzip.decompress(open(local_file, 'rb').read()).decode('ISO-8859-1'))
-                        os.remove(local_file)
-                        local_file = extracted_file
-                        msg = "Extracted file '{}'".format(local_file)
-                        self.get_logger().debug(msg)
-        else:
-            msg = "Error downloading the COSMIC data, error code {} , error message '{}'".format(response.status_code,
-                                                                                                 local_file)
-            self.get_logger().debug(msg)
+        self.get_logger().debug("Downloading file from signed URL '{}'".format(download_url))
+        data_response = requests.get(download_url, stream=True, timeout=30)
+        if data_response.status_code != 200:
+            msg = ("COSMIC S3 download failed: HTTP {} for {}"
+                   .format(data_response.status_code, download_url))
+            self.get_logger().error(msg)
             raise AppConfigException(msg)
+
+        total_size = int(data_response.headers.get('content-length') or 0)
+        chunk_size = 1024 * 1024
+        progress = tqdm(
+            total=total_size if total_size > 0 else None,
+            unit='B', unit_scale=True, unit_divisor=1024,
+            desc=os.path.basename(local_file),
+            leave=True,
+        )
+        try:
+            with open(local_file, 'wb') as f:
+                for chunk in data_response.iter_content(chunk_size=chunk_size):
+                    if chunk:
+                        f.write(chunk)
+                        progress.update(len(chunk))
+        finally:
+            progress.close()
+        self.get_logger().debug("Download finished: '{}'".format(local_file))
+
+        if local_file.endswith('.tar'):
+            extract_dir = os.path.dirname(local_file) or '.'
+            print("Extracting {}...".format(os.path.basename(local_file)), flush=True)
+            with tarfile.open(local_file, 'r') as tar:
+                tar.extractall(path=extract_dir)
+            os.remove(local_file)
+            self.get_logger().debug("Extracted archive into '{}'".format(extract_dir))
diff --git a/pgatk/commands/cosmic_downloader.py b/pgatk/commands/cosmic_downloader.py
index 5696097..557cce9 100644
--- a/pgatk/commands/cosmic_downloader.py
+++ b/pgatk/commands/cosmic_downloader.py
@@ -16,9 +16,15 @@
               help="Username for cosmic database -- please if you don't have one register here (https://cancer.sanger.ac.uk/cosmic/register)")
 @click.option('-p', '--password',
               help="Password for cosmic database -- please if you don't have one register here (https://cancer.sanger.ac.uk/cosmic/register)")
+@click.option('-P', '--products', multiple=True,
+              help='COSMIC scripted-download `path=` value to fetch (e.g. '
+                   'grch38/cosmic/v103/Cosmic_GenomeScreensMutant_Tsv_v103_GRCh38.tar). '
+                   'May be repeated. Overrides the products list in the config file when provided. '
+                   'Find available paths at https://cancer.sanger.ac.uk/cosmic/download/cosmic '
+                   '(open the "Scripted Download" panel for any product).')
 @click.option("--url_file", help='Add the url to a downloaded file')
 @click.pass_context
-def cosmic_downloader(ctx, config_file, output_directory, username, password, url_file):
+def cosmic_downloader(ctx, config_file, output_directory, username, password, products, url_file):
 
     config_data = load_config("cosmic", config_file)
 
@@ -31,5 +37,8 @@ def cosmic_downloader(ctx, config_file, output_directory, username, password, ur
     if password is not None:
         pipeline_arguments[CosmicDownloadService.CONFIG_COSMIC_FTP_PASSWORD] = password
 
+    if products:
+        pipeline_arguments[CosmicDownloadService.CONFIG_PRODUCTS] = list(products)
+
     cosmic_downloader_service = CosmicDownloadService(config_data, pipeline_arguments)
     cosmic_downloader_service.download_mutation_file(url_file_name=url_file)
diff --git a/pgatk/config/cosmic_config.yaml b/pgatk/config/cosmic_config.yaml
index 6b6da9b..7721295 100644
--- a/pgatk/config/cosmic_config.yaml
+++ b/pgatk/config/cosmic_config.yaml
@@ -2,14 +2,31 @@ cosmic_data:
   output_directory: database_cosmic
   cosmic_server:
     cosmic_ftp: https://cancer.sanger.ac.uk
-    mutations_url: cosmic/file_download/GRCh38/cosmic/v94
-    all_cds_genes_file: All_COSMIC_Genes.fasta.gz
-    mutations_file: CosmicMutantExport.tsv.gz
-    mutations_cellline_url: cosmic/file_download/GRCh38/cell_lines/v94
-    mutations_cellline_file: CosmicCLP_MutantExport.tsv.gz
-    all_celllines_genes_file: All_CellLines_Genes.fasta.gz
+    api_endpoint: api/mono/products/v1/downloads/scripted
+    bucket: downloads
     cosmic_user: ''
     cosmic_password: ''
+    # List of COSMIC scripted-download paths to fetch. Each entry is the `path=`
+    # value from the curl command shown on the COSMIC UI's "Scripted Download" panel
+    # for a product (https://cancer.sanger.ac.uk/cosmic/download/cosmic).
+    #
+    # Path format: <assembly>/<area>/<version>/<filename>
+    #   - <assembly> is lowercase (`grch37`, `grch38`)
+    #   - <area> is `cosmic` for the main COSMIC project, `cell_lines` for the
+    #     Cell Lines Project (other projects: `cancer_mutation_census`, `actionability`)
+    #   - <version> is the release (`v103`, etc.)
+    #   - <filename> is the exact .tar archive name shown in the UI
+    #
+    # Add or remove entries to change what gets downloaded. Defaults below cover the
+    # three files the cosmic-to-proteindb pipeline expects (mutations TSV + gene FASTA
+    # for the main COSMIC project, plus cell-line mutations TSV).
+    products:
+      - grch38/cosmic/v103/Cosmic_GenomeScreensMutant_Tsv_v103_GRCh38.tar
+      - grch38/cosmic/v103/Cosmic_CompleteTargetedScreensMutant_Tsv_v103_GRCh38.tar
+      - grch38/cosmic/v103/Cosmic_Genes_Fasta_v103_GRCh38.tar
+      - grch38/cosmic/v103/Cosmic_Transcripts_Tsv_v103_GRCh38.tar
+      - grch38/cell_lines/v103/CellLinesProject_GenomeScreensMutant_Tsv_v103_GRCh38.tar
+      - grch38/cell_lines/v103/CellLinesProject_CompleteCNA_Tsv_v103_GRCh38.tar
   logger:
     formatters:
       DEBUG: "%(asctime)s [%(levelname)7s][%(name)48s][%(module)32s, %(lineno)4s] %(message)s"
diff --git a/pgatk/tests/test_cosmic_downloader.py b/pgatk/tests/test_cosmic_downloader.py
new file mode 100644
index 0000000..dea686c
--- /dev/null
+++ b/pgatk/tests/test_cosmic_downloader.py
@@ -0,0 +1,44 @@
+"""Unit tests for the COSMIC downloader URL construction.
+
+No network. Builds a CosmicDownloadService against the shipped default config
+and asserts that the four product API URLs match the expected pattern.
+"""
+import os
+
+from pgatk.cgenomes.cosmic_downloader import CosmicDownloadService
+from pgatk.config.registry import load_config
+
+
+def test_build_api_url_pattern(tmp_path):
+    """The API URL should be {server}/{endpoint}?path={path}&bucket={bucket}."""
+    config_data = load_config("cosmic", None)
+    pipeline_arguments = {
+        CosmicDownloadService.CONFIG_OUTPUT_DIRECTORY: str(tmp_path),
+        CosmicDownloadService.CONFIG_COSMIC_FTP_USER: "user@example.com",
+        CosmicDownloadService.CONFIG_COSMIC_FTP_PASSWORD: "secret",
+    }
+    svc = CosmicDownloadService(config_data, pipeline_arguments)
+
+    url = svc.build_api_url("grch38/cosmic/v103/Cosmic_MutantCensus_Tsv_v103_GRCh38.tar")
+
+    assert url == (
+        "https://cancer.sanger.ac.uk/api/mono/products/v1/downloads/scripted"
+        "?path=grch38/cosmic/v103/Cosmic_MutantCensus_Tsv_v103_GRCh38.tar"
+        "&bucket=downloads"
+    )
+
+
+def test_basic_auth_token_format(tmp_path):
+    """Auth token is base64(user:password), no trailing newline."""
+    import base64
+
+    config_data = load_config("cosmic", None)
+    pipeline_arguments = {
+        CosmicDownloadService.CONFIG_OUTPUT_DIRECTORY: str(tmp_path),
+        CosmicDownloadService.CONFIG_COSMIC_FTP_USER: "user@example.com",
+        CosmicDownloadService.CONFIG_COSMIC_FTP_PASSWORD: "secret",
+    }
+    svc = CosmicDownloadService(config_data, pipeline_arguments)
+
+    expected = base64.b64encode(b"user@example.com:secret").decode("utf-8")
+    assert svc._cosmic_token == expected

From e485df48fba1b8f7cdc2a9e7c0ffbd3de4bde0a7 Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Mon, 11 May 2026 11:52:44 +0100
Subject: [PATCH 04/23] Update default include_biotypes to current Ensembl
 annotations

The default list contained two entries that no longer match modern
Ensembl GTFs: `polymorphic_pseudogene` was deprecated and folded into
other biotypes, and `mRNA` is not an Ensembl biotype at all (it's a
generic GFF feature type used by some non-Ensembl annotations). Three
protein-producing biotypes that exist in current releases were missing.

Removed:
  - polymorphic_pseudogene (deprecated)
  - mRNA (not an Ensembl biotype)

Added:
  - protein_coding_CDS_not_defined (~26k transcripts; coding but CDS
    not perfectly annotated, still produces protein)
  - protein_coding_LoF (predicted loss-of-function coding transcripts;
    still produces a truncated protein)
  - translated_processed_pseudogene (pseudogenes that ARE translated;
    the "translated_" prefix distinguishes them from regular pseudo-
    genes)

Net default list: 14 -> 15 biotypes. YAML config and the in-code
fallback in ensembl.py kept in sync.

Closes #101
---
 pgatk/config/ensembl_config.yaml | 2 +-
 pgatk/ensembl/ensembl.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pgatk/config/ensembl_config.yaml b/pgatk/config/ensembl_config.yaml
index 1838c7c..24be0dc 100644
--- a/pgatk/config/ensembl_config.yaml
+++ b/pgatk/config/ensembl_config.yaml
@@ -11,7 +11,7 @@ ensembl_translation:
     exclude_biotypes: ''
     exclude_consequences: 'downstream_gene_variant, upstream_gene_variant, intergenic_variant, intron_variant, synonymous_variant, regulatory_region_variant'
     skip_including_all_cds: False
-    include_biotypes: 'protein_coding,polymorphic_pseudogene,non_stop_decay,nonsense_mediated_decay,IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,TR_C_gene,TR_D_gene,TR_J_gene,TR_V_gene,TEC,mRNA'
+    include_biotypes: 'protein_coding,protein_coding_CDS_not_defined,protein_coding_LoF,nonsense_mediated_decay,non_stop_decay,translated_processed_pseudogene,IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,TR_C_gene,TR_D_gene,TR_J_gene,TR_V_gene,TEC'
     include_consequences: 'all'
     biotype_str: transcript_biotype
     transcript_description_sep: ';'
diff --git a/pgatk/ensembl/ensembl.py b/pgatk/ensembl/ensembl.py
index e1d8b7e..5900fd4 100644
--- a/pgatk/ensembl/ensembl.py
+++ b/pgatk/ensembl/ensembl.py
@@ -85,7 +85,7 @@ def __init__(self, config_file: dict, pipeline_arguments: dict) -> None:
                                                                        default_value=False)
         self._include_biotypes = self.get_multiple_options(
             self.get_translation_properties(variable=self.INCLUDE_BIOTYPES,
-                                            default_value='protein_coding,polymorphic_pseudogene,non_stop_decay,nonsense_mediated_decay,IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,TR_C_gene,TR_D_gene,TR_J_gene,TR_V_gene,TEC,mRNA'))
+                                            default_value='protein_coding,protein_coding_CDS_not_defined,protein_coding_LoF,nonsense_mediated_decay,non_stop_decay,translated_processed_pseudogene,IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,TR_C_gene,TR_D_gene,TR_J_gene,TR_V_gene,TEC'))
 
         self._include_consequences = self.get_multiple_options(
             self.get_translation_properties(variable=self.INCLUDE_CONSEQUENCES, default_value='all'))

From 03de0b65cc3f28aa214f942cad3f8b7b0ad70952 Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Mon, 11 May 2026 14:11:15 +0100
Subject: [PATCH 05/23] Speed up vcf-to-proteindb (~475x on the dominant SQL
 op)

Issue #99 reports vcf-to-proteindb taking 12+ hours on chr22-scale
VCFs. Root cause: get_features() makes two SQLite queries against
the gffutils GTF database per (variant, transcript) pair, with no
caching. For typical CSQ-annotated VCFs that's tens of millions of
queries per chromosome.

Six semantically preserving optimisations, stacked:

1. Memoize get_features() results per transcript id for the duration
   of one vcf_to_proteindb call. Dominant gain. Micro-benchmark on
   a real ENSEMBL chr22 transcript shows ~475x speedup on this op
   (0.094 ms/call SQLite -> 0.0002 ms/call cache hit). For 50M
   get_features calls per chr22 that's ~78 minutes -> ~10 seconds
   of pure get_features time.
2. Apply biotype / consequence filters BEFORE the (now-cached)
   get_features call so rejected transcripts skip the work entirely.
3. Iterate the VCF DataFrame with itertuples(index=False) instead of
   iterrows(); ~5-10x faster on the outer loop.
4. Parse record.INFO into a dict once per variant instead of two
   list-comprehensions that each scan the whole INFO string.
5. Cache transcripts_dict[id] (SeqIO.index) lookups per transcript
   id; avoids repeated fseek+parse on the FASTA for variants that
   share a transcript. Uses a _MISSING sentinel so known-missing
   ids skip the disk lookup too.
6. Lazy log formatting in the hot loop -- wrap debug() calls with
   isEnabledFor(DEBUG) and use %s placeholders so the message
   string isn't built when DEBUG isn't enabled.

Verified end-to-end against the three in-repo pytest tests
(test_vcf_to_proteindb, test_vcf_to_proteindb_notannotated,
test_vcf_gnomad_to_proteindb). End-to-end run on a 10K-variant
1000 Genomes chr22 VCF + ENSEMBL release 113 chr22 GTF + full
cDNA FASTA completes in ~30 seconds (mostly gffutils DB build +
bedtools intersect, both one-time costs unrelated to the fix);
43 sequences produced.

Behaviour change worth noting: the "feature IDs from VCF not found
in FASTA" counter now counts each missing transcript once (cached),
not per-variant. The old counter over-counted on repeated lookups
of the same missing id. Behaviour, not output, change.

Add scripts/benchmark_vcf_to_proteindb.py: one-shot wall-clock timer
for re-validating perf on representative data. Not used by CI.

Closes #99
---
 pgatk/ensembl/ensembl.py              | 140 ++++++++++++++++++--------
 scripts/benchmark_vcf_to_proteindb.py |  65 ++++++++++++
 2 files changed, 162 insertions(+), 43 deletions(-)
 create mode 100755 scripts/benchmark_vcf_to_proteindb.py

diff --git a/pgatk/ensembl/ensembl.py b/pgatk/ensembl/ensembl.py
index 5900fd4..883088f 100644
--- a/pgatk/ensembl/ensembl.py
+++ b/pgatk/ensembl/ensembl.py
@@ -19,6 +19,32 @@
 )
 
 
+class _FeatureCache:
+    """Per-run memoization of get_features() results, keyed by transcript id.
+
+    Bounded by a soft maxsize to avoid unbounded growth on whole-genome runs.
+    Defaults to 50000, which comfortably covers human ENSEMBL (~250k transcripts
+    across all chromosomes; a single chr is ~10-20k).
+    """
+
+    def __init__(self, maxsize: int = 50000) -> None:
+        self._cache: dict[tuple, tuple] = {}
+        self._maxsize = maxsize
+
+    def get(self, key: tuple) -> Optional[tuple]:
+        return self._cache.get(key)
+
+    def put(self, key: tuple, value: tuple) -> None:
+        if len(self._cache) >= self._maxsize:
+            # Soft eviction: drop a fifth at random when full. Cheap and bounded.
+            for k in list(self._cache.keys())[: self._maxsize // 5]:
+                del self._cache[k]
+        self._cache[key] = value
+
+
+_MISSING = object()
+
+
 class EnsemblDataService(ParameterConfiguration):
     CONFIG_KEY_VCF = "ensembl_translation"
     INPUT_FASTA = "input_fasta"
@@ -432,6 +458,8 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
         transcripts_dict = SeqIO.index(input_fasta, "fasta", key_function=self.get_key)
         # handle cases where the transcript has version in the GTF but not in the VCF
         transcript_id_mapping = {k.split('.')[0]: k for k in transcripts_dict.keys()}
+        feature_cache = _FeatureCache()
+        seq_cache: dict[str, tuple] = {}  # transcript_id_v -> (ref_seq, desc); see Task 5
 
         transcript_index, consequence_index, biotype_index = None, None, None
         if self._annotation_field_name:
@@ -481,12 +509,12 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
         self._accepted_filters = [x.upper() for x in self._accepted_filters]
 
         with open(self._proteindb_output, 'w', encoding='utf-8') as prots_fn:
-            for _, record in vcf_reader.iterrows():
+            for record in vcf_reader.itertuples(index=False, name='VCFRecord'):
                 trans = False
                 if [x for x in str(record.REF) if x not in 'ACGT']:
-                    msg = "Invalid VCF record, skipping: {}".format(record)
                     invalid_records['# variants with invalid record'] += 1
-                    self.get_logger().debug(msg)
+                    if self.get_logger().isEnabledFor(logging.DEBUG):
+                        self.get_logger().debug("Invalid VCF record, skipping: %s", record)
                     continue
 
                 alts = []
@@ -498,11 +526,18 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
                         continue
                     alts.append(alt)
                 if not alts:
-                    msg = "Invalid VCF record, skipping: {}".format(record)
                     invalid_records['# variants with invalid record'] += 1
-                    self.get_logger().debug(msg)
+                    if self.get_logger().isEnabledFor(logging.DEBUG):
+                        self.get_logger().debug("Invalid VCF record, skipping: %s", record)
                     continue
 
+                # Parse INFO once. Maps each key to its value string, missing keys absent.
+                # Avoids repeated split-and-search list-comprehensions.
+                info_kv: dict[str, str] = {}
+                for entry in record.INFO.split(';'):
+                    k, _, v = entry.partition('=')
+                    info_kv[k] = v
+
                 if not self._ignore_filters and self._accepted_filters != ['ALL']:
                     if record.FILTER and record.FILTER != '.' and record.FILTER != 'NA' and record.FILTER != '':  # if not PASS: None and empty means PASS
                         filters = set(record.FILTER.upper().split(','))
@@ -511,12 +546,10 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
                         if not filters <= set(self._accepted_filters):
                             invalid_records['# variants not passing Filter'] += 1
                             continue
-                # only process variants above a given allele frequency threshold if the AF string is not empty
                 if self._af_field:
-                    # get AF from the INFO field
                     try:
-                        af = float([x.split('=', 1)[1] for x in record.INFO.split(';') if x.split('=', 1)[0] == self._af_field][0])
-                    except (ValueError, IndexError):
+                        af = float(info_kv[self._af_field])
+                    except (ValueError, KeyError):
                         invalid_records['# variants with invalid record'] += 1
                         continue
 
@@ -530,15 +563,13 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
                     trans_table = self._mito_translation_table
 
                 processed_transcript_allele = set()
-                transcript_records = []
                 try:
-                    transcript_records = \
-                        [x.split('=', 1)[1] for x in record.INFO.split(';')
-                         if x.split('=', 1)[0] == self._annotation_field_name][0]
-                except IndexError:  # no overlapping feature was found
+                    transcript_records = info_kv[self._annotation_field_name]
+                except KeyError:
                     invalid_records['# variants with invalid record'] += 1
-                    msg = "skipped record {}, no annotation feature was found".format(record)
-                    self.get_logger().debug(msg)
+                    if self.get_logger().isEnabledFor(logging.DEBUG):
+                        self.get_logger().debug(
+                            "skipped record %s, no annotation feature was found", record)
                     continue
 
                 for transcript_record in transcript_records.split(','):
@@ -548,9 +579,10 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
                             consequence = transcript_info[consequence_index]
                         except IndexError:
                             invalid_records['# variants with invalid record'] += 1
-                            msg = "Give a valid index for the consequence in the INFO field for: {}".format(
-                                transcript_record)
-                            self.get_logger().debug(msg)
+                            if self.get_logger().isEnabledFor(logging.DEBUG):
+                                self.get_logger().debug(
+                                    "Give a valid index for the consequence in the INFO field for: %s",
+                                    transcript_record)
                             continue
                         except TypeError:
                             pass
@@ -559,9 +591,10 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
                             biotype = transcript_info[biotype_index]
                         except IndexError:
                             invalid_records['# variants with invalid record'] += 1
-                            msg = "Give a valid index for the biotype in the INFO field for: {}".format(
-                                transcript_record)
-                            self.get_logger().debug(msg)
+                            if self.get_logger().isEnabledFor(logging.DEBUG):
+                                self.get_logger().debug(
+                                    "Give a valid index for the biotype in the INFO field for: %s",
+                                    transcript_record)
                             continue
                         except TypeError:
                             pass
@@ -570,9 +603,10 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
                         transcript_id = transcript_info[transcript_index]
                     except IndexError:
                         invalid_records['# variants with invalid record'] += 1
-                        msg = "Give a valid index for the Transcript IDs in the INFO field for: {}".format(
-                            transcript_record)
-                        self.get_logger().debug(msg)
+                        if self.get_logger().isEnabledFor(logging.DEBUG):
+                            self.get_logger().debug(
+                                "Give a valid index for the Transcript IDs in the INFO field for: %s",
+                                transcript_record)
                         continue
                     if transcript_id == "":
                         continue
@@ -582,15 +616,26 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
                     except KeyError:
                         transcript_id_v = transcript_id
 
-                    try:
-                        row = transcripts_dict[transcript_id_v]
-                        ref_seq = row.seq  # get the seq and desc for the transcript from the fasta of the gtf
-                        desc = str(row.description)
-                    except KeyError:
-                        invalid_records['# feature IDs from VCF that are not found in the given FASTA file'] += 1
-                        msg = "Feature {} not found in fasta of the GTF file {}".format(transcript_id_v, record)
-                        self.get_logger().debug(msg)
+                    cached_row = seq_cache.get(transcript_id_v, _MISSING)
+                    if cached_row is _MISSING:
+                        try:
+                            row = transcripts_dict[transcript_id_v]
+                            ref_seq = row.seq
+                            desc = str(row.description)
+                            seq_cache[transcript_id_v] = (ref_seq, desc)
+                        except KeyError:
+                            invalid_records['# feature IDs from VCF that are not found in the given FASTA file'] += 1
+                            if self.get_logger().isEnabledFor(logging.DEBUG):
+                                self.get_logger().debug(
+                                    "Feature %s not found in fasta of the GTF file %s",
+                                    transcript_id_v, record)
+                            seq_cache[transcript_id_v] = None
+                            continue
+                    elif cached_row is None:
+                        # already-known-missing; skip without re-counting in the stats
                         continue
+                    else:
+                        ref_seq, desc = cached_row
 
                     feature_types = ['exon']
                     # check if cds info exists in the fasta header otherwise translate all exons
@@ -608,15 +653,13 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
                                 feature_types = ['CDS', 'stop_codon']
                                 num_orfs = 1
                         except (ValueError, IndexError):
-                            msg = "Could not extract cds position from fasta header for: {}".format(desc)
-                            self.get_logger().debug(msg)
+                            if self.get_logger().isEnabledFor(logging.DEBUG):
+                                self.get_logger().debug(
+                                    "Could not extract cds position from fasta header for: %s", desc)
 
-                    chrom, strand, features_info = self.get_features(db,
-                                                                     transcript_id_v,
-                                                                     feature_types)
-                    if chrom is None:  # the record info was not found
-                        continue
-                    # skip transcripts with unwanted consequences
+                    # skip transcripts with unwanted consequences (do this BEFORE
+                    # the expensive get_features() SQL call — both filter inputs
+                    # come from transcript_info, not from get_features)
                     if consequence_index is not None:
                         if (consequence in self._exclude_consequences or
                                 (consequence not in self._include_consequences and
@@ -630,6 +673,17 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
                                  self._include_biotypes != ['all'])):
                             continue
 
+                    cache_key = (transcript_id_v, tuple(feature_types))
+                    cached = feature_cache.get(cache_key)
+                    if cached is None:
+                        chrom, strand, features_info = self.get_features(
+                            db, transcript_id_v, feature_types)
+                        feature_cache.put(cache_key, (chrom, strand, features_info))
+                    else:
+                        chrom, strand, features_info = cached
+                    if chrom is None:  # the record info was not found
+                        continue
+
                     for alt in alts:
                         dedup_key = transcript_id + str(record.REF) + str(alt)
                         if dedup_key in processed_transcript_allele:
@@ -643,8 +697,8 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
                                                               features_info)
                         except TypeError:
                             invalid_records['# variants with invalid record'] += 1
-                            msg = "Wrong VCF record in {}".format(record)
-                            self.get_logger().debug(msg)
+                            if self.get_logger().isEnabledFor(logging.DEBUG):
+                                self.get_logger().debug("Wrong VCF record in %s", record)
                             continue
 
                         if (chrom.lstrip("chr") == str(record.CHROM).lstrip("chr") and
diff --git a/scripts/benchmark_vcf_to_proteindb.py b/scripts/benchmark_vcf_to_proteindb.py
new file mode 100755
index 0000000..bf17700
--- /dev/null
+++ b/scripts/benchmark_vcf_to_proteindb.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+"""One-shot benchmark for ``pgatk vcf-to-proteindb``.
+
+Times one full run end-to-end. Compare two runs of this script (one on a
+pre-fix build, one on a post-fix build) to measure the speedup achieved
+by issue #99. Not invoked by CI.
+
+Usage:
+    python scripts/benchmark_vcf_to_proteindb.py \
+        --vcf    /path/to/sample.vcf \
+        --fasta  /path/to/transcripts.fa \
+        --gtf    /path/to/annotation.gtf \
+        --output /tmp/benchmark_proteindb.fa
+"""
+import argparse
+import time
+from pathlib import Path
+
+from pgatk.ensembl.ensembl import EnsemblDataService
+from pgatk.config.registry import load_config
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--vcf", required=True, help="Input VCF (annotated or raw)")
+    parser.add_argument("--fasta", required=True, help="Transcript FASTA matching the GTF")
+    parser.add_argument("--gtf", required=True, help="Gene annotations GTF")
+    parser.add_argument("--output", required=True, help="Output protein DB FASTA")
+    parser.add_argument(
+        "--annotation-field-name",
+        default="CSQ",
+        help="VCF INFO field that carries per-transcript annotation (default: CSQ). "
+        "Use empty string to force re-annotation via bedtools intersect.",
+    )
+    args = parser.parse_args()
+
+    config_data = load_config("ensembl_config", None)
+    pipeline_arguments = {
+        EnsemblDataService.PROTEIN_DB_OUTPUT: args.output,
+        EnsemblDataService.ANNOTATION_FIELD_NAME: args.annotation_field_name,
+    }
+    svc = EnsemblDataService(config_data, pipeline_arguments)
+
+    start = time.perf_counter()
+    svc.vcf_to_proteindb(args.vcf, args.fasta, args.gtf)
+    elapsed = time.perf_counter() - start
+
+    output_path = Path(args.output)
+    output_size = output_path.stat().st_size if output_path.exists() else 0
+    seq_count = 0
+    if output_path.exists():
+        with open(output_path, "r", encoding="utf-8") as fh:
+            for line in fh:
+                if line.startswith(">"):
+                    seq_count += 1
+
+    print()
+    print("=== BENCHMARK RESULT ===")
+    print(f"  VCF:        {args.vcf}")
+    print(f"  Elapsed:    {elapsed:.2f} s  ({elapsed / 60:.2f} min)")
+    print(f"  Output:     {args.output}  ({output_size} bytes, {seq_count} sequences)")
+
+
+if __name__ == "__main__":
+    main()

From 9eba32c7ca6c36ced66289b79d9c1e51648388c5 Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Mon, 11 May 2026 14:24:59 +0100
Subject: [PATCH 06/23] Address Codacy findings: tarfile path-traversal + minor
 style

cosmic_downloader.py: tarfile.extractall on COSMIC archives is now
wrapped in _safe_extract_tar(), which validates each member against
the destination directory before extraction. Rejects entries that
would escape the target via absolute path, .. traversal, or unsafe
symlink/hardlink targets (CVE-2007-4559 family). COSMIC archives
are trusted in practice but defence-in-depth costs nothing and
silences the static-analysis warning.

ensembl.py: add blank line before _FeatureCache docstring (D203).

D213 ("multi-line docstring summary on second line") not addressed
- it conflicts with D212 ("on the first line"), which the codebase
already follows throughout. Standard modern Python style.
---
 pgatk/cgenomes/cosmic_downloader.py | 34 +++++++++++++++++++++++++++--
 pgatk/ensembl/ensembl.py            |  1 +
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/pgatk/cgenomes/cosmic_downloader.py b/pgatk/cgenomes/cosmic_downloader.py
index 836955e..f913f69 100644
--- a/pgatk/cgenomes/cosmic_downloader.py
+++ b/pgatk/cgenomes/cosmic_downloader.py
@@ -177,7 +177,37 @@ def download_file_cosmic(self, api_url, local_file, token):
         if local_file.endswith('.tar'):
             extract_dir = os.path.dirname(local_file) or '.'
             print("Extracting {}...".format(os.path.basename(local_file)), flush=True)
-            with tarfile.open(local_file, 'r') as tar:
-                tar.extractall(path=extract_dir)
+            self._safe_extract_tar(local_file, extract_dir)
             os.remove(local_file)
             self.get_logger().debug("Extracted archive into '{}'".format(extract_dir))
+
+    @staticmethod
+    def _safe_extract_tar(tar_path, dest_dir):
+        """Extract tar_path into dest_dir, rejecting any member that would
+        escape dest_dir via absolute path or .. traversal (CVE-2007-4559).
+        COSMIC archives are trusted in practice, but defence in depth costs
+        nothing and silences static-analysis warnings.
+        """
+        dest_abs = os.path.realpath(dest_dir)
+        with tarfile.open(tar_path, 'r') as tar:
+            for member in tar.getmembers():
+                member_path = os.path.realpath(os.path.join(dest_abs, member.name))
+                if os.path.commonpath([dest_abs, member_path]) != dest_abs:
+                    raise AppConfigException(
+                        "Refusing to extract tar member {!r}: would escape target dir {!r}"
+                        .format(member.name, dest_abs)
+                    )
+                if member.issym() or member.islnk():
+                    link_target = member.linkname
+                    if os.path.isabs(link_target):
+                        raise AppConfigException(
+                            "Refusing to extract tar member {!r}: absolute symlink/hardlink target {!r}"
+                            .format(member.name, link_target)
+                        )
+                    resolved = os.path.realpath(os.path.join(dest_abs, os.path.dirname(member.name), link_target))
+                    if os.path.commonpath([dest_abs, resolved]) != dest_abs:
+                        raise AppConfigException(
+                            "Refusing to extract tar member {!r}: symlink/hardlink would escape target dir"
+                            .format(member.name)
+                        )
+            tar.extractall(path=dest_abs)
diff --git a/pgatk/ensembl/ensembl.py b/pgatk/ensembl/ensembl.py
index 883088f..2e21aae 100644
--- a/pgatk/ensembl/ensembl.py
+++ b/pgatk/ensembl/ensembl.py
@@ -20,6 +20,7 @@
 
 
 class _FeatureCache:
+
     """Per-run memoization of get_features() results, keyed by transcript id.
 
     Bounded by a soft maxsize to avoid unbounded growth on whole-genome runs.

From 2b467a5c57fd1b293aace53a44a27fe58cac875a Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Mon, 11 May 2026 14:43:14 +0100
Subject: [PATCH 07/23] Configure pydocstyle to suppress D211/D213 (conflict
 with codebase style)

Codacy's default pydocstyle config has both D203 ("one blank line
before class docstring") and D211 ("no blank lines before class
docstring") enabled. These rules contradict each other; the codebase
follows D203. Likewise, D213 ("multi-line summary on second line")
conflicts with D212 ("on the first line"), which is the codebase's
established convention.

Add `add-ignore = ["D211", "D213"]` to silence the false positives.
---
 pyproject.toml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 610da6c..6dff712 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -71,3 +71,10 @@ exclude = ["tests*", "*.tests*", "*.tests.*", "tests.*"]
 
 [tool.setuptools.package-data]
 pgatk = ["config/*.yaml", "config/*.json"]
+
+[tool.pydocstyle]
+# D211 conflicts with D203 (which we follow: one blank line before class docstring).
+# D213 conflicts with D212 (which we follow: multi-line summary on the first line).
+# pydocstyle errors when both rules in each pair are enabled; explicitly disable
+# the ones we don't follow.
+add-ignore = ["D211", "D213"]

From f781b00ebf1772f5a9ac0260e3d68ad9c0d03bb8 Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Mon, 11 May 2026 14:48:10 +0100
Subject: [PATCH 08/23] Silence remaining Codacy noise (tarfile + pydocstyle
 config)

The pydocstyle settings in pyproject.toml in 2b467a5 didn't help -
Codacy runs an older pydocstyle that only reads its native config
files (setup.cfg, .pydocstyle, .pydocstyle.ini, tox.ini), not
pyproject.toml's `[tool.pydocstyle]` section. Add `.pydocstyle`
with the same `add-ignore = D211,D213` and a `.codacy.yaml` that
sets the same patterns at the Codacy engine level.

Also tighten _safe_extract_tar to pass the validated member list
to extractall() explicitly (members=safe_members). Functionally
identical, but makes the contract obvious to readers (and the
static analyser): only validated members are extracted. The
`# nosec B202` comment marks the deliberate suppression for any
Bandit-style scanner that still flags extractall calls.

No source-code behaviour change.
---
 .codacy.yaml                        | 15 +++++++++++++++
 .pydocstyle                         |  6 ++++++
 pgatk/cgenomes/cosmic_downloader.py |  5 ++++-
 3 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 .codacy.yaml
 create mode 100644 .pydocstyle

diff --git a/.codacy.yaml b/.codacy.yaml
new file mode 100644
index 0000000..7a461b7
--- /dev/null
+++ b/.codacy.yaml
@@ -0,0 +1,15 @@
+---
+# Codacy configuration. https://docs.codacy.com/repositories-configure/codacy-configuration-file/
+#
+# pydocstyle has two pairs of mutually-exclusive rules:
+#   D203 vs D211 (blank line before class docstring; codebase follows D203)
+#   D212 vs D213 (multi-line summary line; codebase follows D212)
+# Both rules in each pair are enabled in Codacy's default profile, which produces
+# unavoidable noise — pick one and silence the other. Settings here also apply to
+# any local pydocstyle invocation that reads `.pydocstyle`.
+
+engines:
+  pydocstyle:
+    enabled: true
+    settings:
+      add_ignore: ["D211", "D213"]
diff --git a/.pydocstyle b/.pydocstyle
new file mode 100644
index 0000000..bd70be3
--- /dev/null
+++ b/.pydocstyle
@@ -0,0 +1,6 @@
+[pydocstyle]
+# D211 conflicts with D203 (codebase follows D203: one blank line before class docstring).
+# D213 conflicts with D212 (codebase follows D212: multi-line summary on the first line).
+# Disabling the rules we don't follow stops the mutually-exclusive-pair noise from
+# static analysers (pydocstyle, Codacy).
+add-ignore = D211,D213
diff --git a/pgatk/cgenomes/cosmic_downloader.py b/pgatk/cgenomes/cosmic_downloader.py
index f913f69..5bd7c07 100644
--- a/pgatk/cgenomes/cosmic_downloader.py
+++ b/pgatk/cgenomes/cosmic_downloader.py
@@ -190,6 +190,7 @@ def _safe_extract_tar(tar_path, dest_dir):
         """
         dest_abs = os.path.realpath(dest_dir)
         with tarfile.open(tar_path, 'r') as tar:
+            safe_members = []
             for member in tar.getmembers():
                 member_path = os.path.realpath(os.path.join(dest_abs, member.name))
                 if os.path.commonpath([dest_abs, member_path]) != dest_abs:
@@ -210,4 +211,6 @@ def _safe_extract_tar(tar_path, dest_dir):
                             "Refusing to extract tar member {!r}: symlink/hardlink would escape target dir"
                             .format(member.name)
                         )
-            tar.extractall(path=dest_abs)
+                safe_members.append(member)
+            # nosec B202 - members are validated above; extracting only the validated list
+            tar.extractall(path=dest_abs, members=safe_members)

From dd86c39ca9a254bc279c28ee58d7d8e2fe94f604 Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Mon, 11 May 2026 16:24:11 +0100
Subject: [PATCH 09/23] Fix remaining real Codacy findings; defer
 style-conflict ones to UI

Real issues fixed:

- test_cosmic_downloader.py: remove unused `import os` (F401).
- cosmic_downloader.py: rewrite _safe_extract_tar docstring with proper
  summary/body split (single-period summary line + blank line before
  body), fixes D205 and D415.
- cosmic_downloader.py: add explicit type hint
  `safe_members: list[tarfile.TarInfo]` on the validated-members list,
  and add a clear comment at the extractall() call site explaining
  what was validated. Helps reviewers (and static analysers that can
  follow type hints) confirm the safety contract.

Codacy/Bandit config:

- setup.cfg [bandit]: skip B101 entirely (B101 fires on every pytest
  `assert`, which is a false positive because pytest depends on
  assert rewriting). Also document the B202 (tarfile.extractall)
  suppression at the call site.
- setup.cfg [pydocstyle]: same `add-ignore = D211,D213` as the
  existing .pydocstyle + pyproject.toml. setup.cfg is the most
  universally-read pydocstyle config location across versions.

Remaining D211 / D213 noise on the PR cannot be fixed in code -
they are mutually-exclusive pydocstyle rules (D211 vs D203, D213
vs D212) and the codebase has to follow one of each pair. The
Codacy engine enables both halves of each pair by default and
appears to ignore local config files. Disable D211 and D213 in the
Codacy repository settings UI (Repository -> Settings -> Code
patterns -> pydocstyle) to silence them globally.
---
 pgatk/cgenomes/cosmic_downloader.py   | 18 +++++++++++-------
 pgatk/tests/test_cosmic_downloader.py |  2 --
 setup.cfg                             | 15 +++++++++++++++
 3 files changed, 26 insertions(+), 9 deletions(-)
 create mode 100644 setup.cfg

diff --git a/pgatk/cgenomes/cosmic_downloader.py b/pgatk/cgenomes/cosmic_downloader.py
index 5bd7c07..0a8f7c6 100644
--- a/pgatk/cgenomes/cosmic_downloader.py
+++ b/pgatk/cgenomes/cosmic_downloader.py
@@ -183,14 +183,17 @@ def download_file_cosmic(self, api_url, local_file, token):
 
     @staticmethod
     def _safe_extract_tar(tar_path, dest_dir):
-        """Extract tar_path into dest_dir, rejecting any member that would
-        escape dest_dir via absolute path or .. traversal (CVE-2007-4559).
-        COSMIC archives are trusted in practice, but defence in depth costs
-        nothing and silences static-analysis warnings.
+        """Extract tar_path into dest_dir, rejecting unsafe members.
+
+        Guards against path traversal (CVE-2007-4559) via absolute paths or
+        ``..`` components in member names, and ensures any symlink/hardlink
+        targets resolve within dest_dir. COSMIC archives are trusted in
+        practice, but defence in depth costs nothing and silences static
+        analysers.
         """
         dest_abs = os.path.realpath(dest_dir)
         with tarfile.open(tar_path, 'r') as tar:
-            safe_members = []
+            safe_members: list[tarfile.TarInfo] = []
             for member in tar.getmembers():
                 member_path = os.path.realpath(os.path.join(dest_abs, member.name))
                 if os.path.commonpath([dest_abs, member_path]) != dest_abs:
@@ -212,5 +215,6 @@ def _safe_extract_tar(tar_path, dest_dir):
                             .format(member.name)
                         )
                 safe_members.append(member)
-            # nosec B202 - members are validated above; extracting only the validated list
-            tar.extractall(path=dest_abs, members=safe_members)
+            # All members in safe_members were validated against dest_abs above:
+            # absolute paths, .. traversal, and unsafe symlink targets all rejected.
+            tar.extractall(path=dest_abs, members=safe_members)  # nosec B202
diff --git a/pgatk/tests/test_cosmic_downloader.py b/pgatk/tests/test_cosmic_downloader.py
index dea686c..5a4b028 100644
--- a/pgatk/tests/test_cosmic_downloader.py
+++ b/pgatk/tests/test_cosmic_downloader.py
@@ -3,8 +3,6 @@
 No network. Builds a CosmicDownloadService against the shipped default config
 and asserts that the four product API URLs match the expected pattern.
 """
-import os
-
 from pgatk.cgenomes.cosmic_downloader import CosmicDownloadService
 from pgatk.config.registry import load_config
 
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..2baaabd
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,15 @@
+[pydocstyle]
+# D211 conflicts with D203 (we follow D203 — one blank line before class docstring).
+# D213 conflicts with D212 (we follow D212 — multi-line summary on the first line).
+# Both pairs of rules cannot be satisfied simultaneously; pick one side per pair.
+add-ignore = D211,D213
+
+[bandit]
+# B101 fires on every `assert` used in pytest tests. Pytest depends on assert
+# rewriting; the bytecode-stripping concern doesn't apply to test files.
+# Exclude test directories entirely so pytest assertions are not flagged.
+exclude_dirs = ./pgatk/tests,./tests
+# B202 (tarfile.extractall) — _safe_extract_tar in pgatk/cgenomes/cosmic_downloader.py
+# performs explicit member-by-member validation before calling extractall(members=…)
+# with the validated list. Annotated with `# nosec B202` at the call site.
+skips = B101

From e6889e4c5547c38d65a8693a6e41d6084d17045f Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Mon, 11 May 2026 16:27:40 +0100
Subject: [PATCH 10/23] Bump version to 0.0.27

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 6dff712..1d4b6e2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "pgatk"
-version = "0.0.26"
+version = "0.0.27"
 description = "Python tools for proteogenomics"
 readme = "README.md"
 requires-python = ">=3.9"

From ab10aa639ff5ca22b91c6b97cdd02a197464d7dc Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Mon, 11 May 2026 16:35:15 +0100
Subject: [PATCH 11/23] Clean up unnecessary comments added in this PR

- Drop "_FeatureCache.put" eviction comment that incorrectly claimed
  random eviction (the loop actually drops the first 1/5 of insertion
  order). One-line loop is self-explanatory without commentary.
- Drop "see Task 5" dangling reference in seq_cache declaration; the
  task lived in a now-gitignored implementation plan and means nothing
  to a future reader.
- Collapse two-line "Parse INFO once" comment to one line; keep the
  WHY (avoids repeated scans) and drop the WHAT (variable name and
  loop already say it).
- Collapse the two filter-block comments into a single rationale line
  explaining why the filters run before get_features().
---
 pgatk/ensembl/ensembl.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/pgatk/ensembl/ensembl.py b/pgatk/ensembl/ensembl.py
index 2e21aae..70d2e7f 100644
--- a/pgatk/ensembl/ensembl.py
+++ b/pgatk/ensembl/ensembl.py
@@ -37,7 +37,6 @@ def get(self, key: tuple) -> Optional[tuple]:
 
     def put(self, key: tuple, value: tuple) -> None:
         if len(self._cache) >= self._maxsize:
-            # Soft eviction: drop a fifth at random when full. Cheap and bounded.
             for k in list(self._cache.keys())[: self._maxsize // 5]:
                 del self._cache[k]
         self._cache[key] = value
@@ -460,7 +459,7 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
         # handle cases where the transcript has version in the GTF but not in the VCF
         transcript_id_mapping = {k.split('.')[0]: k for k in transcripts_dict.keys()}
         feature_cache = _FeatureCache()
-        seq_cache: dict[str, tuple] = {}  # transcript_id_v -> (ref_seq, desc); see Task 5
+        seq_cache: dict[str, tuple] = {}  # transcript_id_v -> (ref_seq, desc)
 
         transcript_index, consequence_index, biotype_index = None, None, None
         if self._annotation_field_name:
@@ -532,8 +531,7 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
                         self.get_logger().debug("Invalid VCF record, skipping: %s", record)
                     continue
 
-                # Parse INFO once. Maps each key to its value string, missing keys absent.
-                # Avoids repeated split-and-search list-comprehensions.
+                # Parse INFO once; avoids repeated split-and-search list-comprehensions per variant.
                 info_kv: dict[str, str] = {}
                 for entry in record.INFO.split(';'):
                     k, _, v = entry.partition('=')
@@ -658,16 +656,14 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
                                 self.get_logger().debug(
                                     "Could not extract cds position from fasta header for: %s", desc)
 
-                    # skip transcripts with unwanted consequences (do this BEFORE
-                    # the expensive get_features() SQL call — both filter inputs
-                    # come from transcript_info, not from get_features)
+                    # Apply consequence/biotype filters before get_features(); both inputs
+                    # come from transcript_info, so we can reject early without SQL.
                     if consequence_index is not None:
                         if (consequence in self._exclude_consequences or
                                 (consequence not in self._include_consequences and
                                  self._include_consequences != ['all'])):
                             continue
 
-                    # skip transcripts with unwanted biotypes
                     if biotype_index is not None:
                         if (biotype in self._exclude_biotypes or
                                 (biotype not in self._include_biotypes and

From 3ede3ba465179d4a4c6cdad81565453fa92aadf4 Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Mon, 11 May 2026 16:48:22 +0100
Subject: [PATCH 12/23] Remove unused comments across the codebase

Codebase-wide cleanup pass: drop comments that restate well-named code,
stale task/PR/issue references that have rotted, orphan commented-out
code, narrative step-trackers, and mid-file author stamps. Preserved
all docstrings, workaround notes, domain-knowledge comments, and
substantive WHY rationale.

No behaviour change.
---
 pgatk/cgenomes/cgenomes_proteindb.py          |  7 +--
 pgatk/clinvar/clinvar_service.py              | 18 -------
 pgatk/commands/ensembl_downloader.py          |  1 -
 pgatk/db/digest_mutant_protein.py             |  4 +-
 pgatk/db/map_peptide2genome.py                |  4 +-
 pgatk/ensembl/data_downloader.py              |  2 -
 pgatk/ensembl/ensembl.py                      | 18 ++-----
 pgatk/proteogenomics/spectrumai.py            |  1 -
 pgatk/proteomics/db/protein_database_decoy.py | 49 -------------------
 pgatk/toolbox/general.py                      |  9 ----
 pgatk/toolbox/vcf_utils.py                    |  1 -
 11 files changed, 8 insertions(+), 106 deletions(-)

diff --git a/pgatk/cgenomes/cgenomes_proteindb.py b/pgatk/cgenomes/cgenomes_proteindb.py
index f42d7b8..fe979be 100644
--- a/pgatk/cgenomes/cgenomes_proteindb.py
+++ b/pgatk/cgenomes/cgenomes_proteindb.py
@@ -140,7 +140,7 @@ def get_mut_pro_seq(snp: SNP, seq: Seq) -> Optional[str]:
                 ref_dna = re.sub("[^A-Z]+", "", tmplist[0])
                 mut_dna = re.sub("[^A-Z]+", "", tmplist[1])
                 index = int(positions[0]) - 1
-                if ref_dna == str(seq[index]).upper() and mut_dna in nucleotide:  #
+                if ref_dna == str(seq[index]).upper() and mut_dna in nucleotide:
                     seq_mut = seq[:index] + mut_dna + seq[index + 1:]
                     mut_pro_seq = str(seq_mut.translate(to_stop=False))
             elif "delins" in snp.dna_mut:
@@ -304,7 +304,6 @@ def cosmic_to_proteindb(self) -> None:
                 if len(row) <= max_col:
                     self.get_logger().debug("Skipping malformed row (insufficient columns) at line %s: %s", line_counter, row[:5])
                     continue
-                # filter out mutations from unspecified groups
                 if filter_col is not None:
                     if row[filter_col] not in self._accepted_values and self._accepted_values != ['all']:
                         continue
@@ -382,7 +381,6 @@ def get_value_per_sample(self, local_clinical_sample_file: str, filter_column: s
                     if line.startswith('#'):
                         continue
                     sl = line.strip().split('\t')
-                    # check for header and re-assign columns
                     if 'SAMPLE_ID' in sl and filter_column in sl:
                         filter_column_col, sample_id_col = self.get_sample_headers(sl, filter_column)
                         # Skip adding the header row itself to sample_value
@@ -442,16 +440,13 @@ def cbioportal_to_proteindb(self) -> None:
                 if row[0] == '#':
                     self.get_logger().info("skipping line (%s): %s", i, row)
                     continue
-                # check for header in the mutations file and get column indices
                 if set(header_cols.keys()).issubset(set(row)):
                     header_cols = self.get_mut_header_cols(header_cols, row)
                     continue
 
-                # check if any is none in header_cols then continue
                 if None in header_cols.values():
                     self.get_logger().error("Incorrect header column is given")
                     continue
-                # get filter value and check it
                 group = None
                 if self._accepted_values != ['all'] or self._split_by_filter_column:
                     try:
diff --git a/pgatk/clinvar/clinvar_service.py b/pgatk/clinvar/clinvar_service.py
index ee9a2cc..f368690 100644
--- a/pgatk/clinvar/clinvar_service.py
+++ b/pgatk/clinvar/clinvar_service.py
@@ -372,13 +372,10 @@ def run(self) -> str:
         """
         logger.info("Starting ClinVar pipeline")
 
-        # 1. Load chromosome mapper
         chrom_mapper = ChromosomeMapper.from_assembly_report(self._assembly_report)
 
-        # 2. Parse GTF
         db = self._parse_gtf(self._gtf_file)
 
-        # 3. Load transcript FASTA
         transcripts_dict = SeqIO.index(
             self._fasta_file,
             "fasta",
@@ -389,10 +386,8 @@ def run(self) -> str:
             k.split(".")[0]: k for k in transcripts_dict.keys()
         }
 
-        # 4. Read VCF once into DataFrame
         _metadata, vcf_df = self._read_vcf(self._vcf_file)
 
-        # 5. Find overlapping transcripts via BedTools (from DataFrame)
         overlap_map = self._build_overlap_map(vcf_df, self._gtf_file, chrom_mapper)
         logger.info("Found %d variants with transcript overlaps", len(overlap_map))
 
@@ -411,7 +406,6 @@ def run(self) -> str:
             for _, record in vcf_df.iterrows():
                 stats["variants_processed"] += 1
 
-                # --- Validate alleles ---
                 ref = str(record.REF)
                 if any(c not in "ACGT" for c in ref):
                     continue
@@ -423,26 +417,22 @@ def run(self) -> str:
                 if not alts:
                     continue
 
-                # --- CLNSIG filter ---
                 info = str(record.INFO)
                 clnsig = self._get_info_field(info, "CLNSIG")
                 if not self.passes_clnsig_filter(clnsig, self._clnsig_exclude):
                     stats["variants_filtered_clnsig"] += 1
                     continue
 
-                # --- MC consequence filter ---
                 mc_field = self._get_info_field(info, "MC")
                 if not self.passes_mc_filter(mc_field, self._include_consequences):
                     stats["variants_filtered_mc"] += 1
                     continue
 
-                # --- Parse gene symbol and CLNSIG for description ---
                 gene_symbol, _ = self.parse_geneinfo(
                     self._get_info_field(info, "GENEINFO")
                 )
                 desc_str = f"{clnsig}|{gene_symbol}" if gene_symbol else clnsig
 
-                # --- Find overlapping transcripts ---
                 chrom = str(record.CHROM)
                 pos = int(record.POS)
 
@@ -465,7 +455,6 @@ def run(self) -> str:
                             continue
                         processed_pairs.add(pair_key)
 
-                        # Resolve transcript in FASTA
                         tid = transcript_id
                         if tid not in transcripts_dict:
                             tid = transcript_id_mapping.get(
@@ -479,7 +468,6 @@ def run(self) -> str:
                             )
                             continue
 
-                        # --- Biotype filter ---
                         if self._include_biotypes != ["all"]:
                             biotype = self._get_transcript_biotype(db, tid)
                             if biotype and biotype not in self._include_biotypes:
@@ -489,7 +477,6 @@ def run(self) -> str:
                         ref_seq = fasta_record.seq
                         desc = str(fasta_record.description)
 
-                        # Determine CDS info and feature types
                         cds_info: list[int] = []
                         feature_types = ["exon"]
                         num_orfs = 3
@@ -512,19 +499,16 @@ def run(self) -> str:
                                     desc,
                                 )
 
-                        # Get features from GTF
                         feat_chrom, strand, features_info = self._get_features(
                             db, tid, feature_types
                         )
                         if feat_chrom is None:
                             continue
 
-                        # Check overlap at feature level
                         var_end = pos + len(ref) - 1
                         if not check_overlap(pos, var_end, features_info):
                             continue
 
-                        # Apply variant
                         coding_ref_seq, coding_alt_seq = get_altseq(
                             ref_seq,
                             Seq(ref),
@@ -538,7 +522,6 @@ def run(self) -> str:
                         if coding_alt_seq == "":
                             continue
 
-                        # Translate
                         ref_orfs, alt_orfs = get_orfs_vcf(
                             coding_ref_seq,
                             coding_alt_seq,
@@ -546,7 +529,6 @@ def run(self) -> str:
                             num_orfs,
                         )
 
-                        # Build sequence ID
                         record_id = ""
                         if record.ID and str(record.ID) != ".":
                             record_id = str(record.ID)
diff --git a/pgatk/commands/ensembl_downloader.py b/pgatk/commands/ensembl_downloader.py
index 83adb88..0885cd7 100644
--- a/pgatk/commands/ensembl_downloader.py
+++ b/pgatk/commands/ensembl_downloader.py
@@ -38,7 +38,6 @@ def ensembl_downloader(ctx, config_file, output_directory, taxonomy, folder_pref
     if taxonomy is None and ensembl_name is None:
         raise click.UsageError("Either --taxonomy or --ensembl_name is required.")
 
-    # Parse pipelines parameters.
     pipeline_arguments = {}
 
     if output_directory is not None:
diff --git a/pgatk/db/digest_mutant_protein.py b/pgatk/db/digest_mutant_protein.py
index 7b49201..6651aaa 100644
--- a/pgatk/db/digest_mutant_protein.py
+++ b/pgatk/db/digest_mutant_protein.py
@@ -18,7 +18,7 @@ def trypsin_cleavage(proseq: str, miss_cleavage: int):
         except IndexError:
             pass
 
-        if aa in ['K', 'R'] and next_aa != 'P':  # for trypsin peptides
+        if aa in ['K', 'R'] and next_aa != 'P':
             if len(peptide) > 0:
                 peptides.append(peptide)
             peptide = ''
@@ -63,7 +63,6 @@ def digest_mutant_proteins(
     :param max_length: Maximum peptide length to retain (default: 40).
     :param missed_cleavages: Number of missed cleavages for trypsin digestion (default: 0).
     """
-    # Build canonical peptidome from reference FASTA
     handle1 = SeqIO.parse(fasta_file, 'fasta')
     peptidome = {}
 
@@ -78,7 +77,6 @@ def digest_mutant_proteins(
     log.info("Known peptides number: %d", len(peptidome))
     handle1.close()
 
-    # Parse mutant protein input files
     filelist = input_file.split(",")
     handle_list = []
     for f in filelist:
diff --git a/pgatk/db/map_peptide2genome.py b/pgatk/db/map_peptide2genome.py
index a09b119..c573dc0 100644
--- a/pgatk/db/map_peptide2genome.py
+++ b/pgatk/db/map_peptide2genome.py
@@ -87,7 +87,7 @@ def parse_gtf(infile):
     dic = {}
     with open(infile, "r", encoding='utf-8') as infile_object:
         for line in infile_object:
-            if line[0] != "#":  # skip lines commented out
+            if line[0] != "#":
                 row = line.strip().split("\t")
                 if row[2] == "CDS":
                     attri_list = row[8].split(";")
@@ -153,7 +153,6 @@ def map_peptides_to_genome(
 
     pep_dic = {}
     with open(input_file, 'r', encoding='utf-8') as input_stream:
-        # peptide table with two columns, peptide sequence in first column, protein ID in second column
         input_stream.readline()
         for line in input_stream:
             row = line.strip().split("\t")
@@ -188,7 +187,6 @@ def map_peptides_to_genome(
                 exons, pep_trans_start, pep_trans_end
             )
 
-            # handle exceptions
             if pep_chr_start > pep_chr_end:
                 non_mapped_pep += 1
                 continue
diff --git a/pgatk/ensembl/data_downloader.py b/pgatk/ensembl/data_downloader.py
index 1ad4812..0be653b 100644
--- a/pgatk/ensembl/data_downloader.py
+++ b/pgatk/ensembl/data_downloader.py
@@ -6,7 +6,6 @@
     2. Given a species ID, collect its GTF data, with the option of decompressing it or not.
 """
 
-# App imports
 from json import loads
 import re
 from pgatk.toolbox.general import ParameterConfiguration, check_create_folders, download_file
@@ -432,7 +431,6 @@ def get_vcf_files(self, species: dict, url_file=None) -> list:
                                                 file_name=self.get_local_path_root_ensembl_repo() + '/' + file_name,
                                                 log=self.get_logger(), url_file=url_file)
                 if downloaded_file is not None:
-                    # if chr1 is downloaded then try all others
                     files.append(downloaded_file)
                     for chrN in range(2, 23):  # chr2-22
                         file_name = '{}_incl_consequences-chr{}.vcf.gz'.format(species['name'], chrN)
diff --git a/pgatk/ensembl/ensembl.py b/pgatk/ensembl/ensembl.py
index 70d2e7f..5849082 100644
--- a/pgatk/ensembl/ensembl.py
+++ b/pgatk/ensembl/ensembl.py
@@ -263,10 +263,10 @@ def dnaseq_to_proteindb(self, input_fasta: str) -> str:
         with open(self._proteindb_output, 'w', encoding='utf-8') as prots_fn:
             for record_id in seq_dict.keys():
 
-                ref_seq = seq_dict[record_id].seq  # get the seq and desc for the record from the fasta of the gtf
+                ref_seq = seq_dict[record_id].seq
                 desc = str(seq_dict[record_id].description)
 
-                key_values = {}  # extract key=value in the desc into a dict
+                key_values = {}
                 sep = self._transcript_description_sep
                 desc = desc.replace(' ', sep)
                 for value in desc.split(sep):
@@ -290,7 +290,6 @@ def dnaseq_to_proteindb(self, input_fasta: str) -> str:
                             record_id, desc)
                         self.get_logger().debug(msg)
 
-                # only include features that have the specified biotypes or they have CDSs info
                 if 'CDS' in key_values.keys() and (
                         not self._skip_including_all_cds or 'altORFs' in self._include_biotypes):
                     pass
@@ -299,7 +298,6 @@ def dnaseq_to_proteindb(self, input_fasta: str) -> str:
                                                                           'all']))):
                     continue
 
-                # check wether to filter on expression and if it passes
                 if self._expression_str:
                     try:
                         if float(key_values[self._expression_str]) < self._expression_thresh:
@@ -386,12 +384,10 @@ def annoate_vcf(vcf_file: str, gtf_file: str,
                 muts_dict.setdefault(key, []).append(transcript_id)
 
         with open(f"{vcf_stem}_annotated.vcf", 'w', encoding='utf-8') as ann, open(vcf_file, 'r', encoding='utf-8') as v:
-            # write vcf headers to the output file
             for line in v.readlines():
                 if line.startswith('#'):
                     ann.write(line)
                 else:
-                    # write the mutations and their overlapping transcript to output file
                     sl = line.strip().split('\t')
                     if len(sl) < 8:
                         ann.write(line)
@@ -459,7 +455,7 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
         # handle cases where the transcript has version in the GTF but not in the VCF
         transcript_id_mapping = {k.split('.')[0]: k for k in transcripts_dict.keys()}
         feature_cache = _FeatureCache()
-        seq_cache: dict[str, tuple] = {}  # transcript_id_v -> (ref_seq, desc)
+        seq_cache: dict[str, tuple] = {}
 
         transcript_index, consequence_index, biotype_index = None, None, None
         if self._annotation_field_name:
@@ -473,7 +469,6 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
             except IndexError:
                 pass
 
-            # try to extract index of transcript ID and consequence from the VCF metadata in the header
             try:
                 transcript_index = annotation_cols.index(self._transcript_str.upper())
             except ValueError:
@@ -522,7 +517,6 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
                     if alt is None:
                         continue
                     elif [x for x in str(alt) if x not in 'ACGT']:
-                        # check if all alt alleles are nucleotides
                         continue
                     alts.append(alt)
                 if not alts:
@@ -538,7 +532,7 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
                     info_kv[k] = v
 
                 if not self._ignore_filters and self._accepted_filters != ['ALL']:
-                    if record.FILTER and record.FILTER != '.' and record.FILTER != 'NA' and record.FILTER != '':  # if not PASS: None and empty means PASS
+                    if record.FILTER and record.FILTER != '.' and record.FILTER != 'NA' and record.FILTER != '':  # None and empty means PASS
                         filters = set(record.FILTER.upper().split(','))
                         if ';' in record.FILTER and len(filters) <= 1:
                             filters = set(record.FILTER.upper().split(';'))
@@ -552,7 +546,6 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
                         invalid_records['# variants with invalid record'] += 1
                         continue
 
-                    # check if the AF passed the threshold
                     if af < self._af_threshold:
                         invalid_records['# variants not passing AF threshold'] += 1
                         continue
@@ -637,7 +630,6 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
                         ref_seq, desc = cached_row
 
                     feature_types = ['exon']
-                    # check if cds info exists in the fasta header otherwise translate all exons
                     cds_info = []
                     num_orfs = 3
                     if 'CDS=' in desc:
@@ -678,7 +670,7 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
                         feature_cache.put(cache_key, (chrom, strand, features_info))
                     else:
                         chrom, strand, features_info = cached
-                    if chrom is None:  # the record info was not found
+                    if chrom is None:
                         continue
 
                     for alt in alts:
diff --git a/pgatk/proteogenomics/spectrumai.py b/pgatk/proteogenomics/spectrumai.py
index 0fb16b3..612e309 100644
--- a/pgatk/proteogenomics/spectrumai.py
+++ b/pgatk/proteogenomics/spectrumai.py
@@ -281,7 +281,6 @@ def __init__(self, config_data: dict, pipeline_arguments: dict) -> None:
         self._mzml_files = self.get_validate_parameters(variable=self.CONFIG_MZML_FILES, default_value=False)
         self._ions_tolerance = self.get_validate_parameters(variable=self.CONFIG_IONS_TOLERANCE, default_value=0.02)
 
-        ## check if ions_tolerance is string, convert to float
         if isinstance(self._ions_tolerance, str):
             self._ions_tolerance = float(self._ions_tolerance)
 
diff --git a/pgatk/proteomics/db/protein_database_decoy.py b/pgatk/proteomics/db/protein_database_decoy.py
index 5f47e72..ee22853 100644
--- a/pgatk/proteomics/db/protein_database_decoy.py
+++ b/pgatk/proteomics/db/protein_database_decoy.py
@@ -221,21 +221,14 @@ def generate_decoypyrat_database(self) -> None:
     :return:
     """
 
-        # Create empty sets to add all target and decoy peptides
         upeps = set()
         dpeps = set()
 
-        # Counter for number of decoy sequences
         dcount = 0
 
-        # Open FASTA file using first cmd line argument
-        # fasta = SeqIO.parse(self._input_fasta, 'fasta')
-
         with open(self._input_fasta) as handle:
-            # open temporary decoy FASTA file
             with open(self._temp_file, 'w') as outfa:
 
-                # loop each seq in the file
                 for value in SimpleFastaParser(handle):
                     seq = value[1]
                     description = value[0]
@@ -244,53 +237,42 @@ def generate_decoypyrat_database(self) -> None:
                     if not self._isobaric:
                         seq = seq.replace('I', 'L')
 
-                    # digest sequence add peptides to set
                     upeps.update(
                         cleave(sequence=seq, rule=PGATK_ENZYMES.enzymes[self._enzyme]['cleavage rule'],
                                missed_cleavages=0,
                                min_length=self._min_peptide_length))
 
-                    # reverse and switch protein sequence
                     decoyseq = self.revswitch(seq, self._no_switch,
                                               PGATK_ENZYMES.enzymes[self._enzyme]['cleavage sites'])
 
                     # do not store decoy peptide set in reduced memory mode
                     if not self._memory_save:
-                        # update decoy peptide set
                         dpeps.update(
                             cleave(sequence=decoyseq, rule=PGATK_ENZYMES.enzymes[self._enzyme]['cleavage rule'],
                                    missed_cleavages=0,
                                    min_length=self._min_peptide_length))
 
-                    # write decoy protein accession and sequence to file
                     outfa.write('>' + self._decoy_prefix + description + '\n')
                     outfa.write(decoyseq + '\n')
 
-        # Summarise the numbers of target and decoy peptides and their intersection
         nonDecoys = set()
         self.get_logger().info("proteins: %s", dcount)
         self.get_logger().info("target peptides: %s", len(upeps))
 
         # Reloop decoy file in reduced memory mode to store only intersecting decoys
         if self._memory_save:
-            # open temp decoys
             with open(self._temp_file, "rt") as fin:
                 for line in fin:
-                    # if line is not accession
                     if line[0] != '>':
-                        # digest protein
                         for p in cleave(sequence=line.rstrip(),
                                         rule=PGATK_ENZYMES.enzymes[self._enzyme]['cleavage rule'],
                                         missed_cleavages=0, min_length=self._min_peptide_length):
-                            # check if in target peptides if true then add to nonDecoys
                             if p in upeps:
                                 nonDecoys.add(p)
             fin.close()
             self.get_logger().info("decoy peptides: !Memory Saving Made!")
         else:
-            # can only report total number in normal memory mode
             self.get_logger().info("decoy peptides: %s", len(dpeps))
-            # find intersecting peptides
             nonDecoys = upeps.intersection(dpeps)
 
         self.get_logger().info("#intersection: %s", len(nonDecoys))
@@ -298,37 +280,27 @@ def generate_decoypyrat_database(self) -> None:
         # if there are decoy peptides that are in the target peptide set
         if len(nonDecoys) > 0 and self._no_suffle == False:
 
-            # create empty dictionary with bad decoys as keys
             dAlternative = dict.fromkeys(nonDecoys, '')
             noAlternative = list()
 
-            # loop bad decoys / dictionary keys
             for dPep in dAlternative:
                 i = 0
                 aPep = dPep
 
-                # shuffle until aPep is not in target set (maximum of 10 iterations)
                 while aPep in upeps and i < self._max_iterations:
-
-                    # increment iteration counter
                     i += 1
-
-                    # shuffle peptide
                     aPep = self.shuffle(dPep)
 
                     # check if shuffling has an effect if not end iterations
                     if aPep == dPep:
                         i = self._max_iterations
 
-                # update dictionary with alternative shuffled peptide
                 dAlternative[dPep] = aPep
 
-                # warn if peptide has no suitable alternative, add to removal list
                 if i == self._max_iterations:
                     noAlternative.append(dPep)
 
             self.get_logger().info('%s have no alternative peptide', len(noAlternative))
-            # remove peptides with no alternative
             for p in noAlternative:
                 del dAlternative[p]
 
@@ -336,11 +308,8 @@ def generate_decoypyrat_database(self) -> None:
             upeps.clear()
             dpeps.clear()
 
-            # open second decoy file
             with open(self._output_file, "wt") as fout:
 
-                # Attach the target sequences to the database
-                # fasta = SeqIO.parse(self._input_fasta, 'fasta')
                 with open(self._input_fasta) as handle:
                     for value in SimpleFastaParser(handle):
                         description = value[0]
@@ -348,20 +317,15 @@ def generate_decoypyrat_database(self) -> None:
                         fout.write('>' + description + '\n')
                         fout.write(seq + '\n')
 
-                # open original decoy file
                 with open(self._temp_file, "rt") as fin:
-                    # loop each line of original decoy fasta
                     for line in fin:
                         # if line is not accession replace peptides in dictionary with alternatives
                         if line[0] != '>':
-                            # digest decoy sequence
                             for p in cleave(sequence=line.rstrip(),
                                             rule=PGATK_ENZYMES.enzymes[self._enzyme]['cleavage rule'],
                                             missed_cleavages=0, min_length=self._min_peptide_length):
-                                # store decoy peptide for final count
                                 dpeps.add(p)
 
-                                # if decoy peptide is in dictionary replace with alternative
                                 if p in dAlternative:
                                     line = line.replace(p, dAlternative[p])
 
@@ -391,24 +355,19 @@ def pgatk_decoy_database(self) -> None:
      :return:
     """
 
-        # Create empty sets to add all target and decoy peptides
         upeps = set()
         noAlternative = set()
-        # Open FASTA file using first cmd line argument
         fasta = SeqIO.parse(self._input_fasta, 'fasta')
-        # loop each seq in the file
         for record in fasta:
             seq = str(record.seq)
             if not self._isobaric:
                 seq = seq.replace('I', 'L')
 
-                # digest sequence add peptides to the target set
                 upeps.update(
                     cleave(sequence=seq, rule=PGATK_ENZYMES.enzymes[self._enzyme]['cleavage rule'],
                            missed_cleavages=self._max_missed_cleavages,
                            min_length=self._min_peptide_length))
 
-        # open orary decoy FASTA file
         with open(self._output_file, 'w') as outfa:
             fasta = SeqIO.parse(self._input_fasta, 'fasta')
             targets = []
@@ -418,7 +377,6 @@ def pgatk_decoy_database(self) -> None:
                 targets.append(protseq)
                 revprotseq = []
 
-                # output target protein
                 seq = str(record.seq)
                 id_protein = record.id
                 description = record.description
@@ -431,7 +389,6 @@ def pgatk_decoy_database(self) -> None:
                     if not self._isobaric:
                         seq = seq.replace('I', 'L')
 
-                    # reverse and switch protein sequence
                     decoyseq = self.revswitch(seq, self._no_switch,
                                               PGATK_ENZYMES.enzymes[self._enzyme]['cleavage sites'])
 
@@ -455,29 +412,23 @@ def pgatk_decoy_database(self) -> None:
 
                         if found_in_target and not self._no_suffle and decoy_pep not in noAlternative:
                             aPep = decoy_pep
-                            # shuffle until aPep is not in target set (maximum of 10 iterations)
                             i = 0
                             while aPep in upeps and i < self._max_iterations:
-                                # increment iteration counter
                                 i += 1
-                                # shuffle peptide
                                 aPep = self.shuffle(aPep)
 
                                 # check if shuffling has an effect if not end iterations
                                 if aPep == decoy_pep:
                                     i = self._max_iterations
 
-                                # warn if peptide has no suitable alternative, add to removal list
                                 if i == self._max_iterations:
                                     noAlternative.add(decoy_pep)
                                     aPep = ''
-                        # if decoy is generated then add to the list of peptides
                         if aPep:
                             checked_decoy_peps.append(aPep)
                         else:
                             if self._keep_target_hits:
                                 checked_decoy_peps.append(decoy_pep)
-                    # finally join the peptides to generate protein decoy
                     if checked_decoy_peps:
                         revprotseq.append(''.join(checked_decoy_peps))
 
diff --git a/pgatk/toolbox/general.py b/pgatk/toolbox/general.py
index 15f7603..897cd71 100644
--- a/pgatk/toolbox/general.py
+++ b/pgatk/toolbox/general.py
@@ -17,7 +17,6 @@
 import yaml
 import gzip
 
-# Logging defaults
 from requests import HTTPError
 
 from pgatk.toolbox.exceptions import ToolBoxException
@@ -61,7 +60,6 @@ def __init__(self, root_config_name: str, yaml_configuration: dict, pipeline_par
         else:
             self._default_params = {}
 
-        # Prepare Logging subsystem
         if self._default_params is not None and self._ROOT_CONFIG_NAME in self._default_params:
             if self._CONFIG_LOGGER in self._default_params[self._ROOT_CONFIG_NAME]:
                 if self._CONFIG_LOGGER_LEVEL in self._default_params[self._ROOT_CONFIG_NAME][self._CONFIG_LOGGER]:
@@ -96,10 +94,8 @@ def get_config_value(self, key: str, default: Any = None) -> Any:
         Checks pipeline_parameters first (flat lookup), then
         default_params[_ROOT_CONFIG_NAME][key], then returns default.
         """
-        # Check pipeline params first (flat lookup by key)
         if key in self._pipeline_parameters:
             return self._pipeline_parameters[key]
-        # Then check default config params under root config name
         root = self._ROOT_CONFIG_NAME
         if (self._default_params is not None
                 and root in self._default_params
@@ -117,12 +113,10 @@ def get_log_handlers(self) -> list:
         return self._log_handlers
 
     def get_logger(self) -> logging.Logger:
-        # Get own logger
         return self._logger
 
     def get_session_log_files(self) -> list:
         log_files = []
-        # Add the application logs
         log_files.extend(self._log_files)
         return log_files
 
@@ -267,14 +261,12 @@ def check_create_folders_overwrite(folders: List[str]) -> None:
             if not os.path.isdir(folder):
                 invalid_folders.append(folder)
     if invalid_folders:
-        # If there's any invalid folder, we don't make any change, and we report the situation by raising an exception
         raise ToolBoxException("The following folders ARE NOT FOLDERS - '{}'"
                                .format(invalid_folders))
     for folder in folders:
         try:
             shutil.rmtree(folder)
         except FileNotFoundError:
-            # It is find if the folder is not there
             pass
     check_create_folders(folders)
 
@@ -347,7 +339,6 @@ def gunzip_files(files: List[str]) -> list[tuple[str, str]]:
                 (stdout, stderr) = gunzip_subprocess.communicate(timeout=timeout)
                 if gunzip_subprocess.poll() is not None:
                     if gunzip_subprocess.returncode != 0:
-                        # ERROR - Report this
                         err_msg = "ERROR uncompressing file '{}' output from subprocess STDOUT: {}\nSTDERR: {}" \
                             .format(file, stdout.decode('utf8'), stderr.decode('utf8'))
                         files_with_error.append((file, err_msg))
diff --git a/pgatk/toolbox/vcf_utils.py b/pgatk/toolbox/vcf_utils.py
index 61fe4db..d784361 100644
--- a/pgatk/toolbox/vcf_utils.py
+++ b/pgatk/toolbox/vcf_utils.py
@@ -30,7 +30,6 @@ def check_overlap(var_start: int, var_end: int, features_info: Optional[list] =
         features_info = [[0, 1, 'type']]
     if var_start == -1:
         return True
-    # check if the var overlaps any of the features
     for feature_pos in features_info:
         pep_start = feature_pos[0]
         pep_end = feature_pos[1]

From 237f89009567b0559f702294b71deb2035878ca3 Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Mon, 11 May 2026 16:53:20 +0100
Subject: [PATCH 13/23] Address Copilot review feedback on PR #102

Five issues from the Copilot review, fixed:

1. seq_cache type annotation: changed from `dict[str, tuple]` to
   `dict[str, Optional[tuple]]` to reflect that we store None as the
   "known-missing transcript" marker alongside (ref_seq, desc) tuples.
   Added a comment explaining the _MISSING sentinel pattern.

2. cosmic_downloader.py: redact the presigned S3 URL when logging.
   The download URL embeds a short-lived AWS signature/credentials in
   its query string; even with a 1-hour TTL it can leak into log
   aggregators. Log only the path component.

3. commands/cosmic_downloader.py: rewrite the `--url_file` help text.
   The flag used to write a direct download URL; after the v103 API
   migration it writes the SCRIPTED-DOWNLOAD API URL (step 1, not
   the signed S3 URL). The help now describes the actual TSV format
   and the indirection.

4. tests/test_cosmic_downloader.py: rewrite module docstring to
   describe what the tests actually check (one URL pattern + the
   token format), not the stale "four product API URLs" claim.

5. cosmic_downloader.py: fix the `output_directory` config-lookup
   bug. `get_configuration_default_params` only searched under
   `cosmic_data.cosmic_server.<var>`, so the `output_directory`
   field at `cosmic_data.<var>` (top level) in cosmic_config.yaml
   was silently ignored - the service always fell back to the
   hardcoded `./database_cosmic/`. Now check the top level too,
   so the YAML value takes effect. Pre-existing bug, surfaced by
   the Copilot review.
---
 pgatk/cgenomes/cosmic_downloader.py   | 25 +++++++++++++++----------
 pgatk/commands/cosmic_downloader.py   |  6 +++++-
 pgatk/ensembl/ensembl.py              |  5 ++++-
 pgatk/tests/test_cosmic_downloader.py | 12 ++++++++----
 4 files changed, 32 insertions(+), 16 deletions(-)

diff --git a/pgatk/cgenomes/cosmic_downloader.py b/pgatk/cgenomes/cosmic_downloader.py
index 0a8f7c6..65a066b 100644
--- a/pgatk/cgenomes/cosmic_downloader.py
+++ b/pgatk/cgenomes/cosmic_downloader.py
@@ -61,16 +61,19 @@ def __init__(self, config_file, pipeline_arguments):
         self.prepare_local_cosmic_repository()
 
     def get_configuration_default_params(self, variable: str, default_value):
-        return_value = default_value
         if variable in self.get_pipeline_parameters():
-            return_value = self.get_pipeline_parameters()[variable]
-        elif self.CONFIG_KEY_DATA_DOWNLOADER in self.get_default_parameters() \
-                and self.CONFIG_COSMIC_SERVER in self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER] \
-                and variable in self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][
-            self.CONFIG_COSMIC_SERVER]:
-            return_value = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_COSMIC_SERVER][
-                variable]
-        return return_value
+            return self.get_pipeline_parameters()[variable]
+        defaults = self.get_default_parameters()
+        if self.CONFIG_KEY_DATA_DOWNLOADER not in defaults:
+            return default_value
+        data = defaults[self.CONFIG_KEY_DATA_DOWNLOADER]
+        # `output_directory` lives at cosmic_data.<var>; everything else under cosmic_data.cosmic_server.<var>.
+        if variable in data and not isinstance(data[variable], dict):
+            return data[variable]
+        server = data.get(self.CONFIG_COSMIC_SERVER, {})
+        if variable in server:
+            return server[variable]
+        return default_value
 
     def prepare_local_cosmic_repository(self):
         self.get_logger().debug("Preparing local cbioportal repository, root folder - '{}'".format(
@@ -148,7 +151,9 @@ def download_file_cosmic(self, api_url, local_file, token):
             self.get_logger().error(msg)
             raise AppConfigException(msg)
 
-        self.get_logger().debug("Downloading file from signed URL '{}'".format(download_url))
+        # The presigned URL embeds short-lived AWS credentials in its query string;
+        # log only the path so the secret material doesn't leak into log aggregators.
+        self.get_logger().debug("Downloading file from signed URL (path %s)", download_url.split('?', 1)[0])
         data_response = requests.get(download_url, stream=True, timeout=30)
         if data_response.status_code != 200:
             msg = ("COSMIC S3 download failed: HTTP {} for {}"
diff --git a/pgatk/commands/cosmic_downloader.py b/pgatk/commands/cosmic_downloader.py
index 557cce9..fe35216 100644
--- a/pgatk/commands/cosmic_downloader.py
+++ b/pgatk/commands/cosmic_downloader.py
@@ -22,7 +22,11 @@
                    'May be repeated. Overrides the products list in the config file when provided. '
                    'Find available paths at https://cancer.sanger.ac.uk/cosmic/download/cosmic '
                    '(open the "Scripted Download" panel for any product).')
-@click.option("--url_file", help='Add the url to a downloaded file')
+@click.option("--url_file",
+              help='If set, do not download anything. Instead, write a TSV with one row per '
+                   'product (`<api_url>\\t<local_output_path>`) so the downloads can be driven '
+                   'externally (e.g. wget/curl loop). The api_url is the COSMIC scripted-download '
+                   'endpoint URL — fetching it returns JSON containing the actual signed S3 URL.')
 @click.pass_context
 def cosmic_downloader(ctx, config_file, output_directory, username, password, products, url_file):
 
diff --git a/pgatk/ensembl/ensembl.py b/pgatk/ensembl/ensembl.py
index 5849082..d83d031 100644
--- a/pgatk/ensembl/ensembl.py
+++ b/pgatk/ensembl/ensembl.py
@@ -455,7 +455,10 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
         # handle cases where the transcript has version in the GTF but not in the VCF
         transcript_id_mapping = {k.split('.')[0]: k for k in transcripts_dict.keys()}
         feature_cache = _FeatureCache()
-        seq_cache: dict[str, tuple] = {}
+        # Value is (ref_seq, desc) for a known transcript, or None for a transcript
+        # we've already looked up and confirmed isn't in the FASTA index (avoids re-trying
+        # the disk seek). _MISSING sentinel below distinguishes "not yet looked up".
+        seq_cache: dict[str, Optional[tuple]] = {}
 
         transcript_index, consequence_index, biotype_index = None, None, None
         if self._annotation_field_name:
diff --git a/pgatk/tests/test_cosmic_downloader.py b/pgatk/tests/test_cosmic_downloader.py
index 5a4b028..5564495 100644
--- a/pgatk/tests/test_cosmic_downloader.py
+++ b/pgatk/tests/test_cosmic_downloader.py
@@ -1,7 +1,11 @@
-"""Unit tests for the COSMIC downloader URL construction.
-
-No network. Builds a CosmicDownloadService against the shipped default config
-and asserts that the four product API URLs match the expected pattern.
+"""Unit tests for the COSMIC downloader URL construction and auth token.
+
+No network. Two tests:
+  * `test_build_api_url_pattern` — verifies build_api_url() composes the
+    scripted-download endpoint correctly from server, api_endpoint, path
+    and bucket.
+  * `test_basic_auth_token_format` — verifies the base64(user:password)
+    token has no trailing newline (a common pitfall when echo'ing).
 """
 from pgatk.cgenomes.cosmic_downloader import CosmicDownloadService
 from pgatk.config.registry import load_config

From c1ea9cc6e3f2fb0b23668406258c487e8244f8c0 Mon Sep 17 00:00:00 2001
From: Husen Umer <husen.umer@kaust.edu.sa>
Date: Tue, 12 May 2026 11:39:08 +0300
Subject: [PATCH 14/23] Update use cases and configuration for Ensembl biotypes
 and file formats

- removed .gz from file names in the docs since ensembl download uncopmress files after downloading: `.gtf.gz` to `.gtf` and `.vcf.gz` to `.vcf` in multiple commands for consistency.
- Updated the `include_biotypes` for pseudogenes adding: `unitary_pseudogene` and `rRNA_pseudogene`.
- Renamed references from `lincRNA` to `lncRNA` throughout the documentation for accuracy.
- update canonical biotypes to only keep those that have CDS similar to the reference.
---
 docs/use-cases.md                | 60 ++++++++++++++++----------------
 pgatk/config/ensembl_config.yaml |  2 +-
 2 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/docs/use-cases.md b/docs/use-cases.md
index 0f151e9..0697af1 100644
--- a/docs/use-cases.md
+++ b/docs/use-cases.md
@@ -33,7 +33,7 @@ pgatk ensembl-downloader \
 ```bash
 gffread -F -w ensembl_human/transcripts.fa \
     -g ensembl_human/genome.fa \
-    ensembl_human/Homo_sapiens.GRCh38.*.gtf.gz
+    ensembl_human/Homo_sapiens.GRCh38.*.gtf
 ```
 
 ### Step 3 -- Canonical protein-coding sequences
@@ -58,7 +58,7 @@ pgatk dnaseq-to-proteindb \
     --input_fasta ensembl_human/transcripts.fa \
     --output_proteindb pseudogene.fa \
     --protein_prefix pseudo_ \
-    --include_biotypes processed_pseudogene,unprocessed_pseudogene,transcribed_processed_pseudogene,transcribed_unprocessed_pseudogene,translated_processed_pseudogene \
+    --include_biotypes pseudogene,processed_pseudogene,unprocessed_pseudogene,transcribed_processed_pseudogene,transcribed_unprocessed_pseudogene,translated_processed_pseudogene,unitary_pseudogene,transcribed_unitary_pseudogene,rRNA_pseudogene,IG_V_pseudogene,TR_V_pseudogene,IG_C_pseudogene,TR_J_pseudogene,IG_J_pseudogene,IG_pseudogene \
     --num_orfs 3 \
     --skip_including_all_cds
 ```
@@ -73,7 +73,7 @@ pgatk dnaseq-to-proteindb \
     --input_fasta ensembl_human/transcripts.fa \
     --output_proteindb lncrna.fa \
     --protein_prefix lncrna_ \
-    --include_biotypes lincRNA,antisense,sense_intronic,sense_overlapping \
+    --include_biotypes lncRNA \
     --num_orfs 3 \
     --skip_including_all_cds
 ```
@@ -98,9 +98,9 @@ Include common human variants from ENSEMBL:
 
 ```bash
 pgatk vcf-to-proteindb \
-    --vcf ensembl_human/homo_sapiens_incl_consequences.vcf.gz \
+    --vcf ensembl_human/homo_sapiens_incl_consequences.vcf \
     --input_fasta ensembl_human/transcripts.fa \
-    --gene_annotations_gtf ensembl_human/Homo_sapiens.GRCh38.*.gtf.gz \
+    --gene_annotations_gtf ensembl_human/Homo_sapiens.GRCh38.*.gtf \
     --output_proteindb ensembl_variants.fa
 ```
 
@@ -197,7 +197,7 @@ extract transcript sequences from the GTF and genome FASTA:
 ```bash
 gffread -F -w ensembl_human/transcripts.fa \
     -g ensembl_human/genome.fa \
-    ensembl_human/Homo_sapiens.GRCh38.*.gtf.gz
+    ensembl_human/Homo_sapiens.GRCh38.*.gtf
 ```
 
 ### Step 3 -- Generate the variant protein database
@@ -206,9 +206,9 @@ Translate all ENSEMBL variants that affect protein-coding transcripts:
 
 ```bash
 pgatk vcf-to-proteindb \
-    --vcf ensembl_human/homo_sapiens_incl_consequences.vcf.gz \
+    --vcf ensembl_human/homo_sapiens_incl_consequences.vcf \
     --input_fasta ensembl_human/transcripts.fa \
-    --gene_annotations_gtf ensembl_human/Homo_sapiens.GRCh38.*.gtf.gz \
+    --gene_annotations_gtf ensembl_human/Homo_sapiens.GRCh38.*.gtf \
     --output_proteindb ensembl_human/variant_proteins.fa
 ```
 
@@ -255,7 +255,7 @@ Include only variants present in at least 1% of the population:
 
 ```bash
 pgatk vcf-to-proteindb \
-    --vcf homo_sapiens_incl_consequences.vcf.gz \
+    --vcf homo_sapiens_incl_consequences.vcf \
     --input_fasta transcripts.fa \
     --gene_annotations_gtf genes.gtf \
     --af_field MAF \
@@ -270,7 +270,7 @@ detectable by mass spectrometry:
 
 ```bash
 pgatk vcf-to-proteindb \
-    --vcf homo_sapiens_incl_consequences.vcf.gz \
+    --vcf homo_sapiens_incl_consequences.vcf \
     --input_fasta transcripts.fa \
     --gene_annotations_gtf genes.gtf \
     --af_field MAF \
@@ -289,7 +289,7 @@ a specific population:
 pgatk vcf-to-proteindb \
     --vcf gnomad.exomes.v4.1.sites.vcf.bgz \
     --input_fasta gencode_transcripts.fa \
-    --gene_annotations_gtf gencode.v44.annotation.gtf.gz \
+    --gene_annotations_gtf gencode.v44.annotation.gtf \
     --annotation_field_name vep \
     --af_field AF_afr \
     --af_threshold 0.01 \
@@ -324,18 +324,18 @@ pgatk ncbi-downloader -o ncbi_data
 
 This downloads four files to `ncbi_data/`:
 
-- `GRCh38_latest_genomic.gtf.gz` -- RefSeq gene annotations
-- `GRCh38_latest_rna.fna.gz` -- RefSeq transcript nucleotide sequences
+- `GRCh38_latest_genomic.gtf` -- RefSeq gene annotations
+- `GRCh38_latest_rna.fna` -- RefSeq transcript nucleotide sequences
 - `GRCh38_latest_assembly_report.txt` -- Chromosome name mapping
-- `clinvar.vcf.gz` -- ClinVar variant calls
+- `clinvar.vcf` -- ClinVar variant calls
 
 ### Step 2 -- Generate the ClinVar protein database
 
 ```bash
 pgatk clinvar-to-proteindb \
-    --vcf ncbi_data/clinvar.vcf.gz \
-    --gtf ncbi_data/GRCh38_latest_genomic.gtf.gz \
-    --fasta ncbi_data/GRCh38_latest_rna.fna.gz \
+    --vcf ncbi_data/clinvar.vcf \
+    --gtf ncbi_data/GRCh38_latest_genomic.gtf \
+    --fasta ncbi_data/GRCh38_latest_rna.fna \
     --assembly-report ncbi_data/GRCh38_latest_assembly_report.txt \
     --output clinvar_proteins.fa
 ```
@@ -537,17 +537,17 @@ open reading frames (smORFs), micropeptides from lncRNAs, pseudogene-encoded
 proteins, and alternative reading frames. Studies have found that non-canonical
 peptides can account for over 5% of total identifications.
 
-### lincRNA-derived proteins
+### lncRNA-derived proteins
 
-Long intergenic non-coding RNAs (lincRNAs) can encode small proteins. Translate
+Long intergenic non-coding RNAs (lncRNAs) can encode small proteins. Translate
 them in three reading frames:
 
 ```bash
 pgatk dnaseq-to-proteindb \
     --input_fasta transcripts.fa \
-    --output_proteindb lincRNA_proteins.fa \
-    --protein_prefix lincRNA_ \
-    --include_biotypes lincRNA \
+    --output_proteindb lncRNA_proteins.fa \
+    --protein_prefix lncRNA_ \
+    --include_biotypes lncRNA \
     --num_orfs 3 \
     --skip_including_all_cds
 ```
@@ -598,7 +598,7 @@ Combine all non-canonical sources with the canonical proteome:
 
 ```bash
 cat canonical_proteins.fa \
-    lincRNA_proteins.fa \
+    lncRNA_proteins.fa \
     pseudogene_proteins.fa \
     altorf_proteins.fa \
     antisense_proteins.fa \
@@ -616,7 +616,7 @@ pgatk generate-decoy \
 
     ```bash
     pgatk digest-mutant-protein \
-        --input lincRNA_proteins.fa,pseudogene_proteins.fa,altorf_proteins.fa \
+        --input lncRNA_proteins.fa,pseudogene_proteins.fa,altorf_proteins.fa \
         --fasta canonical_proteins.fa \
         --output novel_unique_peptides.fa
     ```
@@ -975,12 +975,12 @@ pgatk cosmic-to-proteindb \
     --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \
     --output_db cosmic_variants.fa
 
-# Non-coding RNA (lincRNA)
+# Non-coding RNA (lncRNA)
 pgatk dnaseq-to-proteindb \
     --input_fasta ensembl_data/transcripts.fa \
-    --output_proteindb lincRNA.fa \
-    --protein_prefix lincRNA_ \
-    --include_biotypes lincRNA \
+    --output_proteindb lncRNA.fa \
+    --protein_prefix lncRNA_ \
+    --include_biotypes lncRNA \
     --num_orfs 3 \
     --skip_including_all_cds
 
@@ -1001,7 +1001,7 @@ cat canonical.fa \
     ensembl_variants.fa \
     clinvar_variants.fa \
     cosmic_variants.fa \
-    lincRNA.fa \
+    lncRNA.fa \
     pseudogene.fa \
     > combined_target.fa
 ```
@@ -1027,7 +1027,7 @@ pgatk generate-decoy \
 
 ```bash
 pgatk digest-mutant-protein \
-    --input ensembl_variants.fa,clinvar_variants.fa,cosmic_variants.fa,lincRNA.fa,pseudogene.fa \
+    --input ensembl_variants.fa,clinvar_variants.fa,cosmic_variants.fa,lncRNA.fa,pseudogene.fa \
     --fasta canonical.fa \
     --output unique_variant_peptides.fa \
     --min-len 7 \
diff --git a/pgatk/config/ensembl_config.yaml b/pgatk/config/ensembl_config.yaml
index 24be0dc..ac679ed 100644
--- a/pgatk/config/ensembl_config.yaml
+++ b/pgatk/config/ensembl_config.yaml
@@ -11,7 +11,7 @@ ensembl_translation:
     exclude_biotypes: ''
     exclude_consequences: 'downstream_gene_variant, upstream_gene_variant, intergenic_variant, intron_variant, synonymous_variant, regulatory_region_variant'
     skip_including_all_cds: False
-    include_biotypes: 'protein_coding,protein_coding_CDS_not_defined,protein_coding_LoF,nonsense_mediated_decay,non_stop_decay,translated_processed_pseudogene,IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,TR_C_gene,TR_D_gene,TR_J_gene,TR_V_gene,TEC'
+    include_biotypes: 'IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,nonsense_mediated_decay,non_stop_decay,protein_coding,protein_coding_LoF,TR_C_gene,TR_D_gene,TR_J_gene,TR_V_gene'
     include_consequences: 'all'
     biotype_str: transcript_biotype
     transcript_description_sep: ';'

From a4627904ef8d48fb563369cbee37491b66d6d32b Mon Sep 17 00:00:00 2001
From: Husen Umer <husen.umer@kaust.edu.sa>
Date: Tue, 12 May 2026 13:38:51 +0300
Subject: [PATCH 15/23] Updated use cases documentation - Renamed use case
 headings for better organization and clarity. - Added a new section for
 'Putative proteins (three-frame)' with example commands. - Updated input
 commands to include the new putative proteins in relevant workflows. -
 Enhanced descriptions for condition-specific databases and cancer-type
 specific databases to improve understanding.

---
 docs/use-cases.md | 38 +++++++++++++++++++++++++++-----------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/docs/use-cases.md b/docs/use-cases.md
index 0697af1..f08f690 100644
--- a/docs/use-cases.md
+++ b/docs/use-cases.md
@@ -6,7 +6,7 @@ a search-ready protein database.
 
 ---
 
-## Cell-Type Specific Non-Canonical Peptide Discovery
+## USE CASE 1: Cell-Type Specific Non-Canonical Peptide Discovery
 
 !!! abstract "Featured workflow"
     This workflow reproduces the analysis from [Umer et al., *Bioinformatics* 2022](https://doi.org/10.1093/bioinformatics/btab838),
@@ -78,6 +78,20 @@ pgatk dnaseq-to-proteindb \
     --skip_including_all_cds
 ```
 
+#### Putative proteins (three-frame)
+
+Protein coding genes without CDs that are not validated but annoated:
+
+```bash
+pgatk dnaseq-to-proteindb \
+    --input_fasta ensembl_human/transcripts.fa \
+    --output_proteindb putative.fa \
+    --protein_prefix putative_ \
+    --include_biotypes protein_coding_CDS_not_defined,TEC,translated_processed_pseudogene \
+    --num_orfs 3 \
+    --skip_including_all_cds
+```
+
 #### Alternative ORFs (exonic out-of-frame translation)
 
 Non-canonical reading frames of protein-coding mRNAs can produce cryptic
@@ -130,6 +144,7 @@ pgatk cosmic-to-proteindb \
 cat canonical.fa \
     pseudogene.fa \
     lncrna.fa \
+    putative.fa \
     altorf.fa \
     ensembl_variants.fa \
     > cell_type_target.fa
@@ -149,7 +164,7 @@ set of non-canonical peptides unique to these novel sources:
 
 ```bash
 pgatk digest-mutant-protein \
-    --input pseudogene.fa,lncrna.fa,altorf.fa,ensembl_variants.fa \
+    --input pseudogene.fa,lncrna.fa,putative.fa,altorf.fa,ensembl_variants.fa \
     --fasta canonical.fa \
     --output non_canonical_peptides.fa \
     --min-len 7 \
@@ -166,7 +181,7 @@ pgatk digest-mutant-protein \
 
 ---
 
-## 1. Human Variant Protein Database from ENSEMBL
+## USE CASE 2: Human Variant Protein Database from ENSEMBL
 
 Build a variant protein database using ENSEMBL population variants (common SNPs
 and indels) for human proteogenomics searches. This is the most common starting
@@ -242,7 +257,7 @@ The file `target_decoy.fa` is ready for database searching.
 
 ---
 
-## 2. Population-Specific Variant Database
+## USE CASE 3: Population-Specific Variant Database
 
 Population-level genetic variants cause amino acid changes that are invisible
 to standard reference database searches. Studies have shown that incorporating
@@ -308,7 +323,7 @@ pgatk vcf-to-proteindb \
 
 ---
 
-## 3. ClinVar Clinical Variant Database
+## USE CASE 4: ClinVar Clinical Variant Database
 
 [ClinVar](https://www.ncbi.nlm.nih.gov/clinvar/) catalogs the relationship
 between human variants and clinical phenotypes. Building a ClinVar-derived
@@ -356,7 +371,7 @@ pgatk generate-decoy \
 
 ---
 
-## 4. Tumor-Specific Databases for Cancer Proteogenomics
+## USE CASE 5: Condition-Specific Databases for Proteogenomics
 
 Cancer proteogenomics studies (such as those from CPTAC) build
 tumor-specific protein databases to detect somatic mutant peptides, understand
@@ -364,7 +379,7 @@ therapy resistance, and prioritize neoantigen candidates. The databases combine
 somatic mutations from cancer-specific sources (COSMIC, cBioPortal) and/or
 patient-matched whole-exome sequencing.
 
-### 4a. COSMIC somatic mutations by cancer type
+### 4a. Cancer-speific database using COSMIC somatic mutations
 
 Generate one protein database per primary tissue site. This is the standard
 approach for large-scale cancer proteogenomics when patient-level WES is not
@@ -388,7 +403,7 @@ pgatk cosmic-to-proteindb \
 This produces files like `cosmic_proteins_lung.fa`, `cosmic_proteins_breast.fa`,
 etc. Use the tissue-matched database for your cancer type of interest.
 
-### 4b. Single cancer type
+### 4b. Cancer-type speific database using COSMIC somatic mutations
 
 Build a focused database for one cancer type (e.g. lung):
 
@@ -400,7 +415,7 @@ pgatk cosmic-to-proteindb \
     --accepted_values "lung"
 ```
 
-### 4c. Cell-line proteogenomics
+### 4c. Cell-type specific databases
 
 When analyzing cell-line proteomes, use cell-line-specific mutations. COSMIC
 provides a dedicated cell-line export with mutations annotated per sample:
@@ -475,7 +490,7 @@ pgatk generate-decoy \
 
 ---
 
-## 5. Patient-Specific Database from WGS/WES
+## USE CASE 6: Patient-Specific Database from WGS/WES
 
 When matched whole-genome or whole-exome sequencing data is available for
 a sample, build a personalized protein database from the patient's own
@@ -530,7 +545,7 @@ pgatk vcf-to-proteindb \
 
 ---
 
-## 6. Novel ORF and Micropeptide Discovery
+## USE CASE 7: Novel ORF and Micropeptide Discovery
 
 Proteogenomics is a key approach for discovering novel coding regions: small
 open reading frames (smORFs), micropeptides from lncRNAs, pseudogene-encoded
@@ -1002,6 +1017,7 @@ cat canonical.fa \
     clinvar_variants.fa \
     cosmic_variants.fa \
     lncRNA.fa \
+    putative.fa \
     pseudogene.fa \
     > combined_target.fa
 ```

From 2178ca0638ffbd9e464d7b61710ed7befba6ab15 Mon Sep 17 00:00:00 2001
From: Husen Umer <husen.umer@kaust.edu.sa>
Date: Wed, 13 May 2026 11:57:52 +0300
Subject: [PATCH 16/23] Enhance mutation handling in CancerGenomesService based
 on COSMIC updates

- Improved handling of deletion-insertion (delins) mutations with better validation and making sure to skip  intronic and UTR offsets.
- Added support for tandem duplication (dup) mutations, allowing for accurate sequence reconstruction.
- Updated mutation type checks to include additional HGVS terms for missense, nonsense, and in-frame insertions/deletions.
- Refined the required columns in the CosmicMutantExport input to align with updated naming conventions.

Additionally, added a new 'Validations' section in the documentation for clarity on module correctness, currently only covers COSMIC mutation translation based on results in tests/test_variant_types_hgvs.py.
---
 docs/index.md                          |   1 +
 docs/validations.md                    | 130 +++++++
 pgatk/cgenomes/cgenomes_proteindb.py   |  87 +++--
 pgatk/tests/test_variant_types_hgvs.py | 449 +++++++++++++++++++++++++
 4 files changed, 639 insertions(+), 28 deletions(-)
 create mode 100644 docs/validations.md
 create mode 100644 pgatk/tests/test_variant_types_hgvs.py

diff --git a/docs/index.md b/docs/index.md
index f6e58a2..87026a3 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -23,6 +23,7 @@ See the [Installation](installation.md) page for more options (Bioconda, Docker,
 | [Introduction](introduction.md) | Overview of the proteogenomics field |
 | [Installation](installation.md) | How to install pgatk (pip, Bioconda, Docker, source) |
 | [pgatk CLI](pgatk-cli.md) | Full command-line reference for all tools |
+| [Validations](validations.md) | Tests and validations to ensure correctness of the modules|
 | [Use Cases](use-cases.md) | End-to-end workflows and recipes for common scenarios |
 | [File Formats](formats.md) | BED, GTF, GCT format specifications |
 | [Changelog](changelog.md) | Version history and release notes |
diff --git a/docs/validations.md b/docs/validations.md
new file mode 100644
index 0000000..015ef72
--- /dev/null
+++ b/docs/validations.md
@@ -0,0 +1,130 @@
+# Test Validations
+
+This page documents the validation strategy for PGATK, describing what the automated tests verify and how to independently confirm their correctness.
+# 1. Variant Translation Tests - COSMIC
+## 1.1 Validation per variant type (`test_variant_types_hgvs.py`)
+
+These tests verify that `CancerGenomesService.get_mut_pro_seq()` correctly processes all 18 Sequence Ontology (SO) variant types present in COSMIC v103 mutation files, following [HGVS nomenclature rules](https://hgvs-nomenclature.org/stable/).
+
+Each test uses a real mutation from `Cosmic_CompleteTargetedScreensMutant_v103_GRCh38.tsv` paired with the matching CDS from `Cosmic_Genes_v103_GRCh38.fasta`.
+
+---
+
+### Actionable Variant Types
+
+These types are expected to produce a non-empty mutant protein sequence. Tests assert biologically specific properties, not just that the result is non-empty.
+
+| Variant Type | Gene | Mutation | HGVS Rule Verified |
+|---|---|---|---|
+| `missense_variant` | EZH2 | c.656C>T p.S219F | Protein length unchanged; position 218 (0-indexed) is `F`; all other residues identical to WT |
+| `stop_gained` | FCGR3B | c.394A>T p.K132* | Position 131 (0-indexed) is `*` (stop codon) in the translated output |
+| `inframe_deletion` | CTNNB1 | c.104_124del p.T35_G41del | Protein shortened by exactly 7 AA (21 nt deleted ÷ 3); residues before position 35 unchanged |
+| `inframe_insertion` (dup) | SMAP1 | c.1226_1228dup p.I409dup | Protein lengthened by exactly 1 AA (3 nt duplicated ÷ 3); residues before the dup site unchanged |
+| `protein_altering_variant` | NF2 | c.252_262delinsCT p.L85_K88delins* | Result is non-empty and differs from the wild-type sequence |
+| `frameshift_variant` | MEN1 | c.292del p.R98Efs*21 | Non-empty translated sequence produced from the frame-shifted CDS |
+| `start_lost` | INPP4B | c.3G>T p.M1? | First AA is not Met; it is Ile (`ATG→ATT` via the DNA substitution path) |
+| `stop_lost` | MYD88 | c.611_613delinsGCGGCCCCC p.D204_*205delinsGGPR | Mutant protein is longer than WT (stop codon removed; read-through occurs) |
+| `stop_retained_variant` | EPHA3 | c.2756G>A p.*919= | Length unchanged; last residue is still `*` (`TGA→TAA`, both stop codons) |
+
+---
+
+### Filtered / Skipped Variant Types
+
+These types must produce an empty string `""`. There are two mechanisms:
+
+- **Pre-filter** (`cosmic_to_proteindb`): `synonymous_variant` rows are discarded before `get_mut_pro_seq` is called.
+- **`p.?` guard** (`get_mut_pro_seq`): When `aa_mut == "p.?"`, the function immediately returns `""`. All intronic, UTR, and splice variants in COSMIC v103 carry `p.?`.
+
+| Variant Type | Mechanism | Source Mutation |
+|---|---|---|
+| `synonymous_variant` | Pre-filter in `cosmic_to_proteindb` | Filter logic validated directly on the SO term strings |
+| `intron_variant` | `p.?` guard | RAD51C c.145+706C>T p.? |
+| `3_prime_UTR_variant` | `p.?` guard | SUFU c.*2816G>A p.? |
+| `5_prime_UTR_variant` | `p.?` guard | PSMD13 c.-20C>T p.? |
+| `coding_sequence_variant` | `p.?` guard (also has intronic offset `+`) | OAS2 c.2157+2A>G p.? |
+| `splice_acceptor_variant` | `p.?` guard | TSHR c.171-1G>A p.? |
+| `splice_donor_variant` | `p.?` guard | PTCH1 c.3306+2T>G p.? |
+| `splice_region_variant` (composite) | Fires missense branch when combined with `missense_variant` | EZH2 c.656C>T p.S219F with type `missense_variant,splice_region_variant` |
+| `incomplete_terminal_codon_variant` | `p.?` guard (absent from v103 targeted-screen data; tested by convention) | c.2213_2214insA p.? |
+
+!!! note
+    `splice_region_variant` only appears in COSMIC v103 as part of a composite type string (e.g. `missense_variant,splice_region_variant`). When combined with an actionable type, the actionable type's branch fires first and the mutation is processed normally.
+
+---
+
+## 1.2 Independent Validation of the Tests
+
+### 1. Re-derive CDS sequences from the FASTA
+
+The CDS strings embedded in the test file came from `Cosmic_Genes_v103_GRCh38.fasta`. To verify a gene (e.g. EZH2):
+
+```bash
+python3 - <<'EOF'
+from Bio import SeqIO
+fasta = "use-cases/cosmic_data/Cosmic_Genes_v103_GRCh38.fasta"
+for rec in SeqIO.parse(fasta, "fasta"):
+    if "EZH2" in rec.id and "ENST00000483967" in rec.id:
+        print(f"len={len(rec.seq)}")
+        print(f"prot[218]={rec.seq.translate(to_stop=False)[218]}")
+        print(rec.seq[:50])
+EOF
+```
+
+Expected: `len=2211`, `prot[218]=S` (which becomes `F` after the c.656C>T substitution).
+
+### 2. Confirm mutations exist in the COSMIC TSV
+
+Every test documents its source mutation. To verify a mutation is real and not fabricated:
+
+```bash
+grep "EZH2" use-cases/cosmic_data/Cosmic_CompleteTargetedScreensMutant_v103_GRCh38.tsv | \
+  grep "c.656C>T" | \
+  cut -f1-10
+```
+
+### 3. Manually compute expected protein output
+
+For the missense test, manually apply the substitution and translate:
+
+```python
+from Bio.Seq import Seq
+
+cds = '''ATGGGCCAGACTGGGAAGAAATCTGAGAAGGGACCAGTTTGTTGGCGGAAGCGTGTAAAATCAGAGTACATGCGACTGAGACAGCTCAAGAGGTTCAGACGAGCTGATGAAGTAAAGAGTATGTTTAGTTCCAATCGTCAGAAAATT
+TTGGAAAGAACGGAAATCTTAAACCAAGAATGGAAACAGCGAAGGATACAGCCTGTGCACATCCTGACTTCTTGTTCGGTGACCAGTGACTTGGATTTTCCAACACAAGTCATCCCATTAAAGACTCTGAATGCAGTTGCTTCAGTACCCATAATG
+TATTCTTGGTCTCCCCTACAGCAGAATTTTATGGTGGAAGATGAAACTGTTTTACATAACATTCCTTATATGGGAGATGAAGTTTTAGATCAGGATGGTACTTTCATTGAAGAACTAATAAAAAATTATGATGGGAAAGTACACGGGGATAGAGAA
+TGTGGGTTTATAAATGATGAAATTTTTGTGGAGTTGGTGAATGCCCTTGGTCAATATAATGATGATGACGATGATGATGATGGAGACGATCCTGAAGAAAGAGAAGAAAAGCAGAAAGATCTGGAGGATCACCGAGATGATAAAGAAAGCCGCCCA
+CCTCGGAAATTTCCTTCTGATAAAATTTTTGAAGCCATTTCCTCAATGTTTCCAGATAAGGGCACAGCAGAAGAACTAAAGGAAAAATATAAAGAACTCACCGAACAGCAGCTCCCAGGCGCACTTCCTCCTGAATGTACCCCCAACATAGATGGA
+CCAAATGCTAAATCTGTTCAGAGAGAGCAAAGCTTACACTCCTTTCATACGCTTTTCTGTAGGCGATGTTTTAAATATGACTGCTTCCTACATCCTTTTCATGCAACACCCAACACTTATAAGCGGAAGAACACAGAAACAGCTCTAGACAACAAA
+CCTTGTGGACCACAGTGTTACCAGCATTTGGAGGGAGCAAAGGAGTTTGCTGCTGCTCTCACCGCTGAGCGGATAAAGACCCCACCAAAACGTCCAGGAGGCCGCAGAAGAGGACGGCTTCCCAATAACAGTAGCAGGCCCAGCACCCCCACCATT
+AATGTGCTGGAATCAAAGGATACAGACAGTGATAGGGAAGCAGGGACTGAAACGGGGGGAGAGAACAATGATAAAGAAGAAGAAGAGAAGAAAGATGAAACTTCGAGCTCCTCTGAAGCAAATTCTCGGTGTCAAACACCAATAAAGATGAAGCCA
+AATATTGAACCTCCTGAGAATGTGGAGTGGAGTGGTGCTGAAGCCTCAATGTTTAGAGTCCTCATTGGCACTTACTATGACAATTTCTGTGCCATTGCTAGGTTAATTGGGACCAAAACATGTAGACAGGTGTATGAGTTTAGAGTCAAAGAATCT
+AGCATCATAGCTCCAGCTCCCGCTGAGGATGTGGATACTCCTCCAAGGAAAAAGAAGAGGAAACACCGGTTGTGGGCTGCACACTGCAGAAAGATACAGCTGAAAAAGGACGGCTCCTCTAACCATGTTTACAACTATCAACCCTGTGATCATCCA
+CGGCAGCCTTGTGACAGTTCGTGCCCTTGTGTGATAGCACAAAATTTTTGTGAAAAGTTTTGTCAATGTAGTTCAGAGTGTCAAAACCGCTTTCCGGGATGCCGCTGCAAAGCACAGTGCAACACCAAGCAGTGCCCGTGCTACCTGGCTGTCCGA
+GAGTGTGACCCTGACCTCTGTCTTACTTGTGGAGCCGCTGACCATTGGGACAGTAAAAATGTGTCCTGCAAGAACTGCAGTATTCAGCGGGGCTCCAAAAAGCATCTATTGCTGGCACCATCTGACGTGGCAGGCTGGGGGATTTTTATCAAAGAT
+CCTGTGCAGAAAAATGAATTCATCTCAGAATACTGTGGAGAGATTATTTCTCAAGATGAAGCTGACAGAAGAGGGAAAGTGTATGATAAATACATGTGCAGCTTTCTGTTCAACTTGAACAATGATTTTGTGGTGGATGCAACCCGCAAGGGTAAC
+AAAATTCGTTTTGCAAATCATTCGGTAAATCCAAACTGCTATGCAAAAGTTATGATGGTTAACGGTGATCACAGGATAGGTATTTTTGCCAAGAGAGCCATCCAGACTGGCGAAGAGCTGTTTTTTGATTACAGATACAGCCAGGCTGATGCCCTG
+AAGTATGTCGGCATCGAAAGAGAAATGGAAATCCCTTGA'''.replace('\n','')  # EZH2 CDS string
+
+seq = Seq(cds)
+wt_prot = seq.translate(to_stop=False)
+print(wt_prot[218])   # 'S' — wild-type Serine at position 219
+
+# Apply c.656C>T: position 656 is 0-indexed 655
+mut_seq = seq[:655] + "T" + seq[656:]
+mut_prot = mut_seq.translate(to_stop=False)
+print(mut_prot[218])  # 'F' — mutant Phenylalanine
+```
+
+---
+
+## 1.3 Core Unit Tests (`test_cosmic_core.py`)
+
+14 Unit tests are implemented to verify the internal logic of `get_mut_pro_seq()` using minimal synthetic sequences (`Seq("ATGAAATTT")` etc.).
+
+| Class | Coverage |
+|---|---|
+| `TestGetMutProSeqDnaBranch` | DNA substitution, insertion, single-base deletion, two-position deletion, ambiguous `c.?` routing to protein branch |
+| `TestGetMutProSeqProteinBranch` | Missense, nonsense truncation, in-frame insertion, two-position in-frame deletion, single-position in-frame deletion |
+| `TestGetMutProSeqEdgeCases` | Mutation beyond sequence length, ambiguous `p.?` returns `""`, non-alpha last char in aa_mut returns `""`, DNA ref-allele mismatch returns `""` |
+
+These tests are fast to run and isolate individual code paths without requiring reference files.
diff --git a/pgatk/cgenomes/cgenomes_proteindb.py b/pgatk/cgenomes/cgenomes_proteindb.py
index fe979be..70fb72b 100644
--- a/pgatk/cgenomes/cgenomes_proteindb.py
+++ b/pgatk/cgenomes/cgenomes_proteindb.py
@@ -144,15 +144,24 @@ def get_mut_pro_seq(snp: SNP, seq: Seq) -> Optional[str]:
                     seq_mut = seq[:index] + mut_dna + seq[index + 1:]
                     mut_pro_seq = str(seq_mut.translate(to_stop=False))
             elif "delins" in snp.dna_mut:
-                # Deletion-insertion: delete range then insert new bases
-                insert_dna = snp.dna_mut.split("delins")[1]
-                if insert_dna.isalpha() and len(positions) >= 2:
-                    del_index1 = int(positions[0]) - 1
-                    del_index2 = int(positions[1])
+                # HGVS delins: one or more nucleotides replaced by one or more other nucleotides.
+                coord_part, insert_raw = snp.dna_mut.split("delins", 1)
+                insert_dna = insert_raw.upper()
+                if re.search(r'\d[+-]|\*|-\d', coord_part):
+                    # Intronic (c.N+X, c.N-X), 5'UTR (c.-N) or 3'UTR (c.*N) offsets
+                    # cannot be mapped onto a CDS-only FASTA sequence; skip.
+                    pass
+                elif not insert_dna.isalpha():
+                    # Unknown ('?'), N[n], or conversion-notation insertions cannot be resolved.
+                    pass
+                elif len(positions) >= 2:
+                    # Range delins: delete positions[0]..positions[1] (1-indexed, inclusive).
+                    del_index1 = int(positions[0]) - 1  # 0-indexed start
+                    del_index2 = int(positions[1])      # 0-indexed exclusive end
                     seq_mut = seq[:del_index1] + insert_dna + seq[del_index2:]
                     mut_pro_seq = str(seq_mut.translate(to_stop=False))
-                elif insert_dna.isalpha() and len(positions) == 1:
-                    # Single-position delins: replace one base
+                elif len(positions) == 1:
+                    # Single-position delins: replace the one nucleotide at positions[0].
                     del_index1 = int(positions[0]) - 1
                     seq_mut = seq[:del_index1] + insert_dna + seq[del_index1 + 1:]
                     mut_pro_seq = str(seq_mut.translate(to_stop=False))
@@ -165,6 +174,20 @@ def get_mut_pro_seq(snp: SNP, seq: Seq) -> Optional[str]:
                     seq_mut = seq[:ins_index1] + insert_dna + seq[ins_index1:]
                     mut_pro_seq = str(seq_mut.translate(to_stop=False))
 
+            elif "dup" in snp.dna_mut:
+                # Tandem duplication: re-insert the duplicated range after its end position.
+                if len(positions) == 2:
+                    dup_start = int(positions[0]) - 1
+                    dup_end = int(positions[1])
+                    dup_seq = str(seq[dup_start:dup_end])
+                    seq_mut = seq[:dup_end] + dup_seq + seq[dup_end:]
+                    mut_pro_seq = str(seq_mut.translate(to_stop=False))
+                elif len(positions) == 1:
+                    dup_pos = int(positions[0])
+                    dup_seq = str(seq[dup_pos - 1:dup_pos])
+                    seq_mut = seq[:dup_pos] + dup_seq + seq[dup_pos:]
+                    mut_pro_seq = str(seq_mut.translate(to_stop=False))
+
             elif "del" in snp.dna_mut:
                 if len(positions) == 2:
                     del_index1 = int(positions[0]) - 1
@@ -180,7 +203,7 @@ def get_mut_pro_seq(snp: SNP, seq: Seq) -> Optional[str]:
                 positions = re.findall(r'\d+', snp.aa_mut)
                 protein_seq = str(seq.translate(to_stop=False))
 
-                if "Missense" in snp.mutation_type:
+                if "Missense" in snp.mutation_type or "missense_variant" in snp.mutation_type:
                     # Extract the mutant residue from HGVS like p.V600E (1-letter)
                     # or p.Val600Glu (3-letter).  For 1-letter the last char is the
                     # AA; for 3-letter we need to convert the last triplet.
@@ -190,20 +213,27 @@ def get_mut_pro_seq(snp: SNP, seq: Seq) -> Optional[str]:
                         return ''
                     index = int(positions[0]) - 1
                     mut_pro_seq = protein_seq[:index] + mut_aa + protein_seq[index + 1:]
-                elif "Nonsense" in snp.mutation_type:
+                elif "Nonsense" in snp.mutation_type or "stop_gained" in snp.mutation_type:
                     index = int(positions[0]) - 1
                     mut_pro_seq = protein_seq[:index]
-                elif "Insertion - In frame" in snp.mutation_type:
-                    try:
-                        index = snp.aa_mut.index("ins")
-                    except ValueError:
-                        return ''
-                    insert_aa_raw = snp.aa_mut[index + 3:]
-                    insert_aa = _three_to_one(insert_aa_raw)
-                    if insert_aa.isalpha():
-                        ins_index1 = int(positions[0])
-                        mut_pro_seq = protein_seq[:ins_index1] + insert_aa + protein_seq[ins_index1:]
-                elif "Deletion - In frame" in snp.mutation_type:
+                elif "Insertion - In frame" in snp.mutation_type or "inframe_insertion" in snp.mutation_type:
+                    if "dup" in snp.aa_mut:
+                        # Protein-level tandem dup: re-insert the duplicated residues after the range.
+                        dup_start = int(positions[0]) - 1
+                        dup_end = int(positions[-1])
+                        dup_aa = protein_seq[dup_start:dup_end]
+                        mut_pro_seq = protein_seq[:dup_end] + dup_aa + protein_seq[dup_end:]
+                    else:
+                        try:
+                            index = snp.aa_mut.index("ins")
+                        except ValueError:
+                            return ''
+                        insert_aa_raw = snp.aa_mut[index + 3:]
+                        insert_aa = _three_to_one(insert_aa_raw)
+                        if insert_aa.isalpha():
+                            ins_index1 = int(positions[0])
+                            mut_pro_seq = protein_seq[:ins_index1] + insert_aa + protein_seq[ins_index1:]
+                elif "Deletion - In frame" in snp.mutation_type or "inframe_deletion" in snp.mutation_type:
                     if len(positions) == 2:
                         del_index1 = int(positions[0]) - 1
                         del_index2 = int(positions[1])
@@ -211,7 +241,8 @@ def get_mut_pro_seq(snp: SNP, seq: Seq) -> Optional[str]:
                     elif len(positions) == 1:
                         del_index1 = int(positions[0]) - 1
                         mut_pro_seq = protein_seq[:del_index1] + protein_seq[del_index1 + 1:]
-                elif "Complex" in snp.mutation_type and "frameshift" not in snp.mutation_type:
+                elif ("Complex" in snp.mutation_type or "protein_altering_variant" in snp.mutation_type) \
+                        and "frameshift" not in snp.mutation_type:
                     try:
                         index = snp.aa_mut.index(">")
                     except ValueError:
@@ -262,16 +293,16 @@ def cosmic_to_proteindb(self) -> None:
         self.get_logger().debug("Reading input CosmicMutantExport.tsv ...")
         line_counter = 1
 
-        required_columns = ["Gene name", "Accession Number", "Mutation CDS", "Mutation AA", "Mutation Description"]
+        required_columns = ["GENE_SYMBOL", "TRANSCRIPT_ACCESSION", "MUTATION_CDS", "MUTATION_AA", "MUTATION_DESCRIPTION"]
         with _open_text(self._local_mutation_file, encoding="latin-1") as cosmic_input, \
              open(self._local_output_file, 'w', encoding='utf-8') as output:
             header = cosmic_input.readline().strip().split("\t")
             try:
-                gene_col = header.index("Gene name")
-                enst_col = header.index("Accession Number")
-                cds_col = header.index("Mutation CDS")
-                aa_col = header.index("Mutation AA")
-                muttype_col = header.index("Mutation Description")
+                gene_col = header.index("GENE_SYMBOL")
+                enst_col = header.index("TRANSCRIPT_ACCESSION")
+                cds_col = header.index("MUTATION_CDS")
+                aa_col = header.index("MUTATION_AA")
+                muttype_col = header.index("MUTATION_DESCRIPTION")
             except ValueError as e:
                 self.get_logger().error(
                     "COSMIC file missing required columns. Expected %s, got: %s",
@@ -308,7 +339,7 @@ def cosmic_to_proteindb(self) -> None:
                     if row[filter_col] not in self._accepted_values and self._accepted_values != ['all']:
                         continue
 
-                if "coding silent" in row[muttype_col]:
+                if "coding silent" in row[muttype_col] or "synonymous_variant" in row[muttype_col]:
                     continue
 
                 snp = SNP(gene=row[gene_col], mrna=row[enst_col], dna_mut=row[cds_col], aa_mut=row[aa_col],
diff --git a/pgatk/tests/test_variant_types_hgvs.py b/pgatk/tests/test_variant_types_hgvs.py
new file mode 100644
index 0000000..f699088
--- /dev/null
+++ b/pgatk/tests/test_variant_types_hgvs.py
@@ -0,0 +1,449 @@
+"""HGVS compliance tests for get_mut_pro_seq across all SO variant types.
+
+Each test uses a real mutation from Cosmic_CompleteTargetedScreensMutant_v103_GRCh38.tsv
+paired with the matching CDS from Cosmic_Genes_v103_GRCh38.fasta.
+
+HGVS rules verified per type:
+  missense_variant        – single AA substitution, length unchanged
+  stop_gained             – stop (*) introduced at predicted position
+  inframe_deletion        – protein shortened by exact number of deleted codons
+  inframe_insertion       – protein lengthened by exact number of inserted codons (dup)
+  protein_altering_variant– delins produces a non-empty altered sequence
+  frameshift_variant      – out-of-frame deletion produces a non-empty translated sequence
+  start_lost              – start codon destroyed; first AA is no longer Met
+  stop_lost               – stop codon removed; protein extends past original terminus
+  stop_retained_variant   – stop codon changed but still a stop; length unchanged
+  synonymous_variant      – filtered before get_mut_pro_seq (pre-processing step)
+  intron_variant          – p.? guard returns empty string
+  3_prime_UTR_variant     – p.? guard returns empty string
+  5_prime_UTR_variant     – p.? guard returns empty string
+  coding_sequence_variant – p.? guard returns empty string
+  splice_acceptor_variant – p.? guard returns empty string
+  splice_donor_variant    – intronic offset guard returns empty string
+  splice_region_variant   – treated as missense when combined type contains missense_variant
+  incomplete_terminal_codon_variant – not present in v103 targeted screens; p.? returns ""
+"""
+from Bio.Seq import Seq
+
+from pgatk.cgenomes.cgenomes_proteindb import CancerGenomesService
+from pgatk.cgenomes.models import SNP
+
+# ---------------------------------------------------------------------------
+# CDS sequences from Cosmic_Genes_v103_GRCh38.fasta
+# Key: gene symbol  Value: CDS nucleotide string (ENST accession in comment)
+# ---------------------------------------------------------------------------
+_CDS = {
+    # ENST00000483967.5 – used for missense_variant
+    "EZH2": (
+        "ATGGGCCAGACTGGGAAGAAATCTGAGAAGGGACCAGTTTGTTGGCGGAAGCGTGTAAAATCAGAGTACATGCGACTGAGACAGCTCAAGAGGTTCAGACGAGCTGATGAAGTAAAGAGTATGTTTAGTTCCAATCGTCAGAAAATTTTGGAAAGAACGGAAATCTTAAACCAAGAATGGAAACAGCGAAGGATACAGCCTGTGCACATCCTGACTTCTTGTTCGGTGACCAGTGACTTGGATTTTCCAACACAAGTCATCCCATTAAAGACTCTGAATGCAGTTGCTTCAGTACCCATAATGTATTCTTGGTCTCCCCTACAGCAGAATTTTATGGTGGAAGATGAAACTGTTTTACATAACATTCCTTATATGGGAGATGAAGTTTTAGATCAGGATGGTACTTTCATTGAAGAACTAATAAAAAATTATGATGGGAAAGTACACGGGGATAGAGAATGTGGGTTTATAAATGATGAAATTTTTGTGGAGTTGGTGAATGCCCTTGGTCAATATAATGATGATGACGATGATGATGATGGAGACGATCCTGAAGAAAGAGAAGAAAAGCAGAAAGATCTGGAGGATCACCGAGATGATAAAGAAAGCCGCCCACCTCGGAAATTTCCTTCTGATAAAATTTTTGAAGCCATTTCCTCAATGTTTCCAGATAAGGGCACAGCAGAAGAACTAAAGGAAAAATATAAAGAACTCACCGAACAGCAGCTCCCAGGCGCACTTCCTCCTGAATGTACCCCCAACATAGATGGACCAAATGCTAAATCTGTTCAGAGAGAGCAAAGCTTACACTCCTTTCATACGCTTTTCTGTAGGCGATGTTTTAAATATGACTGCTTCCTACATCCTTTTCATGCAACACCCAACACTTATAAGCGGAAGAACACAGAAACAGCTCTAGACAACAAACCTTGTGGACCACAGTGTTACCAGCATTTGGAGGGAGCAAAGGAGTTTGCTGCTGCTCTCACCGCTGAGCGGATAAAGACCCCACCAAAACGTCCAGGAGGCCGCAGAAGAGGACGGCTTCCCAATAACAGTAGCAGGCCCAGCACCCCCACCATTAATGTGCTGGAATCAAAGGATACAGACAGTGATAGGGAAGCAGGGACTGAAACGGGGGGAGAGAACAATGATAAAGAAGAAGAAGAGAAGAAAGATGAAACTTCGAGCTCCTCTGAAGCAAATTCTCGGTGTCAAACACCAATAAAGATGAAGCCAAATATTGAACCTCCTGAGAATGTGGAGTGGAGTGGTGCTGAAGCCTCAATGTTTAGAGTCCTCATTGGCACTTACTATGACAATTTCTGTGCCATTGCTAGGTTAATTGGGACCAAAACATGTAGACAGGTGTATGAGTTTAGAGTCAAAGAATCTAGCATCATAGCTCCAGCTCCCGCTGAGGATGTGGATACTCCTCCAAGGAAAAAGAAGAGGAAACACCGGTTGTGGGCTGCACACTGCAGAAAGATACAGCTGAAAAAGGACGGCTCCTCTAACCATGTTTACAACTATCAACCCTGTGATCATCCACGGCAGCCTTGTGACAGTTCGTGCCCTTGTGTGATAGCACAAAATTTTTGTGAAAAGTTTTGTCAATGTAGTTCAGAGTGTCAAAACCGCTTTCCGGGATGCCGCTGCAAAGCACAGTGCAACACCAAGCAGTGCCCGTGCTACCTGGCTGTCCGAGAGTGTGACCCTGACCTCTGTCTTACTTGTGGAGCCGCTGACCATTGGGACAGTAAAAATGTGTCCTGCAAGAACTGCAGTATTCAGCGGGGCTCCAAAAAGCATCTATTGCTGGCACCATCTGACGTGGCAGGCTGGGGGATTTTTATCAAAGATCCTGTGCAGAAAAATGAATTCATCTCAGAATACTGTGGAGAGATTATTTCTCAAGATGAAGCTGACAGAAGAGGGAAAGTGTATGATAAATACATGTGCAGCTTTCTGTTCAACTTGAACAATGATTTTGTGGTGGATGCAACCCGCAAGGGTAACAAAATTCGTTTTGCAAATCATTCGGTAAATCCAAACTGCTATGCAAAAGTTATGATGGTTAACGGTGATCACAGGATAGGTATTTTTGCCAAGAGAGCCATCCAGACTGGCGAAGAGCTGTTTTTTGATTACAGATACAGCCAGGCTGATGCCCTGAAGTATGTCGGCATCGAAAGAGAAATGGAAATCCCTTGA"
+    ),
+    # ENST00000613418.4 – used for stop_gained
+    "FCGR3B": (
+        "ATGCGGACTGAAGATCTCCCAAAGGCTGTGGTGTTCCTGGAGCCTCAATGGTACAGCGTGCTTGAGAAGGACAGTGTGACTCTGAAGTGCCAGGGAGCC"
+        "TACTCCCCTGAGGACAATTCCACACAGTGGTTTCACAATGAGAACCTCATCTCAAGCCAGGCCTCGAGCTACTTCATTGACGCTGCCACAGTCAACGAC"
+        "AGTGGAGAGTACAGGTGCCAGACAAACCTCTCCACCCTCAGTGACCCGGTGCAGCTAGAAGTCCATATCGGCTGGCTGTTGCTCCAGGCCCCTCGGTGGG"
+        "TGTTCAAGGAGGAAGACCCTATTCACCTGAGGTGTCACAGCTGGAAGAACACTGCTCTGCATAAGGTCACATATTTACAGAATGGCAAAGACAGGAAGTAT"
+        "TTTCATCATAATTCTGACTTCCACATTCCAAAAGCCACACTCAAAGATAGCGGCTCCTACTTCTGCAGGGGGCTTGTTGGGAGTAAAAATGTGTCTTCAGA"
+        "GACTGTGAACATCACCATCACTCAAGGTTTGGCAGTGTCAACCATCTCATCATTCTCTCCACCTGGGTACCAAGTCTCTTTCTGCTTGGTGATGGTACTCC"
+        "TTTTTGCAGTGGACACAGGACTATATTTCTCTGTGAAGACAAACATTTGA"
+    ),
+    # ENST00000646174.1 – used for inframe_deletion
+    "CTNNB1": (
+        "ATGGAGTTGGACATGGCCATGGAACCAGACAGAAAAGCGGCTGTTAGTCACTGGCAGCAACAGTCTTACCTGGACTCTGGAATCCATTCTGGTGCCACTAC"
+        "CACAGCTCCTTCTCTGAGTGGTAAAGGCAATCCTGAGGAAGAGGATGTGGATACCTCCCAAGTCCTGTATGAGTGGGAACAGGGATTTTCTCAGTCCTTCA"
+        "CTCAAGAACAAGTAGCTGATATTGATGGACAGTATGCAATGACTCGAGCTCAGAGGGTACGAGCTGCTATGTTCCCTGAGACATTAGATGAGGGCATGCAGA"
+        "TCCCATCTACACAGTTTGATGCTGCTCATCCCACTAATGTCCAGCGTTTGGCTGAACCATCACAGATGCTGAAACATGCAGTTGTAAACTTGATTAACTATCA"
+        "AGATGATGCAGAACTTGCCACACGTGCAATCCCTGAACTGACAAAACTGCTAAATGACGAGGACCAGGTGGTGGTTAATAAGGCTGCAGTTATGGTCCATCAG"
+        "CTTTCTAAAAAGGAAGCTTCCAGACACGCTATCATGCGTTCTCCTCAGATGGTGTCTGCTATTGTACGTACCATGCAGAATACAAATGATGTAGAAACAGCTCG"
+        "TTGTACCGCTGGGACCTTGCATAACCTTTCCCATCATCGTGAGGGCTTACTGGCCATCTTTAAGTCTGGAGGCATTCCTGCCCTGGTGAAAATGCTTGGTTCAC"
+        "CAGTGGATTCTGTGTTGTTTTATGCCATTACAACTCTCCACAACCTTTTATTACATCAAGAAGGAGCTAAAATGGCAGTGCGTTTAGCTGGTGGGCTGCAGAAAA"
+        "TGGTTGCCTTGCTCAACAAAACAAATGTTAAATTCTTGGCTATTACGACAGACTGCCTTCAAATTTTAGCTTATGGCAACCAAGAAAGCAAGCTCATCATACTGG"
+        "CTAGTGGTGGACCCCAAGCTTTAGTAAATATAATGAGGACCTATACTTACGAAAAACTACTGTGGACCACAAGCAGAGTGCTGAAGGTGCTATCTGTCTGCTCTA"
+        "GTAATAAGCCGGCTATTGTAGAAGCTGGTGGAATGCAAGCTTTAGGACTTCACCTGACAGATCCAAGTCAACGTCTTGTTCAGAACTGTCTTTGGACTCTCAGGA"
+        "ATCTTTCAGATGCTGCAACTAAACAGGAAGGGATGGAAGGTCTCCTTGGGACTCTTGTTCAGCTTCTGGGTTCAGATGATATAAATGTGGTCACCTGTGCAGCTG"
+        "GAATTCTTTCTAACCTCACTTGCAATAATTATAAGAACAAGATGATGGTCTGCCAAGTGGGTGGTATAGAGGCTCTTGTGCGTACTGTCCTTCGGGCTGGTGACAG"
+        "GGAAGACATCACTGAGCCTGCCATCTGTGCTCTTCGTCATCTGACCAGCCGACACCAAGAAGCAGAGATGGCCCAGAATGCAGTTCGCCTTCACTATGGACTACCA"
+        "GTTGTGGTTAAGCTCTTACACCCACCATCCCACTGGCCTCTGATAAAGGCTACTGTTGGATTGATTCGAAATCTTGCCCTTTGTCCCGCAAATCATGCACCTTTGC"
+        "GTGAGCAGGGTGCCATTCCACGACTAGTTCAGTTGCTTGTTCGTGCACATCAGGATACCCAGCGCCGTACGTCCATGGGTGGGACACAGCAGCAATTTGTGGAGGG"
+        "GGTCCGCATGGAAGAAATAGTTGAAGGTTGTACCGGAGCCCTTCACATCCTAGCTCGGGATGTTCACAACCGAATTGTTATCAGAGGACTAAATACCATTCCATTG"
+        "TTTGTGCAGCTGCTTTATTCTCCCATTGAAAACATCCAAAGAGTAGCTGCAGGGGTCCTCTGTGAACTTGCTCAGGACAAGGAAGCTGCAGAAGCTATTGAAGCTG"
+        "AGGGAGCCACAGCTCCTCTGACAGAGTTACTTCACTCTAGGAATGAAGGTGTGGCGACATATGCAGCTGCTGTTTTGTTCCGAATGTCTGAGGACAAGCCACAAGAT"
+        "TACAAGAAACGGCTTTCAGTTGAGCTGACCAGCTCTCTCTTCAGAACAGAGCCAATGGCTTGGAATGAGACTGCTGATCTTGGACTTGATATTGGTGCCCAGGGAGAA"
+        "CCCCTTGGATATCGCCAGGATGATCCTAGCTATCGTTCTTTTCACTCTGGTGGATATGGCCAGGATGCCTTGGGTATGGACCCCATGATGGAACATGAGATGGGTGGC"
+        "CACCACCCTGGTGCTGACTATCCAGTTGATGGGCTGCCAGATCTGGGGCATGCCCAGGACCTCATGGATGGGCTGCCTCCAGGTGACAGCAATCAGCTGGCCTGGTTT"
+        "GATACTGACCTGTAA"
+    ),
+    # ENST00000370452.7 – used for inframe_insertion (dup)
+    "SMAP1": (
+        "ATGGCGACGCGCTCCTGTCGGGAGAAGGCTCAGAAGCTGAACGAGCAGCACCAGCTCATCCTATCCAAGCTTCTGAGGGAGGAGGACAACAAGTACTGCGCC"
+        "GACTGCGAGGCCAAAGGTCCTCGATGGGCTTCCTGGAATATTGGTGTGTTTATTTGCATCAGATGTGCTGGAATTCATAGAAATCTTGGGGTTCATATATCC"
+        "AGGGTCAAATCAGTCAACCTAGACCAATGGACAGCAGAACAGATACAGTGCATGCAAGATATGGGAAATACTAAAGCAAGACTACTCTATGAAGCCAATCTTCC"
+        "AGAGAACTTTCGAAGACCACAGACAGATCAAGCAGTGGAATTTTTCATCAGAGATAAATATGAAAAGAAGAAATACTACGATAAAAATGCCATAGCTATTACAAAT"
+        "AAAGAAAAGGAAGAAAAAAAAGGAAGAGAAAAAGAGAGAAAAGGAGCCAGAAAAGCCGGCAAAACCACTTACAGCTGAAAAGCTGCAGAAGAAAGATCAGCAACTGG"
+        "AGCCTAAAAAAAGTACCAGCCCTAAAAAAGCTGCGGAGCCCACTGTGGATCTTTTAGGACTTGATGGCCCTGCTGTGGCACCAGTGACCAACGGGAACACAACGGTG"
+        "CCACCCCTGAACGATGATCTGGACATCTTTGGACCGATGATTTCTAATCCCTTACCTGCAACTGTCATGCCCCCAGCTCAGGGGACACCCTCTGCACCAGCAGCTGCA"
+        "ACCCTGTCTACAGTAACATCTGGGGATCTAGATTTATTCACTGAGCAAACTACAAAATCAGAAGAAGTGGCAAAGAAACAACTTTCCAAAGACTCCATCTTATCTCTG"
+        "TATGGCACAGGAACCATTCAACAGCAAAGTACTCCTGGTGTATTTATGGGACCCACAAATATACCATTTACCTCACAAGCACCAGCTGCATTTCAGGGCTTTCCATCG"
+        "ATGGGCGTGCCTGTGCCTGCAGCTCCTGGCCTTATAGGAAATGTGATGGGACAGAGTCCAAGCATGATGGTGGGCATGCCCATGCCCAATGGGTTTATGGGAAATGCA"
+        "CAAACTGGTGTGATGCCACTTCCTCAGAACGTTGTTGGCCCCCAAGGAGGAATGGTGGGACAAATGGGTGCACCCCAGAGTAAGTTTGGCCTGCCGCAAGCTCAGCAG"
+        "CCCCAGTGGAGCCTCTCACAGATAATGCAGAAGGGTGATGCTGTTCTCCAGCACTCCATCAGTGCAATCTACTGGCCAATGACAAGGTGGTTAAAATGTCCTTTAGT"
+        "AGATGAATCAGCAGATGGCTGGCATGAGTATCAGTAG"
+    ),
+    # ENST00000361676.8 – used for protein_altering_variant
+    "NF2": (
+        "ATGGCCGGGGCCATCGCTTCCCGCATGAGCTTCAGCTCTCTCAAGAGGAAGCAACCCAAGACGTTCACCGTGAGGATCGTCACCATGGACGCCGAGATGGAG"
+        "TTCAATTGCGAGGTACTGGATCATGATGTTTCAAAGGAAGAACCAGTCACCTTTCACTTCTTGGCCAAATTTTATCCTGAGAATGCTGAAGAGGAGCTGGTT"
+        "CAGGAGATCACACAACATTTATTCTTCTTACAGGTAAAGAAGCAGATTTTAGATGAAAAGATCTACTGCCCTCCTGAGGCTTCTGTGCTCCTGGCTTCTTACG"
+        "CCGTCCAGGCCAAGTATGGTGACTACGACCCCAGTGTTCACAAGCGGGGATTTTTGGCCCAAGAGGAATTGCTTCCAAAAAGGGTAATAAATCTGTATCAGAT"
+        "GACTCCGGAAATGTGGGAGGAGAGAATTACTGCTTGGTACGCAGAGCACCGAGGCCGAGCCAGGGATGAAGCTGAAATGGAATATCTGAAGATAGCTCAGGAC"
+        "CTGGAGATGTACGGTGTGAACTACTTTGCAATCCGGAATAAAAAGGGCACAGAGCTGCTGCTTGGAGTGGATGCCCTGGGGCTTCACATTTATGACCCTGAGA"
+        "ACAGACTGACCCCCAAGATCTCCTTCCCGTGGAATGAAATCCGAAACATCTCGTACAGTGACAAGGAGTTTACTATTAAACCACTGGATAAGAAAATTGATGTC"
+        "TTCAAGTTTAACTCCTCAAAGCTTCGTGTTAATAAGCTGATTCTCCAGCTATGTATCGGGAACCATGATCTATTTATGAGGAGAAGGAAAGCCGATTCTTTGGAA"
+        "GTTCAGCAGATGAAAGCCCAGGCCAGGGAGGAGAAGGCTAGAAAGCAGATGGAGCGGCAGCGCCTCGCTCGAGAGAAGCAGATGAGGGAGGAGGCTGAACGCACG"
+        "AGGGATGAGTTGGAGAGGAGGCTGCTGCAGATGAAAGAAGAAGCAACAATGGCCAACGAAGCACTGATGCGGTCTGAGGAGACAGCTGACCTGTTGGCTGAAAAG"
+        "GCCCAGATCACCGAGGAGGAGGCAAAACTTCTGGCCCAGAAGGCCGCAGAGGCTGAGCAGGAAATGCAGCGCATCAAGGCCACAGCGATTCGCACGGAGGAGGAG"
+        "AAGCGCCTGATGGAGCAGAAGGTGCTGGAAGCCGAGGTGCTGGCACTGAAGATGGCTGAGGAGTCAGAGAGGAGGGCCAAAGAGGCAGATCAGCTGAAGCAGGAC"
+        "CTGCAGGAAGCACGCGAGGCGGAGCGAAGAGCCAAGCAGAAGCTCCTGGAGATTGCCACCAAGCCCACGTACCCGCCCATGAACCCAATTCCAGCACCGTTGCCTC"
+        "CTGACATACCAAGCTTCAACCTCATTGGTGACAGCCTGTCTTTCGACTTCAAAGATACTGACATGAAGCGGCTTTCCATGGAGATAGAGAAAGAAAAAGTGGAATAC"
+        "ATGGAAAAGAGCAAGCATCTGCAGGAGCAGCTCAATGAACTCAAGACAGAAATCGAGGCCTTGAAACTGAAAGAGAGGGAGACAGCTCTGGATATTCTGCACAATGA"
+        "GAACTCCGACAGGGGTGGCAGCAGCAAGCACAATACCATTAAAAAGCCTCAAGCCCAAGGCAGAAGACCTATCTGCATTTGA"
+    ),
+    # ENST00000394374.6 – used for frameshift_variant
+    "MEN1": (
+        "ATGGGGCTGAAGGCCGCCCAGAAGACGCTGTTCCCGCTGCGCTCCATCGACGACGTGGTGCGCCTGTTTGCTGCCGAGCTGGGCCGAGAGGAGCCGGACCTG"
+        "GTGCTCCTTTCCTTGGTGCTGGGCTTCGTGGAGCATTTTCTGGCTGTCAACCGCGTCATCCCTACCAACGTTCCCGAGCTCACCTTCCAGCCCAGCCCCGCCC"
+        "CCGACCCGCCTGGCGGCCTCACCTACTTTCCCGTGGCCGACCTGTCTATCATCGCCGCCCTCTATGCCCGCTTCACCGCCCAGATCCGAGGCGCCGTCGACCTG"
+        "TCCCTCTATCCTCGAGAAGGGGGTGTCTCCAGCCGTGAGCTGGTGAAGAAGGTCTCCGATGTCATATGGAACAGCCTCAGCCGCTCCTACTTCAAGGATCGGGCC"
+        "CACATCCAGTCCCTCTTCAGCTTCATCACAGGTTGGAGCCCAGTAGGCACCAAATTGGACAGCTCCGGTGTGGCCTTTGCTGTGGTTGGGGCCTGCCAGGCCCTGG"
+        "GTCTCCGGGATGTCCACCTCGCCCTGTCTGAGGATCATGCCTGGGTAGTGTTTGGGCCCAATGGGGAGCAGACAGCTGAGGTCACCTGGCACGGCAAGGGCAACGA"
+        "GGACCGCAGGGGCCAGACAGTCAATGCCGGTGTGGCTGAGCGGAGCTGGCTGTACCTGAAAGGATCATACATGCGCTGTGACCGCAAGATGGAGGTGGCGTTCATG"
+        "GTGTGTGCCATCAACCCTTCCATTGACCTGCACACCGACTCGCTGGAGCTTCTGCAGCTGCAGCAGAAGCTGCTCTGGCTGCTCTATGACCTGGGACATCTGGAAAG"
+        "GTACCCCATGGCCTTAGGGAACCTGGCAGATCTAGAGGAGCTGGAGCCCACCCCTGGCCGGCCAGACCCACTCACCCTCTACCACAAGGGCATTGCCTCAGCCAAGAC"
+        "CTACTATCGGGATGAACACATCTACCCCTACATGTACCTGGCTGGCTACCACTGTCGCAACCGCAATGTGCGGGAAGCCCTGCAGGCCTGGGCGGACACGGCCACTGT"
+        "CATCCAGGACTACAACTACTGCCGGGAAGACGAGGAGATCTACAAGGAGTTCTTTGAAGTAGCCAATGATGTCATCCCCAACCTGCTGAAGGAGGCAGCCAGCTTGCT"
+        "GGAGGCGGGCGAGGAGCGGCCGGGGGAGCAAAGCCAGGGCACCCAGAGCCAAGGTTCCGCCCTCCAGGACCCTGAGTGCTTCGCCCACCTGCTGCGATTCTACGACGG"
+        "CATCTGCAAATGGGAGGAGGGCAGTCCCACGCCTGTGCTGCATGTGGGCTGGGCCACCTTTCTTGTGCAGTCCCTAGGCCGTTTTGAGGGACAGGTGCGGCAGAAGGT"
+        "GCGCATAGTGAGCCGAGAGGCCGAGGCGGCCGAGGCCGAGGAGCCGTGGGGCGAGGAAGCCCGGGAAGGCCGGCGGCGGGGCCCACGGCGGGAGTCCAAGCCAGAGGAG"
+        "CCCCCGCCGCCCAAGAAGCCAGCACTGGACAAGGGCCTGGGCACCGGCCAGGGTGCAGTGTCAGGACCCCCCCGGAAGCCTCCTGGGACTGTCGCTGGCACAGCCCGAGG"
+        "CCCTGAAGGTGGCAGCACGGCTCAGGTGCCAGCACCCACAGCATCACCACCGCCGGAGGGTCCAGTGCTCACTTTCCAGAGTGAGAAGATGAAGGGCATGAAGGAGCTGC"
+        "TGGTGGCCACCAAGATCAACTCGAGCGCCATCAAGCTGCAACTCACGGCACAGTCGCAAGTGCAGATGAAGAAGCAGAAAGTGTCCACCCCTAGTGACTACACTCTGTCT"
+        "TTCCTCAAGCGGCAGCGCAAAGGCCTCTGA"
+    ),
+    # ENST00000513000.5 – used for start_lost
+    "INPP4B": (
+        "ATGGAAATTAAAGAGGAAGGGGCATCAGAAGAAGGGCAGCACTTTCTTCCTACAGCCCAGGCCAATGATCCCGGGGACTGTCAGTTCACAAGTATCCAGAAGAC"
+        "TCCAAATGAACCGCAGTTGGAATTCATCCTTGCATGCAAGGATCTCGTGGCTCCTGTCCGTGATCGTAAACTGAATACACTGGTGCAGATCTCCGTAATCCACC"
+        "CCGTGGAGCAGAGTCTGACAAGATACTCCAGCACCGAAATTGTGGAGGGAACAAGGGACCCACTGTTTTTGACTGGTGTCACATTCCCATCTGAGTATCCCATCT"
+        "ATGAGGAGACCAAAATAAAACTAACAGTCTATGATGTCAAGGATAAGTCTCATGACACCGTTCGAACCAGTGTCCTACCAGAACATAAGGATCCCCCGCCAGAAGT"
+        "TGGGCGAAGTTTCTTGGGCTATGCCAGTTTTAAAGTGGGAGAGCTGCTGAAGTCAAAGGAGCAATTGCTGGTCCTGAGCCTGAGAACTTCAGATGGTGGCAAAGTG"
+        "GTTGGCACCATAGAAGTCAGTGTCGTGAAGATGGGGGAGATTGAGGATGGGGAAGCCGACCACATCACCACAGATGTACAGGGACAAAAGTGTGCCCTGGTATGTGA"
+        "ATGTACAGCCCCGGAAAGTGTGAGCGGAAAAGATAACTTACCTTTTTTGAATTCAGTGTTAAAGAACCCAGTATGTAAATTATATAGATTTCCCACATCTGACAATAAG"
+        "TGGATGCGAATTCGAGAGCAGATGTCAGAGAGCATTCTTTCCTTTCATATTCCTAAGGAATTGATTTCCCTTCACATTAAAGAAGATTTGTGCAGAAACCAGGAGATA"
+        "AAAGAACTTGGTGAGCTTTCTCCACATTGGGACAATCTGCGAAAAAATGTCCTTACTCACTGTGATCAAATGGTGAATATGTACCAAGACATTCTGACAGAACTTAGCA"
+        "AGGAAACAGGGTCCTCTTTCAAATCAAGCAGCAGCAAAGGAGAGAAAACATTAGAATTTGTTCCAATAAATCTACATCTGCAAAGAATGCAGGTACACAGCCCTCACTTG"
+        "AAAGATGCTCTCTACGATGTCATCACTGTGGGAGCCCCAGCTGCCCATTTTCAGGGATTTAAGAATGGTGGTCTTCGGAAGCTACTCCATAGATTTGAAACAGAAAGAAG"
+        "AAATACCGGATACCAGTTTATTTACTATTCACCTGAAAACACAGCCAAAGCAAAGGAAGTTCTCAGCAACATCAATCAACTACAACCTCTTATAGCAACCCATGCAGACC"
+        "TACTGCTTAATTCTGCAAGCCAGCATTCTCCAGACAGCTTGAAGAATTCTTTAAAGATGCTTTCAGAAAAAACAGAGCTTTTTGTACATGCCTTCAAGGATCAACTTGTC"
+        "AGGAGTGCTCTTTTAGCACTCTACACTGCAAGGCCAGGAGGCATTCTTAAGAAGCCACCCTCTCCTAAGAGCAGCACAGAGGAGAGCAGTCCCCAAGACCAACCCCCAGT"
+        "GATGAGAGGGCAGGACTCCATACCACATCATTCAGACTATGATGAGGAAGAGTGGGACAGGGTGTGGGCCAATGTGGGGAAGAGCCTGAACTGCATTATTGCTATGGTGG"
+        "ACAAACTGATTGAAAGAGATGGTGGCAGTGAAGGCAGTGGCGGCAACAATGATGGAGAAAAGGAACCTTCATTAACAGATGCCATTCCCTCTCACCCAAGAGAGGACTGG"
+        "TATGAACAGTTGTATCCCCTCATCCTTACCCTGAAGGACTGCATGGGAGAAGTGGTGAACCGAGCCAAGCAGTCCCTGACATTTGTGCTCCTTCAGGAACTTGCGTACAGC"
+        "TTGCCCCAGTGTCTGATGCTGACGCTAAGAAGAGACATCGTCTTCAGCCAAGCACTTGCTGGATTGGTTTGTGGTTTTATCATCAAATTACAGACAAGTCTGTATGACCC"
+        "AGGCTTCCTACAGCAGCTTCACACAGTGGGGTTGATAGTACAATATGAAGGACTGCTAAGTACATACAGCGATGAAATTGGAATGCTAGAGGACATGGCCGTTGGCATTTC"
+        "CGATTTAAAGAAAGTTGCATTTAAAATAATTGAAGCCAAATCCAATGATGTGTTGCCAGTTATAACAGGAAGACGAGAACATTACGTGGTAGAGGTCAAGCTTCCAGCCAG"
+        "AATGTTTGAGTCACTACCTCTACAGATTAAAGAAGGACAGTTGCTTCATGTGTATCCAGTACTTTTTAATGTTGGAATCAATGAACAGCAAACTCTGGCTGAAAGGTTTGG"
+        "AGATGTCTCTTTGCAAGAAAGTATTAATCAGGAAAACTTCGAACTTCTACAAGAATATTACAAGATATTTATGGAAAAGATGCCTCCTGATTATATTTCACATTTTCAGGAA"
+        "CAAAATGATTTAAAAGCATTGCTAGAAAATCTCCTTCAAAATATCCAATCCAAAAAAAGAAAGAATGTAGAAATTATGTGGCTGGCTGCAACGATTTGCCGCAAACTGAATG"
+        "GTATTCGTTTCACCTGTTGTAAAAGTGCCAAAGACAGGACATCGATGTCAGTGACACTTGAACAATGCTCAATCTTGAGAGATGAGCACCAGTTACACAAGGACTTCTTTAT"
+        "CCGAGCGCTGGATTGCATGAGAAGAGAAGGATGCCGCATAGAGAATGTACTGAAGAATATCAAATGCAGAAAGTATGCTTTCAACATGCTACAGCTGATGGCTTTCCCCAAG"
+        "TACTACAGACCTCCAGAGGGGACTTATGGAAAAGCTGACACCTAA"
+    ),
+    # ENST00000443433.6 – used for stop_lost
+    "MYD88": (
+        "ATGCGACCCGACCGCGCTGAGGCTCCAGGACCGCCCGCCATGGCTGCAGGAGGTCCCGGCGCGGGGTCTGCGGCCCCGGTCTCCTCCACATCCTCCCTTCCCC"
+        "TGGCTGCTCTCAACATGCGAGTGCGGCGCCGCCTGTCTCTGTTCTTGAACGTGCGGACACAGGTGGCGGCCGACTGGACCGCGCTGGCGGAGGAGATGGACTTT"
+        "GAGTACTTGGAGATCCGGCAACTGGAGACACAAGCGGACCCCACTGGCAGGCTGCTGGACGCCTGGCAGGGACGCCCTGGCGCCTCTGTAGGCCGACTGCTCGAG"
+        "CTGCTTACCAAGCTGGGCCGCGACGACGTGCTGCTGGAGCTGGGACCCAGCATTGAGGAGGATTGCCAAAAGTATATCTTGAAGCAGCAGCAGGAGGAGGCTGAG"
+        "AAGCCTTTACAGGTGGCCGCTGTAGACAGCAGTGTCCCACGGACAGCAGAGCTGGCGGGCATCACCACACTTGATGACCCCCTGGGTGCCGCCGGATGGTGGTGGT"
+        "TGTCTCTGATGATTACCTGCAGAGCAAGGAATGTGACTTCCAGACCAAATTTGCACTCAGCCTCTCTCCAGGTGCCCATCAGAAGCGACTGA"
+    ),
+    # ENST00000494014.1 – used for stop_retained_variant
+    "EPHA3": (
+        "ATGGATTGTCAGCTCTCCATCCTCCTCCTTCTCAGCTGCTCTGTTCTCGACAGCTTCGGGGAACTGATTCCGCAGCCTTCCAATGAAGTCAATCTACTGGATTC"
+        "AAAAACAATTCAAGGGGAGCTGGGCTGGATCTCTTATCCATCACATGGGTGGGAAGAGATCAGTGGTGTGGATGAACATTACACACCCATCAGGACTTACCAGGTG"
+        "TGCAATGTCATGGACCACAGTCAAAACAATTGGCTGAGAACAAACTGGGTCCCCAGGAACTCAGCTCAGAAGATTTATGTGGAGCTCAAGTTCACTCTACGAGACTG"
+        "CAATAGCATTCCATTGGTTTTAGGAACTTGCAAGGAGACATTCAACCTGTACTACATGGAGTCTGATGATGATCATGGGGTGAAATTTCGAGAGCATCAGTTTACAAA"
+        "GATTGACACCATTGCAGCTGATGAAAGTTTCACTCAAATGGATCTTGGGGACCGTATTCTGAAGCTCAACACTGAGATTAGAGAAGTAGGTCCTGTCAACAAGAAGGG"
+        "ATTTTATTTGGCATTTCAAGATGTTGGTGCTTGTGTTGCCTTGGTGTCTGTGAGAGTATACTTCAAAAAGTGCCCATTTACAGTGAAGAATCTGGCTATGTTTCCAGA"
+        "CACGGTACCCATGGACTCCCAGTCCCTGGTGGAGGTTAGAGGGTCTTGTGTCAACAATTCTAAGGAGGAAGATCCTCCAAGGATGTACTGCAGTACAGAAGGCGAATGG"
+        "CTTGTACCCATTGGCAAGTGTTCCTGCAATGCTGGCTATGAAGAAAGAGGTTTTATGTGCCAAGCTTGTCGACCAGGTTTCTACAAGGCATTGGATGGTAATATGAAGT"
+        "GTGCTAAGTGCCCGCCTCACAGTTCTACTCAGGAAGATGGTTCAATGAACTGCAGGTGTGAGAATAATTACTTCCGGGCAGACAAAGACCCTCCATCCATGGCTTGTACC"
+        "CGACCTCCATCTTCACCAAGAAATGTTATCTCTAATATAAACGAGACCTCAGTTATCCTGGACTGGAGTTGGCCCCTGGACACAGGAGGCCGGAAAGATGTTACCTTCAAC"
+        "ATCATATGTAAAAAATGTGGGTGGAATATAAAACAGTGTGAGCCATGCAGCCCAAATGTCCGCTTCCTCCCTCGACAGTTTGGACTCACCAACACCACGGTGACAGTGACAG"
+        "ACCTTCTGGCACATACTAACTACACCTTTGAGATTGATGCCGTTAATGGGGTGTCAGAGCTGAGCTCCCCACCAAGACAGTTTGCTGCGGTCAGCATCACAACTAATCAGGC"
+        "TGCTCCATCACCTGTCCTGACGATTAAGAAAGATCGGACCTCCAGAAATAGCATCTCTTTGTCCTGGCAAGAACCTGAACATCCTAATGGGATCATATTGGACTACGAGGTC"
+        "AAATACTATGAAAAGCAGGAACAAGAAACAAGTTATACCATTCTGAGGGCAAGAGGCACAAATGTTACCATCAGTAGCCTCAAGCCTGACACTATATACGTATTCCAAATCC"
+        "GAGCCCGAACAGCCGCTGGATATGGGACGAACAGCCGCAAGTTTGAGTTTGAAACTAGTCCAGACTCTTTCTCCATCTCTGGTGAAAGTAGCCAAGTGGTCATGATCGCCAT"
+        "TTCAGCGGCAGTAGCAATTATTCTCCTCACTGTTGTCATCTATGTTTTGATTGGGAGGTTCTGTGGCTATAAGTCAAAACATGGGGCAGATGAAAAAAGACTTCATTTTGGC"
+        "AATGGGCATTTAAAACTTCCAGGTCTCAGGACTTATGTTGACCCACATACATATGAAGACCCTACCCAAGCTGTTCATGAGTTTGCCAAGGAATTGGATGCCACCAACATATC"
+        "CATTGATAAAGTTGTTGGAGCAGGTGAATTTGGAGAGGTGTGCAGTGGTCGCTTAAAACTTCCTTCAAAAAAAGAGATTTCAGTGGCCATTAAGACCCTGAAAGTTGGCTAC"
+        "ACAGAAAAGCAGAGGAGAGACTTCCTGGGAGAAGCAAGCATTATGGGACAGTTTGACCACCCCAATATCATTCGACTGGAAGGAGTTGTTACCAAAAGTAAGCCAGTTATGA"
+        "TTGTCACAGAATACATGGAGAATGGTTCCTTGGATAGTTTCCTACGTAAACACGATGCCCAGTTTACTGTCATTCAGCTAGTGGGGATGCTTCGAGGGATAGCATCTGGCAT"
+        "GAAGTACCTGTCAGACATGGGCTATGTTCACCGAGACCTCGCTGCTCGGAACATCTTGATCAACAGTAACTTGGTGTGTAAGGTTTCTGATTTCGGACTTTCGCGTGTCCTG"
+        "GAGGATGACCCAGAAGCTGCTTATACAACAAGAGGAGGGAAGATCCCAATCAGGTGGACATCACCAGAAGCTATAGCCTACCGCAAGTTCACGTCAGCCAGCGATGTATGGAG"
+        "TTATGGGATTGTTCTCTGGGAGGTGATGTCTTATGGAGAGAGACCATACTGGGAGATGTCCAATCAGGATGTAATTAAAGCTGTAGATGAGGGCTATCGACTGCCACCCCCC"
+        "ATGGACTGCCCAGCTGCCTTGTATCAGCTGATGCTGGACTGCTGGCAGAAAGACAGGAACAACAGACCCAAGTTTGAGCAGATTGTTAGTATTCTGGACAAGCTTATCCGGA"
+        "ATCCCGGCAGCCTGAAGATCATCACCAGTGCAGCCGCAAGGCCATCAAACCTTCTTCTGGACCAAAGCAATGTGGATATCACTACCTTCCGCACAACAGTGACATGA"
+    ),
+}
+
+
+def _seq(gene: str) -> Seq:
+    return Seq("".join(_CDS[gene].split()))
+
+
+def _wt_protein(gene: str) -> str:
+    return str(_seq(gene).translate(to_stop=False))
+
+
+def _call(gene, dna_mut, aa_mut, mutation_type) -> str:
+    snp = SNP(gene=gene, mrna="", dna_mut=dna_mut, aa_mut=aa_mut, mutation_type=mutation_type)
+    return CancerGenomesService.get_mut_pro_seq(snp, _seq(gene))
+
+
+# ---------------------------------------------------------------------------
+# Actionable types — should produce a non-empty mutant protein
+# ---------------------------------------------------------------------------
+
+class TestActionableVariantTypes:
+
+    def test_missense_variant(self):
+        """HGVS rule: single AA substitution leaves total protein length unchanged.
+
+        Source: EZH2 c.656C>T p.S219F (missense_variant)
+        DNA path: substitution at position 656 (C→T) changes codon 219 from Ser to Phe.
+        """
+        wt = _wt_protein("EZH2")
+        result = _call("EZH2", "c.656C>T", "p.S219F", "missense_variant")
+
+        assert result, "missense_variant must produce a non-empty protein"
+        assert len(result) == len(wt), (
+            f"Missense must preserve protein length: wt={len(wt)}, mut={len(result)}"
+        )
+        assert result[218] == "F", (
+            f"Position 219 (index 218) must change to Phe; got '{result[218]}'"
+        )
+        assert result[:218] == wt[:218], "Residues before the substitution must be unchanged"
+        assert result[219:] == wt[219:], "Residues after the substitution must be unchanged"
+
+    def test_stop_gained(self):
+        """HGVS rule: stop codon (*) introduced at the predicted position.
+
+        Source: FCGR3B c.394A>T p.K132* (stop_gained)
+        DNA path: substitution at position 394 (A→T) converts codon 132 to a stop.
+        """
+        result = _call("FCGR3B", "c.394A>T", "p.K132*", "stop_gained")
+
+        assert result, "stop_gained must produce a non-empty translated sequence"
+        assert result[131] == "*", (
+            f"Stop codon must be introduced at position 132 (index 131); got '{result[131]}'"
+        )
+
+    def test_inframe_deletion(self):
+        """HGVS rule: in-frame deletion shortens the protein by exactly the number of deleted codons.
+
+        Source: CTNNB1 c.104_124del p.T35_G41del (inframe_deletion)
+        DNA path: deletes 21 nucleotides (positions 104–124) = 7 codons = 7 amino acids.
+        """
+        wt = _wt_protein("CTNNB1")
+        result = _call("CTNNB1", "c.104_124del", "p.T35_G41del", "inframe_deletion")
+
+        deleted_nt = 124 - 104 + 1   # 21 nt = 7 codons
+        deleted_aa = deleted_nt // 3
+
+        assert result, "inframe_deletion must produce a non-empty protein"
+        assert len(result) == len(wt) - deleted_aa, (
+            f"Inframe deletion of {deleted_aa} AAs: expected len {len(wt) - deleted_aa}, "
+            f"got {len(result)}"
+        )
+        assert result[:34] == wt[:34], "Residues before the deletion must be unchanged"
+
+    def test_inframe_insertion_dup(self):
+        """HGVS rule: in-frame duplication lengthens the protein by exactly the number of duplicated codons.
+
+        Source: SMAP1 c.1226_1228dup p.I409dup (inframe_insertion)
+        DNA path: duplicates 3 nucleotides (positions 1226–1228) = 1 codon = 1 amino acid inserted.
+        """
+        wt = _wt_protein("SMAP1")
+        result = _call("SMAP1", "c.1226_1228dup", "p.I409dup", "inframe_insertion")
+
+        inserted_nt = 1228 - 1226 + 1   # 3 nt = 1 codon
+        inserted_aa = inserted_nt // 3
+
+        assert result, "inframe_insertion (dup) must produce a non-empty protein"
+        assert len(result) == len(wt) + inserted_aa, (
+            f"Duplication of {inserted_aa} AA: expected len {len(wt) + inserted_aa}, "
+            f"got {len(result)}"
+        )
+        assert result[:409] == wt[:409], "Residues before the duplication site must be unchanged"
+
+    def test_protein_altering_variant(self):
+        """HGVS rule: delins replaces a nucleotide range; result is non-empty and differs from WT.
+
+        Source: NF2 c.252_262delinsCT p.L85_K88delins* (protein_altering_variant)
+        DNA path: deletes positions 252–262 (11 nt) and inserts CT (2 nt).
+        The AA annotation (delins*) predicts a premature stop in the altered sequence.
+        """
+        wt = _wt_protein("NF2")
+        result = _call("NF2", "c.252_262delinsCT", "p.L85_K88delins*", "protein_altering_variant")
+
+        assert result, "protein_altering_variant must produce a non-empty sequence"
+        assert result != wt, "protein_altering_variant must differ from the wild-type sequence"
+
+    def test_frameshift_variant(self):
+        """HGVS rule: out-of-frame deletion disrupts the reading frame; the code translates the
+        resulting sequence as-is (to_stop=False) producing a non-empty aberrant protein.
+
+        Source: MEN1 c.292del p.R98Efs*21 (frameshift_variant)
+        DNA path: single-nucleotide deletion at position 292 shifts the reading frame.
+        """
+        result = _call("MEN1", "c.292del", "p.R98Efs*21", "frameshift_variant")
+
+        assert result, "frameshift_variant must produce a non-empty translated sequence"
+
+    def test_start_lost(self):
+        """HGVS rule: start codon destroyed; the processed sequence no longer begins with Met.
+
+        Source: INPP4B c.3G>T p.M1? (start_lost)
+        The annotation is p.M1? (ambiguous protein consequence) but dna_mut is unambiguous,
+        so the DNA substitution path is taken.  ATG (Met) → ATT (Ile) at codon 1.
+        """
+        result = _call("INPP4B", "c.3G>T", "p.M1?", "start_lost")
+
+        assert result, "start_lost must produce a non-empty translated sequence via the DNA path"
+        assert result[0] != "M", "Start codon is destroyed; first AA must no longer be Met"
+        assert result[0] == "I", "ATG→ATT encodes Ile; first AA must be Ile"
+
+    def test_stop_lost(self):
+        """HGVS rule: stop codon removed; the protein extends past the original terminus.
+
+        Source: MYD88 c.611_613delinsGCGGCCCCC p.D204_*205delinsGGPR (protein_altering_variant,stop_lost)
+        DNA path: delins at 611–613 removes the original stop codon; the protein extends.
+        """
+        wt = _wt_protein("MYD88")
+        result = _call(
+            "MYD88",
+            "c.611_613delinsGCGGCCCCC",
+            "p.D204_*205delinsGGPR",
+            "protein_altering_variant,stop_lost",
+        )
+
+        assert result, "stop_lost must produce a non-empty protein"
+        assert len(result) > len(wt), (
+            f"Stop-lost protein must be longer than WT: wt={len(wt)}, mut={len(result)}"
+        )
+
+    def test_stop_retained_variant(self):
+        """HGVS rule: synonymous change at the stop codon; stop is preserved; length unchanged.
+
+        Source: EPHA3 c.2756G>A p.*919= (stop_retained_variant)
+        DNA path: TGA (stop) → TAA (stop); both are stop codons, protein length is unchanged.
+        """
+        wt = _wt_protein("EPHA3")
+        result = _call("EPHA3", "c.2756G>A", "p.*919=", "stop_retained_variant")
+
+        assert result, "stop_retained_variant must produce a non-empty sequence"
+        assert len(result) == len(wt), (
+            f"Stop-retained must preserve protein length: wt={len(wt)}, mut={len(result)}"
+        )
+        assert result[-1] == "*", "Stop codon must still be present at the terminus"
+
+
+# ---------------------------------------------------------------------------
+# Filtered / skipped types — get_mut_pro_seq must return an empty string
+# ---------------------------------------------------------------------------
+
+class TestSkippedVariantTypes:
+
+    def test_synonymous_variant_is_prefiltered(self):
+        """synonymous_variant rows are discarded in cosmic_to_proteindb before get_mut_pro_seq
+        is ever called.  Verify the filter expression matches the v103 SO term.
+        """
+        assert "synonymous_variant" in "synonymous_variant"
+        assert "synonymous_variant" in "splice_region_variant,synonymous_variant"
+        assert "synonymous_variant" not in "missense_variant"
+
+    def test_intron_variant_returns_empty(self):
+        """intron_variant always carries p.? — the p.? guard returns an empty string.
+
+        Source: RAD51C c.145+706C>T p.? (intron_variant)
+        """
+        result = _call("EZH2", "c.145+706C>T", "p.?", "intron_variant")
+        assert result == "", f"intron_variant with p.? must return ''; got {repr(result)}"
+
+    def test_3_prime_utr_variant_returns_empty(self):
+        """3_prime_UTR_variant always carries p.? — the p.? guard returns an empty string.
+
+        Source: SUFU c.*2816G>A p.? (3_prime_UTR_variant)
+        The c.*N notation signals a 3'UTR position; the outer if-condition fails on p.?.
+        """
+        result = _call("EZH2", "c.*2816G>A", "p.?", "3_prime_UTR_variant")
+        assert result == "", f"3_prime_UTR_variant with p.? must return ''; got {repr(result)}"
+
+    def test_5_prime_utr_variant_returns_empty(self):
+        """5_prime_UTR_variant always carries p.? — the p.? guard returns an empty string.
+
+        Source: PSMD13 c.-20C>T p.? (5_prime_UTR_variant)
+        """
+        result = _call("EZH2", "c.-20C>T", "p.?", "5_prime_UTR_variant")
+        assert result == "", f"5_prime_UTR_variant with p.? must return ''; got {repr(result)}"
+
+    def test_coding_sequence_variant_returns_empty(self):
+        """coding_sequence_variant in v103 uses intronic-offset positions with p.?.
+
+        Source: OAS2 c.2157+2A>G p.? (coding_sequence_variant)
+        Both the intronic offset guard ('+' in coord) and the p.? guard ensure ''.
+        """
+        result = _call("EZH2", "c.2157+2A>G", "p.?", "coding_sequence_variant")
+        assert result == "", f"coding_sequence_variant with p.? must return ''; got {repr(result)}"
+
+    def test_splice_acceptor_variant_returns_empty(self):
+        """splice_acceptor_variant uses intronic-offset position notation with p.?.
+
+        Source: TSHR c.171-1G>A p.? (splice_acceptor_variant)
+        The '-' offset guard in the delins branch (and p.? in the substitution guard) return ''.
+        """
+        result = _call("EZH2", "c.171-1G>A", "p.?", "splice_acceptor_variant")
+        assert result == "", f"splice_acceptor_variant with p.? must return ''; got {repr(result)}"
+
+    def test_splice_donor_variant_returns_empty(self):
+        """splice_donor_variant uses downstream-offset position notation with p.?.
+
+        Source: PTCH1 c.3306+2T>G p.? (splice_donor_variant)
+        """
+        result = _call("EZH2", "c.3306+2T>G", "p.?", "splice_donor_variant")
+        assert result == "", f"splice_donor_variant with p.? must return ''; got {repr(result)}"
+
+    def test_splice_region_variant_composite_treated_as_primary_type(self):
+        """splice_region_variant only appears in v103 as part of a composite type string.
+        When combined with missense_variant, the missense guard fires first and the
+        mutation is processed as a missense — same-length result with a single AA change.
+
+        Source: EZH2 c.656C>T p.S219F (missense_variant — re-used with composite type)
+        """
+        wt = _wt_protein("EZH2")
+        result = _call(
+            "EZH2", "c.656C>T", "p.S219F", "missense_variant,splice_region_variant"
+        )
+        assert result, "composite type containing missense_variant must produce a result"
+        assert len(result) == len(wt), "missense-handled result must preserve protein length"
+        assert result[218] == "F", "Substitution at codon 219 must produce Phe"
+
+    def test_incomplete_terminal_codon_variant_returns_empty(self):
+        """incomplete_terminal_codon_variant is absent from v103 targeted-screen data.
+        By convention it carries p.?, so get_mut_pro_seq returns an empty string.
+        """
+        result = _call("EZH2", "c.2213_2214insA", "p.?", "incomplete_terminal_codon_variant")
+        assert result == "", (
+            f"incomplete_terminal_codon_variant with p.? must return ''; got {repr(result)}"
+        )

From 311fafb7c36fc45ab5f0981b807f96375c16a738 Mon Sep 17 00:00:00 2001
From: Husen Umer <husen.umer@kaust.edu.sa>
Date: Wed, 13 May 2026 14:42:29 +0300
Subject: [PATCH 17/23] minor command fixes in use-cases file

---
 docs/use-cases.md | 21 ++++-----------------
 1 file changed, 4 insertions(+), 17 deletions(-)

diff --git a/docs/use-cases.md b/docs/use-cases.md
index f08f690..3f9e0a4 100644
--- a/docs/use-cases.md
+++ b/docs/use-cases.md
@@ -32,7 +32,7 @@ pgatk ensembl-downloader \
 
 ```bash
 gffread -F -w ensembl_human/transcripts.fa \
-    -g ensembl_human/genome.fa \
+    -g ensembl_human/Homo_sapiens.GRCh38.dna_sm.toplevel.fa \
     ensembl_human/Homo_sapiens.GRCh38.*.gtf
 ```
 
@@ -106,19 +106,7 @@ pgatk dnaseq-to-proteindb \
     --skip_including_all_cds
 ```
 
-### Step 5 -- Population variant proteins
-
-Include common human variants from ENSEMBL:
-
-```bash
-pgatk vcf-to-proteindb \
-    --vcf ensembl_human/homo_sapiens_incl_consequences.vcf \
-    --input_fasta ensembl_human/transcripts.fa \
-    --gene_annotations_gtf ensembl_human/Homo_sapiens.GRCh38.*.gtf \
-    --output_proteindb ensembl_variants.fa
-```
-
-### Step 6 -- COSMIC somatic mutations (optional, for cancer cell lines)
+### Step 5 -- COSMIC somatic mutations (optional, for cancer cell lines)
 
 For cancer cell-line studies, add cell-line-specific somatic mutations:
 
@@ -136,7 +124,7 @@ pgatk cosmic-to-proteindb \
     --split_by_filter_column
 ```
 
-### Step 7 -- Combine and generate target-decoy database
+### Step 6 -- Combine and generate target-decoy database
 
 ```bash
 # Combine all components
@@ -146,7 +134,6 @@ cat canonical.fa \
     lncrna.fa \
     putative.fa \
     altorf.fa \
-    ensembl_variants.fa \
     > cell_type_target.fa
 
 # Generate decoy sequences
@@ -211,7 +198,7 @@ extract transcript sequences from the GTF and genome FASTA:
 
 ```bash
 gffread -F -w ensembl_human/transcripts.fa \
-    -g ensembl_human/genome.fa \
+    -g ensembl_human/Homo_sapiens.GRCh38.dna_sm.toplevel.fa \
     ensembl_human/Homo_sapiens.GRCh38.*.gtf
 ```
 

From dd286f8941e764ba6fed55dd07446e809acd94ae Mon Sep 17 00:00:00 2001
From: Husen Umer <husen.umer@kaust.edu.sa>
Date: Wed, 13 May 2026 18:49:47 +0300
Subject: [PATCH 18/23] Update COSMIC integration for v103 schema changes

- Updated input file names and parameters in the use cases to reflect the new COSMIC v103 format.
- Introduced the `--clinical_sample_file` option to allow mapping of `COSMIC_PHENOTYPE_ID` to `PRIMARY_SITE` for filtering.
- Enhanced the `CancerGenomesService` to utilize the classification file for phenotype mapping, improving mutation filtering based on tissue type.
- Adjusted documentation to clarify the new data model and integration tests for COSMIC v103.
- Updated test cases to align with the new schema and ensure accurate output generation.
---
 docs/use-cases.md                             |  49 ++++---
 docs/validations.md                           | 127 ++++++++++++++++++
 pgatk/cgenomes/cgenomes_proteindb.py          | 112 ++++++++++-----
 pgatk/commands/cosmic_to_proteindb.py         |  10 +-
 pgatk/config/cosmic_config.yaml               |   3 +-
 pgatk/testdata/proteindb_from_custom_VCF.fa   |   8 +-
 pgatk/testdata/test_cosmic_classification.tsv |   9 ++
 pgatk/testdata/test_cosmic_mutations.tsv      |  26 ++--
 .../test_cosmic_mutations_proteindb.fa        |  14 +-
 .../test_cosmic_mutations_proteindb_bone.fa   |   4 +-
 .../test_cosmic_mutations_proteindb_liver.fa  |   2 +-
 .../test_cosmic_mutations_proteindb_skin.fa   |   4 +-
 ...test_cosmic_mutations_proteindb_thyroid.fa |   2 +-
 ...tions_proteindb_upperaerodigestivetract.fa |   4 +-
 pgatk/tests/pgatk_tests.py                    |   3 +-
 15 files changed, 292 insertions(+), 85 deletions(-)
 create mode 100644 pgatk/testdata/test_cosmic_classification.tsv

diff --git a/docs/use-cases.md b/docs/use-cases.md
index 3f9e0a4..0442c9c 100644
--- a/docs/use-cases.md
+++ b/docs/use-cases.md
@@ -117,10 +117,11 @@ pgatk cosmic-downloader \
     -o cosmic_data
 
 pgatk cosmic-to-proteindb \
-    --input_mutation cosmic_data/CosmicCLP_MutantExport.tsv.gz \
-    --input_genes cosmic_data/All_CellLines_Genes.fasta.gz \
+    --input_mutation cosmic_data/CellLinesProject_GenomeScreensMutant_v103_GRCh38.tsv.gz \
+    --input_genes cosmic_data/Cosmic_Genes_v103_GRCh38.fasta.gz \
     --output_db cosmic_cellline.fa \
-    --filter_column "Sample name" \
+    --clinical_sample_file cosmic_data/Cosmic_Classification_v103_GRCh38.tsv.gz \
+    --filter_column PRIMARY_SITE \
     --split_by_filter_column
 ```
 
@@ -144,7 +145,7 @@ pgatk generate-decoy \
     --decoy_prefix DECOY_
 ```
 
-### Step 8 -- Extract non-canonical unique peptides
+### Step 7 -- Extract non-canonical unique peptides
 
 After database searching with your search engine, you can also pre-compute the
 set of non-canonical peptides unique to these novel sources:
@@ -381,24 +382,34 @@ pgatk cosmic-downloader \
 
 # Generate per-tissue databases
 pgatk cosmic-to-proteindb \
-    --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \
-    --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \
+    --input_mutation cosmic_data/Cosmic_GenomeScreensMutant_v103_GRCh38.tsv.gz \
+    --input_genes cosmic_data/Cosmic_Genes_v103_GRCh38.fasta.gz \
     --output_db cosmic_proteins.fa \
+    --clinical_sample_file cosmic_data/Cosmic_Classification_v103_GRCh38.tsv.gz \
+    --filter_column PRIMARY_SITE \
     --split_by_filter_column
 ```
 
 This produces files like `cosmic_proteins_lung.fa`, `cosmic_proteins_breast.fa`,
 etc. Use the tissue-matched database for your cancer type of interest.
 
+!!! note "COSMIC v103 tissue annotation"
+    In COSMIC v103, `PRIMARY_SITE` is not a column in the mutation file. It lives
+    in the separate Classification file (`Cosmic_Classification_v103_GRCh38.tsv.gz`)
+    and is joined via `COSMIC_PHENOTYPE_ID`. Pass the Classification file via
+    `--clinical_sample_file` whenever tissue-type filtering or splitting is needed.
+
 ### 4b. Cancer-type speific database using COSMIC somatic mutations
 
 Build a focused database for one cancer type (e.g. lung):
 
 ```bash
 pgatk cosmic-to-proteindb \
-    --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \
-    --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \
+    --input_mutation cosmic_data/Cosmic_GenomeScreensMutant_v103_GRCh38.tsv.gz \
+    --input_genes cosmic_data/Cosmic_Genes_v103_GRCh38.fasta.gz \
     --output_db cosmic_lung_proteins.fa \
+    --clinical_sample_file cosmic_data/Cosmic_Classification_v103_GRCh38.tsv.gz \
+    --filter_column PRIMARY_SITE \
     --accepted_values "lung"
 ```
 
@@ -409,10 +420,11 @@ provides a dedicated cell-line export with mutations annotated per sample:
 
 ```bash
 pgatk cosmic-to-proteindb \
-    --input_mutation cosmic_data/CosmicCLP_MutantExport.tsv.gz \
-    --input_genes cosmic_data/All_CellLines_Genes.fasta.gz \
+    --input_mutation cosmic_data/CellLinesProject_GenomeScreensMutant_v103_GRCh38.tsv.gz \
+    --input_genes cosmic_data/Cosmic_Genes_v103_GRCh38.fasta.gz \
     --output_db cosmic_cellline_proteins.fa \
-    --filter_column "Sample name" \
+    --clinical_sample_file cosmic_data/Cosmic_Classification_v103_GRCh38.tsv.gz \
+    --filter_column PRIMARY_SITE \
     --split_by_filter_column
 ```
 
@@ -446,15 +458,16 @@ pgatk cbioportal-to-proteindb \
 ### 4e. Combined cancer database for immunopeptidomics
 
 For HLA immunopeptidomics / neoantigen discovery, a broad mutation database
-maximizes the chance of detecting mutant HLA-presented peptides. Studies have
-shown that COSMIC-derived databases can identify 5x more mutant immunopeptides
-than patient WES alone. Combine COSMIC with ClinVar:
+maximizes the chance of detecting mutant HLA-presented peptides. COSMIC-derived
+databases have been shown to identify 5x more mutant HLA-I immunopeptides than
+patient WES alone ([Wang et al., *J Transl Med* 2024](https://doi.org/10.1186/s12967-023-04821-0)).
+Combine COSMIC with ClinVar:
 
 ```bash
 # Generate COSMIC mutations
 pgatk cosmic-to-proteindb \
-    --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \
-    --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \
+    --input_mutation cosmic_data/Cosmic_GenomeScreensMutant_v103_GRCh38.tsv.gz \
+    --input_genes cosmic_data/Cosmic_Genes_v103_GRCh38.fasta.gz \
     --output_db cosmic_proteins.fa
 
 # Generate ClinVar mutations
@@ -973,8 +986,8 @@ pgatk clinvar-to-proteindb \
 
 # COSMIC somatic mutations
 pgatk cosmic-to-proteindb \
-    --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \
-    --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \
+    --input_mutation cosmic_data/Cosmic_GenomeScreensMutant_v103_GRCh38.tsv.gz \
+    --input_genes cosmic_data/Cosmic_Genes_v103_GRCh38.fasta.gz \
     --output_db cosmic_variants.fa
 
 # Non-coding RNA (lncRNA)
diff --git a/docs/validations.md b/docs/validations.md
index 015ef72..6dd3acc 100644
--- a/docs/validations.md
+++ b/docs/validations.md
@@ -128,3 +128,130 @@ print(mut_prot[218])  # 'F' — mutant Phenylalanine
 | `TestGetMutProSeqEdgeCases` | Mutation beyond sequence length, ambiguous `p.?` returns `""`, non-alpha last char in aa_mut returns `""`, DNA ref-allele mismatch returns `""` |
 
 These tests are fast to run and isolate individual code paths without requiring reference files.
+
+---
+
+# 2. COSMIC Integration Pipeline Tests
+
+## 2.1 COSMIC v103 Schema and Data Model
+
+COSMIC v103 introduced two breaking changes from earlier releases:
+
+**Column name changes** — all columns are now `UPPERCASE_UNDERSCORE`:
+
+| Old (v2/v98) | New (v103) |
+|---|---|
+| `Gene name` | `GENE_SYMBOL` |
+| `Accession Number` | `TRANSCRIPT_ACCESSION` |
+| `CDS mutation` | `MUTATION_CDS` |
+| `AA Mutation` | `MUTATION_AA` |
+| `Type` | `MUTATION_DESCRIPTION` |
+| `Primary site` | *(moved to Classification file)* |
+
+**Tissue-type data architecture** — `PRIMARY_SITE` (tissue of origin) is no longer a column in the mutation export. It lives in a separate **Classification file** (`Cosmic_Classification_Tsv_v103_GRCh38.tar`) and is joined via `COSMIC_PHENOTYPE_ID`:
+
+```
+mutation_file.COSMIC_PHENOTYPE_ID (e.g. COSO1001)
+    → Cosmic_Classification_v103_GRCh38.tsv.COSMIC_PHENOTYPE_ID
+    → PRIMARY_SITE (e.g. upper_aerodigestive_tract)
+```
+
+`COSMIC_PHENOTYPE_ID` is present at column 6 in both `Cosmic_CompleteTargetedScreensMutant` and `Cosmic_GenomeScreensMutant` files. No hop through the Sample file is needed.
+
+> **Note**: The Sample file (`Cosmic_Sample_Tsv_v103_GRCh38.tsv`) contains 34 columns covering sequencing metadata (`SAMPLE_NAME`, `COSMIC_SAMPLE_ID`, `TUMOUR_ID`, etc.) but does **not** contain `PRIMARY_SITE`. The Classification file is the authoritative source for tissue and histology.
+
+---
+
+## 2.2 Integration Test (`test_cosmic_to_proteindb`)
+
+**Test file**: `pgatk/tests/pgatk_tests.py::PgatkRunnerTests::test_cosmic_to_proteindb`
+
+The test runs the full `cosmic-to-proteindb` pipeline end-to-end using small testdata files that mirror the real v103 schema:
+
+| File | Purpose |
+|---|---|
+| `testdata/test_cosmic_mutations.tsv` | 12 HRAS mutation rows in v103 27-column format; covers `missense_variant`, `stop_gained`, `inframe_deletion`, `synonymous_variant` (filtered), `frameshift_variant` (`p.?`, skipped) |
+| `testdata/test_cosmic_genes.fa` | HRAS CDS FASTA from `Cosmic_Genes_v103_GRCh38.fasta` |
+| `testdata/test_cosmic_classification.tsv` | 8 rows mapping `COSMIC_PHENOTYPE_ID` (COSO1001–COSO1008) to `PRIMARY_SITE`; covers `upper_aerodigestive_tract`, `skin`, `bone`, `thyroid`, `liver`, `haematopoietic_and_lymphoid_tissue` |
+
+CLI invocation used in the test:
+
+```
+cosmic-to-proteindb
+  --config_file config/cosmic_config.yaml
+  --input_mutation testdata/test_cosmic_mutations.tsv
+  --input_genes testdata/test_cosmic_genes.fa
+  --output_db testdata/test_cosmic_mutations_proteindb.fa
+  --clinical_sample_file testdata/test_cosmic_classification.tsv
+  --filter_column PRIMARY_SITE
+  --split_by_filter_column
+  --accepted_values all
+```
+
+Expected output files (split per `PRIMARY_SITE`):
+
+```
+testdata/test_cosmic_mutations_proteindb_bone.fa
+testdata/test_cosmic_mutations_proteindb_liver.fa
+testdata/test_cosmic_mutations_proteindb_skin.fa
+testdata/test_cosmic_mutations_proteindb_thyroid.fa
+testdata/test_cosmic_mutations_proteindb_upperaerodigestivetract.fa
+```
+
+The `synonymous_variant` row (COSO1006) is pre-filtered and produces no output. The `frameshift_variant` row carrying `p.?` (COSO1008) is skipped by the `p.?` guard. The consolidated `testdata/test_cosmic_mutations_proteindb.fa` contains all non-filtered mutations regardless of tissue type.
+
+---
+
+## 2.3 `--clinical_sample_file` CLI Option
+
+The `--clinical_sample_file` / `-cl` option accepts any TSV file that maps a sample/phenotype ID column to the `--filter_column` value. For COSMIC, this is the Classification file:
+
+| Argument | COSMIC v103 value |
+|---|---|
+| `--clinical_sample_file` | `Cosmic_Classification_v103_GRCh38.tsv` (or `.gz`) |
+| `--filter_column` | `PRIMARY_SITE` |
+| Join key used internally | `COSMIC_PHENOTYPE_ID` (auto-detected for COSMIC) |
+
+For **cBioportal**, the equivalent is the clinical sample file with `SAMPLE_ID` as the join key — the `--clinical_sample_file` option uses `SAMPLE_ID` by default and switches to `COSMIC_PHENOTYPE_ID` only when invoked via `cosmic-to-proteindb`.
+
+---
+
+## 2.4 Independent Validation with Real COSMIC Files
+
+After downloading COSMIC v103 files into `use-cases/use-case2/cosmic_data/`, run the pipeline against a subset of real mutations:
+
+```bash
+# Extract 500 actionable mutations (skip intronic/UTR/synonymous)
+zcat use-cases/use-case2/cosmic_data/Cosmic_CompleteTargetedScreensMutant_v103_GRCh38.tsv.gz \
+  | head -1 > /tmp/cosmic_test.tsv
+zcat use-cases/use-case2/cosmic_data/Cosmic_CompleteTargetedScreensMutant_v103_GRCh38.tsv.gz \
+  | grep -E "missense_variant|stop_gained|inframe_deletion|inframe_insertion|frameshift" \
+  | head -500 >> /tmp/cosmic_test.tsv
+
+pgatk cosmic-to-proteindb \
+  --config_file pgatk/config/cosmic_config.yaml \
+  --input_mutation /tmp/cosmic_test.tsv \
+  --input_genes <(zcat use-cases/use-case2/cosmic_data/Cosmic_Genes_v103_GRCh38.fasta.gz) \
+  --output_db /tmp/cosmic_out.fa \
+  --clinical_sample_file <(zcat use-cases/use-case2/cosmic_data/Cosmic_Classification_v103_GRCh38.tsv.gz) \
+  --filter_column PRIMARY_SITE \
+  --split_by_filter_column \
+  --accepted_values all
+```
+
+Expected: ~27 per-tissue output files, including canonical COSMIC tissue types (`lung`, `breast`, `large_intestine`, `skin`, `haematopoietic_and_lymphoid_tissue`, etc.). FASTA headers follow the format:
+
+```
+>COSMIC:GENE_SYMBOL:p.MutAA:SO_term
+```
+
+To verify the join is working (i.e. phenotype IDs are resolving to tissue names):
+
+```bash
+# Count sequences per tissue file
+for f in /tmp/cosmic_out_*.fa; do
+  echo "$f: $(grep -c '^>' $f) sequences"
+done
+```
+
+A file named `cosmic_out_NS.fa` is expected for phenotypes where tissue is not specified in the Classification file — these are retained rather than silently dropped.
diff --git a/pgatk/cgenomes/cgenomes_proteindb.py b/pgatk/cgenomes/cgenomes_proteindb.py
index 70fb72b..0d1e39f 100644
--- a/pgatk/cgenomes/cgenomes_proteindb.py
+++ b/pgatk/cgenomes/cgenomes_proteindb.py
@@ -287,6 +287,23 @@ def cosmic_to_proteindb(self) -> None:
                 except KeyError:
                     COSMIC_CDS_DB[record.id] = [record]
 
+        # Build phenotype-to-group mapping from the classification file if provided.
+        # COSMIC_PHENOTYPE_ID is present directly in the mutation file (column 6) and
+        # is the join key to the Cosmic_Classification file where PRIMARY_SITE lives.
+        sample_groups_dict = {}
+        if self._local_clinical_sample_file and self._filter_column:
+            sample_groups_dict = self.get_value_per_sample(
+                self._local_clinical_sample_file,
+                self._filter_column,
+                sample_id_column='COSMIC_PHENOTYPE_ID',
+            )
+            if not sample_groups_dict:
+                self.get_logger().warning(
+                    "clinical_sample_file '%s' produced no phenotype mappings; "
+                    "falling back to direct filter-column lookup in the mutation file.",
+                    self._local_clinical_sample_file,
+                )
+
         regex = re.compile('[^a-zA-Z]')
         mutation_dic = {}
         groups_mutations_dict = {}
@@ -296,55 +313,81 @@ def cosmic_to_proteindb(self) -> None:
         required_columns = ["GENE_SYMBOL", "TRANSCRIPT_ACCESSION", "MUTATION_CDS", "MUTATION_AA", "MUTATION_DESCRIPTION"]
         with _open_text(self._local_mutation_file, encoding="latin-1") as cosmic_input, \
              open(self._local_output_file, 'w', encoding='utf-8') as output:
-            header = cosmic_input.readline().strip().split("\t")
+            col_header = cosmic_input.readline().strip().split("\t")
             try:
-                gene_col = header.index("GENE_SYMBOL")
-                enst_col = header.index("TRANSCRIPT_ACCESSION")
-                cds_col = header.index("MUTATION_CDS")
-                aa_col = header.index("MUTATION_AA")
-                muttype_col = header.index("MUTATION_DESCRIPTION")
+                gene_col = col_header.index("GENE_SYMBOL")
+                enst_col = col_header.index("TRANSCRIPT_ACCESSION")
+                cds_col = col_header.index("MUTATION_CDS")
+                aa_col = col_header.index("MUTATION_AA")
+                muttype_col = col_header.index("MUTATION_DESCRIPTION")
             except ValueError as e:
                 self.get_logger().error(
                     "COSMIC file missing required columns. Expected %s, got: %s",
-                    required_columns, header
+                    required_columns, col_header
                 )
                 raise ValueError(
                     f"COSMIC mutation file missing required column: {e}. "
                     f"Expected columns include: {required_columns}"
                 ) from e
+
+            # COSMIC_PHENOTYPE_ID is the direct join key to the classification file.
+            try:
+                phenotype_id_col = col_header.index("COSMIC_PHENOTYPE_ID")
+            except ValueError:
+                phenotype_id_col = None
+                if sample_groups_dict:
+                    self.get_logger().warning(
+                        "COSMIC_PHENOTYPE_ID not found in mutation file header; classification file join will be skipped."
+                    )
+
+            # Fall back to a direct filter column in the mutation file when no classification file is provided.
             filter_col = None
-            if self._filter_column:
+            if not sample_groups_dict and self._filter_column:
                 try:
-                    filter_col = header.index(self._filter_column)
+                    filter_col = col_header.index(self._filter_column)
                 except ValueError:
                     self.get_logger().warning(
-                        "Filter column '%s' not found in COSMIC header: %s. Filtering disabled.",
-                        self._filter_column, header
+                        "Filter column '%s' not found in COSMIC header. Filtering disabled.",
+                        self._filter_column,
                     )
 
             max_col = max(gene_col, enst_col, cds_col, aa_col, muttype_col)
             if filter_col is not None:
                 max_col = max(max_col, filter_col)
+            if phenotype_id_col is not None and sample_groups_dict:
+                max_col = max(max_col, phenotype_id_col)
 
             for line in cosmic_input:
                 if line_counter % 10000 == 0:
-                    msg = "Number of lines finished -- '{}'".format(line_counter)
-                    self.get_logger().debug(msg)
+                    self.get_logger().debug("Number of lines finished -- '%s'", line_counter)
                 line_counter += 1
                 row = line.strip().split("\t")
                 if len(row) <= max_col:
                     self.get_logger().debug("Skipping malformed row (insufficient columns) at line %s: %s", line_counter, row[:5])
                     continue
-                if filter_col is not None:
-                    if row[filter_col] not in self._accepted_values and self._accepted_values != ['all']:
+
+                # Determine the group (e.g. primary site) for this mutation.
+                group = None
+                if sample_groups_dict and phenotype_id_col is not None:
+                    group = sample_groups_dict.get(row[phenotype_id_col])
+                    if group is None and (self._accepted_values != ['all'] or self._split_by_filter_column):
+                        self.get_logger().warning(
+                            "No classification found for COSMIC_PHENOTYPE_ID '%s'; skipping row.",
+                            row[phenotype_id_col],
+                        )
                         continue
+                elif filter_col is not None:
+                    group = row[filter_col]
+
+                if group is not None and group not in self._accepted_values and self._accepted_values != ['all']:
+                    continue
 
                 if "coding silent" in row[muttype_col] or "synonymous_variant" in row[muttype_col]:
                     continue
 
                 snp = SNP(gene=row[gene_col], mrna=row[enst_col], dna_mut=row[cds_col], aa_mut=row[aa_col],
                           mutation_type=row[muttype_col])
-                header = "COSMIC:%s:%s:%s" % (snp.gene, snp.aa_mut, snp.mutation_type.replace(" ", ""))
+                fasta_header = "COSMIC:%s:%s:%s" % (snp.gene, snp.aa_mut, snp.mutation_type.replace(" ", ""))
                 try:
                     this_gene_records = COSMIC_CDS_DB[snp.gene]
                     seqs = []
@@ -364,16 +407,16 @@ def cosmic_to_proteindb(self) -> None:
                         break
 
                 if mut_pro_seq:
-                    entry = ">%s\n%s\n" % (header, mut_pro_seq)
-                    if header not in mutation_dic:
+                    entry = ">%s\n%s\n" % (fasta_header, mut_pro_seq)
+                    if fasta_header not in mutation_dic:
                         output.write(entry)
-                        mutation_dic[header] = 1
+                        mutation_dic[fasta_header] = 1
 
-                    if self._split_by_filter_column and filter_col is not None:
+                    if self._split_by_filter_column and group is not None:
                         try:
-                            groups_mutations_dict[row[filter_col]][header] = entry
+                            groups_mutations_dict[group][fasta_header] = entry
                         except KeyError:
-                            groups_mutations_dict[row[filter_col]] = {header: entry}
+                            groups_mutations_dict[group] = {fasta_header: entry}
                 else:
                     self.get_logger().warning(
                         f"Could not parse mutation record: gene={snp.gene}, dna_mut={snp.dna_mut}, "
@@ -383,13 +426,14 @@ def cosmic_to_proteindb(self) -> None:
         for group_name in groups_mutations_dict.keys():
             output_base = str(Path(self._local_output_file).with_suffix(''))
             with open(f"{output_base}_{regex.sub('', group_name)}.fa", 'w', encoding='utf-8') as fn:
-                for header in groups_mutations_dict[group_name].keys():
-                    fn.write(groups_mutations_dict[group_name][header])
+                for fasta_hdr in groups_mutations_dict[group_name].keys():
+                    fn.write(groups_mutations_dict[group_name][fasta_hdr])
 
         self.get_logger().debug("COSMIC contains in total {} non redundant mutations".format(len(mutation_dic)))
 
     @staticmethod
-    def get_sample_headers(header_line: list, filter_column: str) -> tuple[Optional[int], Optional[int]]:
+    def get_sample_headers(header_line: list, filter_column: str,
+                           sample_id_column: str = 'SAMPLE_ID') -> tuple[Optional[int], Optional[int]]:
         _logger = logging.getLogger(__name__)
         try:
             filter_col = header_line.index(filter_column)
@@ -397,13 +441,14 @@ def get_sample_headers(header_line: list, filter_column: str) -> tuple[Optional[
             _logger.warning('%s was not found in the header row: %s', filter_column, header_line)
             return None, None
         try:
-            sample_id_col = header_line.index('SAMPLE_ID')
+            sample_id_col = header_line.index(sample_id_column)
         except ValueError:
-            _logger.warning('SAMPLE_ID was not found in the header row: %s', header_line)
+            _logger.warning('%s was not found in the header row: %s', sample_id_column, header_line)
             return None, None
         return filter_col, sample_id_col
 
-    def get_value_per_sample(self, local_clinical_sample_file: str, filter_column: str) -> dict:
+    def get_value_per_sample(self, local_clinical_sample_file: str, filter_column: str,
+                              sample_id_column: str = 'SAMPLE_ID') -> dict:
         sample_value = {}
         if local_clinical_sample_file:
             with open(local_clinical_sample_file, 'r', encoding='utf-8') as clin_fn:
@@ -412,16 +457,17 @@ def get_value_per_sample(self, local_clinical_sample_file: str, filter_column: s
                     if line.startswith('#'):
                         continue
                     sl = line.strip().split('\t')
-                    if 'SAMPLE_ID' in sl and filter_column in sl:
-                        filter_column_col, sample_id_col = self.get_sample_headers(sl, filter_column)
-                        # Skip adding the header row itself to sample_value
+                    if sample_id_column in sl and filter_column in sl:
+                        filter_column_col, sample_id_col = self.get_sample_headers(
+                            sl, filter_column, sample_id_column
+                        )
                         continue
                     if filter_column_col is not None and sample_id_col is not None:
                         if (sample_id_col < len(sl) and filter_column_col < len(sl)):
                             sample_value[sl[sample_id_col]] = sl[filter_column_col].strip().replace(' ', '_')
                     else:
-                        self.get_logger().warning("No column was found for %s, %s in %s", filter_column, 'SAMPLE_ID',
-                                                  local_clinical_sample_file)
+                        self.get_logger().warning("No column was found for %s, %s in %s", filter_column,
+                                                  sample_id_column, local_clinical_sample_file)
         return sample_value
 
     @staticmethod
diff --git a/pgatk/commands/cosmic_to_proteindb.py b/pgatk/commands/cosmic_to_proteindb.py
index 5c24ca4..b4d0c27 100644
--- a/pgatk/commands/cosmic_to_proteindb.py
+++ b/pgatk/commands/cosmic_to_proteindb.py
@@ -22,9 +22,12 @@
 @click.option('-s', '--split_by_filter_column',
               help='Use this flag to generate a proteinDB per group as specified in the filter_column, default is False',
               is_flag=True)
+@click.option('-cl', '--clinical_sample_file',
+              help='COSMIC sample annotation file used to map '
+                   'COSMIC_SAMPLE_ID to the filter column value (e.g. PRIMARY_SITE)')
 @click.pass_context
 def cosmic_to_proteindb(ctx, config_file, input_mutation, input_genes, output_db,
-                        filter_column, accepted_values, split_by_filter_column):
+                        filter_column, accepted_values, split_by_filter_column, clinical_sample_file):
 
     config_data = load_config("cosmic", config_file)
 
@@ -39,10 +42,13 @@ def cosmic_to_proteindb(ctx, config_file, input_mutation, input_genes, output_db
     if output_db is not None:
         pipeline_arguments[CancerGenomesService.CONFIG_OUTPUT_FILE] = output_db
 
+    if clinical_sample_file is not None:
+        pipeline_arguments[CancerGenomesService.CLINICAL_SAMPLE_FILE] = clinical_sample_file
+
     if filter_column is not None:
         pipeline_arguments[CancerGenomesService.FILTER_COLUMN] = filter_column
     elif config_data is None:
-        pipeline_arguments[CancerGenomesService.FILTER_COLUMN] = 'Primary site'
+        pipeline_arguments[CancerGenomesService.FILTER_COLUMN] = 'PRIMARY_SITE'
 
     if accepted_values is not None:
         pipeline_arguments[CancerGenomesService.ACCEPTED_VALUES] = accepted_values
diff --git a/pgatk/config/cosmic_config.yaml b/pgatk/config/cosmic_config.yaml
index 7721295..0555af5 100644
--- a/pgatk/config/cosmic_config.yaml
+++ b/pgatk/config/cosmic_config.yaml
@@ -27,6 +27,7 @@ cosmic_data:
       - grch38/cosmic/v103/Cosmic_Transcripts_Tsv_v103_GRCh38.tar
       - grch38/cell_lines/v103/CellLinesProject_GenomeScreensMutant_Tsv_v103_GRCh38.tar
       - grch38/cell_lines/v103/CellLinesProject_CompleteCNA_Tsv_v103_GRCh38.tar
+      - grch38/cosmic/v103/Cosmic_Classification_Tsv_v103_GRCh38.tar
   logger:
     formatters:
       DEBUG: "%(asctime)s [%(levelname)7s][%(name)48s][%(module)32s, %(lineno)4s] %(message)s"
@@ -34,7 +35,7 @@ cosmic_data:
     loglevel: DEBUG
 proteindb:
    filter_info:
-      filter_column: 'Primary site'
+      filter_column: 'PRIMARY_SITE'
       accepted_values: "all"
       split_by_filter_column: False
       clinical_sample_file: ''
diff --git a/pgatk/testdata/proteindb_from_custom_VCF.fa b/pgatk/testdata/proteindb_from_custom_VCF.fa
index 5325f01..22f87e0 100644
--- a/pgatk/testdata/proteindb_from_custom_VCF.fa
+++ b/pgatk/testdata/proteindb_from_custom_VCF.fa
@@ -1,11 +1,11 @@
->varsample_rs78350717_22.15528913.C.A_ENST00000252835
-MCPLTLQVTGLMNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKDFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII*
->varsample_rs78350717_22.15528913.C.T_ENST00000252835
-MCPLTLQVTGLMNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKVFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII*
 >varsample_rs78350717_22.15528913.C.A_ENST00000643195
 MNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKDFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII*
 >varsample_rs78350717_22.15528913.C.T_ENST00000643195
 MNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKVFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII*
+>varsample_rs78350717_22.15528913.C.A_ENST00000252835
+MCPLTLQVTGLMNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKDFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII*
+>varsample_rs78350717_22.15528913.C.T_ENST00000252835
+MCPLTLQVTGLMNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKVFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII*
 >varsample_rs147461488_22.17740102.GGCCACGCTCAACT.G_ENST00000342111
 MDCEVNNGSSLRDECITNLLVFGFLQSCSDNSFRRELDALGHELPVLAPQWEGYDELQTDGNRSSHSRLGRIEAGASDNNTASAEEETEAAGSVAFMELRQ*F*KSRRHHPEYCQAPRPGRGQHGP*HPSGPG
 >varsample_rs147461488_22.17740102.GGCCACGCTCAACT.G_ENST00000617586
diff --git a/pgatk/testdata/test_cosmic_classification.tsv b/pgatk/testdata/test_cosmic_classification.tsv
new file mode 100644
index 0000000..e5a9f9c
--- /dev/null
+++ b/pgatk/testdata/test_cosmic_classification.tsv
@@ -0,0 +1,9 @@
+COSMIC_PHENOTYPE_ID	PRIMARY_SITE	PRIMARY_HISTOLOGY	SITE_SUBTYPE_1	HISTOLOGY_SUBTYPE_1
+COSO1001	upper_aerodigestive_tract	carcinoma	mouth	squamous_cell_carcinoma
+COSO1002	skin	carcinoma	upper_leg	squamous_cell_carcinoma
+COSO1003	bone	other	femur	giant_cell_tumour
+COSO1004	thyroid	adenoma-nodule-goitre	NS	NS
+COSO1005	liver	other	NS	neoplasm
+COSO1006	haematopoietic_and_lymphoid_tissue	haematopoietic_neoplasm	NS	acute_myeloid_leukaemia
+COSO1007	bone	carcinoma	NS	NS
+COSO1008	skin	carcinoma	NS	NS
diff --git a/pgatk/testdata/test_cosmic_mutations.tsv b/pgatk/testdata/test_cosmic_mutations.tsv
index f4a9ba4..254cef5 100644
--- a/pgatk/testdata/test_cosmic_mutations.tsv
+++ b/pgatk/testdata/test_cosmic_mutations.tsv
@@ -1,13 +1,13 @@
-Gene name	Accession Number	Gene CDS length	HGNC ID	Sample name	ID_sample	ID_tumour	Primary site	Site subtype 1	Site subtype 2	Site subtype 3	Primary histology	Histology subtype 1	Histology subtype 2	Histology subtype 3	Genome-wide screen	Mutation ID	Mutation CDS	Mutation AA	Mutation Description	Mutation zygosity	LOH	GRCh	Mutation genome position	Mutation strand	SNP	Resistance Mutation	FATHMM prediction	FATHMM score	Mutation somatic status	Pubmed_PMID	ID_STUDY	Sample Type	Tumour origin	Age
-HRAS	ENST00000397596	570	5173	HN12PT	1542873	1465176	upper_aerodigestive_tract	mouth	NS	NS	carcinoma	squamous_cell_carcinoma	NS	NS	y	COSM486	c.37G>C	p.G13R	Substitution - Missense		u	38	11:534286-534286	-	n	-	PATHOGENIC	.99415	Confirmed somatic variant	21798897		surgery fresh/frozen	primary	
-HRAS	ENST00000397596	570	5173	V19	2688774	2547756	skin	upper_leg	NS	NS	carcinoma	squamous_cell_carcinoma	NS	NS	n	COSM486	c.37G>C	p.G13R	Substitution - Missense		u	38	11:534286-534286	-	n	-	PATHOGENIC	.99415	Confirmed somatic variant	24662767		surgery fresh/frozen	NS	
-HRAS	ENST00000397596	570	5173	S23372	702100	629222	bone	femur	NS	NS	other	giant_cell_tumour	NS	NS	n	COSM487	c.37G>A	p.G13S	Substitution - Missense		y	38	11:534286-534286	-	n	-	PATHOGENIC	.99324	Reported in another cancer sample as somatic	11903582		surgery-fixed	recurrent	42
-HRAS	ENST00000397596	570	5173	pel-102	720047	644990	upper_aerodigestive_tract	mouth	lip	NS	carcinoma	squamous_cell_carcinoma	NS	NS	n	COSM484	c.35G>A	p.G12D	Substitution - Missense		n	38	11:534288-534288	-	n	-	PATHOGENIC	.99322	Reported in another cancer sample as somatic	8253528		surgery fresh/frozen	primary	
-HRAS	ENST00000397596	570	5173	1965078	1965078	1851573	skin	chest	NS	NS	benign_melanocytic_nevus	NS	NS	NS	n	COSM498	c.182A>T	p.Q61L	Substitution - Missense		u	38	11:533874-533874	-	n	-	PATHOGENIC	.97511	Reported in another cancer sample as somatic	23599145		NS	NS	30
-HRAS	ENST00000397596	570	5173	1571189	1571189	1492737	skin	NS	NS	NS	other	atypical_Spitzoid_tumour	NS	NS	n	COSM498	c.182A>T	p.Q61L	Substitution - Missense	het	u	38	11:533874-533874	-	n	-	PATHOGENIC	.97511	Reported in another cancer sample as somatic	21496703		surgery-fixed	NS	
-HRAS	ENST00000397596	570	5173	S61945	741791	664521	thyroid	NS	NS	NS	adenoma-nodule-goitre	NS	NS	NS	n	COSM499	c.182A>G	p.Q61R	Substitution - Missense		y	38	11:533874-533874	-	n	-	PATHOGENIC	.97179	Reported in another cancer sample as somatic	12727991		NS	primary	
-HRAS	ENST00000397596	570	5173	49T	2745889	2604592	liver	NS	NS	NS	other	neoplasm	NS	NS	y	COSM1729837	c.403C>T	p.R135*	Substitution - Nonsense		u	38	11:533500-533500	-	n	-	PATHOGENIC	.97257	Confirmed somatic variant		660	NS	primary	67
-HRAS	ENST00000397596	570	5173	DM3T4S	2762863	2621122	thyroid	NS	NS	NS	carcinoma	follicular_carcinoma	NS	NS	n	COSM499	c.182A>G	p.Q61R	Substitution - Missense		u	38	11:533874-533874	-	n	-	PATHOGENIC	.97179	Reported in another cancer sample as somatic	29615459		NS	metastasis	58
-HRAS	ENST00000397596	570	5173	TESTSAMPLE	9999991	9999981	haematopoietic_and_lymphoid_tissue	NS	NS	NS	haematopoietic_neoplasm	acute_myeloid_leukaemia	NS	NS	n	COSM_TEST1	c.?	p.G13(H^K)	Substitution - Missense		u	38	11:534286-534286	-	n	-	PATHOGENIC	.99415	Confirmed somatic variant	25858894		blood-bone marrow	primary	
-HRAS	ENST00000397596	570	5173	TESTSAMPLE	9999991	9999981	haematopoietic_and_lymphoid_tissue	NS	NS	NS	haematopoietic_neoplasm	acute_myeloid_leukaemia	NS	NS	n	COSM_TEST2	c.?	p.13_14>21	Complex - insertion inframe		u	38	11:534286-534286	-	n	-	PATHOGENIC	.99415	Confirmed somatic variant	25858894		blood-bone marrow	primary	
-HRAS	ENST00000397596	570	5173	TESTSAMPLE	9999991	9999981	haematopoietic_and_lymphoid_tissue	NS	NS	NS	haematopoietic_neoplasm	acute_myeloid_leukaemia	NS	NS	n	COSM_TEST3	c.?	p.G13_S14>GS	Insertion - In frame		u	38	11:534286-534286	-	n	-	PATHOGENIC	.99415	Confirmed somatic variant	25858894		blood-bone marrow	primary	
+GENE_SYMBOL	COSMIC_GENE_ID	TRANSCRIPT_ACCESSION	COSMIC_SAMPLE_ID	SAMPLE_NAME	COSMIC_PHENOTYPE_ID	GENOMIC_MUTATION_ID	LEGACY_MUTATION_ID	MUTATION_ID	MUTATION_CDS	MUTATION_AA	MUTATION_DESCRIPTION	MUTATION_ZYGOSITY	LOH	CHROMOSOME	GENOME_START	GENOME_STOP	STRAND	PUBMED_PMID	COSMIC_STUDY_ID	HGVSP	HGVSC	HGVSG	GENOMIC_WT_ALLELE	GENOMIC_MUT_ALLELE	MUTATION_SOMATIC_STATUS	POSITIVE_SCREEN
+HRAS	COSG11180	ENST00000397596	COSS1542873	HN12PT	COSO1001	COSV56056559	COSM486	COSM486	c.37G>C	p.G13R	missense_variant		u	11	534286	534286	-	21798897		p.Gly13Arg	c.37G>C		G	C	Confirmed somatic variant	y
+HRAS	COSG11180	ENST00000397596	COSS2688774	V19	COSO1002	COSV56056559	COSM486	COSM486	c.37G>C	p.G13R	missense_variant		u	11	534286	534286	-	24662767		p.Gly13Arg	c.37G>C		G	C	Confirmed somatic variant	n
+HRAS	COSG11180	ENST00000397596	COSS702100	S23372	COSO1003	COSV56056558	COSM487	COSM487	c.37G>A	p.G13S	missense_variant		y	11	534286	534286	-	11903582		p.Gly13Ser	c.37G>A		G	A	Reported in another cancer sample as somatic	n
+HRAS	COSG11180	ENST00000397596	COSS720047	pel-102	COSO1001	COSV56056557	COSM484	COSM484	c.35G>A	p.G12D	missense_variant		n	11	534288	534288	-	8253528		p.Gly12Asp	c.35G>A		G	A	Reported in another cancer sample as somatic	n
+HRAS	COSG11180	ENST00000397596	COSS1965078	1965078	COSO1002	COSV56056584	COSM498	COSM498	c.182A>T	p.Q61L	missense_variant		u	11	533874	533874	-	23599145		p.Gln61Leu	c.182A>T		A	T	Reported in another cancer sample as somatic	n
+HRAS	COSG11180	ENST00000397596	COSS1571189	1571189	COSO1002	COSV56056584	COSM498	COSM498	c.182A>T	p.Q61L	missense_variant	het	u	11	533874	533874	-	21496703		p.Gln61Leu	c.182A>T		A	T	Reported in another cancer sample as somatic	n
+HRAS	COSG11180	ENST00000397596	COSS741791	S61945	COSO1004	COSV56056585	COSM499	COSM499	c.182A>G	p.Q61R	missense_variant		y	11	533874	533874	-	12727991		p.Gln61Arg	c.182A>G		A	G	Reported in another cancer sample as somatic	n
+HRAS	COSG11180	ENST00000397596	COSS2745889	49T	COSO1005	COSV56070001	COSM1729837	COSM1729837	c.403C>T	p.R135*	stop_gained		u	11	533500	533500	-		660	p.Arg135Ter	c.403C>T		C	T	Confirmed somatic variant	y
+HRAS	COSG11180	ENST00000397596	COSS2762863	DM3T4S	COSO1004	COSV56056585	COSM499	COSM499	c.182A>G	p.Q61R	missense_variant		u	11	533874	533874	-	29615459		p.Gln61Arg	c.182A>G		A	G	Reported in another cancer sample as somatic	n
+HRAS	COSG11180	ENST00000397596	COSS9999991	TESTSAMPLE	COSO1006	COSV_TEST1	COSM_TEST1	COSM_TEST1	c.36C>T	p.G12=	synonymous_variant		u	11	534287	534287	-			p.Gly12Gly	c.36C>T		C	T	Confirmed somatic variant	n
+HRAS	COSG11180	ENST00000397596	COSS9999992	TESTSAMPLE2	COSO1007	COSV_TEST2	COSM_TEST2	COSM_TEST2	c.34_36del	p.G12del	inframe_deletion		u	11	534285	534287	-			p.Gly12del	c.34_36del		GGC	-	Confirmed somatic variant	n
+HRAS	COSG11180	ENST00000397596	COSS9999993	TESTSAMPLE3	COSO1008	COSV_TEST3	COSM_TEST3	COSM_TEST3	c.?	p.?	frameshift_variant		u	11	534286	534287	-			p.?	c.?		.	.	Confirmed somatic variant	n
diff --git a/pgatk/testdata/test_cosmic_mutations_proteindb.fa b/pgatk/testdata/test_cosmic_mutations_proteindb.fa
index 9516b34..e2fd977 100644
--- a/pgatk/testdata/test_cosmic_mutations_proteindb.fa
+++ b/pgatk/testdata/test_cosmic_mutations_proteindb.fa
@@ -1,12 +1,14 @@
->COSMIC:HRAS:p.G13R:Substitution-Missense
+>COSMIC:HRAS:p.G13R:missense_variant
 MTEYKLVVVGAGRVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS*
->COSMIC:HRAS:p.G13S:Substitution-Missense
+>COSMIC:HRAS:p.G13S:missense_variant
 MTEYKLVVVGAGSVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS*
->COSMIC:HRAS:p.G12D:Substitution-Missense
+>COSMIC:HRAS:p.G12D:missense_variant
 MTEYKLVVVGADGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS*
->COSMIC:HRAS:p.Q61L:Substitution-Missense
+>COSMIC:HRAS:p.Q61L:missense_variant
 MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGLEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS*
->COSMIC:HRAS:p.Q61R:Substitution-Missense
+>COSMIC:HRAS:p.Q61R:missense_variant
 MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGREEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS*
->COSMIC:HRAS:p.R135*:Substitution-Nonsense
+>COSMIC:HRAS:p.R135*:stop_gained
 MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLA*SYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS*
+>COSMIC:HRAS:p.G12del:inframe_deletion
+MTEYKLVVVGAGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS*
diff --git a/pgatk/testdata/test_cosmic_mutations_proteindb_bone.fa b/pgatk/testdata/test_cosmic_mutations_proteindb_bone.fa
index 3be6bf8..f96859a 100644
--- a/pgatk/testdata/test_cosmic_mutations_proteindb_bone.fa
+++ b/pgatk/testdata/test_cosmic_mutations_proteindb_bone.fa
@@ -1,2 +1,4 @@
->COSMIC:HRAS:p.G13S:Substitution-Missense
+>COSMIC:HRAS:p.G13S:missense_variant
 MTEYKLVVVGAGSVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS*
+>COSMIC:HRAS:p.G12del:inframe_deletion
+MTEYKLVVVGAGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS*
diff --git a/pgatk/testdata/test_cosmic_mutations_proteindb_liver.fa b/pgatk/testdata/test_cosmic_mutations_proteindb_liver.fa
index fc6d70d..62dafeb 100644
--- a/pgatk/testdata/test_cosmic_mutations_proteindb_liver.fa
+++ b/pgatk/testdata/test_cosmic_mutations_proteindb_liver.fa
@@ -1,2 +1,2 @@
->COSMIC:HRAS:p.R135*:Substitution-Nonsense
+>COSMIC:HRAS:p.R135*:stop_gained
 MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLA*SYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS*
diff --git a/pgatk/testdata/test_cosmic_mutations_proteindb_skin.fa b/pgatk/testdata/test_cosmic_mutations_proteindb_skin.fa
index b365210..6d8f4ab 100644
--- a/pgatk/testdata/test_cosmic_mutations_proteindb_skin.fa
+++ b/pgatk/testdata/test_cosmic_mutations_proteindb_skin.fa
@@ -1,4 +1,4 @@
->COSMIC:HRAS:p.G13R:Substitution-Missense
+>COSMIC:HRAS:p.G13R:missense_variant
 MTEYKLVVVGAGRVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS*
->COSMIC:HRAS:p.Q61L:Substitution-Missense
+>COSMIC:HRAS:p.Q61L:missense_variant
 MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGLEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS*
diff --git a/pgatk/testdata/test_cosmic_mutations_proteindb_thyroid.fa b/pgatk/testdata/test_cosmic_mutations_proteindb_thyroid.fa
index dc17c3d..b9ead29 100644
--- a/pgatk/testdata/test_cosmic_mutations_proteindb_thyroid.fa
+++ b/pgatk/testdata/test_cosmic_mutations_proteindb_thyroid.fa
@@ -1,2 +1,2 @@
->COSMIC:HRAS:p.Q61R:Substitution-Missense
+>COSMIC:HRAS:p.Q61R:missense_variant
 MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGREEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS*
diff --git a/pgatk/testdata/test_cosmic_mutations_proteindb_upperaerodigestivetract.fa b/pgatk/testdata/test_cosmic_mutations_proteindb_upperaerodigestivetract.fa
index f3118ae..9b3cf5c 100644
--- a/pgatk/testdata/test_cosmic_mutations_proteindb_upperaerodigestivetract.fa
+++ b/pgatk/testdata/test_cosmic_mutations_proteindb_upperaerodigestivetract.fa
@@ -1,4 +1,4 @@
->COSMIC:HRAS:p.G13R:Substitution-Missense
+>COSMIC:HRAS:p.G13R:missense_variant
 MTEYKLVVVGAGRVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS*
->COSMIC:HRAS:p.G12D:Substitution-Missense
+>COSMIC:HRAS:p.G12D:missense_variant
 MTEYKLVVVGADGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS*
diff --git a/pgatk/tests/pgatk_tests.py b/pgatk/tests/pgatk_tests.py
index 825a642..a738bdf 100644
--- a/pgatk/tests/pgatk_tests.py
+++ b/pgatk/tests/pgatk_tests.py
@@ -178,7 +178,8 @@ def test_cosmic_to_proteindb(self):
                                 '--input_mutation', 'testdata/test_cosmic_mutations.tsv',
                                 '--input_genes', 'testdata/test_cosmic_genes.fa',
                                 '--output_db', 'testdata/test_cosmic_mutations_proteindb.fa',
-                                '--filter_column', 'Primary site',
+                                '--clinical_sample_file', 'testdata/test_cosmic_classification.tsv',
+                                '--filter_column', 'PRIMARY_SITE',
                                 '--split_by_filter_column', '--accepted_values', 'all'])
         if result.exit_code != 0:
             print(result.exception)

From 0b9c179584fff36638ad7174783c35b8eecd6085 Mon Sep 17 00:00:00 2001
From: Husen Umer <husen.umer@kaust.edu.sa>
Date: Wed, 13 May 2026 19:06:15 +0300
Subject: [PATCH 19/23] minor fix to cosmic gzip file reading

---
 pgatk/cgenomes/cgenomes_proteindb.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgatk/cgenomes/cgenomes_proteindb.py b/pgatk/cgenomes/cgenomes_proteindb.py
index 0d1e39f..1484728 100644
--- a/pgatk/cgenomes/cgenomes_proteindb.py
+++ b/pgatk/cgenomes/cgenomes_proteindb.py
@@ -451,7 +451,7 @@ def get_value_per_sample(self, local_clinical_sample_file: str, filter_column: s
                               sample_id_column: str = 'SAMPLE_ID') -> dict:
         sample_value = {}
         if local_clinical_sample_file:
-            with open(local_clinical_sample_file, 'r', encoding='utf-8') as clin_fn:
+            with _open_text(local_clinical_sample_file, encoding='utf-8') as clin_fn:
                 filter_column_col, sample_id_col = None, None
                 for line in clin_fn.readlines():
                     if line.startswith('#'):

From 7c08c5014bb700c0f4ad2e666fc8ab0eb75f4324 Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Wed, 13 May 2026 19:24:22 +0100
Subject: [PATCH 20/23] Add cProfile support to vcf-to-proteindb benchmark
 script

Adds --profile-out PATH and --print-top N flags. When given, the run is
wrapped in cProfile, the .prof file is written to PATH, and the top N
functions are printed sorted by both cumulative time and own time. Used
to identify the remaining hot path after the issue-#99 fix before
designing a parallelization strategy.
---
 scripts/benchmark_vcf_to_proteindb.py | 57 ++++++++++++++++++++++++---
 1 file changed, 51 insertions(+), 6 deletions(-)

diff --git a/scripts/benchmark_vcf_to_proteindb.py b/scripts/benchmark_vcf_to_proteindb.py
index bf17700..8901565 100755
--- a/scripts/benchmark_vcf_to_proteindb.py
+++ b/scripts/benchmark_vcf_to_proteindb.py
@@ -1,18 +1,27 @@
 #!/usr/bin/env python3
-"""One-shot benchmark for ``pgatk vcf-to-proteindb``.
+"""One-shot benchmark / profiler for ``pgatk vcf-to-proteindb``.
 
-Times one full run end-to-end. Compare two runs of this script (one on a
-pre-fix build, one on a post-fix build) to measure the speedup achieved
-by issue #99. Not invoked by CI.
+Times one full run end-to-end. With ``--profile-out PATH``, also wraps the
+run in cProfile and writes a ``.prof`` file; print the top-N hotspots with
+``--print-top``. Not invoked by CI.
 
-Usage:
+Basic usage:
     python scripts/benchmark_vcf_to_proteindb.py \
         --vcf    /path/to/sample.vcf \
         --fasta  /path/to/transcripts.fa \
         --gtf    /path/to/annotation.gtf \
         --output /tmp/benchmark_proteindb.fa
+
+Profile a chr22 run and print the 30 hottest functions:
+    python scripts/benchmark_vcf_to_proteindb.py \
+        --vcf chr22.vcf --fasta tx.fa --gtf chr22.gtf \
+        --output /tmp/chr22.fa \
+        --profile-out /tmp/chr22.prof --print-top 30
 """
 import argparse
+import cProfile
+import io
+import pstats
 import time
 from pathlib import Path
 
@@ -32,6 +41,17 @@ def main() -> None:
         help="VCF INFO field that carries per-transcript annotation (default: CSQ). "
         "Use empty string to force re-annotation via bedtools intersect.",
     )
+    parser.add_argument(
+        "--profile-out",
+        help="If set, wrap the run in cProfile and write the .prof file here.",
+    )
+    parser.add_argument(
+        "--print-top",
+        type=int,
+        default=0,
+        help="With --profile-out, also print the top-N functions by cumulative time "
+             "(default: 0, no printing). Recommended N=30 for a first look.",
+    )
     args = parser.parse_args()
 
     config_data = load_config("ensembl_config", None)
@@ -41,10 +61,20 @@ def main() -> None:
     }
     svc = EnsemblDataService(config_data, pipeline_arguments)
 
+    profiler = cProfile.Profile() if args.profile_out else None
     start = time.perf_counter()
-    svc.vcf_to_proteindb(args.vcf, args.fasta, args.gtf)
+    if profiler is not None:
+        profiler.enable()
+    try:
+        svc.vcf_to_proteindb(args.vcf, args.fasta, args.gtf)
+    finally:
+        if profiler is not None:
+            profiler.disable()
     elapsed = time.perf_counter() - start
 
+    if profiler is not None:
+        profiler.dump_stats(args.profile_out)
+
     output_path = Path(args.output)
     output_size = output_path.stat().st_size if output_path.exists() else 0
     seq_count = 0
@@ -59,6 +89,21 @@ def main() -> None:
     print(f"  VCF:        {args.vcf}")
     print(f"  Elapsed:    {elapsed:.2f} s  ({elapsed / 60:.2f} min)")
     print(f"  Output:     {args.output}  ({output_size} bytes, {seq_count} sequences)")
+    if args.profile_out:
+        print(f"  Profile:    {args.profile_out}")
+
+    if args.profile_out and args.print_top > 0:
+        print()
+        print(f"=== TOP {args.print_top} BY CUMULATIVE TIME ===")
+        buf = io.StringIO()
+        stats = pstats.Stats(args.profile_out, stream=buf)
+        stats.strip_dirs().sort_stats('cumulative').print_stats(args.print_top)
+        print(buf.getvalue())
+        print(f"=== TOP {args.print_top} BY OWN (tottime) TIME ===")
+        buf = io.StringIO()
+        stats = pstats.Stats(args.profile_out, stream=buf)
+        stats.strip_dirs().sort_stats('tottime').print_stats(args.print_top)
+        print(buf.getvalue())
 
 
 if __name__ == "__main__":

From 9235f7d71b6018f5e4932320f5b26b7907cc0574 Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Wed, 13 May 2026 19:48:42 +0100
Subject: [PATCH 21/23] Parallelise vcf-to-proteindb per chromosome via
 multiprocessing

Profile of 50K gnomAD chr22 variants showed ~36s one-time setup and
~14s per-variant work, with Biopython translate the dominant CPU cost.
Implements per-chromosome multiprocessing.Pool to fan out the
per-variant loop across cores.

- New `--workers N` CLI flag (default 1, sequential, backward-compatible).
- Internally: pre-annotate VCF in the main process (avoids bedtools-cwd
  race), split the VCF into per-chromosome temp files, dispatch to a
  spawn-context Pool, concatenate per-chunk outputs.
- Existing per-variant loop moved verbatim into `_vcf_to_proteindb_chunk`;
  the public `vcf_to_proteindb` now dispatches sequential or parallel.
- Workers re-construct EnsemblDataService fresh per process (no shared
  gffutils SQLite handles or SeqIO indices); each pays its own setup
  but the per-variant CPU work parallelises near-linearly.
- Equivalence test verifies workers=2 produces the same set of output
  sequences as workers=1 on the existing test.vcf fixture.
---
 pgatk/commands/vcf_to_proteindb.py            |   9 +-
 pgatk/ensembl/ensembl.py                      | 124 +++++++++++++++++-
 pgatk/tests/test_vcf_to_proteindb_parallel.py |  64 +++++++++
 3 files changed, 193 insertions(+), 4 deletions(-)
 create mode 100644 pgatk/tests/test_vcf_to_proteindb_parallel.py

diff --git a/pgatk/commands/vcf_to_proteindb.py b/pgatk/commands/vcf_to_proteindb.py
index da2d474..7cb2faa 100644
--- a/pgatk/commands/vcf_to_proteindb.py
+++ b/pgatk/commands/vcf_to_proteindb.py
@@ -38,6 +38,8 @@
               help="enabling this option causes or variants to be parsed. By default only variants that have not failed any filters will be processed (FILTER column is PASS, None, .) or if the filters are subset of the accepted filters. (default is False)",
               is_flag=True)
 @click.option('--accepted_filters', help="Accepted filters for variant parsing")
+@click.option('-w', '--workers', type=int, default=None,
+              help="Number of worker processes to fan out across (default: 1, sequential). Per-chromosome split.")
 @click.pass_context
 def vcf_to_proteindb(ctx, config_file, input_fasta, vcf, gene_annotations_gtf, translation_table,
                      mito_translation_table,
@@ -45,7 +47,7 @@ def vcf_to_proteindb(ctx, config_file, input_fasta, vcf, gene_annotations_gtf, t
                      af_field, af_threshold, transcript_str, biotype_str, exclude_biotypes,
                      include_biotypes, consequence_str, exclude_consequences,
                      skip_including_all_cds, include_consequences,
-                     ignore_filters, accepted_filters):
+                     ignore_filters, accepted_filters, workers):
 
     config_data = load_config("ensembl_config", config_file)
 
@@ -105,5 +107,8 @@ def vcf_to_proteindb(ctx, config_file, input_fasta, vcf, gene_annotations_gtf, t
     if accepted_filters is not None:
         pipeline_arguments[EnsemblDataService.ACCEPTED_FILTERS] = accepted_filters
 
+    if workers is not None:
+        pipeline_arguments[EnsemblDataService.WORKERS] = workers
+
     ensembl_data_service = EnsemblDataService(config_data, pipeline_arguments)
-    ensembl_data_service.vcf_to_proteindb(vcf, input_fasta, gene_annotations_gtf)
+    ensembl_data_service.vcf_to_proteindb(vcf, input_fasta, gene_annotations_gtf, workers=workers)
diff --git a/pgatk/ensembl/ensembl.py b/pgatk/ensembl/ensembl.py
index d83d031..f06d69d 100644
--- a/pgatk/ensembl/ensembl.py
+++ b/pgatk/ensembl/ensembl.py
@@ -1,6 +1,8 @@
 from __future__ import annotations
 
 import logging
+import os
+import re
 import sqlite3
 from pathlib import Path
 from typing import Any, Optional
@@ -45,6 +47,42 @@ def put(self, key: tuple, value: tuple) -> None:
 _MISSING = object()
 
 
+def _safe_chrom(chrom: str) -> str:
+    """Sanitise a chromosome identifier into a filesystem-safe filename component."""
+    return re.sub(r'[^A-Za-z0-9._-]', '_', str(chrom))
+
+
+def _split_vcf_by_chrom(vcf_file: str) -> tuple[list[str], dict[str, list[str]]]:
+    """Read vcf_file once; return (header_lines, {chrom: [data_lines]}).
+
+    Preserves line ordering within each chromosome. Headers retain trailing newlines;
+    data lines retain trailing newlines. Empty/whitespace-only data lines are skipped.
+    """
+    header: list[str] = []
+    buckets: dict[str, list[str]] = {}
+    with open(vcf_file, 'r', encoding='utf-8') as f:
+        for line in f:
+            if line.startswith('#'):
+                header.append(line)
+                continue
+            if not line.strip():
+                continue
+            chrom = line.split('\t', 1)[0]
+            buckets.setdefault(chrom, []).append(line)
+    return header, buckets
+
+
+def _vcf_to_proteindb_worker(default_params, pipeline_args, vcf_file,
+                              input_fasta, gene_annotations_gtf, output_path):
+    """Module-level worker for multiprocessing.Pool.starmap.
+
+    Re-constructs an EnsemblDataService in the worker process from the
+    pickled config dict + pipeline args, then runs the chunk pipeline.
+    """
+    svc = EnsemblDataService(default_params, pipeline_args)
+    svc._vcf_to_proteindb_chunk(vcf_file, input_fasta, gene_annotations_gtf, output_path)
+
+
 class EnsemblDataService(ParameterConfiguration):
     CONFIG_KEY_VCF = "ensembl_translation"
     INPUT_FASTA = "input_fasta"
@@ -73,6 +111,7 @@ class EnsemblDataService(ParameterConfiguration):
     EXPRESSION_THRESH = "expression_thresh"
     IGNORE_FILTERS = "ignore_filters"
     ACCEPTED_FILTERS = "accepted_filters"
+    WORKERS = "workers"
 
     def __init__(self, config_file: dict, pipeline_arguments: dict) -> None:
         """
@@ -129,6 +168,12 @@ def __init__(self, config_file: dict, pipeline_arguments: dict) -> None:
         self._accepted_filters = self.get_multiple_options(
             self.get_translation_properties(variable=self.ACCEPTED_FILTERS, default_value='PASS'))
 
+        raw_workers = self.get_translation_properties(variable=self.WORKERS, default_value=1)
+        try:
+            self._workers = max(1, int(raw_workers))
+        except (TypeError, ValueError):
+            self._workers = 1
+
     def get_translation_properties(self, variable: str, default_value: Any) -> Any:
         value_return = default_value
         if variable in self.get_pipeline_parameters():
@@ -436,7 +481,7 @@ def vcf_from_file(vcf_file: str) -> tuple[list, pd.DataFrame]:
 
         return metadata, vcf_df
 
-    def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf: str) -> str:
+    def _vcf_to_proteindb_chunk(self, vcf_file: str, input_fasta: str, gene_annotations_gtf: str, output_path: str) -> str:
         """
     Generate proteins for variants by modifying sequences of affected transcripts.
     In case of already annotated variants it only considers variants within
@@ -447,6 +492,7 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
     :param vcf_file:
     :param input_fasta:
     :param gene_annotations_gtf:
+    :param output_path: path for writing the output FASTA
     :return:
     """
         db = self.parse_gtf(gene_annotations_gtf, str(Path(gene_annotations_gtf).with_suffix('.db')))
@@ -491,6 +537,12 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
                     annotation_cols, vcf_file)
                 self.get_logger().debug(msg)
 
+            # Fall back to index 0 when no structured ##INFO header was found (e.g.
+            # transcriptOverlaps-annotated VCFs produced by annoate_vcf, which write
+            # plain transcript IDs without a pipe-delimited FORMAT declaration).
+            if transcript_index is None:
+                transcript_index = 0
+
         else:
             # in case the given VCF is not annotated, annotate it by identifying the overlapping transcripts
             vcf_file = self.annoate_vcf(vcf_file, gene_annotations_gtf)
@@ -506,7 +558,7 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
 
         self._accepted_filters = [x.upper() for x in self._accepted_filters]
 
-        with open(self._proteindb_output, 'w', encoding='utf-8') as prots_fn:
+        with open(output_path, 'w', encoding='utf-8') as prots_fn:
             for record in vcf_reader.itertuples(index=False, name='VCFRecord'):
                 trans = False
                 if [x for x in str(record.REF) if x not in 'ACGT']:
@@ -726,6 +778,74 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
             '\n'.join([x + ":" + str(invalid_records[x]) for x in invalid_records.keys()]))
         self.get_logger().info(msg)
 
+        return output_path
+
+    def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf: str, workers=None) -> str:
+        """Generate proteins for variants by modifying sequences of affected transcripts.
+
+        If workers is None, falls back to self._workers (config) which defaults
+        to 1 (sequential, backward-compatible). Pass workers > 1 to fan out per
+        chromosome via multiprocessing.Pool.
+        :param vcf_file:
+        :param input_fasta:
+        :param gene_annotations_gtf:
+        :param workers: number of parallel worker processes (None => use config default)
+        :return: path to the output proteindb FASTA
+        """
+        if workers is None:
+            workers = self._workers if self._workers else 1
+
+        # Fast path: sequential — single call, identical behaviour to the original implementation.
+        # For sequential runs we do NOT pre-annotate here; _vcf_to_proteindb_chunk handles that
+        # in its else-branch so that transcript_index=0 is set correctly for unannotated VCFs.
+        if workers <= 1:
+            return self._vcf_to_proteindb_chunk(vcf_file, input_fasta, gene_annotations_gtf, self._proteindb_output)
+
+        # Parallel path: pre-annotate unannotated VCFs in the main process. This avoids each
+        # worker racing on the same bedtools-output bed file (which annoate_vcf writes to cwd)
+        # and amortises the bedtools intersect across workers.
+        if not self._annotation_field_name:
+            vcf_file = self.annoate_vcf(vcf_file, gene_annotations_gtf)
+            self._annotation_field_name = 'transcriptOverlaps'
+
+        # Read VCF text once. Header lines + per-chrom buckets.
+        header_lines, chrom_to_lines = _split_vcf_by_chrom(vcf_file)
+
+        # Parallel — split into per-chrom temp VCFs, fan out to a Pool.
+        import multiprocessing as mp
+        import shutil
+        import tempfile
+
+        with tempfile.TemporaryDirectory(prefix='pgatk_v2p_') as tmpdir:
+            tasks = []
+            for chrom in sorted(chrom_to_lines.keys()):
+                chunk_vcf = os.path.join(tmpdir, f"chunk_{_safe_chrom(chrom)}.vcf")
+                chunk_out = os.path.join(tmpdir, f"out_{_safe_chrom(chrom)}.fa")
+                with open(chunk_vcf, 'w', encoding='utf-8') as f:
+                    f.writelines(header_lines)
+                    f.writelines(chrom_to_lines[chrom])
+                pa = dict(self.get_pipeline_parameters())
+                pa[EnsemblDataService.PROTEIN_DB_OUTPUT] = chunk_out
+                # Force annotated mode for workers (annoate_vcf already ran in main):
+                pa[EnsemblDataService.ANNOTATION_FIELD_NAME] = self._annotation_field_name
+                tasks.append((self.get_default_parameters(), pa, chunk_vcf,
+                              input_fasta, gene_annotations_gtf, chunk_out))
+
+            self.get_logger().info(
+                "vcf-to-proteindb: dispatching %d chromosome chunk(s) across %d worker(s)",
+                len(tasks), min(workers, len(tasks)))
+
+            with mp.get_context('spawn').Pool(min(workers, len(tasks))) as pool:
+                pool.starmap(_vcf_to_proteindb_worker, tasks)
+
+            # Concatenate the per-chunk FASTAs into the final output.
+            with open(self._proteindb_output, 'wb') as out:
+                for task in tasks:
+                    chunk_out = task[5]
+                    if os.path.exists(chunk_out):
+                        with open(chunk_out, 'rb') as f:
+                            shutil.copyfileobj(f, out)
+
         return self._proteindb_output
 
     @staticmethod
diff --git a/pgatk/tests/test_vcf_to_proteindb_parallel.py b/pgatk/tests/test_vcf_to_proteindb_parallel.py
new file mode 100644
index 0000000..c9183ff
--- /dev/null
+++ b/pgatk/tests/test_vcf_to_proteindb_parallel.py
@@ -0,0 +1,64 @@
+"""Equivalence test: vcf_to_proteindb with workers=2 produces the same sequence
+content as workers=1 on a small fixture."""
+import os
+from pathlib import Path
+
+from click.testing import CliRunner
+
+from pgatk.cli import cli
+
+
+def _sequence_set(fasta_path) -> set:
+    seqs = []
+    current = []
+    with open(fasta_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            if line.startswith('>'):
+                if current:
+                    seqs.append(''.join(current))
+                    current = []
+            else:
+                current.append(line.strip())
+        if current:
+            seqs.append(''.join(current))
+    return set(seqs)
+
+
+def test_parallel_matches_sequential(tmp_path):
+    # pgatk package root: .../pgatk/pgatk/
+    pkg_root = Path(__file__).resolve().parents[1]
+    testdata = pkg_root / 'testdata'
+    config = pkg_root / 'config' / 'ensembl_config.yaml'
+    out_seq = tmp_path / 'seq.fa'
+    out_par = tmp_path / 'par.fa'
+
+    common_args = [
+        'vcf-to-proteindb',
+        '--config_file', str(config),
+        '--vcf', str(testdata / 'test.vcf'),
+        '--input_fasta', str(testdata / 'test.fa'),
+        '--gene_annotations_gtf', str(testdata / 'test.gtf'),
+        '--protein_prefix', 'ensvar',
+        '--af_field', 'MAF',
+        '--annotation_field_name', 'CSQ',
+        '--biotype_str', 'feature_type',
+        '--include_biotypes', 'mRNA,ncRNA',
+    ]
+
+    runner = CliRunner()
+    orig_dir = os.getcwd()
+    try:
+        # Run from the package root so the .db file is created next to the .gtf.
+        os.chdir(str(pkg_root))
+
+        r1 = runner.invoke(cli, common_args + ['--output_proteindb', str(out_seq)])
+        assert r1.exit_code == 0, r1.output + (str(r1.exception) if r1.exception else '')
+
+        r2 = runner.invoke(cli, common_args + ['--output_proteindb', str(out_par), '--workers', '2'])
+        assert r2.exit_code == 0, r2.output + (str(r2.exception) if r2.exception else '')
+    finally:
+        os.chdir(orig_dir)
+
+    # FASTA header order may differ across runs (per-worker ordering); compare
+    # sequence content as a set.
+    assert _sequence_set(out_seq) == _sequence_set(out_par)

From 254e35c3c914c1169af33df17397819d7d8fb167 Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Wed, 13 May 2026 20:49:21 +0100
Subject: [PATCH 22/23] Tier-1 perf follow-ups: SeqIO.index_db, streamed VCF
 split, lazy id-mapping
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three focused improvements stacking on the per-chromosome multiprocessing PR:

1. Replace SeqIO.index with SeqIO.index_db. SeqIO.index rebuilds an
   in-memory FASTA offset index every invocation (~14 s for the 434 MB
   Ensembl cDNA FASTA); SeqIO.index_db persists the index in a SQLite
   .idx file next to the FASTA, reused across runs and shared across
   workers. Pre-built once in the main process before fan-out so the
   first chunk pays the build cost and the rest just open the same .idx.
   Stale detection: .idx older than FASTA triggers a rebuild.

2. _split_vcf_by_chrom now streams the input VCF and writes per-chrom
   chunks directly to disk in a single pass instead of buffering all
   data lines in memory. Constant memory regardless of input size —
   makes the whole-genome path actually feasible.

3. transcript_id_mapping (versionless -> versioned dict of 207k FASTA
   keys) is now built lazily on the first KeyError fallback. Most
   real workloads never need it; saves ~200ms per worker startup.

No public API change. All existing tests pass; equivalence test still
confirms sequential == parallel output.
---
 pgatk/ensembl/ensembl.py                    | 105 +++++++++++++++-----
 pgatk/testdata/proteindb_from_custom_VCF.fa |  12 +--
 2 files changed, 84 insertions(+), 33 deletions(-)

diff --git a/pgatk/ensembl/ensembl.py b/pgatk/ensembl/ensembl.py
index f06d69d..49fe852 100644
--- a/pgatk/ensembl/ensembl.py
+++ b/pgatk/ensembl/ensembl.py
@@ -52,24 +52,59 @@ def _safe_chrom(chrom: str) -> str:
     return re.sub(r'[^A-Za-z0-9._-]', '_', str(chrom))
 
 
-def _split_vcf_by_chrom(vcf_file: str) -> tuple[list[str], dict[str, list[str]]]:
-    """Read vcf_file once; return (header_lines, {chrom: [data_lines]}).
+def _ensure_fasta_index(input_fasta: str) -> str:
+    """Return the path to a `.idx` SQLite index for input_fasta, building it
+    if absent or stale. Stale means: the .idx exists but the FASTA's mtime
+    is newer than the .idx's. The index is keyed by `EnsemblDataService.get_key`.
+    """
+    idx_path = input_fasta + ".idx"
+    if os.path.exists(idx_path):
+        if os.path.getmtime(idx_path) >= os.path.getmtime(input_fasta):
+            return idx_path
+        try:
+            os.remove(idx_path)
+        except OSError:
+            pass
+    # Build the index. SeqIO.index_db materialises the SQLite file on disk
+    # as a side effect; we don't need to keep the returned handle here.
+    SeqIO.index_db(idx_path, [input_fasta], "fasta", key_function=EnsemblDataService.get_key)
+    return idx_path
+
 
-    Preserves line ordering within each chromosome. Headers retain trailing newlines;
-    data lines retain trailing newlines. Empty/whitespace-only data lines are skipped.
+def _split_vcf_by_chrom(vcf_file: str, output_dir: str) -> dict[str, str]:
+    """Stream `vcf_file` once, writing per-chromosome VCFs into `output_dir`.
+
+    Each output file is `<output_dir>/chunk_<safe_chrom>.vcf` and contains the
+    full VCF header followed by only the data lines for that chromosome.
+    Preserves line ordering within each chromosome.
+
+    Returns a mapping `{chrom: chunk_path}`. Constant memory — holds at most
+    one open file handle per chromosome seen so far.
     """
     header: list[str] = []
-    buckets: dict[str, list[str]] = {}
-    with open(vcf_file, 'r', encoding='utf-8') as f:
-        for line in f:
-            if line.startswith('#'):
-                header.append(line)
-                continue
-            if not line.strip():
-                continue
-            chrom = line.split('\t', 1)[0]
-            buckets.setdefault(chrom, []).append(line)
-    return header, buckets
+    handles: dict[str, Any] = {}
+    chunk_paths: dict[str, str] = {}
+    try:
+        with open(vcf_file, 'r', encoding='utf-8') as f:
+            for line in f:
+                if line.startswith('#'):
+                    header.append(line)
+                    continue
+                if not line.strip():
+                    continue
+                chrom = line.split('\t', 1)[0]
+                handle = handles.get(chrom)
+                if handle is None:
+                    chunk_path = os.path.join(output_dir, "chunk_" + _safe_chrom(chrom) + ".vcf")
+                    handle = open(chunk_path, 'w', encoding='utf-8')
+                    handle.writelines(header)
+                    handles[chrom] = handle
+                    chunk_paths[chrom] = chunk_path
+                handle.write(line)
+    finally:
+        for h in handles.values():
+            h.close()
+    return chunk_paths
 
 
 def _vcf_to_proteindb_worker(default_params, pipeline_args, vcf_file,
@@ -497,9 +532,12 @@ def _vcf_to_proteindb_chunk(self, vcf_file: str, input_fasta: str, gene_annotati
     """
         db = self.parse_gtf(gene_annotations_gtf, str(Path(gene_annotations_gtf).with_suffix('.db')))
 
-        transcripts_dict = SeqIO.index(input_fasta, "fasta", key_function=self.get_key)
+        idx_path = _ensure_fasta_index(input_fasta)
+        transcripts_dict = SeqIO.index_db(idx_path, [input_fasta], "fasta",
+                                          key_function=self.get_key)
         # handle cases where the transcript has version in the GTF but not in the VCF
-        transcript_id_mapping = {k.split('.')[0]: k for k in transcripts_dict.keys()}
+        # Built lazily on the first KeyError to avoid iterating 207k keys up-front.
+        transcript_id_mapping: Optional[dict[str, str]] = None
         feature_cache = _FeatureCache()
         # Value is (ref_seq, desc) for a known transcript, or None for a transcript
         # we've already looked up and confirmed isn't in the FASTA index (avoids re-trying
@@ -659,9 +697,16 @@ def _vcf_to_proteindb_chunk(self, vcf_file: str, input_fasta: str, gene_annotati
                         continue
 
                     try:
-                        transcript_id_v = transcript_id_mapping[transcript_id]
-                    except KeyError:
-                        transcript_id_v = transcript_id
+                        transcript_id_v = transcript_id_mapping[transcript_id]  # type: ignore[index]
+                    except (KeyError, TypeError):
+                        if transcript_id_mapping is None:
+                            transcript_id_mapping = {k.split('.')[0]: k for k in transcripts_dict.keys()}
+                            try:
+                                transcript_id_v = transcript_id_mapping[transcript_id]
+                            except KeyError:
+                                transcript_id_v = transcript_id
+                        else:
+                            transcript_id_v = transcript_id
 
                     cached_row = seq_cache.get(transcript_id_v, _MISSING)
                     if cached_row is _MISSING:
@@ -808,8 +853,9 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
             vcf_file = self.annoate_vcf(vcf_file, gene_annotations_gtf)
             self._annotation_field_name = 'transcriptOverlaps'
 
-        # Read VCF text once. Header lines + per-chrom buckets.
-        header_lines, chrom_to_lines = _split_vcf_by_chrom(vcf_file)
+        # Build the FASTA index ONCE in the main process so all workers share it
+        # instead of each re-scanning the FASTA (~14 s × N workers wasted).
+        _ensure_fasta_index(input_fasta)
 
         # Parallel — split into per-chrom temp VCFs, fan out to a Pool.
         import multiprocessing as mp
@@ -817,13 +863,18 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
         import tempfile
 
         with tempfile.TemporaryDirectory(prefix='pgatk_v2p_') as tmpdir:
+            # Stream-split VCF by chromosome directly to per-chrom files (constant memory).
+            chunk_paths = _split_vcf_by_chrom(vcf_file, tmpdir)
+
+            if len(chunk_paths) <= 1:
+                # Only one chromosome — run sequentially on the original file.
+                return self._vcf_to_proteindb_chunk(vcf_file, input_fasta, gene_annotations_gtf,
+                                                    self._proteindb_output)
+
             tasks = []
-            for chrom in sorted(chrom_to_lines.keys()):
-                chunk_vcf = os.path.join(tmpdir, f"chunk_{_safe_chrom(chrom)}.vcf")
+            for chrom in sorted(chunk_paths.keys()):
+                chunk_vcf = chunk_paths[chrom]
                 chunk_out = os.path.join(tmpdir, f"out_{_safe_chrom(chrom)}.fa")
-                with open(chunk_vcf, 'w', encoding='utf-8') as f:
-                    f.writelines(header_lines)
-                    f.writelines(chrom_to_lines[chrom])
                 pa = dict(self.get_pipeline_parameters())
                 pa[EnsemblDataService.PROTEIN_DB_OUTPUT] = chunk_out
                 # Force annotated mode for workers (annoate_vcf already ran in main):
diff --git a/pgatk/testdata/proteindb_from_custom_VCF.fa b/pgatk/testdata/proteindb_from_custom_VCF.fa
index 22f87e0..7a47e85 100644
--- a/pgatk/testdata/proteindb_from_custom_VCF.fa
+++ b/pgatk/testdata/proteindb_from_custom_VCF.fa
@@ -1,12 +1,12 @@
->varsample_rs78350717_22.15528913.C.A_ENST00000643195
-MNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKDFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII*
->varsample_rs78350717_22.15528913.C.T_ENST00000643195
-MNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKVFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII*
 >varsample_rs78350717_22.15528913.C.A_ENST00000252835
 MCPLTLQVTGLMNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKDFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII*
 >varsample_rs78350717_22.15528913.C.T_ENST00000252835
 MCPLTLQVTGLMNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKVFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII*
->varsample_rs147461488_22.17740102.GGCCACGCTCAACT.G_ENST00000342111
-MDCEVNNGSSLRDECITNLLVFGFLQSCSDNSFRRELDALGHELPVLAPQWEGYDELQTDGNRSSHSRLGRIEAGASDNNTASAEEETEAAGSVAFMELRQ*F*KSRRHHPEYCQAPRPGRGQHGP*HPSGPG
+>varsample_rs78350717_22.15528913.C.A_ENST00000643195
+MNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKDFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII*
+>varsample_rs78350717_22.15528913.C.T_ENST00000643195
+MNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKVFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII*
 >varsample_rs147461488_22.17740102.GGCCACGCTCAACT.G_ENST00000617586
 MDCEVNNGSSLRDECITNLLVFGFLQSCSDNSFRRELDALGHELPVLAPQWEGYDELQTDGNRSSHSRLGRIEAGASDNNTASAEEETEAAGSVAFMELRQWYSGRGSTEAVRQRRRT
+>varsample_rs147461488_22.17740102.GGCCACGCTCAACT.G_ENST00000342111
+MDCEVNNGSSLRDECITNLLVFGFLQSCSDNSFRRELDALGHELPVLAPQWEGYDELQTDGNRSSHSRLGRIEAGASDNNTASAEEETEAAGSVAFMELRQ*F*KSRRHHPEYCQAPRPGRGQHGP*HPSGPG

From d50f8a04b26a72a0df32e7a0d110b6081cf99b99 Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Wed, 13 May 2026 20:50:28 +0100
Subject: [PATCH 23/23] Revert non-deterministic test output; gitignore
 SeqIO.index_db files

The previous commit (254e35c) inadvertently captured a re-ordering of
proteindb_from_custom_VCF.fa from a single test run. The file is the
OUTPUT of test_vcf_to_proteindb_notannotated (which only asserts
exit_code == 0, not content); the ordering shifts non-deterministically
between runs because annoate_vcf uses Python set iteration with hash
randomization. Restore the prior content to keep the working tree
stable across consecutive test invocations.

Also gitignore *.fa.idx / *.fasta.idx: BioPython SeqIO.index_db now
materialises a SQLite index next to each FASTA. These are build
artifacts, rebuilt automatically when the source FASTA changes.
---
 .gitignore                                  |  5 +++++
 pgatk/testdata/proteindb_from_custom_VCF.fa | 12 ++++++------
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/.gitignore b/.gitignore
index c9aa01f..2112a66 100644
--- a/.gitignore
+++ b/.gitignore
@@ -138,3 +138,8 @@ pgatk/testdata/Meleagris_gallopavo*
 
 # Internal working docs (implementation plans, scratch notes)
 docs/plans/
+
+# BioPython SeqIO.index_db SQLite indexes — built lazily on first use,
+# rebuilt automatically when the source FASTA changes (mtime check).
+*.fa.idx
+*.fasta.idx
diff --git a/pgatk/testdata/proteindb_from_custom_VCF.fa b/pgatk/testdata/proteindb_from_custom_VCF.fa
index 7a47e85..22f87e0 100644
--- a/pgatk/testdata/proteindb_from_custom_VCF.fa
+++ b/pgatk/testdata/proteindb_from_custom_VCF.fa
@@ -1,12 +1,12 @@
->varsample_rs78350717_22.15528913.C.A_ENST00000252835
-MCPLTLQVTGLMNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKDFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII*
->varsample_rs78350717_22.15528913.C.T_ENST00000252835
-MCPLTLQVTGLMNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKVFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII*
 >varsample_rs78350717_22.15528913.C.A_ENST00000643195
 MNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKDFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII*
 >varsample_rs78350717_22.15528913.C.T_ENST00000643195
 MNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKVFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII*
->varsample_rs147461488_22.17740102.GGCCACGCTCAACT.G_ENST00000617586
-MDCEVNNGSSLRDECITNLLVFGFLQSCSDNSFRRELDALGHELPVLAPQWEGYDELQTDGNRSSHSRLGRIEAGASDNNTASAEEETEAAGSVAFMELRQWYSGRGSTEAVRQRRRT
+>varsample_rs78350717_22.15528913.C.A_ENST00000252835
+MCPLTLQVTGLMNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKDFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII*
+>varsample_rs78350717_22.15528913.C.T_ENST00000252835
+MCPLTLQVTGLMNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKVFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII*
 >varsample_rs147461488_22.17740102.GGCCACGCTCAACT.G_ENST00000342111
 MDCEVNNGSSLRDECITNLLVFGFLQSCSDNSFRRELDALGHELPVLAPQWEGYDELQTDGNRSSHSRLGRIEAGASDNNTASAEEETEAAGSVAFMELRQ*F*KSRRHHPEYCQAPRPGRGQHGP*HPSGPG
+>varsample_rs147461488_22.17740102.GGCCACGCTCAACT.G_ENST00000617586
+MDCEVNNGSSLRDECITNLLVFGFLQSCSDNSFRRELDALGHELPVLAPQWEGYDELQTDGNRSSHSRLGRIEAGASDNNTASAEEETEAAGSVAFMELRQWYSGRGSTEAVRQRRRT