diff --git a/.codacy.yaml b/.codacy.yaml
new file mode 100644
index 0000000..7a461b7
--- /dev/null
+++ b/.codacy.yaml
@@ -0,0 +1,15 @@
+---
+# Codacy configuration. https://docs.codacy.com/repositories-configure/codacy-configuration-file/
+#
+# pydocstyle has two pairs of mutually-exclusive rules:
+#   D203 vs D211 (blank line before class docstring; codebase follows D203)
+#   D212 vs D213 (multi-line summary line; codebase follows D212)
+# Both rules in each pair are enabled in Codacy's default profile, which produces
+# unavoidable noise — pick one and silence the other. Settings here also apply to
+# any local pydocstyle invocation that reads `.pydocstyle`.
+
+engines:
+  pydocstyle:
+    enabled: true
+    settings:
+      add_ignore: ["D211", "D213"]
diff --git a/.gitignore b/.gitignore
index 3e8faad..2112a66 100644
--- a/.gitignore
+++ b/.gitignore
@@ -135,3 +135,11 @@ pgatk/testdata/Meleagris_gallopavo*
 .DS_Store
 .codacy/
 .cursor/
+
+# Internal working docs (implementation plans, scratch notes)
+docs/plans/
+
+# BioPython SeqIO.index_db SQLite indexes — built lazily on first use,
+# rebuilt automatically when the source FASTA changes (mtime check).
+*.fa.idx
+*.fasta.idx
diff --git a/.pydocstyle b/.pydocstyle
new file mode 100644
index 0000000..bd70be3
--- /dev/null
+++ b/.pydocstyle
@@ -0,0 +1,6 @@
+[pydocstyle]
+# D211 conflicts with D203 (codebase follows D203: one blank line before class docstring).
+# D213 conflicts with D212 (codebase follows D212: multi-line summary on the first line).
+# Disabling the rules we don't follow stops the mutually-exclusive-pair noise from
+# static analysers (pydocstyle, Codacy).
+add-ignore = D211,D213
diff --git a/docs/index.md b/docs/index.md
index f6e58a2..87026a3 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -23,6 +23,7 @@ See the [Installation](installation.md) page for more options (Bioconda, Docker,
 | [Introduction](introduction.md) | Overview of the proteogenomics field |
 | [Installation](installation.md) | How to install pgatk (pip, Bioconda, Docker, source) |
 | [pgatk CLI](pgatk-cli.md) | Full command-line reference for all tools |
+| [Validations](validations.md) | Tests and validations to ensure correctness of the modules|
 | [Use Cases](use-cases.md) | End-to-end workflows and recipes for common scenarios |
 | [File Formats](formats.md) | BED, GTF, GCT format specifications |
 | [Changelog](changelog.md) | Version history and release notes |
diff --git a/docs/pgatk-cli.md b/docs/pgatk-cli.md
index f36971a..8bf7878 100644
--- a/docs/pgatk-cli.md
+++ b/docs/pgatk-cli.md
@@ -318,7 +318,7 @@ Usage: pgatk vcf-to-proteindb [OPTIONS]
   Options:
     --translation_table INTEGER        Translation table (Default 1)
     --mito_translation_table INTEGER   Mito_trans_table (default 2)
-    --var_prefix TEXT                   String to add as prefix for the variant peptides
+    --protein_prefix TEXT               String to add as prefix for the variant peptides
     --report_ref_seq                   Also report the reference peptide from overlapping transcripts
     --annotation_field_name TEXT       Annotation field name in INFO column (default: CSQ)
     --af_field TEXT                    Field name for variant allele frequency (default: none)
@@ -467,7 +467,7 @@ Usage: pgatk dnaseq-to-proteindb [OPTIONS]
     --biotype_str TEXT                 String used to identify gene/transcript biotype (default: transcript_biotype)
     --expression_str TEXT              String for extracting expression value (default: None)
     --expression_thresh FLOAT          Threshold for expression value filtering (default: 5)
-    --var_prefix TEXT                  Prefix to be added to fasta headers (default: none)
+    --protein_prefix TEXT              Prefix to be added to fasta headers (default: none)
     -h, --help                         Show this message and exit.
 ```
 
@@ -489,7 +489,7 @@ Usage: pgatk dnaseq-to-proteindb [OPTIONS]
         --config_file config/ensembl_config.yaml \
         --input_fasta transcript_sequences.fa \
         --output_proteindb proteindb_from_lincRNA_canonical_sequences.fa \
-        --var_prefix lincRNA_ \
+        --protein_prefix lincRNA_ \
         --include_biotypes lincRNA
     ```
 
@@ -500,7 +500,7 @@ Usage: pgatk dnaseq-to-proteindb [OPTIONS]
         --config_file config/ensembl_config.yaml \
         --input_fasta transcript_sequences.fa \
         --output_proteindb proteindb_from_processed_pseudogene.fa \
-        --var_prefix pseudogene_ \
+        --protein_prefix pseudogene_ \
         --include_biotypes processed_pseudogene,transcribed_processed_pseudogene,translated_processed_pseudogene \
         --skip_including_all_cds
     ```
@@ -512,7 +512,7 @@ Usage: pgatk dnaseq-to-proteindb [OPTIONS]
         --config_file config/ensembl_config.yaml \
         --input_fasta transcript_sequences.fa \
         --output_proteindb proteindb_from_altORFs.fa \
-        --var_prefix altorf_ \
+        --protein_prefix altorf_ \
         --include_biotypes altORFs \
         --skip_including_all_cds
     ```
diff --git a/docs/plans/2026-03-01-pgatk-graph-engine-design.md b/docs/plans/2026-03-01-pgatk-graph-engine-design.md
deleted file mode 100644
index 9048e99..0000000
--- a/docs/plans/2026-03-01-pgatk-graph-engine-design.md
+++ /dev/null
@@ -1,270 +0,0 @@
-# pgatk Evolution: Graph-Based Transcript Modeling & Feature Parity
-
-**Date:** 2026-03-01
-**Status:** Approved
-**Authors:** Yasset Perez-Riverol, Claude (AI assistant)
-
----
-
-## 1. Executive Summary
-
-This document describes the research and implementation plan to evolve pgatk from a linear, one-variant-at-a-time proteogenomics database generator into a graph-based transcript modeling engine capable of multi-variant co-occurrence, gene fusions, RNA editing, and circular RNA support. The plan includes a prerequisite infrastructure cleanup phase and a phased feature rollout with performance optimization.
-
----
-
-## 2. Current State Assessment
-
-### What pgatk does well
-
-- Only Python tool with native one-command downloaders for ENSEMBL, COSMIC, and cBioPortal
-- Integrated decoy generation (4 methods: protein-reverse, protein-shuffle, DecoyPyrat, pgdbdeep)
-- Spectrum-level validation via SpectrumAI (unique among database generators)
-- Aho-Corasick exact matching + sliding-window mismatch search (BlastGetPosition)
-- Published benchmark: 43,501 non-canonical peptides across 64 human cell lines (Umer et al. 2022)
-- Pangenome proteogenomics demonstrated (Wang et al. 2024 preprint): 4,991 novel peptides from HPRC
-
-### Critical gaps vs. the field (2024-2026)
-
-| Gap | Impact | Solved by |
-|---|---|---|
-| No multi-variant co-occurrence | Transcript with N SNPs produces 1 sequence instead of up to 2^N | moPepGen (DAG), vcf2prot (phased VCF) |
-| No gene fusions | Major cancer event type missed entirely | moPepGen |
-| No RNA editing | A-to-I editing creates novel peptides invisible to DNA-only | moPepGen |
-| No circular RNA | Emerging class of translated non-coding RNA | moPepGen |
-| No ClinVar/NCBI support | Open issue #24 since 2019 | Manual workaround only |
-| pandas iterrows in VCF processing | ~100x slower than vectorized operations | Internal bottleneck |
-| No transcript feature caching | Redundant gffutils DB queries per variant | Internal bottleneck |
-
-### Competitive landscape
-
-**moPepGen** (Nature Biotechnology, June 2025): Graph-based (DAG per transcript), handles all event types, ~4x more non-canonical peptides than prior methods. The algorithmic benchmark to match.
-
-**vcf2prot** (ElAbd et al. 2022, bioRxiv): Rust-based Sequence Intermediate Representation (SIR) approach. ~1000x faster than PrecisionProDB. Processes 99,254 variants across 8,192 patients in ~11 minutes. Demonstrates the performance ceiling achievable with systems-level optimization. Limited to phased VCFs with BCFtools/csq annotation.
-
-**PG2** (J. Proteome Research 2023): Snakemake pipeline integrating genome + transcriptome. Handles splicing and fusions but requires heavy infrastructure.
-
-**pXg** (MCP 2024): RNA-Seq + proteomics + de novo sequencing for immunopeptidomics.
-
-**NeoDisc** (Nature Biotechnology 2024): End-to-end clinical neoantigen pipeline.
-
-**PepCentric** (bioRxiv 2025): Repository-scale validation against 2.3B spectra. Complementary to database generators.
-
-### Key code quality findings
-
-- `get_altseq()` (ensembl.py): 60-line heart of variant processing, handles SNP/ins/del but only one variant at a time
-- `vcf_to_proteindb()` (ensembl.py): ~260 lines, main bottleneck is `vcf_reader.iterrows()` + per-row gffutils DB lookups
-- No `pathlib`, no `dataclasses`, inconsistent type hints
-- `db/` standalone scripts use `sys.argv` at import time (not importable as modules)
-- Variable shadowing bug in `protein_database_decoy.py` (`decoy_sequence` variable shadows the imported `pyteomics.fasta.decoy_sequence` function)
-- Broad `except Exception` in several places suppresses unexpected errors
-- Mixed `print()` and logger usage for error reporting
-- Repetitive 3-layer config fallback pattern across all service classes
-
----
-
-## 3. Architecture: TranscriptGraph Engine
-
-### Core Concept
-
-Each transcript is modeled as a directed acyclic graph (DAG):
-
-- **Nodes** represent sequence segments (reference sequence between variant positions)
-- **Edges** represent either the reference path or an alternative (variant) path
-- **Traversal** of all root-to-leaf paths generates all combinatorial protein sequences
-
-```
-Reference:  ──[seg1]──[seg2]──[seg3]──[seg4]──
-                        │                │
-Variant A:              └─[alt_A]─┘      │
-Variant B:                               └─[alt_B]─┘
-
-Paths: ref, A only, B only, A+B  →  4 protein sequences
-```
-
-### Module Structure
-
-```
-pgatk/
-  graph/                          # NEW module
-    __init__.py
-    transcript_graph.py           # TranscriptGraph class (DAG builder + traverser)
-    variant_nodes.py              # Node types: SNP, Insertion, Deletion, Fusion, RNAEdit, CircRNA
-    graph_enumerator.py           # Path enumeration with combinatorial explosion controls
-    event_parsers/                # Input parsers for each event type
-      __init__.py
-      vcf_parser.py              # VCF → graph edges (replaces current get_altseq)
-      fusion_parser.py           # Gene fusion calls → cross-transcript edges
-      rnaedit_parser.py          # RNA editing sites → substitution edges
-      circrna_parser.py          # Back-splice junctions → circular edges
-      clinvar_parser.py          # ClinVar VCF → graph edges (issue #24)
-    filters/
-      __init__.py
-      allele_frequency.py        # AF-based pruning
-      consequence.py             # VEP consequence filtering
-      expression.py              # Expression-level filtering
-      max_variants.py            # Cap combinatorial explosion
-```
-
-### Key Classes
-
-```python
-@dataclass
-class VariantNode:
-    """Base class for all genomic events."""
-    position: int          # CDS position
-    ref_allele: str
-    alt_allele: str
-    event_type: EventType  # SNP, INS, DEL, FUSION, RNA_EDIT, CIRC_RNA
-    metadata: dict         # AF, consequence, source, etc.
-
-
-class TranscriptGraph:
-    """DAG representing a transcript with all its variants."""
-
-    def __init__(self, transcript_id: str, reference_seq: str, strand: str) -> None: ...
-    def add_variant(self, variant: VariantNode) -> None: ...
-    def add_fusion(self, partner_graph: 'TranscriptGraph', breakpoints: tuple) -> None: ...
-    def enumerate_paths(self, max_paths: int = 1000) -> Iterator[ProteinSequence]: ...
-    def to_fasta(self, output: TextIO) -> int: ...  # returns count written
-```
-
-### Combinatorial Explosion Controls
-
-With N variants, naive enumeration produces 2^N sequences. Controls:
-
-1. **Max variants per transcript** (default: 10) — skip transcripts exceeding this
-2. **Max paths per transcript** (default: 1000) — stop enumeration after limit
-3. **Allele frequency pruning** — only include variants above a threshold
-4. **Phasing support** — if phased VCF, only enumerate haplotype-consistent paths
-5. **Consequence filtering** — only include protein-altering consequences
-
-### Integration Strategy
-
-The graph engine sits alongside existing modules (no replacement):
-
-- **Downloaders** remain unchanged
-- **New CLI commands** (`graph-vcf-to-proteindb`, etc.) use the graph engine
-- **Existing commands** continue working for backward compatibility
-- **Shared infrastructure** (`toolbox/`, `config/`) is reused
-
----
-
-## 4. Phased Implementation Plan
-
-### Phase 0 — Infrastructure & Code Quality
-
-**Goal:** Fix existing issues so the graph engine is built on solid ground.
-
-**Deliverables:**
-
-1. Fix variable shadowing bug in `protein_database_decoy.py`
-2. Convert `db/digest_mutant_protein.py` and `db/map_peptide2genome.py` from `sys.argv`/`getopt` import-time execution to proper Click CLI commands
-3. Add `conftest.py` with proper fixtures; remove relative path dependencies in tests
-4. Replace `print()` error reporting with proper logger usage across all modules
-5. Add type hints to all public APIs
-6. Use `dataclasses` for data models (`SNP` in `cgenomes/models.py`, new models)
-7. Use `pathlib.Path` for file operations (replace string concatenation)
-8. Replace broad `except Exception` with specific exception handling
-9. Standardize the 3-layer config fallback pattern (DRY up repetitive `get_*_parameters()` methods)
-10. Expand unit test coverage for core functions: `get_altseq()`, `get_mut_pro_seq()`, `revswitch()`
-11. Replace `pathos` with `concurrent.futures.ProcessPoolExecutor` (stdlib)
-
----
-
-### Phase 1 — Graph Core + SNP/Indel Support
-
-**Goal:** Working graph engine for basic variants, producing a superset of current output.
-
-**Deliverables:**
-
-1. `TranscriptGraph` class with add/enumerate/serialize
-2. `VariantNode` dataclass hierarchy (SNP, Insertion, Deletion)
-3. `vcf_parser.py` — reads VCF (VEP-annotated or unannotated) into graph edges
-4. `graph_enumerator.py` — path enumeration with explosion controls
-5. New CLI command: `graph-vcf-to-proteindb`
-6. Transcript feature caching (eliminate redundant gffutils queries)
-7. Vectorized VCF grouping (replace `iterrows()`)
-8. Unit tests for graph operations + integration tests
-9. Benchmark against current `vcf_to_proteindb()` on test VCFs
-
-**Validation:** Graph engine must produce a superset of sequences from the linear approach.
-
----
-
-### Phase 2 — Cancer Genomics + ClinVar
-
-**Goal:** Bring cancer mutation sources into the graph model and add ClinVar (issue #24).
-
-**Deliverables:**
-
-1. `clinvar_parser.py` — parse ClinVar VCF into graph edges
-2. Refactor `CancerGenomesService.get_mut_pro_seq()` to produce `VariantNode` objects
-3. Graph-aware COSMIC processing (multiple mutations on same gene → combinatorial)
-4. Graph-aware cBioPortal processing
-5. New CLI commands: `clinvar-to-proteindb`, `graph-cosmic-to-proteindb`
-6. Tests with real COSMIC data (multiple mutations per transcript)
-
----
-
-### Phase 3 — Novel Event Types: Fusions, RNA Editing, Circular RNA
-
-**Goal:** Add the event types that differentiate moPepGen from other tools.
-
-**Deliverables:**
-
-1. `fusion_parser.py` — parse gene fusion calls (STAR-Fusion, Arriba, FusionCatcher formats)
-2. `rnaedit_parser.py` — parse RNA editing databases (REDIportal, DARNED) or user sites
-3. `circrna_parser.py` — parse back-splice junction files (CIRCexplorer, CIRI)
-4. Extended `TranscriptGraph` for cross-transcript fusions and circular paths
-5. New CLI commands: `fusion-to-proteindb`, `rnaedit-to-proteindb`, `circrna-to-proteindb`
-6. Combined command: `multi-event-to-proteindb` (merges all event types per transcript)
-
----
-
-### Phase 4 — Performance Optimization
-
-**Goal:** Make the graph engine fast enough for genome-scale datasets.
-
-**Deliverables:**
-
-1. Profile Phases 1-3 on real-world datasets (gnomAD, TCGA)
-2. Identify hot paths via `cProfile` / `py-spy`
-3. Optimize graph traversal (topological sort, memoization of shared subpaths)
-4. Replace remaining `pathos` usage with `concurrent.futures`
-5. Optional: C extension (via `cffi` or `cython`) for innermost graph traversal — only if profiling shows it's the bottleneck
-6. Benchmark against moPepGen and vcf2prot on same datasets
-
----
-
-### Phase Dependencies
-
-```
-Phase 0 (Infrastructure & Quality)
-    └── Phase 1 (Graph Core + SNP/Indel)
-            ├── Phase 2 (Cancer + ClinVar)
-            │       └── Phase 3 (Fusions, RNA Edit, CircRNA)
-            │               └── Phase 4 (Performance)
-            └── (ongoing quality improvements)
-```
-
----
-
-## 5. References
-
-### Primary pgatk publication
-- Umer HM, Audain E, Zhu Y, Pfeuffer J, Sachsenberg T, Lehtiö J, Branca RM, Perez-Riverol Y. Generation of ENSEMBL-based proteogenomics databases boosts the identification of non-canonical peptides. *Bioinformatics*. 2022;38(5):1470-1472. doi:10.1093/bioinformatics/btab838
-
-### Pangenome proteogenomics
-- Wang D, Bouwmeester R, Zheng P, Dai C, Sanchez A, Shu K, Bai M, Umer HM, Perez-Riverol Y. Proteogenomics analysis of human tissues using pangenomes. *bioRxiv*. 2024. doi:10.1101/2024.05.24.595489
-
-### Competing tools
-- moPepGen: Graph-based proteogenomic database generation. *Nature Biotechnology*. June 2025. doi:10.1038/s41587-025-02701-0
-- vcf2prot: ElAbd H, Degenhardt F, Lenz TL, Franke A, Wendorff M. VCF2Prot: An efficient and parallel tool for generating personalized proteomes from VCF files. *bioRxiv*. 2022. doi:10.1101/2022.01.21.477084
-- ProteomeGenerator2: *J. Proteome Research*. 2023. doi:10.1021/acs.jproteome.3c00005
-- pXg: *Molecular & Cellular Proteomics*. 2024. doi:10.1016/j.mcpro.2024.100733
-- NeoDisc: *Nature Biotechnology*. October 2024. doi:10.1038/s41587-024-02420-y
-- PepCentric: *bioRxiv*. February 2025. doi:10.1101/2025.02.24.639867
-
-### Related ecosystem
-- quantms: Dai C et al. *Nature Methods*. 2024. doi:10.1038/s41592-024-02343-1
-- PRIDE 2025: *Nucleic Acids Research*. 2025;53(D1):D543. doi:10.1093/nar/gkae1011
diff --git a/docs/plans/2026-03-03-protein-accession-design.md b/docs/plans/2026-03-03-protein-accession-design.md
deleted file mode 100644
index cfe0b8d..0000000
--- a/docs/plans/2026-03-03-protein-accession-design.md
+++ /dev/null
@@ -1,96 +0,0 @@
-# Protein Accession and FASTA Header Design
-
-Issue: https://github.com/bigbio/pgatk/issues/18
-Branch: `feature/protein-accession-design`
-Date: 2026-03-03
-
-## Problem
-
-Current pgatk FASTA headers are inconsistent across variant sources (VCF, COSMIC, ClinVar) and incompatible with major search engines like SearchGUI, which cannot parse ENSEMBL-style IDs.
-
-## Design
-
-### Two protein categories, two prefix strategies
-
-| Category | Prefix | Accession | Description |
-|----------|--------|-----------|-------------|
-| Canonical (reference) | Keep original (`sp\|`, `tr\|`, `ensp\|`) | Original accession | Untouched from source database |
-| Variant (mutated) | `pgvar\|` | `{TRANSCRIPT_ID}-{INDEX}` | pgatk-generated variant protein |
-
-### Variant header format
-
-```
->pgvar|{TRANSCRIPT_ID}-{INDEX}|{GENE_SYMBOL} {key=value metadata}
-```
-
-**Fields:**
-
-- `pgvar` -- database tag identifying pgatk-generated variant proteins.
-- `{TRANSCRIPT_ID}-{INDEX}` -- accession composed of parent transcript ID and a dash-separated 1-based index (per transcript, per run). Mirrors UniProt isoform convention (`P12345-2`).
-- `{GENE_SYMBOL}` -- gene name, first token after the second pipe.
-- Metadata key=value pairs in the description field:
-
-| Key | Description | Example |
-|-----|-------------|---------|
-| `VariantSource` | Origin database | `COSMIC`, `ClinVar`, `gnomAD`, `dbSNP` |
-| `GenomicCoord` | `chr:pos:ref:alt` | `12:25245347:C:G` |
-| `AAChange` | HGVS protein notation | `p.G13R` |
-| `MutationType` | SO term or short label | `missense_variant` |
-| `dbSNP` | rsID if available | `rs121913529` |
-| `ORF` | Reading frame number (only when multi-ORF) | `1`, `2`, `3` |
-
-### Examples
-
-```fasta
-# Canonical proteins -- untouched from source databases
->sp|P01112|RASH_HUMAN GTPase HRas OS=Homo sapiens
->ensp|ENSP00000309845|BRCA1
-
-# Variant proteins -- unified pgvar| prefix regardless of source
->pgvar|ENST00000311189-1|HRAS VariantSource=COSMIC AAChange=p.G13R MutationType=missense_variant GenomicCoord=12:25245347:C:G
->pgvar|ENST00000311189-2|HRAS VariantSource=COSMIC AAChange=p.Q61L MutationType=missense_variant GenomicCoord=12:25245350:A:T
->pgvar|ENST00000357654-1|BRCA1 VariantSource=ClinVar AAChange=p.R1699Q MutationType=missense_variant GenomicCoord=17:43094464:G:A dbSNP=rs41293455
-
-# Multiple ORFs -- each ORF gets its own index, ORF number in metadata
->pgvar|ENST00000311189-3|HRAS VariantSource=COSMIC AAChange=p.G13R ORF=1
->pgvar|ENST00000311189-4|HRAS VariantSource=COSMIC AAChange=p.G13R ORF=2
->pgvar|ENST00000311189-5|HRAS VariantSource=COSMIC AAChange=p.G13R ORF=3
-```
-
-### Indexing logic
-
-The `-{index}` is per-transcript, per-file generation run:
-
-- First variant on `ENST00000311189` gets `-1`
-- Second variant on same transcript gets `-2`
-- Multi-ORF outputs each consume an index (3 ORFs = 3 indices)
-- First variant on a different transcript resets to `-1`
-
-### Search engine compatibility
-
-| Engine | Compatible | Notes |
-|--------|-----------|-------|
-| SearchGUI / PeptideShaker | Yes | Matches UniProt-like `db\|acc\|name` pattern |
-| MaxQuant | Yes | Default UniProt parse rule works |
-| MSFragger / FragPipe | Yes | Reads full header, splits on first whitespace |
-| Comet | Yes | Parses `>db\|acc\|` natively |
-| DIA-NN | Yes | Follows UniProt-style parsing |
-| Proteome Discoverer | Yes | Supports pipe-delimited headers |
-
-### Files to modify
-
-| File | Change |
-|------|--------|
-| `pgatk/ensembl/ensembl.py` | Refactor `vcf_to_proteindb()` header construction (lines 661-664) |
-| `pgatk/clinvar/clinvar_service.py` | Refactor header construction (lines 554-560) |
-| `pgatk/cgenomes/cgenomes_proteindb.py` | Refactor COSMIC header (line 317), cBioPortal header |
-| `pgatk/toolbox/vcf_utils.py` | Update `write_output()` to handle new format cleanly |
-| `pgatk/config/` | Add constants for `PGVAR_PREFIX`, metadata keys |
-
-### Design decisions
-
-1. **Dash separator** (`-`) between transcript and index, consistent with UniProt isoform convention.
-2. **No ORF suffix in accession** -- ORF number is metadata (`ORF=N`), each ORF gets its own index.
-3. **Canonical proteins are pass-through** -- pgatk does not reformat existing database headers.
-4. **Unified format across all sources** -- COSMIC, ClinVar, VCF variants all use `pgvar|` regardless of origin.
-5. **Key=value metadata** in description field for structured downstream parsing.
diff --git a/docs/use-cases.md b/docs/use-cases.md
index bd3cf20..0442c9c 100644
--- a/docs/use-cases.md
+++ b/docs/use-cases.md
@@ -6,7 +6,7 @@ a search-ready protein database.
 
 ---
 
-## Cell-Type Specific Non-Canonical Peptide Discovery
+## USE CASE 1: Cell-Type Specific Non-Canonical Peptide Discovery
 
 !!! abstract "Featured workflow"
     This workflow reproduces the analysis from [Umer et al., *Bioinformatics* 2022](https://doi.org/10.1093/bioinformatics/btab838),
@@ -32,8 +32,8 @@ pgatk ensembl-downloader \
 
 ```bash
 gffread -F -w ensembl_human/transcripts.fa \
-    -g ensembl_human/genome.fa \
-    ensembl_human/Homo_sapiens.GRCh38.*.gtf.gz
+    -g ensembl_human/Homo_sapiens.GRCh38.dna_sm.toplevel.fa \
+    ensembl_human/Homo_sapiens.GRCh38.*.gtf
 ```
 
 ### Step 3 -- Canonical protein-coding sequences
@@ -57,8 +57,8 @@ detectable peptides. Translate them in three reading frames:
 pgatk dnaseq-to-proteindb \
     --input_fasta ensembl_human/transcripts.fa \
     --output_proteindb pseudogene.fa \
-    --var_prefix pseudo_ \
-    --include_biotypes processed_pseudogene,unprocessed_pseudogene,transcribed_processed_pseudogene,transcribed_unprocessed_pseudogene,translated_processed_pseudogene \
+    --protein_prefix pseudo_ \
+    --include_biotypes pseudogene,processed_pseudogene,unprocessed_pseudogene,transcribed_processed_pseudogene,transcribed_unprocessed_pseudogene,translated_processed_pseudogene,unitary_pseudogene,transcribed_unitary_pseudogene,rRNA_pseudogene,IG_V_pseudogene,TR_V_pseudogene,IG_C_pseudogene,TR_J_pseudogene,IG_J_pseudogene,IG_pseudogene \
     --num_orfs 3 \
     --skip_including_all_cds
 ```
@@ -72,39 +72,41 @@ micropeptides:
 pgatk dnaseq-to-proteindb \
     --input_fasta ensembl_human/transcripts.fa \
     --output_proteindb lncrna.fa \
-    --var_prefix lncrna_ \
-    --include_biotypes lincRNA,antisense,sense_intronic,sense_overlapping \
+    --protein_prefix lncrna_ \
+    --include_biotypes lncRNA \
     --num_orfs 3 \
     --skip_including_all_cds
 ```
 
-#### Alternative ORFs (exonic out-of-frame translation)
+#### Putative proteins (three-frame)
 
-Non-canonical reading frames of protein-coding mRNAs can produce cryptic
-peptides:
+Protein coding genes without CDs that are not validated but annoated:
 
 ```bash
 pgatk dnaseq-to-proteindb \
     --input_fasta ensembl_human/transcripts.fa \
-    --output_proteindb altorf.fa \
-    --var_prefix altorf_ \
-    --include_biotypes altORFs \
+    --output_proteindb putative.fa \
+    --protein_prefix putative_ \
+    --include_biotypes protein_coding_CDS_not_defined,TEC,translated_processed_pseudogene \
+    --num_orfs 3 \
     --skip_including_all_cds
 ```
 
-### Step 5 -- Population variant proteins
+#### Alternative ORFs (exonic out-of-frame translation)
 
-Include common human variants from ENSEMBL:
+Non-canonical reading frames of protein-coding mRNAs can produce cryptic
+peptides:
 
 ```bash
-pgatk vcf-to-proteindb \
-    --vcf ensembl_human/homo_sapiens_incl_consequences.vcf.gz \
+pgatk dnaseq-to-proteindb \
     --input_fasta ensembl_human/transcripts.fa \
-    --gene_annotations_gtf ensembl_human/Homo_sapiens.GRCh38.*.gtf.gz \
-    --output_proteindb ensembl_variants.fa
+    --output_proteindb altorf.fa \
+    --protein_prefix altorf_ \
+    --include_biotypes altORFs \
+    --skip_including_all_cds
 ```
 
-### Step 6 -- COSMIC somatic mutations (optional, for cancer cell lines)
+### Step 5 -- COSMIC somatic mutations (optional, for cancer cell lines)
 
 For cancer cell-line studies, add cell-line-specific somatic mutations:
 
@@ -115,14 +117,15 @@ pgatk cosmic-downloader \
     -o cosmic_data
 
 pgatk cosmic-to-proteindb \
-    --input_mutation cosmic_data/CosmicCLP_MutantExport.tsv.gz \
-    --input_genes cosmic_data/All_CellLines_Genes.fasta.gz \
+    --input_mutation cosmic_data/CellLinesProject_GenomeScreensMutant_v103_GRCh38.tsv.gz \
+    --input_genes cosmic_data/Cosmic_Genes_v103_GRCh38.fasta.gz \
     --output_db cosmic_cellline.fa \
-    --filter_column "Sample name" \
+    --clinical_sample_file cosmic_data/Cosmic_Classification_v103_GRCh38.tsv.gz \
+    --filter_column PRIMARY_SITE \
     --split_by_filter_column
 ```
 
-### Step 7 -- Combine and generate target-decoy database
+### Step 6 -- Combine and generate target-decoy database
 
 ```bash
 # Combine all components
@@ -130,8 +133,8 @@ pgatk cosmic-to-proteindb \
 cat canonical.fa \
     pseudogene.fa \
     lncrna.fa \
+    putative.fa \
     altorf.fa \
-    ensembl_variants.fa \
     > cell_type_target.fa
 
 # Generate decoy sequences
@@ -142,14 +145,14 @@ pgatk generate-decoy \
     --decoy_prefix DECOY_
 ```
 
-### Step 8 -- Extract non-canonical unique peptides
+### Step 7 -- Extract non-canonical unique peptides
 
 After database searching with your search engine, you can also pre-compute the
 set of non-canonical peptides unique to these novel sources:
 
 ```bash
 pgatk digest-mutant-protein \
-    --input pseudogene.fa,lncrna.fa,altorf.fa,ensembl_variants.fa \
+    --input pseudogene.fa,lncrna.fa,putative.fa,altorf.fa,ensembl_variants.fa \
     --fasta canonical.fa \
     --output non_canonical_peptides.fa \
     --min-len 7 \
@@ -166,7 +169,7 @@ pgatk digest-mutant-protein \
 
 ---
 
-## 1. Human Variant Protein Database from ENSEMBL
+## USE CASE 2: Human Variant Protein Database from ENSEMBL
 
 Build a variant protein database using ENSEMBL population variants (common SNPs
 and indels) for human proteogenomics searches. This is the most common starting
@@ -196,8 +199,8 @@ extract transcript sequences from the GTF and genome FASTA:
 
 ```bash
 gffread -F -w ensembl_human/transcripts.fa \
-    -g ensembl_human/genome.fa \
-    ensembl_human/Homo_sapiens.GRCh38.*.gtf.gz
+    -g ensembl_human/Homo_sapiens.GRCh38.dna_sm.toplevel.fa \
+    ensembl_human/Homo_sapiens.GRCh38.*.gtf
 ```
 
 ### Step 3 -- Generate the variant protein database
@@ -206,9 +209,9 @@ Translate all ENSEMBL variants that affect protein-coding transcripts:
 
 ```bash
 pgatk vcf-to-proteindb \
-    --vcf ensembl_human/homo_sapiens_incl_consequences.vcf.gz \
+    --vcf ensembl_human/homo_sapiens_incl_consequences.vcf \
     --input_fasta ensembl_human/transcripts.fa \
-    --gene_annotations_gtf ensembl_human/Homo_sapiens.GRCh38.*.gtf.gz \
+    --gene_annotations_gtf ensembl_human/Homo_sapiens.GRCh38.*.gtf \
     --output_proteindb ensembl_human/variant_proteins.fa
 ```
 
@@ -242,7 +245,7 @@ The file `target_decoy.fa` is ready for database searching.
 
 ---
 
-## 2. Population-Specific Variant Database
+## USE CASE 3: Population-Specific Variant Database
 
 Population-level genetic variants cause amino acid changes that are invisible
 to standard reference database searches. Studies have shown that incorporating
@@ -255,7 +258,7 @@ Include only variants present in at least 1% of the population:
 
 ```bash
 pgatk vcf-to-proteindb \
-    --vcf homo_sapiens_incl_consequences.vcf.gz \
+    --vcf homo_sapiens_incl_consequences.vcf \
     --input_fasta transcripts.fa \
     --gene_annotations_gtf genes.gtf \
     --af_field MAF \
@@ -270,7 +273,7 @@ detectable by mass spectrometry:
 
 ```bash
 pgatk vcf-to-proteindb \
-    --vcf homo_sapiens_incl_consequences.vcf.gz \
+    --vcf homo_sapiens_incl_consequences.vcf \
     --input_fasta transcripts.fa \
     --gene_annotations_gtf genes.gtf \
     --af_field MAF \
@@ -289,7 +292,7 @@ a specific population:
 pgatk vcf-to-proteindb \
     --vcf gnomad.exomes.v4.1.sites.vcf.bgz \
     --input_fasta gencode_transcripts.fa \
-    --gene_annotations_gtf gencode.v44.annotation.gtf.gz \
+    --gene_annotations_gtf gencode.v44.annotation.gtf \
     --annotation_field_name vep \
     --af_field AF_afr \
     --af_threshold 0.01 \
@@ -308,7 +311,7 @@ pgatk vcf-to-proteindb \
 
 ---
 
-## 3. ClinVar Clinical Variant Database
+## USE CASE 4: ClinVar Clinical Variant Database
 
 [ClinVar](https://www.ncbi.nlm.nih.gov/clinvar/) catalogs the relationship
 between human variants and clinical phenotypes. Building a ClinVar-derived
@@ -324,18 +327,18 @@ pgatk ncbi-downloader -o ncbi_data
 
 This downloads four files to `ncbi_data/`:
 
-- `GRCh38_latest_genomic.gtf.gz` -- RefSeq gene annotations
-- `GRCh38_latest_rna.fna.gz` -- RefSeq transcript nucleotide sequences
+- `GRCh38_latest_genomic.gtf` -- RefSeq gene annotations
+- `GRCh38_latest_rna.fna` -- RefSeq transcript nucleotide sequences
 - `GRCh38_latest_assembly_report.txt` -- Chromosome name mapping
-- `clinvar.vcf.gz` -- ClinVar variant calls
+- `clinvar.vcf` -- ClinVar variant calls
 
 ### Step 2 -- Generate the ClinVar protein database
 
 ```bash
 pgatk clinvar-to-proteindb \
-    --vcf ncbi_data/clinvar.vcf.gz \
-    --gtf ncbi_data/GRCh38_latest_genomic.gtf.gz \
-    --fasta ncbi_data/GRCh38_latest_rna.fna.gz \
+    --vcf ncbi_data/clinvar.vcf \
+    --gtf ncbi_data/GRCh38_latest_genomic.gtf \
+    --fasta ncbi_data/GRCh38_latest_rna.fna \
     --assembly-report ncbi_data/GRCh38_latest_assembly_report.txt \
     --output clinvar_proteins.fa
 ```
@@ -356,7 +359,7 @@ pgatk generate-decoy \
 
 ---
 
-## 4. Tumor-Specific Databases for Cancer Proteogenomics
+## USE CASE 5: Condition-Specific Databases for Proteogenomics
 
 Cancer proteogenomics studies (such as those from CPTAC) build
 tumor-specific protein databases to detect somatic mutant peptides, understand
@@ -364,7 +367,7 @@ therapy resistance, and prioritize neoantigen candidates. The databases combine
 somatic mutations from cancer-specific sources (COSMIC, cBioPortal) and/or
 patient-matched whole-exome sequencing.
 
-### 4a. COSMIC somatic mutations by cancer type
+### 4a. Cancer-speific database using COSMIC somatic mutations
 
 Generate one protein database per primary tissue site. This is the standard
 approach for large-scale cancer proteogenomics when patient-level WES is not
@@ -379,38 +382,49 @@ pgatk cosmic-downloader \
 
 # Generate per-tissue databases
 pgatk cosmic-to-proteindb \
-    --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \
-    --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \
+    --input_mutation cosmic_data/Cosmic_GenomeScreensMutant_v103_GRCh38.tsv.gz \
+    --input_genes cosmic_data/Cosmic_Genes_v103_GRCh38.fasta.gz \
     --output_db cosmic_proteins.fa \
+    --clinical_sample_file cosmic_data/Cosmic_Classification_v103_GRCh38.tsv.gz \
+    --filter_column PRIMARY_SITE \
     --split_by_filter_column
 ```
 
 This produces files like `cosmic_proteins_lung.fa`, `cosmic_proteins_breast.fa`,
 etc. Use the tissue-matched database for your cancer type of interest.
 
-### 4b. Single cancer type
+!!! note "COSMIC v103 tissue annotation"
+    In COSMIC v103, `PRIMARY_SITE` is not a column in the mutation file. It lives
+    in the separate Classification file (`Cosmic_Classification_v103_GRCh38.tsv.gz`)
+    and is joined via `COSMIC_PHENOTYPE_ID`. Pass the Classification file via
+    `--clinical_sample_file` whenever tissue-type filtering or splitting is needed.
+
+### 4b. Cancer-type speific database using COSMIC somatic mutations
 
 Build a focused database for one cancer type (e.g. lung):
 
 ```bash
 pgatk cosmic-to-proteindb \
-    --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \
-    --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \
+    --input_mutation cosmic_data/Cosmic_GenomeScreensMutant_v103_GRCh38.tsv.gz \
+    --input_genes cosmic_data/Cosmic_Genes_v103_GRCh38.fasta.gz \
     --output_db cosmic_lung_proteins.fa \
+    --clinical_sample_file cosmic_data/Cosmic_Classification_v103_GRCh38.tsv.gz \
+    --filter_column PRIMARY_SITE \
     --accepted_values "lung"
 ```
 
-### 4c. Cell-line proteogenomics
+### 4c. Cell-type specific databases
 
 When analyzing cell-line proteomes, use cell-line-specific mutations. COSMIC
 provides a dedicated cell-line export with mutations annotated per sample:
 
 ```bash
 pgatk cosmic-to-proteindb \
-    --input_mutation cosmic_data/CosmicCLP_MutantExport.tsv.gz \
-    --input_genes cosmic_data/All_CellLines_Genes.fasta.gz \
+    --input_mutation cosmic_data/CellLinesProject_GenomeScreensMutant_v103_GRCh38.tsv.gz \
+    --input_genes cosmic_data/Cosmic_Genes_v103_GRCh38.fasta.gz \
     --output_db cosmic_cellline_proteins.fa \
-    --filter_column "Sample name" \
+    --clinical_sample_file cosmic_data/Cosmic_Classification_v103_GRCh38.tsv.gz \
+    --filter_column PRIMARY_SITE \
     --split_by_filter_column
 ```
 
@@ -444,15 +458,16 @@ pgatk cbioportal-to-proteindb \
 ### 4e. Combined cancer database for immunopeptidomics
 
 For HLA immunopeptidomics / neoantigen discovery, a broad mutation database
-maximizes the chance of detecting mutant HLA-presented peptides. Studies have
-shown that COSMIC-derived databases can identify 5x more mutant immunopeptides
-than patient WES alone. Combine COSMIC with ClinVar:
+maximizes the chance of detecting mutant HLA-presented peptides. COSMIC-derived
+databases have been shown to identify 5x more mutant HLA-I immunopeptides than
+patient WES alone ([Wang et al., *J Transl Med* 2024](https://doi.org/10.1186/s12967-023-04821-0)).
+Combine COSMIC with ClinVar:
 
 ```bash
 # Generate COSMIC mutations
 pgatk cosmic-to-proteindb \
-    --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \
-    --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \
+    --input_mutation cosmic_data/Cosmic_GenomeScreensMutant_v103_GRCh38.tsv.gz \
+    --input_genes cosmic_data/Cosmic_Genes_v103_GRCh38.fasta.gz \
     --output_db cosmic_proteins.fa
 
 # Generate ClinVar mutations
@@ -475,7 +490,7 @@ pgatk generate-decoy \
 
 ---
 
-## 5. Patient-Specific Database from WGS/WES
+## USE CASE 6: Patient-Specific Database from WGS/WES
 
 When matched whole-genome or whole-exome sequencing data is available for
 a sample, build a personalized protein database from the patient's own
@@ -530,24 +545,24 @@ pgatk vcf-to-proteindb \
 
 ---
 
-## 6. Novel ORF and Micropeptide Discovery
+## USE CASE 7: Novel ORF and Micropeptide Discovery
 
 Proteogenomics is a key approach for discovering novel coding regions: small
 open reading frames (smORFs), micropeptides from lncRNAs, pseudogene-encoded
 proteins, and alternative reading frames. Studies have found that non-canonical
 peptides can account for over 5% of total identifications.
 
-### lincRNA-derived proteins
+### lncRNA-derived proteins
 
-Long intergenic non-coding RNAs (lincRNAs) can encode small proteins. Translate
+Long intergenic non-coding RNAs (lncRNAs) can encode small proteins. Translate
 them in three reading frames:
 
 ```bash
 pgatk dnaseq-to-proteindb \
     --input_fasta transcripts.fa \
-    --output_proteindb lincRNA_proteins.fa \
-    --var_prefix lincRNA_ \
-    --include_biotypes lincRNA \
+    --output_proteindb lncRNA_proteins.fa \
+    --protein_prefix lncRNA_ \
+    --include_biotypes lncRNA \
     --num_orfs 3 \
     --skip_including_all_cds
 ```
@@ -560,7 +575,7 @@ Some pseudogenes are transcribed and may produce functional peptides:
 pgatk dnaseq-to-proteindb \
     --input_fasta transcripts.fa \
     --output_proteindb pseudogene_proteins.fa \
-    --var_prefix pseudogene_ \
+    --protein_prefix pseudogene_ \
     --include_biotypes processed_pseudogene,transcribed_processed_pseudogene,translated_processed_pseudogene \
     --num_orfs 3 \
     --skip_including_all_cds
@@ -575,7 +590,7 @@ discover upstream ORFs (uORFs), overlapping ORFs, and downstream ORFs:
 pgatk dnaseq-to-proteindb \
     --input_fasta transcripts.fa \
     --output_proteindb altorf_proteins.fa \
-    --var_prefix altorf_ \
+    --protein_prefix altorf_ \
     --include_biotypes altORFs \
     --skip_including_all_cds
 ```
@@ -586,7 +601,7 @@ pgatk dnaseq-to-proteindb \
 pgatk dnaseq-to-proteindb \
     --input_fasta transcripts.fa \
     --output_proteindb antisense_proteins.fa \
-    --var_prefix antisense_ \
+    --protein_prefix antisense_ \
     --include_biotypes antisense,antisense_RNA \
     --num_orfs 3 \
     --skip_including_all_cds
@@ -598,7 +613,7 @@ Combine all non-canonical sources with the canonical proteome:
 
 ```bash
 cat canonical_proteins.fa \
-    lincRNA_proteins.fa \
+    lncRNA_proteins.fa \
     pseudogene_proteins.fa \
     altorf_proteins.fa \
     antisense_proteins.fa \
@@ -616,7 +631,7 @@ pgatk generate-decoy \
 
     ```bash
     pgatk digest-mutant-protein \
-        --input lincRNA_proteins.fa,pseudogene_proteins.fa,altorf_proteins.fa \
+        --input lncRNA_proteins.fa,pseudogene_proteins.fa,altorf_proteins.fa \
         --fasta canonical_proteins.fa \
         --output novel_unique_peptides.fa
     ```
@@ -971,16 +986,16 @@ pgatk clinvar-to-proteindb \
 
 # COSMIC somatic mutations
 pgatk cosmic-to-proteindb \
-    --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \
-    --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \
+    --input_mutation cosmic_data/Cosmic_GenomeScreensMutant_v103_GRCh38.tsv.gz \
+    --input_genes cosmic_data/Cosmic_Genes_v103_GRCh38.fasta.gz \
     --output_db cosmic_variants.fa
 
-# Non-coding RNA (lincRNA)
+# Non-coding RNA (lncRNA)
 pgatk dnaseq-to-proteindb \
     --input_fasta ensembl_data/transcripts.fa \
-    --output_proteindb lincRNA.fa \
-    --var_prefix lincRNA_ \
-    --include_biotypes lincRNA \
+    --output_proteindb lncRNA.fa \
+    --protein_prefix lncRNA_ \
+    --include_biotypes lncRNA \
     --num_orfs 3 \
     --skip_including_all_cds
 
@@ -988,7 +1003,7 @@ pgatk dnaseq-to-proteindb \
 pgatk dnaseq-to-proteindb \
     --input_fasta ensembl_data/transcripts.fa \
     --output_proteindb pseudogene.fa \
-    --var_prefix pseudogene_ \
+    --protein_prefix pseudogene_ \
     --include_biotypes processed_pseudogene,transcribed_processed_pseudogene \
     --num_orfs 3 \
     --skip_including_all_cds
@@ -1001,7 +1016,8 @@ cat canonical.fa \
     ensembl_variants.fa \
     clinvar_variants.fa \
     cosmic_variants.fa \
-    lincRNA.fa \
+    lncRNA.fa \
+    putative.fa \
     pseudogene.fa \
     > combined_target.fa
 ```
@@ -1027,7 +1043,7 @@ pgatk generate-decoy \
 
 ```bash
 pgatk digest-mutant-protein \
-    --input ensembl_variants.fa,clinvar_variants.fa,cosmic_variants.fa,lincRNA.fa,pseudogene.fa \
+    --input ensembl_variants.fa,clinvar_variants.fa,cosmic_variants.fa,lncRNA.fa,pseudogene.fa \
     --fasta canonical.fa \
     --output unique_variant_peptides.fa \
     --min-len 7 \
diff --git a/docs/validations.md b/docs/validations.md
new file mode 100644
index 0000000..6dd3acc
--- /dev/null
+++ b/docs/validations.md
@@ -0,0 +1,257 @@
+# Test Validations
+
+This page documents the validation strategy for PGATK, describing what the automated tests verify and how to independently confirm their correctness.
+# 1. Variant Translation Tests - COSMIC
+## 1.1 Validation per variant type (`test_variant_types_hgvs.py`)
+
+These tests verify that `CancerGenomesService.get_mut_pro_seq()` correctly processes all 18 Sequence Ontology (SO) variant types present in COSMIC v103 mutation files, following [HGVS nomenclature rules](https://hgvs-nomenclature.org/stable/).
+
+Each test uses a real mutation from `Cosmic_CompleteTargetedScreensMutant_v103_GRCh38.tsv` paired with the matching CDS from `Cosmic_Genes_v103_GRCh38.fasta`.
+
+---
+
+### Actionable Variant Types
+
+These types are expected to produce a non-empty mutant protein sequence. Tests assert biologically specific properties, not just that the result is non-empty.
+
+| Variant Type | Gene | Mutation | HGVS Rule Verified |
+|---|---|---|---|
+| `missense_variant` | EZH2 | c.656C>T p.S219F | Protein length unchanged; position 218 (0-indexed) is `F`; all other residues identical to WT |
+| `stop_gained` | FCGR3B | c.394A>T p.K132* | Position 131 (0-indexed) is `*` (stop codon) in the translated output |
+| `inframe_deletion` | CTNNB1 | c.104_124del p.T35_G41del | Protein shortened by exactly 7 AA (21 nt deleted ÷ 3); residues before position 35 unchanged |
+| `inframe_insertion` (dup) | SMAP1 | c.1226_1228dup p.I409dup | Protein lengthened by exactly 1 AA (3 nt duplicated ÷ 3); residues before the dup site unchanged |
+| `protein_altering_variant` | NF2 | c.252_262delinsCT p.L85_K88delins* | Result is non-empty and differs from the wild-type sequence |
+| `frameshift_variant` | MEN1 | c.292del p.R98Efs*21 | Non-empty translated sequence produced from the frame-shifted CDS |
+| `start_lost` | INPP4B | c.3G>T p.M1? | First AA is not Met; it is Ile (`ATG→ATT` via the DNA substitution path) |
+| `stop_lost` | MYD88 | c.611_613delinsGCGGCCCCC p.D204_*205delinsGGPR | Mutant protein is longer than WT (stop codon removed; read-through occurs) |
+| `stop_retained_variant` | EPHA3 | c.2756G>A p.*919= | Length unchanged; last residue is still `*` (`TGA→TAA`, both stop codons) |
+
+---
+
+### Filtered / Skipped Variant Types
+
+These types must produce an empty string `""`. There are two mechanisms:
+
+- **Pre-filter** (`cosmic_to_proteindb`): `synonymous_variant` rows are discarded before `get_mut_pro_seq` is called.
+- **`p.?` guard** (`get_mut_pro_seq`): When `aa_mut == "p.?"`, the function immediately returns `""`. All intronic, UTR, and splice variants in COSMIC v103 carry `p.?`.
+
+| Variant Type | Mechanism | Source Mutation |
+|---|---|---|
+| `synonymous_variant` | Pre-filter in `cosmic_to_proteindb` | Filter logic validated directly on the SO term strings |
+| `intron_variant` | `p.?` guard | RAD51C c.145+706C>T p.? |
+| `3_prime_UTR_variant` | `p.?` guard | SUFU c.*2816G>A p.? |
+| `5_prime_UTR_variant` | `p.?` guard | PSMD13 c.-20C>T p.? |
+| `coding_sequence_variant` | `p.?` guard (also has intronic offset `+`) | OAS2 c.2157+2A>G p.? |
+| `splice_acceptor_variant` | `p.?` guard | TSHR c.171-1G>A p.? |
+| `splice_donor_variant` | `p.?` guard | PTCH1 c.3306+2T>G p.? |
+| `splice_region_variant` (composite) | Fires missense branch when combined with `missense_variant` | EZH2 c.656C>T p.S219F with type `missense_variant,splice_region_variant` |
+| `incomplete_terminal_codon_variant` | `p.?` guard (absent from v103 targeted-screen data; tested by convention) | c.2213_2214insA p.? |
+
+!!! note
+    `splice_region_variant` only appears in COSMIC v103 as part of a composite type string (e.g. `missense_variant,splice_region_variant`). When combined with an actionable type, the actionable type's branch fires first and the mutation is processed normally.
+
+---
+
+## 1.2 Independent Validation of the Tests
+
+### 1. Re-derive CDS sequences from the FASTA
+
+The CDS strings embedded in the test file came from `Cosmic_Genes_v103_GRCh38.fasta`. To verify a gene (e.g. EZH2):
+
+```bash
+python3 - <<'EOF'
+from Bio import SeqIO
+fasta = "use-cases/cosmic_data/Cosmic_Genes_v103_GRCh38.fasta"
+for rec in SeqIO.parse(fasta, "fasta"):
+    if "EZH2" in rec.id and "ENST00000483967" in rec.id:
+        print(f"len={len(rec.seq)}")
+        print(f"prot[218]={rec.seq.translate(to_stop=False)[218]}")
+        print(rec.seq[:50])
+EOF
+```
+
+Expected: `len=2211`, `prot[218]=S` (which becomes `F` after the c.656C>T substitution).
+
+### 2. Confirm mutations exist in the COSMIC TSV
+
+Every test documents its source mutation. To verify a mutation is real and not fabricated:
+
+```bash
+grep "EZH2" use-cases/cosmic_data/Cosmic_CompleteTargetedScreensMutant_v103_GRCh38.tsv | \
+  grep "c.656C>T" | \
+  cut -f1-10
+```
+
+### 3. Manually compute expected protein output
+
+For the missense test, manually apply the substitution and translate:
+
+```python
+from Bio.Seq import Seq
+
+cds = '''ATGGGCCAGACTGGGAAGAAATCTGAGAAGGGACCAGTTTGTTGGCGGAAGCGTGTAAAATCAGAGTACATGCGACTGAGACAGCTCAAGAGGTTCAGACGAGCTGATGAAGTAAAGAGTATGTTTAGTTCCAATCGTCAGAAAATT
+TTGGAAAGAACGGAAATCTTAAACCAAGAATGGAAACAGCGAAGGATACAGCCTGTGCACATCCTGACTTCTTGTTCGGTGACCAGTGACTTGGATTTTCCAACACAAGTCATCCCATTAAAGACTCTGAATGCAGTTGCTTCAGTACCCATAATG
+TATTCTTGGTCTCCCCTACAGCAGAATTTTATGGTGGAAGATGAAACTGTTTTACATAACATTCCTTATATGGGAGATGAAGTTTTAGATCAGGATGGTACTTTCATTGAAGAACTAATAAAAAATTATGATGGGAAAGTACACGGGGATAGAGAA
+TGTGGGTTTATAAATGATGAAATTTTTGTGGAGTTGGTGAATGCCCTTGGTCAATATAATGATGATGACGATGATGATGATGGAGACGATCCTGAAGAAAGAGAAGAAAAGCAGAAAGATCTGGAGGATCACCGAGATGATAAAGAAAGCCGCCCA
+CCTCGGAAATTTCCTTCTGATAAAATTTTTGAAGCCATTTCCTCAATGTTTCCAGATAAGGGCACAGCAGAAGAACTAAAGGAAAAATATAAAGAACTCACCGAACAGCAGCTCCCAGGCGCACTTCCTCCTGAATGTACCCCCAACATAGATGGA
+CCAAATGCTAAATCTGTTCAGAGAGAGCAAAGCTTACACTCCTTTCATACGCTTTTCTGTAGGCGATGTTTTAAATATGACTGCTTCCTACATCCTTTTCATGCAACACCCAACACTTATAAGCGGAAGAACACAGAAACAGCTCTAGACAACAAA
+CCTTGTGGACCACAGTGTTACCAGCATTTGGAGGGAGCAAAGGAGTTTGCTGCTGCTCTCACCGCTGAGCGGATAAAGACCCCACCAAAACGTCCAGGAGGCCGCAGAAGAGGACGGCTTCCCAATAACAGTAGCAGGCCCAGCACCCCCACCATT
+AATGTGCTGGAATCAAAGGATACAGACAGTGATAGGGAAGCAGGGACTGAAACGGGGGGAGAGAACAATGATAAAGAAGAAGAAGAGAAGAAAGATGAAACTTCGAGCTCCTCTGAAGCAAATTCTCGGTGTCAAACACCAATAAAGATGAAGCCA
+AATATTGAACCTCCTGAGAATGTGGAGTGGAGTGGTGCTGAAGCCTCAATGTTTAGAGTCCTCATTGGCACTTACTATGACAATTTCTGTGCCATTGCTAGGTTAATTGGGACCAAAACATGTAGACAGGTGTATGAGTTTAGAGTCAAAGAATCT
+AGCATCATAGCTCCAGCTCCCGCTGAGGATGTGGATACTCCTCCAAGGAAAAAGAAGAGGAAACACCGGTTGTGGGCTGCACACTGCAGAAAGATACAGCTGAAAAAGGACGGCTCCTCTAACCATGTTTACAACTATCAACCCTGTGATCATCCA
+CGGCAGCCTTGTGACAGTTCGTGCCCTTGTGTGATAGCACAAAATTTTTGTGAAAAGTTTTGTCAATGTAGTTCAGAGTGTCAAAACCGCTTTCCGGGATGCCGCTGCAAAGCACAGTGCAACACCAAGCAGTGCCCGTGCTACCTGGCTGTCCGA
+GAGTGTGACCCTGACCTCTGTCTTACTTGTGGAGCCGCTGACCATTGGGACAGTAAAAATGTGTCCTGCAAGAACTGCAGTATTCAGCGGGGCTCCAAAAAGCATCTATTGCTGGCACCATCTGACGTGGCAGGCTGGGGGATTTTTATCAAAGAT
+CCTGTGCAGAAAAATGAATTCATCTCAGAATACTGTGGAGAGATTATTTCTCAAGATGAAGCTGACAGAAGAGGGAAAGTGTATGATAAATACATGTGCAGCTTTCTGTTCAACTTGAACAATGATTTTGTGGTGGATGCAACCCGCAAGGGTAAC
+AAAATTCGTTTTGCAAATCATTCGGTAAATCCAAACTGCTATGCAAAAGTTATGATGGTTAACGGTGATCACAGGATAGGTATTTTTGCCAAGAGAGCCATCCAGACTGGCGAAGAGCTGTTTTTTGATTACAGATACAGCCAGGCTGATGCCCTG
+AAGTATGTCGGCATCGAAAGAGAAATGGAAATCCCTTGA'''.replace('\n','')  # EZH2 CDS string
+
+seq = Seq(cds)
+wt_prot = seq.translate(to_stop=False)
+print(wt_prot[218])   # 'S' — wild-type Serine at position 219
+
+# Apply c.656C>T: position 656 is 0-indexed 655
+mut_seq = seq[:655] + "T" + seq[656:]
+mut_prot = mut_seq.translate(to_stop=False)
+print(mut_prot[218])  # 'F' — mutant Phenylalanine
+```
+
+---
+
+## 1.3 Core Unit Tests (`test_cosmic_core.py`)
+
+14 Unit tests are implemented to verify the internal logic of `get_mut_pro_seq()` using minimal synthetic sequences (`Seq("ATGAAATTT")` etc.).
+
+| Class | Coverage |
+|---|---|
+| `TestGetMutProSeqDnaBranch` | DNA substitution, insertion, single-base deletion, two-position deletion, ambiguous `c.?` routing to protein branch |
+| `TestGetMutProSeqProteinBranch` | Missense, nonsense truncation, in-frame insertion, two-position in-frame deletion, single-position in-frame deletion |
+| `TestGetMutProSeqEdgeCases` | Mutation beyond sequence length, ambiguous `p.?` returns `""`, non-alpha last char in aa_mut returns `""`, DNA ref-allele mismatch returns `""` |
+
+These tests are fast to run and isolate individual code paths without requiring reference files.
+
+---
+
+# 2. COSMIC Integration Pipeline Tests
+
+## 2.1 COSMIC v103 Schema and Data Model
+
+COSMIC v103 introduced two breaking changes from earlier releases:
+
+**Column name changes** — all columns are now `UPPERCASE_UNDERSCORE`:
+
+| Old (v2/v98) | New (v103) |
+|---|---|
+| `Gene name` | `GENE_SYMBOL` |
+| `Accession Number` | `TRANSCRIPT_ACCESSION` |
+| `CDS mutation` | `MUTATION_CDS` |
+| `AA Mutation` | `MUTATION_AA` |
+| `Type` | `MUTATION_DESCRIPTION` |
+| `Primary site` | *(moved to Classification file)* |
+
+**Tissue-type data architecture** — `PRIMARY_SITE` (tissue of origin) is no longer a column in the mutation export. It lives in a separate **Classification file** (`Cosmic_Classification_Tsv_v103_GRCh38.tar`) and is joined via `COSMIC_PHENOTYPE_ID`:
+
+```
+mutation_file.COSMIC_PHENOTYPE_ID (e.g. COSO1001)
+    → Cosmic_Classification_v103_GRCh38.tsv.COSMIC_PHENOTYPE_ID
+    → PRIMARY_SITE (e.g. upper_aerodigestive_tract)
+```
+
+`COSMIC_PHENOTYPE_ID` is present at column 6 in both `Cosmic_CompleteTargetedScreensMutant` and `Cosmic_GenomeScreensMutant` files. No hop through the Sample file is needed.
+
+> **Note**: The Sample file (`Cosmic_Sample_Tsv_v103_GRCh38.tsv`) contains 34 columns covering sequencing metadata (`SAMPLE_NAME`, `COSMIC_SAMPLE_ID`, `TUMOUR_ID`, etc.) but does **not** contain `PRIMARY_SITE`. The Classification file is the authoritative source for tissue and histology.
+
+---
+
+## 2.2 Integration Test (`test_cosmic_to_proteindb`)
+
+**Test file**: `pgatk/tests/pgatk_tests.py::PgatkRunnerTests::test_cosmic_to_proteindb`
+
+The test runs the full `cosmic-to-proteindb` pipeline end-to-end using small testdata files that mirror the real v103 schema:
+
+| File | Purpose |
+|---|---|
+| `testdata/test_cosmic_mutations.tsv` | 12 HRAS mutation rows in v103 27-column format; covers `missense_variant`, `stop_gained`, `inframe_deletion`, `synonymous_variant` (filtered), `frameshift_variant` (`p.?`, skipped) |
+| `testdata/test_cosmic_genes.fa` | HRAS CDS FASTA from `Cosmic_Genes_v103_GRCh38.fasta` |
+| `testdata/test_cosmic_classification.tsv` | 8 rows mapping `COSMIC_PHENOTYPE_ID` (COSO1001–COSO1008) to `PRIMARY_SITE`; covers `upper_aerodigestive_tract`, `skin`, `bone`, `thyroid`, `liver`, `haematopoietic_and_lymphoid_tissue` |
+
+CLI invocation used in the test:
+
+```
+cosmic-to-proteindb
+  --config_file config/cosmic_config.yaml
+  --input_mutation testdata/test_cosmic_mutations.tsv
+  --input_genes testdata/test_cosmic_genes.fa
+  --output_db testdata/test_cosmic_mutations_proteindb.fa
+  --clinical_sample_file testdata/test_cosmic_classification.tsv
+  --filter_column PRIMARY_SITE
+  --split_by_filter_column
+  --accepted_values all
+```
+
+Expected output files (split per `PRIMARY_SITE`):
+
+```
+testdata/test_cosmic_mutations_proteindb_bone.fa
+testdata/test_cosmic_mutations_proteindb_liver.fa
+testdata/test_cosmic_mutations_proteindb_skin.fa
+testdata/test_cosmic_mutations_proteindb_thyroid.fa
+testdata/test_cosmic_mutations_proteindb_upperaerodigestivetract.fa
+```
+
+The `synonymous_variant` row (COSO1006) is pre-filtered and produces no output. The `frameshift_variant` row carrying `p.?` (COSO1008) is skipped by the `p.?` guard. The consolidated `testdata/test_cosmic_mutations_proteindb.fa` contains all non-filtered mutations regardless of tissue type.
+
+---
+
+## 2.3 `--clinical_sample_file` CLI Option
+
+The `--clinical_sample_file` / `-cl` option accepts any TSV file that maps a sample/phenotype ID column to the `--filter_column` value. For COSMIC, this is the Classification file:
+
+| Argument | COSMIC v103 value |
+|---|---|
+| `--clinical_sample_file` | `Cosmic_Classification_v103_GRCh38.tsv` (or `.gz`) |
+| `--filter_column` | `PRIMARY_SITE` |
+| Join key used internally | `COSMIC_PHENOTYPE_ID` (auto-detected for COSMIC) |
+
+For **cBioportal**, the equivalent is the clinical sample file with `SAMPLE_ID` as the join key — the `--clinical_sample_file` option uses `SAMPLE_ID` by default and switches to `COSMIC_PHENOTYPE_ID` only when invoked via `cosmic-to-proteindb`.
+
+---
+
+## 2.4 Independent Validation with Real COSMIC Files
+
+After downloading COSMIC v103 files into `use-cases/use-case2/cosmic_data/`, run the pipeline against a subset of real mutations:
+
+```bash
+# Extract 500 actionable mutations (skip intronic/UTR/synonymous)
+zcat use-cases/use-case2/cosmic_data/Cosmic_CompleteTargetedScreensMutant_v103_GRCh38.tsv.gz \
+  | head -1 > /tmp/cosmic_test.tsv
+zcat use-cases/use-case2/cosmic_data/Cosmic_CompleteTargetedScreensMutant_v103_GRCh38.tsv.gz \
+  | grep -E "missense_variant|stop_gained|inframe_deletion|inframe_insertion|frameshift" \
+  | head -500 >> /tmp/cosmic_test.tsv
+
+pgatk cosmic-to-proteindb \
+  --config_file pgatk/config/cosmic_config.yaml \
+  --input_mutation /tmp/cosmic_test.tsv \
+  --input_genes <(zcat use-cases/use-case2/cosmic_data/Cosmic_Genes_v103_GRCh38.fasta.gz) \
+  --output_db /tmp/cosmic_out.fa \
+  --clinical_sample_file <(zcat use-cases/use-case2/cosmic_data/Cosmic_Classification_v103_GRCh38.tsv.gz) \
+  --filter_column PRIMARY_SITE \
+  --split_by_filter_column \
+  --accepted_values all
+```
+
+Expected: ~27 per-tissue output files, including canonical COSMIC tissue types (`lung`, `breast`, `large_intestine`, `skin`, `haematopoietic_and_lymphoid_tissue`, etc.). FASTA headers follow the format:
+
+```
+>COSMIC:GENE_SYMBOL:p.MutAA:SO_term
+```
+
+To verify the join is working (i.e. phenotype IDs are resolving to tissue names):
+
+```bash
+# Count sequences per tissue file
+for f in /tmp/cosmic_out_*.fa; do
+  echo "$f: $(grep -c '^>' $f) sequences"
+done
+```
+
+A file named `cosmic_out_NS.fa` is expected for phenotypes where tissue is not specified in the Classification file — these are retained rather than silently dropped.
diff --git a/pgatk/cgenomes/cgenomes_proteindb.py b/pgatk/cgenomes/cgenomes_proteindb.py
index f42d7b8..1484728 100644
--- a/pgatk/cgenomes/cgenomes_proteindb.py
+++ b/pgatk/cgenomes/cgenomes_proteindb.py
@@ -140,19 +140,28 @@ def get_mut_pro_seq(snp: SNP, seq: Seq) -> Optional[str]:
                 ref_dna = re.sub("[^A-Z]+", "", tmplist[0])
                 mut_dna = re.sub("[^A-Z]+", "", tmplist[1])
                 index = int(positions[0]) - 1
-                if ref_dna == str(seq[index]).upper() and mut_dna in nucleotide:  #
+                if ref_dna == str(seq[index]).upper() and mut_dna in nucleotide:
                     seq_mut = seq[:index] + mut_dna + seq[index + 1:]
                     mut_pro_seq = str(seq_mut.translate(to_stop=False))
             elif "delins" in snp.dna_mut:
-                # Deletion-insertion: delete range then insert new bases
-                insert_dna = snp.dna_mut.split("delins")[1]
-                if insert_dna.isalpha() and len(positions) >= 2:
-                    del_index1 = int(positions[0]) - 1
-                    del_index2 = int(positions[1])
+                # HGVS delins: one or more nucleotides replaced by one or more other nucleotides.
+                coord_part, insert_raw = snp.dna_mut.split("delins", 1)
+                insert_dna = insert_raw.upper()
+                if re.search(r'\d[+-]|\*|-\d', coord_part):
+                    # Intronic (c.N+X, c.N-X), 5'UTR (c.-N) or 3'UTR (c.*N) offsets
+                    # cannot be mapped onto a CDS-only FASTA sequence; skip.
+                    pass
+                elif not insert_dna.isalpha():
+                    # Unknown ('?'), N[n], or conversion-notation insertions cannot be resolved.
+                    pass
+                elif len(positions) >= 2:
+                    # Range delins: delete positions[0]..positions[1] (1-indexed, inclusive).
+                    del_index1 = int(positions[0]) - 1  # 0-indexed start
+                    del_index2 = int(positions[1])      # 0-indexed exclusive end
                     seq_mut = seq[:del_index1] + insert_dna + seq[del_index2:]
                     mut_pro_seq = str(seq_mut.translate(to_stop=False))
-                elif insert_dna.isalpha() and len(positions) == 1:
-                    # Single-position delins: replace one base
+                elif len(positions) == 1:
+                    # Single-position delins: replace the one nucleotide at positions[0].
                     del_index1 = int(positions[0]) - 1
                     seq_mut = seq[:del_index1] + insert_dna + seq[del_index1 + 1:]
                     mut_pro_seq = str(seq_mut.translate(to_stop=False))
@@ -165,6 +174,20 @@ def get_mut_pro_seq(snp: SNP, seq: Seq) -> Optional[str]:
                     seq_mut = seq[:ins_index1] + insert_dna + seq[ins_index1:]
                     mut_pro_seq = str(seq_mut.translate(to_stop=False))
 
+            elif "dup" in snp.dna_mut:
+                # Tandem duplication: re-insert the duplicated range after its end position.
+                if len(positions) == 2:
+                    dup_start = int(positions[0]) - 1
+                    dup_end = int(positions[1])
+                    dup_seq = str(seq[dup_start:dup_end])
+                    seq_mut = seq[:dup_end] + dup_seq + seq[dup_end:]
+                    mut_pro_seq = str(seq_mut.translate(to_stop=False))
+                elif len(positions) == 1:
+                    dup_pos = int(positions[0])
+                    dup_seq = str(seq[dup_pos - 1:dup_pos])
+                    seq_mut = seq[:dup_pos] + dup_seq + seq[dup_pos:]
+                    mut_pro_seq = str(seq_mut.translate(to_stop=False))
+
             elif "del" in snp.dna_mut:
                 if len(positions) == 2:
                     del_index1 = int(positions[0]) - 1
@@ -180,7 +203,7 @@ def get_mut_pro_seq(snp: SNP, seq: Seq) -> Optional[str]:
                 positions = re.findall(r'\d+', snp.aa_mut)
                 protein_seq = str(seq.translate(to_stop=False))
 
-                if "Missense" in snp.mutation_type:
+                if "Missense" in snp.mutation_type or "missense_variant" in snp.mutation_type:
                     # Extract the mutant residue from HGVS like p.V600E (1-letter)
                     # or p.Val600Glu (3-letter).  For 1-letter the last char is the
                     # AA; for 3-letter we need to convert the last triplet.
@@ -190,20 +213,27 @@ def get_mut_pro_seq(snp: SNP, seq: Seq) -> Optional[str]:
                         return ''
                     index = int(positions[0]) - 1
                     mut_pro_seq = protein_seq[:index] + mut_aa + protein_seq[index + 1:]
-                elif "Nonsense" in snp.mutation_type:
+                elif "Nonsense" in snp.mutation_type or "stop_gained" in snp.mutation_type:
                     index = int(positions[0]) - 1
                     mut_pro_seq = protein_seq[:index]
-                elif "Insertion - In frame" in snp.mutation_type:
-                    try:
-                        index = snp.aa_mut.index("ins")
-                    except ValueError:
-                        return ''
-                    insert_aa_raw = snp.aa_mut[index + 3:]
-                    insert_aa = _three_to_one(insert_aa_raw)
-                    if insert_aa.isalpha():
-                        ins_index1 = int(positions[0])
-                        mut_pro_seq = protein_seq[:ins_index1] + insert_aa + protein_seq[ins_index1:]
-                elif "Deletion - In frame" in snp.mutation_type:
+                elif "Insertion - In frame" in snp.mutation_type or "inframe_insertion" in snp.mutation_type:
+                    if "dup" in snp.aa_mut:
+                        # Protein-level tandem dup: re-insert the duplicated residues after the range.
+                        dup_start = int(positions[0]) - 1
+                        dup_end = int(positions[-1])
+                        dup_aa = protein_seq[dup_start:dup_end]
+                        mut_pro_seq = protein_seq[:dup_end] + dup_aa + protein_seq[dup_end:]
+                    else:
+                        try:
+                            index = snp.aa_mut.index("ins")
+                        except ValueError:
+                            return ''
+                        insert_aa_raw = snp.aa_mut[index + 3:]
+                        insert_aa = _three_to_one(insert_aa_raw)
+                        if insert_aa.isalpha():
+                            ins_index1 = int(positions[0])
+                            mut_pro_seq = protein_seq[:ins_index1] + insert_aa + protein_seq[ins_index1:]
+                elif "Deletion - In frame" in snp.mutation_type or "inframe_deletion" in snp.mutation_type:
                     if len(positions) == 2:
                         del_index1 = int(positions[0]) - 1
                         del_index2 = int(positions[1])
@@ -211,7 +241,8 @@ def get_mut_pro_seq(snp: SNP, seq: Seq) -> Optional[str]:
                     elif len(positions) == 1:
                         del_index1 = int(positions[0]) - 1
                         mut_pro_seq = protein_seq[:del_index1] + protein_seq[del_index1 + 1:]
-                elif "Complex" in snp.mutation_type and "frameshift" not in snp.mutation_type:
+                elif ("Complex" in snp.mutation_type or "protein_altering_variant" in snp.mutation_type) \
+                        and "frameshift" not in snp.mutation_type:
                     try:
                         index = snp.aa_mut.index(">")
                     except ValueError:
@@ -256,65 +287,107 @@ def cosmic_to_proteindb(self) -> None:
                 except KeyError:
                     COSMIC_CDS_DB[record.id] = [record]
 
+        # Build phenotype-to-group mapping from the classification file if provided.
+        # COSMIC_PHENOTYPE_ID is present directly in the mutation file (column 6) and
+        # is the join key to the Cosmic_Classification file where PRIMARY_SITE lives.
+        sample_groups_dict = {}
+        if self._local_clinical_sample_file and self._filter_column:
+            sample_groups_dict = self.get_value_per_sample(
+                self._local_clinical_sample_file,
+                self._filter_column,
+                sample_id_column='COSMIC_PHENOTYPE_ID',
+            )
+            if not sample_groups_dict:
+                self.get_logger().warning(
+                    "clinical_sample_file '%s' produced no phenotype mappings; "
+                    "falling back to direct filter-column lookup in the mutation file.",
+                    self._local_clinical_sample_file,
+                )
+
         regex = re.compile('[^a-zA-Z]')
         mutation_dic = {}
         groups_mutations_dict = {}
         self.get_logger().debug("Reading input CosmicMutantExport.tsv ...")
         line_counter = 1
 
-        required_columns = ["Gene name", "Accession Number", "Mutation CDS", "Mutation AA", "Mutation Description"]
+        required_columns = ["GENE_SYMBOL", "TRANSCRIPT_ACCESSION", "MUTATION_CDS", "MUTATION_AA", "MUTATION_DESCRIPTION"]
         with _open_text(self._local_mutation_file, encoding="latin-1") as cosmic_input, \
              open(self._local_output_file, 'w', encoding='utf-8') as output:
-            header = cosmic_input.readline().strip().split("\t")
+            col_header = cosmic_input.readline().strip().split("\t")
             try:
-                gene_col = header.index("Gene name")
-                enst_col = header.index("Accession Number")
-                cds_col = header.index("Mutation CDS")
-                aa_col = header.index("Mutation AA")
-                muttype_col = header.index("Mutation Description")
+                gene_col = col_header.index("GENE_SYMBOL")
+                enst_col = col_header.index("TRANSCRIPT_ACCESSION")
+                cds_col = col_header.index("MUTATION_CDS")
+                aa_col = col_header.index("MUTATION_AA")
+                muttype_col = col_header.index("MUTATION_DESCRIPTION")
             except ValueError as e:
                 self.get_logger().error(
                     "COSMIC file missing required columns. Expected %s, got: %s",
-                    required_columns, header
+                    required_columns, col_header
                 )
                 raise ValueError(
                     f"COSMIC mutation file missing required column: {e}. "
                     f"Expected columns include: {required_columns}"
                 ) from e
+
+            # COSMIC_PHENOTYPE_ID is the direct join key to the classification file.
+            try:
+                phenotype_id_col = col_header.index("COSMIC_PHENOTYPE_ID")
+            except ValueError:
+                phenotype_id_col = None
+                if sample_groups_dict:
+                    self.get_logger().warning(
+                        "COSMIC_PHENOTYPE_ID not found in mutation file header; classification file join will be skipped."
+                    )
+
+            # Fall back to a direct filter column in the mutation file when no classification file is provided.
             filter_col = None
-            if self._filter_column:
+            if not sample_groups_dict and self._filter_column:
                 try:
-                    filter_col = header.index(self._filter_column)
+                    filter_col = col_header.index(self._filter_column)
                 except ValueError:
                     self.get_logger().warning(
-                        "Filter column '%s' not found in COSMIC header: %s. Filtering disabled.",
-                        self._filter_column, header
+                        "Filter column '%s' not found in COSMIC header. Filtering disabled.",
+                        self._filter_column,
                     )
 
             max_col = max(gene_col, enst_col, cds_col, aa_col, muttype_col)
             if filter_col is not None:
                 max_col = max(max_col, filter_col)
+            if phenotype_id_col is not None and sample_groups_dict:
+                max_col = max(max_col, phenotype_id_col)
 
             for line in cosmic_input:
                 if line_counter % 10000 == 0:
-                    msg = "Number of lines finished -- '{}'".format(line_counter)
-                    self.get_logger().debug(msg)
+                    self.get_logger().debug("Number of lines finished -- '%s'", line_counter)
                 line_counter += 1
                 row = line.strip().split("\t")
                 if len(row) <= max_col:
                     self.get_logger().debug("Skipping malformed row (insufficient columns) at line %s: %s", line_counter, row[:5])
                     continue
-                # filter out mutations from unspecified groups
-                if filter_col is not None:
-                    if row[filter_col] not in self._accepted_values and self._accepted_values != ['all']:
+
+                # Determine the group (e.g. primary site) for this mutation.
+                group = None
+                if sample_groups_dict and phenotype_id_col is not None:
+                    group = sample_groups_dict.get(row[phenotype_id_col])
+                    if group is None and (self._accepted_values != ['all'] or self._split_by_filter_column):
+                        self.get_logger().warning(
+                            "No classification found for COSMIC_PHENOTYPE_ID '%s'; skipping row.",
+                            row[phenotype_id_col],
+                        )
                         continue
+                elif filter_col is not None:
+                    group = row[filter_col]
+
+                if group is not None and group not in self._accepted_values and self._accepted_values != ['all']:
+                    continue
 
-                if "coding silent" in row[muttype_col]:
+                if "coding silent" in row[muttype_col] or "synonymous_variant" in row[muttype_col]:
                     continue
 
                 snp = SNP(gene=row[gene_col], mrna=row[enst_col], dna_mut=row[cds_col], aa_mut=row[aa_col],
                           mutation_type=row[muttype_col])
-                header = "COSMIC:%s:%s:%s" % (snp.gene, snp.aa_mut, snp.mutation_type.replace(" ", ""))
+                fasta_header = "COSMIC:%s:%s:%s" % (snp.gene, snp.aa_mut, snp.mutation_type.replace(" ", ""))
                 try:
                     this_gene_records = COSMIC_CDS_DB[snp.gene]
                     seqs = []
@@ -334,16 +407,16 @@ def cosmic_to_proteindb(self) -> None:
                         break
 
                 if mut_pro_seq:
-                    entry = ">%s\n%s\n" % (header, mut_pro_seq)
-                    if header not in mutation_dic:
+                    entry = ">%s\n%s\n" % (fasta_header, mut_pro_seq)
+                    if fasta_header not in mutation_dic:
                         output.write(entry)
-                        mutation_dic[header] = 1
+                        mutation_dic[fasta_header] = 1
 
-                    if self._split_by_filter_column and filter_col is not None:
+                    if self._split_by_filter_column and group is not None:
                         try:
-                            groups_mutations_dict[row[filter_col]][header] = entry
+                            groups_mutations_dict[group][fasta_header] = entry
                         except KeyError:
-                            groups_mutations_dict[row[filter_col]] = {header: entry}
+                            groups_mutations_dict[group] = {fasta_header: entry}
                 else:
                     self.get_logger().warning(
                         f"Could not parse mutation record: gene={snp.gene}, dna_mut={snp.dna_mut}, "
@@ -353,13 +426,14 @@ def cosmic_to_proteindb(self) -> None:
         for group_name in groups_mutations_dict.keys():
             output_base = str(Path(self._local_output_file).with_suffix(''))
             with open(f"{output_base}_{regex.sub('', group_name)}.fa", 'w', encoding='utf-8') as fn:
-                for header in groups_mutations_dict[group_name].keys():
-                    fn.write(groups_mutations_dict[group_name][header])
+                for fasta_hdr in groups_mutations_dict[group_name].keys():
+                    fn.write(groups_mutations_dict[group_name][fasta_hdr])
 
         self.get_logger().debug("COSMIC contains in total {} non redundant mutations".format(len(mutation_dic)))
 
     @staticmethod
-    def get_sample_headers(header_line: list, filter_column: str) -> tuple[Optional[int], Optional[int]]:
+    def get_sample_headers(header_line: list, filter_column: str,
+                           sample_id_column: str = 'SAMPLE_ID') -> tuple[Optional[int], Optional[int]]:
         _logger = logging.getLogger(__name__)
         try:
             filter_col = header_line.index(filter_column)
@@ -367,32 +441,33 @@ def get_sample_headers(header_line: list, filter_column: str) -> tuple[Optional[
             _logger.warning('%s was not found in the header row: %s', filter_column, header_line)
             return None, None
         try:
-            sample_id_col = header_line.index('SAMPLE_ID')
+            sample_id_col = header_line.index(sample_id_column)
         except ValueError:
-            _logger.warning('SAMPLE_ID was not found in the header row: %s', header_line)
+            _logger.warning('%s was not found in the header row: %s', sample_id_column, header_line)
             return None, None
         return filter_col, sample_id_col
 
-    def get_value_per_sample(self, local_clinical_sample_file: str, filter_column: str) -> dict:
+    def get_value_per_sample(self, local_clinical_sample_file: str, filter_column: str,
+                              sample_id_column: str = 'SAMPLE_ID') -> dict:
         sample_value = {}
         if local_clinical_sample_file:
-            with open(local_clinical_sample_file, 'r', encoding='utf-8') as clin_fn:
+            with _open_text(local_clinical_sample_file, encoding='utf-8') as clin_fn:
                 filter_column_col, sample_id_col = None, None
                 for line in clin_fn.readlines():
                     if line.startswith('#'):
                         continue
                     sl = line.strip().split('\t')
-                    # check for header and re-assign columns
-                    if 'SAMPLE_ID' in sl and filter_column in sl:
-                        filter_column_col, sample_id_col = self.get_sample_headers(sl, filter_column)
-                        # Skip adding the header row itself to sample_value
+                    if sample_id_column in sl and filter_column in sl:
+                        filter_column_col, sample_id_col = self.get_sample_headers(
+                            sl, filter_column, sample_id_column
+                        )
                         continue
                     if filter_column_col is not None and sample_id_col is not None:
                         if (sample_id_col < len(sl) and filter_column_col < len(sl)):
                             sample_value[sl[sample_id_col]] = sl[filter_column_col].strip().replace(' ', '_')
                     else:
-                        self.get_logger().warning("No column was found for %s, %s in %s", filter_column, 'SAMPLE_ID',
-                                                  local_clinical_sample_file)
+                        self.get_logger().warning("No column was found for %s, %s in %s", filter_column,
+                                                  sample_id_column, local_clinical_sample_file)
         return sample_value
 
     @staticmethod
@@ -442,16 +517,13 @@ def cbioportal_to_proteindb(self) -> None:
                 if row[0] == '#':
                     self.get_logger().info("skipping line (%s): %s", i, row)
                     continue
-                # check for header in the mutations file and get column indices
                 if set(header_cols.keys()).issubset(set(row)):
                     header_cols = self.get_mut_header_cols(header_cols, row)
                     continue
 
-                # check if any is none in header_cols then continue
                 if None in header_cols.values():
                     self.get_logger().error("Incorrect header column is given")
                     continue
-                # get filter value and check it
                 group = None
                 if self._accepted_values != ['all'] or self._split_by_filter_column:
                     try:
diff --git a/pgatk/cgenomes/cosmic_downloader.py b/pgatk/cgenomes/cosmic_downloader.py
index 3332014..65a066b 100644
--- a/pgatk/cgenomes/cosmic_downloader.py
+++ b/pgatk/cgenomes/cosmic_downloader.py
@@ -1,7 +1,9 @@
 import base64
-import gzip
-import requests
 import os
+import tarfile
+
+import requests
+from tqdm import tqdm
 
 from pgatk.toolbox.exceptions import AppConfigException
 from pgatk.toolbox.general import ParameterConfiguration, check_create_folders
@@ -12,14 +14,22 @@ class CosmicDownloadService(ParameterConfiguration):
     CONFIG_OUTPUT_DIRECTORY = 'output_directory'
     CONFIG_COSMIC_SERVER = 'cosmic_server'
     CONFIG_COSMIC_FTP_URL = 'cosmic_ftp'
+    CONFIG_API_ENDPOINT = 'api_endpoint'
+    CONFIG_BUCKET = 'bucket'
     CONFIG_COSMIC_FTP_USER = "cosmic_user"
     CONFIG_COSMIC_FTP_PASSWORD = "cosmic_password"
-    CONFIG_COSMIC_MUTATIONS_URL = "mutations_url"
-    CONFIG_COSMIC_MUTATIONS_FILE = "mutations_file"
-    CONFIG_COSMIC_CELLLINE_MUTATIONS_URL = "mutations_cellline_url"
-    CONFIG_COSMIC_CELLLINE_MUTATIONS_FILE = "mutations_cellline_file"
-    CONFIG_COSMIC_CDS_GENES_FILE = "all_cds_genes_file"
-    CONFIG_COSMIC_CELLLINES_GENES_FILE = "all_celllines_genes_file"
+    CONFIG_PRODUCTS = "products"
+
+    # Built-in fallback list, used only if `products` is missing from config.
+    # Real default lives in pgatk/config/cosmic_config.yaml and is the source of truth.
+    _DEFAULT_PRODUCTS = (
+        'grch38/cosmic/v103/Cosmic_GenomeScreensMutant_Tsv_v103_GRCh38.tar',
+        'grch38/cosmic/v103/Cosmic_CompleteTargetedScreensMutant_Tsv_v103_GRCh38.tar',
+        'grch38/cosmic/v103/Cosmic_Genes_Fasta_v103_GRCh38.tar',
+        'grch38/cosmic/v103/Cosmic_Transcripts_Tsv_v103_GRCh38.tar',
+        'grch38/cell_lines/v103/CellLinesProject_GenomeScreensMutant_Tsv_v103_GRCh38.tar',
+        'grch38/cell_lines/v103/CellLinesProject_CompleteCNA_Tsv_v103_GRCh38.tar',
+    )
 
     def __init__(self, config_file, pipeline_arguments):
         """
@@ -29,44 +39,41 @@ def __init__(self, config_file, pipeline_arguments):
         """
         super(CosmicDownloadService, self).__init__(self.CONFIG_KEY_DATA_DOWNLOADER, config_file, pipeline_arguments)
 
-        self._local_path_cosmic = self.get_configuration_default_params(variable=self.CONFIG_OUTPUT_DIRECTORY,
-                                                                        default_value='./database_cosmic/')
-        self._cosmic_ftp_url = self.get_configuration_default_params(variable=self.CONFIG_COSMIC_FTP_URL,
-                                                                     default_value='https://cancer.sanger.ac.uk')
-        self._cosmic_user = self.get_configuration_default_params(variable=self.CONFIG_COSMIC_FTP_USER,
-                                                                  default_value='')
-        self._cosmic_password = self.get_configuration_default_params(variable=self.CONFIG_COSMIC_FTP_PASSWORD,
-                                                                      default_value='')
-        self._cosmic_mutation_url = self.get_configuration_default_params(variable=self.CONFIG_COSMIC_MUTATIONS_URL,
-                                                                          default_value='cosmic/file_download/GRCh38/cosmic/v94')
-        self._cosmic_mutations_file = self.get_configuration_default_params(variable=self.CONFIG_COSMIC_MUTATIONS_FILE,
-                                                                            default_value='CosmicMutantExport.tsv.gz')
-        self._cosmic_cellline_mutation_url = self.get_configuration_default_params(
-            variable=self.CONFIG_COSMIC_CELLLINE_MUTATIONS_URL,
-            default_value='cosmic/file_download/GRCh38/cell_lines/v94')
-        self._cosmic_cellline_mutation_file = self.get_configuration_default_params(
-            variable=self.CONFIG_COSMIC_CELLLINE_MUTATIONS_FILE, default_value='CosmicCLP_MutantExport.tsv.gz')
-        self._cosmic_cellline_mutation_gene = self.get_configuration_default_params(
-            variable=self.CONFIG_COSMIC_CELLLINES_GENES_FILE, default_value='All_CellLines_Genes.fasta.gz')
-        self._cosmic_cdns_file = self.get_configuration_default_params(variable=self.CONFIG_COSMIC_CDS_GENES_FILE,
-                                                                       default_value='All_COSMIC_Genes.fasta.gz')
-
-        self._cosmic_token = base64.b64encode("{}:{}".format(self._cosmic_user, self._cosmic_password)
-                                              .encode()).decode('utf-8')
+        self._local_path_cosmic = self.get_configuration_default_params(
+            variable=self.CONFIG_OUTPUT_DIRECTORY, default_value='./database_cosmic/')
+        self._cosmic_ftp_url = self.get_configuration_default_params(
+            variable=self.CONFIG_COSMIC_FTP_URL, default_value='https://cancer.sanger.ac.uk')
+        self._api_endpoint = self.get_configuration_default_params(
+            variable=self.CONFIG_API_ENDPOINT, default_value='api/mono/products/v1/downloads/scripted')
+        self._bucket = self.get_configuration_default_params(
+            variable=self.CONFIG_BUCKET, default_value='downloads')
+        self._cosmic_user = self.get_configuration_default_params(
+            variable=self.CONFIG_COSMIC_FTP_USER, default_value='')
+        self._cosmic_password = self.get_configuration_default_params(
+            variable=self.CONFIG_COSMIC_FTP_PASSWORD, default_value='')
+        self._products = self.get_configuration_default_params(
+            variable=self.CONFIG_PRODUCTS, default_value=list(self._DEFAULT_PRODUCTS))
+
+        self._cosmic_token = base64.b64encode(
+            "{}:{}".format(self._cosmic_user, self._cosmic_password).encode()
+        ).decode('utf-8')
 
         self.prepare_local_cosmic_repository()
 
     def get_configuration_default_params(self, variable: str, default_value):
-        return_value = default_value
         if variable in self.get_pipeline_parameters():
-            return_value = self.get_pipeline_parameters()[variable]
-        elif self.CONFIG_KEY_DATA_DOWNLOADER in self.get_default_parameters() \
-                and self.CONFIG_COSMIC_SERVER in self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER] \
-                and variable in self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][
-            self.CONFIG_COSMIC_SERVER]:
-            return_value = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_COSMIC_SERVER][
-                variable]
-        return return_value
+            return self.get_pipeline_parameters()[variable]
+        defaults = self.get_default_parameters()
+        if self.CONFIG_KEY_DATA_DOWNLOADER not in defaults:
+            return default_value
+        data = defaults[self.CONFIG_KEY_DATA_DOWNLOADER]
+        # `output_directory` lives at cosmic_data.<var>; everything else under cosmic_data.cosmic_server.<var>.
+        if variable in data and not isinstance(data[variable], dict):
+            return data[variable]
+        server = data.get(self.CONFIG_COSMIC_SERVER, {})
+        if variable in server:
+            return server[variable]
+        return default_value
 
     def prepare_local_cosmic_repository(self):
         self.get_logger().debug("Preparing local cbioportal repository, root folder - '{}'".format(
@@ -78,88 +85,141 @@ def prepare_local_cosmic_repository(self):
     def get_local_path_root_cosmic_repo(self):
         return self._local_path_cosmic
 
+    def build_api_url(self, product_path):
+        """Build the COSMIC scripted-download API URL for a single product."""
+        return "{server}/{endpoint}?path={path}&bucket={bucket}".format(
+            server=self._cosmic_ftp_url.rstrip('/'),
+            endpoint=self._api_endpoint.strip('/'),
+            path=product_path,
+            bucket=self._bucket,
+        )
+
     def download_mutation_file(self, url_file_name=None):
         """
-        This function will download the mutations file from Cosmic Database.
-        :return: None
-        """
+        Download every product listed in `self._products` via COSMIC's v103+
+        scripted-download API. Each product is a `path=` value such as
+        `grch38/cosmic/v103/Cosmic_GenomeScreensMutant_Tsv_v103_GRCh38.tar`.
 
-        mutation_output_file = "{}/{}".format(self.get_local_path_root_cosmic_repo(), self._cosmic_mutations_file)
-        cds_genes_output_file = "{}/{}".format(self.get_local_path_root_cosmic_repo(), self._cosmic_cdns_file)
-
-        mutation_celline_output_file = "{}/{}".format(self.get_local_path_root_cosmic_repo(),
-                                                      self._cosmic_cellline_mutation_file)
-        cellines_genes_output_file = "{}/{}".format(self.get_local_path_root_cosmic_repo(),
-                                                    self._cosmic_cellline_mutation_gene)
-
-        server = self._cosmic_ftp_url
-
-        cosmic_version = self._cosmic_mutation_url
-        mutation_file = self._cosmic_mutations_file
+        If url_file_name is provided, write the API URLs to that file instead of downloading.
+        """
 
-        cosmic_cellline_version = self._cosmic_cellline_mutation_url
-        mutation_celline_file = self._cosmic_cellline_mutation_file
+        products = list(self._products)
+        if not products:
+            self.get_logger().warning("No COSMIC products configured; nothing to download.")
+            return
 
-        all_cds_gene_file = self._cosmic_cdns_file
+        token = "Basic {}".format(self._cosmic_token)
+        output_dir = self.get_local_path_root_cosmic_repo()
 
-        all_celllines_gene_file = self._cosmic_cellline_mutation_gene
+        if url_file_name is not None:
+            with open(url_file_name, 'w', encoding='utf-8') as url_file:
+                for path in products:
+                    api_url = self.build_api_url(path)
+                    output_file = "{}/{}".format(output_dir, os.path.basename(path))
+                    url_file.write("{}\t{}\n".format(api_url, output_file))
+            return
 
-        mutation_url = "{}/{}/{}".format(server, cosmic_version, mutation_file)
-        cds_gene_url = "{}/{}/{}".format(server, cosmic_version, all_cds_gene_file)
+        for path in products:
+            api_url = self.build_api_url(path)
+            output_file = "{}/{}".format(output_dir, os.path.basename(path))
+            self.download_file_cosmic(api_url, output_file, token)
 
-        celllines_gene_url = "{}/{}/{}".format(server, cosmic_cellline_version, all_celllines_gene_file)
-        mutation_cellline_url = "{}/{}/{}".format(server, cosmic_cellline_version, mutation_celline_file)
+    def download_file_cosmic(self, api_url, local_file, token):
+        """
+        Two-step download from COSMIC's scripted-download API:
+        1) GET api_url with Basic auth -> returns JSON {"url": "<s3-presigned-url>"}
+        2) GET the presigned URL (no auth, 1-hour TTL) -> stream to local_file
 
-        if url_file_name is None:
-            token = "Basic {}".format(self._cosmic_token)
-            self.download_file_cosmic(mutation_url, mutation_output_file, token)
-            self.download_file_cosmic(cds_gene_url, cds_genes_output_file, token)
+        If local_file ends in .tar, extract its contents to the same directory
+        and delete the archive.
+        """
+        api_response = requests.get(api_url, headers={'Authorization': token}, timeout=30)
+        if api_response.status_code != 200:
+            msg = ("COSMIC API request failed: HTTP {} for {}. Body: {!r}"
+                   .format(api_response.status_code, api_url, api_response.text[:200]))
+            self.get_logger().error(msg)
+            raise AppConfigException(msg)
 
-            self.download_file_cosmic(celllines_gene_url, cellines_genes_output_file, token)
-            self.download_file_cosmic(mutation_cellline_url, mutation_celline_output_file, token)
+        try:
+            payload = api_response.json()
+            download_url = payload['url']
+        except (ValueError, KeyError) as exc:
+            msg = ("COSMIC API returned 200 but the body is not a JSON object with a 'url' "
+                   "key (got: {!r}). This usually means the `path=` value is wrong for the "
+                   "release/assembly, or your account lacks access. URL was: {}. Error: {}"
+                   .format(api_response.text[:200], api_url, exc))
+            self.get_logger().error(msg)
+            raise AppConfigException(msg)
 
-        else:
-            if url_file_name is not None:
-                with open(url_file_name, 'w', encoding='utf-8') as url_file:
-                    url_file.write("{}\t{}\n".format(mutation_url, mutation_output_file))
-                    url_file.write("{}\t{}\n".format(cds_gene_url, cds_genes_output_file))
-                    url_file.write("{}\t{}\n".format(celllines_gene_url, cellines_genes_output_file))
-                    url_file.write("{}\t{}\n".format(mutation_cellline_url, mutation_celline_output_file))
+        # The presigned URL embeds short-lived AWS credentials in its query string;
+        # log only the path so the secret material doesn't leak into log aggregators.
+        self.get_logger().debug("Downloading file from signed URL (path %s)", download_url.split('?', 1)[0])
+        data_response = requests.get(download_url, stream=True, timeout=30)
+        if data_response.status_code != 200:
+            msg = ("COSMIC S3 download failed: HTTP {} for {}"
+                   .format(data_response.status_code, download_url))
+            self.get_logger().error(msg)
+            raise AppConfigException(msg)
 
-    def download_file_cosmic(self, url, local_file, token):
-        """
-        Download file from cosmic repository using requests
-        :param url: url of the file to be download
-        :param local_file: local file
-        :param token: token to be used
-        :return:
+        total_size = int(data_response.headers.get('content-length') or 0)
+        chunk_size = 1024 * 1024
+        progress = tqdm(
+            total=total_size if total_size > 0 else None,
+            unit='B', unit_scale=True, unit_divisor=1024,
+            desc=os.path.basename(local_file),
+            leave=True,
+        )
+        try:
+            with open(local_file, 'wb') as f:
+                for chunk in data_response.iter_content(chunk_size=chunk_size):
+                    if chunk:
+                        f.write(chunk)
+                        progress.update(len(chunk))
+        finally:
+            progress.close()
+        self.get_logger().debug("Download finished: '{}'".format(local_file))
+
+        if local_file.endswith('.tar'):
+            extract_dir = os.path.dirname(local_file) or '.'
+            print("Extracting {}...".format(os.path.basename(local_file)), flush=True)
+            self._safe_extract_tar(local_file, extract_dir)
+            os.remove(local_file)
+            self.get_logger().debug("Extracted archive into '{}'".format(extract_dir))
+
+    @staticmethod
+    def _safe_extract_tar(tar_path, dest_dir):
+        """Extract tar_path into dest_dir, rejecting unsafe members.
+
+        Guards against path traversal (CVE-2007-4559) via absolute paths or
+        ``..`` components in member names, and ensures any symlink/hardlink
+        targets resolve within dest_dir. COSMIC archives are trusted in
+        practice, but defence in depth costs nothing and silences static
+        analysers.
         """
-
-        response = requests.get(url, stream=True, headers={'Authorization': token}, timeout=30)
-        if response.status_code == 200:
-            url = response.json()['url']
-            msg = "Downloading file from url '{}'".format(url)
-            self.get_logger().debug(msg)
-
-            response = requests.get(url, stream=True, timeout=30)
-            if response.status_code == 200:
-                with open(local_file, 'wb') as f:
-                    f.write(response.content)
-                    msg = "Download Finish for file '{}'".format(local_file)
-                    self.get_logger().debug(msg)
-                if local_file.endswith('.gz'):
-                    extracted_file = local_file.replace('.gz', '')
-                    with open(extracted_file, 'w', encoding='utf-8') as outfile:
-                        try:
-                            outfile.write(gzip.decompress(open(local_file, 'rb').read()).decode('utf-8'))
-                        except UnicodeDecodeError:
-                            outfile.write(gzip.decompress(open(local_file, 'rb').read()).decode('ISO-8859-1'))
-                        os.remove(local_file)
-                        local_file = extracted_file
-                        msg = "Extracted file '{}'".format(local_file)
-                        self.get_logger().debug(msg)
-        else:
-            msg = "Error downloading the COSMIC data, error code {} , error message '{}'".format(response.status_code,
-                                                                                                 local_file)
-            self.get_logger().debug(msg)
-            raise AppConfigException(msg)
+        dest_abs = os.path.realpath(dest_dir)
+        with tarfile.open(tar_path, 'r') as tar:
+            safe_members: list[tarfile.TarInfo] = []
+            for member in tar.getmembers():
+                member_path = os.path.realpath(os.path.join(dest_abs, member.name))
+                if os.path.commonpath([dest_abs, member_path]) != dest_abs:
+                    raise AppConfigException(
+                        "Refusing to extract tar member {!r}: would escape target dir {!r}"
+                        .format(member.name, dest_abs)
+                    )
+                if member.issym() or member.islnk():
+                    link_target = member.linkname
+                    if os.path.isabs(link_target):
+                        raise AppConfigException(
+                            "Refusing to extract tar member {!r}: absolute symlink/hardlink target {!r}"
+                            .format(member.name, link_target)
+                        )
+                    resolved = os.path.realpath(os.path.join(dest_abs, os.path.dirname(member.name), link_target))
+                    if os.path.commonpath([dest_abs, resolved]) != dest_abs:
+                        raise AppConfigException(
+                            "Refusing to extract tar member {!r}: symlink/hardlink would escape target dir"
+                            .format(member.name)
+                        )
+                safe_members.append(member)
+            # All members in safe_members were validated against dest_abs above:
+            # absolute paths, .. traversal, and unsafe symlink targets all rejected.
+            tar.extractall(path=dest_abs, members=safe_members)  # nosec B202
diff --git a/pgatk/clinvar/clinvar_service.py b/pgatk/clinvar/clinvar_service.py
index ee9a2cc..f368690 100644
--- a/pgatk/clinvar/clinvar_service.py
+++ b/pgatk/clinvar/clinvar_service.py
@@ -372,13 +372,10 @@ def run(self) -> str:
         """
         logger.info("Starting ClinVar pipeline")
 
-        # 1. Load chromosome mapper
         chrom_mapper = ChromosomeMapper.from_assembly_report(self._assembly_report)
 
-        # 2. Parse GTF
         db = self._parse_gtf(self._gtf_file)
 
-        # 3. Load transcript FASTA
         transcripts_dict = SeqIO.index(
             self._fasta_file,
             "fasta",
@@ -389,10 +386,8 @@ def run(self) -> str:
             k.split(".")[0]: k for k in transcripts_dict.keys()
         }
 
-        # 4. Read VCF once into DataFrame
         _metadata, vcf_df = self._read_vcf(self._vcf_file)
 
-        # 5. Find overlapping transcripts via BedTools (from DataFrame)
         overlap_map = self._build_overlap_map(vcf_df, self._gtf_file, chrom_mapper)
         logger.info("Found %d variants with transcript overlaps", len(overlap_map))
 
@@ -411,7 +406,6 @@ def run(self) -> str:
             for _, record in vcf_df.iterrows():
                 stats["variants_processed"] += 1
 
-                # --- Validate alleles ---
                 ref = str(record.REF)
                 if any(c not in "ACGT" for c in ref):
                     continue
@@ -423,26 +417,22 @@ def run(self) -> str:
                 if not alts:
                     continue
 
-                # --- CLNSIG filter ---
                 info = str(record.INFO)
                 clnsig = self._get_info_field(info, "CLNSIG")
                 if not self.passes_clnsig_filter(clnsig, self._clnsig_exclude):
                     stats["variants_filtered_clnsig"] += 1
                     continue
 
-                # --- MC consequence filter ---
                 mc_field = self._get_info_field(info, "MC")
                 if not self.passes_mc_filter(mc_field, self._include_consequences):
                     stats["variants_filtered_mc"] += 1
                     continue
 
-                # --- Parse gene symbol and CLNSIG for description ---
                 gene_symbol, _ = self.parse_geneinfo(
                     self._get_info_field(info, "GENEINFO")
                 )
                 desc_str = f"{clnsig}|{gene_symbol}" if gene_symbol else clnsig
 
-                # --- Find overlapping transcripts ---
                 chrom = str(record.CHROM)
                 pos = int(record.POS)
 
@@ -465,7 +455,6 @@ def run(self) -> str:
                             continue
                         processed_pairs.add(pair_key)
 
-                        # Resolve transcript in FASTA
                         tid = transcript_id
                         if tid not in transcripts_dict:
                             tid = transcript_id_mapping.get(
@@ -479,7 +468,6 @@ def run(self) -> str:
                             )
                             continue
 
-                        # --- Biotype filter ---
                         if self._include_biotypes != ["all"]:
                             biotype = self._get_transcript_biotype(db, tid)
                             if biotype and biotype not in self._include_biotypes:
@@ -489,7 +477,6 @@ def run(self) -> str:
                         ref_seq = fasta_record.seq
                         desc = str(fasta_record.description)
 
-                        # Determine CDS info and feature types
                         cds_info: list[int] = []
                         feature_types = ["exon"]
                         num_orfs = 3
@@ -512,19 +499,16 @@ def run(self) -> str:
                                     desc,
                                 )
 
-                        # Get features from GTF
                         feat_chrom, strand, features_info = self._get_features(
                             db, tid, feature_types
                         )
                         if feat_chrom is None:
                             continue
 
-                        # Check overlap at feature level
                         var_end = pos + len(ref) - 1
                         if not check_overlap(pos, var_end, features_info):
                             continue
 
-                        # Apply variant
                         coding_ref_seq, coding_alt_seq = get_altseq(
                             ref_seq,
                             Seq(ref),
@@ -538,7 +522,6 @@ def run(self) -> str:
                         if coding_alt_seq == "":
                             continue
 
-                        # Translate
                         ref_orfs, alt_orfs = get_orfs_vcf(
                             coding_ref_seq,
                             coding_alt_seq,
@@ -546,7 +529,6 @@ def run(self) -> str:
                             num_orfs,
                         )
 
-                        # Build sequence ID
                         record_id = ""
                         if record.ID and str(record.ID) != ".":
                             record_id = str(record.ID)
diff --git a/pgatk/commands/cosmic_downloader.py b/pgatk/commands/cosmic_downloader.py
index 5696097..fe35216 100644
--- a/pgatk/commands/cosmic_downloader.py
+++ b/pgatk/commands/cosmic_downloader.py
@@ -16,9 +16,19 @@
               help="Username for cosmic database -- please if you don't have one register here (https://cancer.sanger.ac.uk/cosmic/register)")
 @click.option('-p', '--password',
               help="Password for cosmic database -- please if you don't have one register here (https://cancer.sanger.ac.uk/cosmic/register)")
-@click.option("--url_file", help='Add the url to a downloaded file')
+@click.option('-P', '--products', multiple=True,
+              help='COSMIC scripted-download `path=` value to fetch (e.g. '
+                   'grch38/cosmic/v103/Cosmic_GenomeScreensMutant_Tsv_v103_GRCh38.tar). '
+                   'May be repeated. Overrides the products list in the config file when provided. '
+                   'Find available paths at https://cancer.sanger.ac.uk/cosmic/download/cosmic '
+                   '(open the "Scripted Download" panel for any product).')
+@click.option("--url_file",
+              help='If set, do not download anything. Instead, write a TSV with one row per '
+                   'product (`<api_url>\\t<local_output_path>`) so the downloads can be driven '
+                   'externally (e.g. wget/curl loop). The api_url is the COSMIC scripted-download '
+                   'endpoint URL — fetching it returns JSON containing the actual signed S3 URL.')
 @click.pass_context
-def cosmic_downloader(ctx, config_file, output_directory, username, password, url_file):
+def cosmic_downloader(ctx, config_file, output_directory, username, password, products, url_file):
 
     config_data = load_config("cosmic", config_file)
 
@@ -31,5 +41,8 @@ def cosmic_downloader(ctx, config_file, output_directory, username, password, ur
     if password is not None:
         pipeline_arguments[CosmicDownloadService.CONFIG_COSMIC_FTP_PASSWORD] = password
 
+    if products:
+        pipeline_arguments[CosmicDownloadService.CONFIG_PRODUCTS] = list(products)
+
     cosmic_downloader_service = CosmicDownloadService(config_data, pipeline_arguments)
     cosmic_downloader_service.download_mutation_file(url_file_name=url_file)
diff --git a/pgatk/commands/cosmic_to_proteindb.py b/pgatk/commands/cosmic_to_proteindb.py
index 5c24ca4..b4d0c27 100644
--- a/pgatk/commands/cosmic_to_proteindb.py
+++ b/pgatk/commands/cosmic_to_proteindb.py
@@ -22,9 +22,12 @@
 @click.option('-s', '--split_by_filter_column',
               help='Use this flag to generate a proteinDB per group as specified in the filter_column, default is False',
               is_flag=True)
+@click.option('-cl', '--clinical_sample_file',
+              help='COSMIC sample annotation file used to map '
+                   'COSMIC_SAMPLE_ID to the filter column value (e.g. PRIMARY_SITE)')
 @click.pass_context
 def cosmic_to_proteindb(ctx, config_file, input_mutation, input_genes, output_db,
-                        filter_column, accepted_values, split_by_filter_column):
+                        filter_column, accepted_values, split_by_filter_column, clinical_sample_file):
 
     config_data = load_config("cosmic", config_file)
 
@@ -39,10 +42,13 @@ def cosmic_to_proteindb(ctx, config_file, input_mutation, input_genes, output_db
     if output_db is not None:
         pipeline_arguments[CancerGenomesService.CONFIG_OUTPUT_FILE] = output_db
 
+    if clinical_sample_file is not None:
+        pipeline_arguments[CancerGenomesService.CLINICAL_SAMPLE_FILE] = clinical_sample_file
+
     if filter_column is not None:
         pipeline_arguments[CancerGenomesService.FILTER_COLUMN] = filter_column
     elif config_data is None:
-        pipeline_arguments[CancerGenomesService.FILTER_COLUMN] = 'Primary site'
+        pipeline_arguments[CancerGenomesService.FILTER_COLUMN] = 'PRIMARY_SITE'
 
     if accepted_values is not None:
         pipeline_arguments[CancerGenomesService.ACCEPTED_VALUES] = accepted_values
diff --git a/pgatk/commands/ensembl_downloader.py b/pgatk/commands/ensembl_downloader.py
index 83adb88..0885cd7 100644
--- a/pgatk/commands/ensembl_downloader.py
+++ b/pgatk/commands/ensembl_downloader.py
@@ -38,7 +38,6 @@ def ensembl_downloader(ctx, config_file, output_directory, taxonomy, folder_pref
     if taxonomy is None and ensembl_name is None:
         raise click.UsageError("Either --taxonomy or --ensembl_name is required.")
 
-    # Parse pipelines parameters.
     pipeline_arguments = {}
 
     if output_directory is not None:
diff --git a/pgatk/commands/vcf_to_proteindb.py b/pgatk/commands/vcf_to_proteindb.py
index da2d474..7cb2faa 100644
--- a/pgatk/commands/vcf_to_proteindb.py
+++ b/pgatk/commands/vcf_to_proteindb.py
@@ -38,6 +38,8 @@
               help="enabling this option causes or variants to be parsed. By default only variants that have not failed any filters will be processed (FILTER column is PASS, None, .) or if the filters are subset of the accepted filters. (default is False)",
               is_flag=True)
 @click.option('--accepted_filters', help="Accepted filters for variant parsing")
+@click.option('-w', '--workers', type=int, default=None,
+              help="Number of worker processes to fan out across (default: 1, sequential). Per-chromosome split.")
 @click.pass_context
 def vcf_to_proteindb(ctx, config_file, input_fasta, vcf, gene_annotations_gtf, translation_table,
                      mito_translation_table,
@@ -45,7 +47,7 @@ def vcf_to_proteindb(ctx, config_file, input_fasta, vcf, gene_annotations_gtf, t
                      af_field, af_threshold, transcript_str, biotype_str, exclude_biotypes,
                      include_biotypes, consequence_str, exclude_consequences,
                      skip_including_all_cds, include_consequences,
-                     ignore_filters, accepted_filters):
+                     ignore_filters, accepted_filters, workers):
 
     config_data = load_config("ensembl_config", config_file)
 
@@ -105,5 +107,8 @@ def vcf_to_proteindb(ctx, config_file, input_fasta, vcf, gene_annotations_gtf, t
     if accepted_filters is not None:
         pipeline_arguments[EnsemblDataService.ACCEPTED_FILTERS] = accepted_filters
 
+    if workers is not None:
+        pipeline_arguments[EnsemblDataService.WORKERS] = workers
+
     ensembl_data_service = EnsemblDataService(config_data, pipeline_arguments)
-    ensembl_data_service.vcf_to_proteindb(vcf, input_fasta, gene_annotations_gtf)
+    ensembl_data_service.vcf_to_proteindb(vcf, input_fasta, gene_annotations_gtf, workers=workers)
diff --git a/pgatk/config/cosmic_config.yaml b/pgatk/config/cosmic_config.yaml
index 6b6da9b..0555af5 100644
--- a/pgatk/config/cosmic_config.yaml
+++ b/pgatk/config/cosmic_config.yaml
@@ -2,14 +2,32 @@ cosmic_data:
   output_directory: database_cosmic
   cosmic_server:
     cosmic_ftp: https://cancer.sanger.ac.uk
-    mutations_url: cosmic/file_download/GRCh38/cosmic/v94
-    all_cds_genes_file: All_COSMIC_Genes.fasta.gz
-    mutations_file: CosmicMutantExport.tsv.gz
-    mutations_cellline_url: cosmic/file_download/GRCh38/cell_lines/v94
-    mutations_cellline_file: CosmicCLP_MutantExport.tsv.gz
-    all_celllines_genes_file: All_CellLines_Genes.fasta.gz
+    api_endpoint: api/mono/products/v1/downloads/scripted
+    bucket: downloads
     cosmic_user: ''
     cosmic_password: ''
+    # List of COSMIC scripted-download paths to fetch. Each entry is the `path=`
+    # value from the curl command shown on the COSMIC UI's "Scripted Download" panel
+    # for a product (https://cancer.sanger.ac.uk/cosmic/download/cosmic).
+    #
+    # Path format: <assembly>/<area>/<version>/<filename>
+    #   - <assembly> is lowercase (`grch37`, `grch38`)
+    #   - <area> is `cosmic` for the main COSMIC project, `cell_lines` for the
+    #     Cell Lines Project (other projects: `cancer_mutation_census`, `actionability`)
+    #   - <version> is the release (`v103`, etc.)
+    #   - <filename> is the exact .tar archive name shown in the UI
+    #
+    # Add or remove entries to change what gets downloaded. Defaults below cover the
+    # three files the cosmic-to-proteindb pipeline expects (mutations TSV + gene FASTA
+    # for the main COSMIC project, plus cell-line mutations TSV).
+    products:
+      - grch38/cosmic/v103/Cosmic_GenomeScreensMutant_Tsv_v103_GRCh38.tar
+      - grch38/cosmic/v103/Cosmic_CompleteTargetedScreensMutant_Tsv_v103_GRCh38.tar
+      - grch38/cosmic/v103/Cosmic_Genes_Fasta_v103_GRCh38.tar
+      - grch38/cosmic/v103/Cosmic_Transcripts_Tsv_v103_GRCh38.tar
+      - grch38/cell_lines/v103/CellLinesProject_GenomeScreensMutant_Tsv_v103_GRCh38.tar
+      - grch38/cell_lines/v103/CellLinesProject_CompleteCNA_Tsv_v103_GRCh38.tar
+      - grch38/cosmic/v103/Cosmic_Classification_Tsv_v103_GRCh38.tar
   logger:
     formatters:
       DEBUG: "%(asctime)s [%(levelname)7s][%(name)48s][%(module)32s, %(lineno)4s] %(message)s"
@@ -17,7 +35,7 @@ cosmic_data:
     loglevel: DEBUG
 proteindb:
    filter_info:
-      filter_column: 'Primary site'
+      filter_column: 'PRIMARY_SITE'
       accepted_values: "all"
       split_by_filter_column: False
       clinical_sample_file: ''
diff --git a/pgatk/config/ensembl_config.yaml b/pgatk/config/ensembl_config.yaml
index 1838c7c..ac679ed 100644
--- a/pgatk/config/ensembl_config.yaml
+++ b/pgatk/config/ensembl_config.yaml
@@ -11,7 +11,7 @@ ensembl_translation:
     exclude_biotypes: ''
     exclude_consequences: 'downstream_gene_variant, upstream_gene_variant, intergenic_variant, intron_variant, synonymous_variant, regulatory_region_variant'
     skip_including_all_cds: False
-    include_biotypes: 'protein_coding,polymorphic_pseudogene,non_stop_decay,nonsense_mediated_decay,IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,TR_C_gene,TR_D_gene,TR_J_gene,TR_V_gene,TEC,mRNA'
+    include_biotypes: 'IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,nonsense_mediated_decay,non_stop_decay,protein_coding,protein_coding_LoF,TR_C_gene,TR_D_gene,TR_J_gene,TR_V_gene'
     include_consequences: 'all'
     biotype_str: transcript_biotype
     transcript_description_sep: ';'
diff --git a/pgatk/db/digest_mutant_protein.py b/pgatk/db/digest_mutant_protein.py
index 7b49201..6651aaa 100644
--- a/pgatk/db/digest_mutant_protein.py
+++ b/pgatk/db/digest_mutant_protein.py
@@ -18,7 +18,7 @@ def trypsin_cleavage(proseq: str, miss_cleavage: int):
         except IndexError:
             pass
 
-        if aa in ['K', 'R'] and next_aa != 'P':  # for trypsin peptides
+        if aa in ['K', 'R'] and next_aa != 'P':
             if len(peptide) > 0:
                 peptides.append(peptide)
             peptide = ''
@@ -63,7 +63,6 @@ def digest_mutant_proteins(
     :param max_length: Maximum peptide length to retain (default: 40).
     :param missed_cleavages: Number of missed cleavages for trypsin digestion (default: 0).
     """
-    # Build canonical peptidome from reference FASTA
     handle1 = SeqIO.parse(fasta_file, 'fasta')
     peptidome = {}
 
@@ -78,7 +77,6 @@ def digest_mutant_proteins(
     log.info("Known peptides number: %d", len(peptidome))
     handle1.close()
 
-    # Parse mutant protein input files
     filelist = input_file.split(",")
     handle_list = []
     for f in filelist:
diff --git a/pgatk/db/map_peptide2genome.py b/pgatk/db/map_peptide2genome.py
index a09b119..c573dc0 100644
--- a/pgatk/db/map_peptide2genome.py
+++ b/pgatk/db/map_peptide2genome.py
@@ -87,7 +87,7 @@ def parse_gtf(infile):
     dic = {}
     with open(infile, "r", encoding='utf-8') as infile_object:
         for line in infile_object:
-            if line[0] != "#":  # skip lines commented out
+            if line[0] != "#":
                 row = line.strip().split("\t")
                 if row[2] == "CDS":
                     attri_list = row[8].split(";")
@@ -153,7 +153,6 @@ def map_peptides_to_genome(
 
     pep_dic = {}
     with open(input_file, 'r', encoding='utf-8') as input_stream:
-        # peptide table with two columns, peptide sequence in first column, protein ID in second column
         input_stream.readline()
         for line in input_stream:
             row = line.strip().split("\t")
@@ -188,7 +187,6 @@ def map_peptides_to_genome(
                 exons, pep_trans_start, pep_trans_end
             )
 
-            # handle exceptions
             if pep_chr_start > pep_chr_end:
                 non_mapped_pep += 1
                 continue
diff --git a/pgatk/ensembl/data_downloader.py b/pgatk/ensembl/data_downloader.py
index 1ad4812..0be653b 100644
--- a/pgatk/ensembl/data_downloader.py
+++ b/pgatk/ensembl/data_downloader.py
@@ -6,7 +6,6 @@
     2. Given a species ID, collect its GTF data, with the option of decompressing it or not.
 """
 
-# App imports
 from json import loads
 import re
 from pgatk.toolbox.general import ParameterConfiguration, check_create_folders, download_file
@@ -432,7 +431,6 @@ def get_vcf_files(self, species: dict, url_file=None) -> list:
                                                 file_name=self.get_local_path_root_ensembl_repo() + '/' + file_name,
                                                 log=self.get_logger(), url_file=url_file)
                 if downloaded_file is not None:
-                    # if chr1 is downloaded then try all others
                     files.append(downloaded_file)
                     for chrN in range(2, 23):  # chr2-22
                         file_name = '{}_incl_consequences-chr{}.vcf.gz'.format(species['name'], chrN)
diff --git a/pgatk/ensembl/ensembl.py b/pgatk/ensembl/ensembl.py
index e1d8b7e..49fe852 100644
--- a/pgatk/ensembl/ensembl.py
+++ b/pgatk/ensembl/ensembl.py
@@ -1,6 +1,8 @@
 from __future__ import annotations
 
 import logging
+import os
+import re
 import sqlite3
 from pathlib import Path
 from typing import Any, Optional
@@ -19,6 +21,103 @@
 )
 
 
+class _FeatureCache:
+
+    """Per-run memoization of get_features() results, keyed by transcript id.
+
+    Bounded by a soft maxsize to avoid unbounded growth on whole-genome runs.
+    Defaults to 50000, which comfortably covers human ENSEMBL (~250k transcripts
+    across all chromosomes; a single chr is ~10-20k).
+    """
+
+    def __init__(self, maxsize: int = 50000) -> None:
+        self._cache: dict[tuple, tuple] = {}
+        self._maxsize = maxsize
+
+    def get(self, key: tuple) -> Optional[tuple]:
+        return self._cache.get(key)
+
+    def put(self, key: tuple, value: tuple) -> None:
+        if len(self._cache) >= self._maxsize:
+            for k in list(self._cache.keys())[: self._maxsize // 5]:
+                del self._cache[k]
+        self._cache[key] = value
+
+
+_MISSING = object()
+
+
+def _safe_chrom(chrom: str) -> str:
+    """Sanitise a chromosome identifier into a filesystem-safe filename component."""
+    return re.sub(r'[^A-Za-z0-9._-]', '_', str(chrom))
+
+
+def _ensure_fasta_index(input_fasta: str) -> str:
+    """Return the path to a `.idx` SQLite index for input_fasta, building it
+    if absent or stale. Stale means: the .idx exists but the FASTA's mtime
+    is newer than the .idx's. The index is keyed by `EnsemblDataService.get_key`.
+    """
+    idx_path = input_fasta + ".idx"
+    if os.path.exists(idx_path):
+        if os.path.getmtime(idx_path) >= os.path.getmtime(input_fasta):
+            return idx_path
+        try:
+            os.remove(idx_path)
+        except OSError:
+            pass
+    # Build the index. SeqIO.index_db materialises the SQLite file on disk
+    # as a side effect; we don't need to keep the returned handle here.
+    SeqIO.index_db(idx_path, [input_fasta], "fasta", key_function=EnsemblDataService.get_key)
+    return idx_path
+
+
+def _split_vcf_by_chrom(vcf_file: str, output_dir: str) -> dict[str, str]:
+    """Stream `vcf_file` once, writing per-chromosome VCFs into `output_dir`.
+
+    Each output file is `<output_dir>/chunk_<safe_chrom>.vcf` and contains the
+    full VCF header followed by only the data lines for that chromosome.
+    Preserves line ordering within each chromosome.
+
+    Returns a mapping `{chrom: chunk_path}`. Constant memory — holds at most
+    one open file handle per chromosome seen so far.
+    """
+    header: list[str] = []
+    handles: dict[str, Any] = {}
+    chunk_paths: dict[str, str] = {}
+    try:
+        with open(vcf_file, 'r', encoding='utf-8') as f:
+            for line in f:
+                if line.startswith('#'):
+                    header.append(line)
+                    continue
+                if not line.strip():
+                    continue
+                chrom = line.split('\t', 1)[0]
+                handle = handles.get(chrom)
+                if handle is None:
+                    chunk_path = os.path.join(output_dir, "chunk_" + _safe_chrom(chrom) + ".vcf")
+                    handle = open(chunk_path, 'w', encoding='utf-8')
+                    handle.writelines(header)
+                    handles[chrom] = handle
+                    chunk_paths[chrom] = chunk_path
+                handle.write(line)
+    finally:
+        for h in handles.values():
+            h.close()
+    return chunk_paths
+
+
+def _vcf_to_proteindb_worker(default_params, pipeline_args, vcf_file,
+                              input_fasta, gene_annotations_gtf, output_path):
+    """Module-level worker for multiprocessing.Pool.starmap.
+
+    Re-constructs an EnsemblDataService in the worker process from the
+    pickled config dict + pipeline args, then runs the chunk pipeline.
+    """
+    svc = EnsemblDataService(default_params, pipeline_args)
+    svc._vcf_to_proteindb_chunk(vcf_file, input_fasta, gene_annotations_gtf, output_path)
+
+
 class EnsemblDataService(ParameterConfiguration):
     CONFIG_KEY_VCF = "ensembl_translation"
     INPUT_FASTA = "input_fasta"
@@ -47,6 +146,7 @@ class EnsemblDataService(ParameterConfiguration):
     EXPRESSION_THRESH = "expression_thresh"
     IGNORE_FILTERS = "ignore_filters"
     ACCEPTED_FILTERS = "accepted_filters"
+    WORKERS = "workers"
 
     def __init__(self, config_file: dict, pipeline_arguments: dict) -> None:
         """
@@ -85,7 +185,7 @@ def __init__(self, config_file: dict, pipeline_arguments: dict) -> None:
                                                                        default_value=False)
         self._include_biotypes = self.get_multiple_options(
             self.get_translation_properties(variable=self.INCLUDE_BIOTYPES,
-                                            default_value='protein_coding,polymorphic_pseudogene,non_stop_decay,nonsense_mediated_decay,IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,TR_C_gene,TR_D_gene,TR_J_gene,TR_V_gene,TEC,mRNA'))
+                                            default_value='protein_coding,protein_coding_CDS_not_defined,protein_coding_LoF,nonsense_mediated_decay,non_stop_decay,translated_processed_pseudogene,IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,TR_C_gene,TR_D_gene,TR_J_gene,TR_V_gene,TEC'))
 
         self._include_consequences = self.get_multiple_options(
             self.get_translation_properties(variable=self.INCLUDE_CONSEQUENCES, default_value='all'))
@@ -103,6 +203,12 @@ def __init__(self, config_file: dict, pipeline_arguments: dict) -> None:
         self._accepted_filters = self.get_multiple_options(
             self.get_translation_properties(variable=self.ACCEPTED_FILTERS, default_value='PASS'))
 
+        raw_workers = self.get_translation_properties(variable=self.WORKERS, default_value=1)
+        try:
+            self._workers = max(1, int(raw_workers))
+        except (TypeError, ValueError):
+            self._workers = 1
+
     def get_translation_properties(self, variable: str, default_value: Any) -> Any:
         value_return = default_value
         if variable in self.get_pipeline_parameters():
@@ -237,10 +343,10 @@ def dnaseq_to_proteindb(self, input_fasta: str) -> str:
         with open(self._proteindb_output, 'w', encoding='utf-8') as prots_fn:
             for record_id in seq_dict.keys():
 
-                ref_seq = seq_dict[record_id].seq  # get the seq and desc for the record from the fasta of the gtf
+                ref_seq = seq_dict[record_id].seq
                 desc = str(seq_dict[record_id].description)
 
-                key_values = {}  # extract key=value in the desc into a dict
+                key_values = {}
                 sep = self._transcript_description_sep
                 desc = desc.replace(' ', sep)
                 for value in desc.split(sep):
@@ -264,7 +370,6 @@ def dnaseq_to_proteindb(self, input_fasta: str) -> str:
                             record_id, desc)
                         self.get_logger().debug(msg)
 
-                # only include features that have the specified biotypes or they have CDSs info
                 if 'CDS' in key_values.keys() and (
                         not self._skip_including_all_cds or 'altORFs' in self._include_biotypes):
                     pass
@@ -273,7 +378,6 @@ def dnaseq_to_proteindb(self, input_fasta: str) -> str:
                                                                           'all']))):
                     continue
 
-                # check wether to filter on expression and if it passes
                 if self._expression_str:
                     try:
                         if float(key_values[self._expression_str]) < self._expression_thresh:
@@ -360,12 +464,10 @@ def annoate_vcf(vcf_file: str, gtf_file: str,
                 muts_dict.setdefault(key, []).append(transcript_id)
 
         with open(f"{vcf_stem}_annotated.vcf", 'w', encoding='utf-8') as ann, open(vcf_file, 'r', encoding='utf-8') as v:
-            # write vcf headers to the output file
             for line in v.readlines():
                 if line.startswith('#'):
                     ann.write(line)
                 else:
-                    # write the mutations and their overlapping transcript to output file
                     sl = line.strip().split('\t')
                     if len(sl) < 8:
                         ann.write(line)
@@ -414,7 +516,7 @@ def vcf_from_file(vcf_file: str) -> tuple[list, pd.DataFrame]:
 
         return metadata, vcf_df
 
-    def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf: str) -> str:
+    def _vcf_to_proteindb_chunk(self, vcf_file: str, input_fasta: str, gene_annotations_gtf: str, output_path: str) -> str:
         """
     Generate proteins for variants by modifying sequences of affected transcripts.
     In case of already annotated variants it only considers variants within
@@ -425,13 +527,22 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
     :param vcf_file:
     :param input_fasta:
     :param gene_annotations_gtf:
+    :param output_path: path for writing the output FASTA
     :return:
     """
         db = self.parse_gtf(gene_annotations_gtf, str(Path(gene_annotations_gtf).with_suffix('.db')))
 
-        transcripts_dict = SeqIO.index(input_fasta, "fasta", key_function=self.get_key)
+        idx_path = _ensure_fasta_index(input_fasta)
+        transcripts_dict = SeqIO.index_db(idx_path, [input_fasta], "fasta",
+                                          key_function=self.get_key)
         # handle cases where the transcript has version in the GTF but not in the VCF
-        transcript_id_mapping = {k.split('.')[0]: k for k in transcripts_dict.keys()}
+        # Built lazily on the first KeyError to avoid iterating 207k keys up-front.
+        transcript_id_mapping: Optional[dict[str, str]] = None
+        feature_cache = _FeatureCache()
+        # Value is (ref_seq, desc) for a known transcript, or None for a transcript
+        # we've already looked up and confirmed isn't in the FASTA index (avoids re-trying
+        # the disk seek). _MISSING sentinel below distinguishes "not yet looked up".
+        seq_cache: dict[str, Optional[tuple]] = {}
 
         transcript_index, consequence_index, biotype_index = None, None, None
         if self._annotation_field_name:
@@ -445,7 +556,6 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
             except IndexError:
                 pass
 
-            # try to extract index of transcript ID and consequence from the VCF metadata in the header
             try:
                 transcript_index = annotation_cols.index(self._transcript_str.upper())
             except ValueError:
@@ -465,6 +575,12 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
                     annotation_cols, vcf_file)
                 self.get_logger().debug(msg)
 
+            # Fall back to index 0 when no structured ##INFO header was found (e.g.
+            # transcriptOverlaps-annotated VCFs produced by annoate_vcf, which write
+            # plain transcript IDs without a pipe-delimited FORMAT declaration).
+            if transcript_index is None:
+                transcript_index = 0
+
         else:
             # in case the given VCF is not annotated, annotate it by identifying the overlapping transcripts
             vcf_file = self.annoate_vcf(vcf_file, gene_annotations_gtf)
@@ -480,13 +596,13 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
 
         self._accepted_filters = [x.upper() for x in self._accepted_filters]
 
-        with open(self._proteindb_output, 'w', encoding='utf-8') as prots_fn:
-            for _, record in vcf_reader.iterrows():
+        with open(output_path, 'w', encoding='utf-8') as prots_fn:
+            for record in vcf_reader.itertuples(index=False, name='VCFRecord'):
                 trans = False
                 if [x for x in str(record.REF) if x not in 'ACGT']:
-                    msg = "Invalid VCF record, skipping: {}".format(record)
                     invalid_records['# variants with invalid record'] += 1
-                    self.get_logger().debug(msg)
+                    if self.get_logger().isEnabledFor(logging.DEBUG):
+                        self.get_logger().debug("Invalid VCF record, skipping: %s", record)
                     continue
 
                 alts = []
@@ -494,33 +610,35 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
                     if alt is None:
                         continue
                     elif [x for x in str(alt) if x not in 'ACGT']:
-                        # check if all alt alleles are nucleotides
                         continue
                     alts.append(alt)
                 if not alts:
-                    msg = "Invalid VCF record, skipping: {}".format(record)
                     invalid_records['# variants with invalid record'] += 1
-                    self.get_logger().debug(msg)
+                    if self.get_logger().isEnabledFor(logging.DEBUG):
+                        self.get_logger().debug("Invalid VCF record, skipping: %s", record)
                     continue
 
+                # Parse INFO once; avoids repeated split-and-search list-comprehensions per variant.
+                info_kv: dict[str, str] = {}
+                for entry in record.INFO.split(';'):
+                    k, _, v = entry.partition('=')
+                    info_kv[k] = v
+
                 if not self._ignore_filters and self._accepted_filters != ['ALL']:
-                    if record.FILTER and record.FILTER != '.' and record.FILTER != 'NA' and record.FILTER != '':  # if not PASS: None and empty means PASS
+                    if record.FILTER and record.FILTER != '.' and record.FILTER != 'NA' and record.FILTER != '':  # None and empty means PASS
                         filters = set(record.FILTER.upper().split(','))
                         if ';' in record.FILTER and len(filters) <= 1:
                             filters = set(record.FILTER.upper().split(';'))
                         if not filters <= set(self._accepted_filters):
                             invalid_records['# variants not passing Filter'] += 1
                             continue
-                # only process variants above a given allele frequency threshold if the AF string is not empty
                 if self._af_field:
-                    # get AF from the INFO field
                     try:
-                        af = float([x.split('=', 1)[1] for x in record.INFO.split(';') if x.split('=', 1)[0] == self._af_field][0])
-                    except (ValueError, IndexError):
+                        af = float(info_kv[self._af_field])
+                    except (ValueError, KeyError):
                         invalid_records['# variants with invalid record'] += 1
                         continue
 
-                    # check if the AF passed the threshold
                     if af < self._af_threshold:
                         invalid_records['# variants not passing AF threshold'] += 1
                         continue
@@ -530,15 +648,13 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
                     trans_table = self._mito_translation_table
 
                 processed_transcript_allele = set()
-                transcript_records = []
                 try:
-                    transcript_records = \
-                        [x.split('=', 1)[1] for x in record.INFO.split(';')
-                         if x.split('=', 1)[0] == self._annotation_field_name][0]
-                except IndexError:  # no overlapping feature was found
+                    transcript_records = info_kv[self._annotation_field_name]
+                except KeyError:
                     invalid_records['# variants with invalid record'] += 1
-                    msg = "skipped record {}, no annotation feature was found".format(record)
-                    self.get_logger().debug(msg)
+                    if self.get_logger().isEnabledFor(logging.DEBUG):
+                        self.get_logger().debug(
+                            "skipped record %s, no annotation feature was found", record)
                     continue
 
                 for transcript_record in transcript_records.split(','):
@@ -548,9 +664,10 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
                             consequence = transcript_info[consequence_index]
                         except IndexError:
                             invalid_records['# variants with invalid record'] += 1
-                            msg = "Give a valid index for the consequence in the INFO field for: {}".format(
-                                transcript_record)
-                            self.get_logger().debug(msg)
+                            if self.get_logger().isEnabledFor(logging.DEBUG):
+                                self.get_logger().debug(
+                                    "Give a valid index for the consequence in the INFO field for: %s",
+                                    transcript_record)
                             continue
                         except TypeError:
                             pass
@@ -559,9 +676,10 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
                             biotype = transcript_info[biotype_index]
                         except IndexError:
                             invalid_records['# variants with invalid record'] += 1
-                            msg = "Give a valid index for the biotype in the INFO field for: {}".format(
-                                transcript_record)
-                            self.get_logger().debug(msg)
+                            if self.get_logger().isEnabledFor(logging.DEBUG):
+                                self.get_logger().debug(
+                                    "Give a valid index for the biotype in the INFO field for: %s",
+                                    transcript_record)
                             continue
                         except TypeError:
                             pass
@@ -570,30 +688,48 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
                         transcript_id = transcript_info[transcript_index]
                     except IndexError:
                         invalid_records['# variants with invalid record'] += 1
-                        msg = "Give a valid index for the Transcript IDs in the INFO field for: {}".format(
-                            transcript_record)
-                        self.get_logger().debug(msg)
+                        if self.get_logger().isEnabledFor(logging.DEBUG):
+                            self.get_logger().debug(
+                                "Give a valid index for the Transcript IDs in the INFO field for: %s",
+                                transcript_record)
                         continue
                     if transcript_id == "":
                         continue
 
                     try:
-                        transcript_id_v = transcript_id_mapping[transcript_id]
-                    except KeyError:
-                        transcript_id_v = transcript_id
+                        transcript_id_v = transcript_id_mapping[transcript_id]  # type: ignore[index]
+                    except (KeyError, TypeError):
+                        if transcript_id_mapping is None:
+                            transcript_id_mapping = {k.split('.')[0]: k for k in transcripts_dict.keys()}
+                            try:
+                                transcript_id_v = transcript_id_mapping[transcript_id]
+                            except KeyError:
+                                transcript_id_v = transcript_id
+                        else:
+                            transcript_id_v = transcript_id
 
-                    try:
-                        row = transcripts_dict[transcript_id_v]
-                        ref_seq = row.seq  # get the seq and desc for the transcript from the fasta of the gtf
-                        desc = str(row.description)
-                    except KeyError:
-                        invalid_records['# feature IDs from VCF that are not found in the given FASTA file'] += 1
-                        msg = "Feature {} not found in fasta of the GTF file {}".format(transcript_id_v, record)
-                        self.get_logger().debug(msg)
+                    cached_row = seq_cache.get(transcript_id_v, _MISSING)
+                    if cached_row is _MISSING:
+                        try:
+                            row = transcripts_dict[transcript_id_v]
+                            ref_seq = row.seq
+                            desc = str(row.description)
+                            seq_cache[transcript_id_v] = (ref_seq, desc)
+                        except KeyError:
+                            invalid_records['# feature IDs from VCF that are not found in the given FASTA file'] += 1
+                            if self.get_logger().isEnabledFor(logging.DEBUG):
+                                self.get_logger().debug(
+                                    "Feature %s not found in fasta of the GTF file %s",
+                                    transcript_id_v, record)
+                            seq_cache[transcript_id_v] = None
+                            continue
+                    elif cached_row is None:
+                        # already-known-missing; skip without re-counting in the stats
                         continue
+                    else:
+                        ref_seq, desc = cached_row
 
                     feature_types = ['exon']
-                    # check if cds info exists in the fasta header otherwise translate all exons
                     cds_info = []
                     num_orfs = 3
                     if 'CDS=' in desc:
@@ -608,28 +744,35 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
                                 feature_types = ['CDS', 'stop_codon']
                                 num_orfs = 1
                         except (ValueError, IndexError):
-                            msg = "Could not extract cds position from fasta header for: {}".format(desc)
-                            self.get_logger().debug(msg)
+                            if self.get_logger().isEnabledFor(logging.DEBUG):
+                                self.get_logger().debug(
+                                    "Could not extract cds position from fasta header for: %s", desc)
 
-                    chrom, strand, features_info = self.get_features(db,
-                                                                     transcript_id_v,
-                                                                     feature_types)
-                    if chrom is None:  # the record info was not found
-                        continue
-                    # skip transcripts with unwanted consequences
+                    # Apply consequence/biotype filters before get_features(); both inputs
+                    # come from transcript_info, so we can reject early without SQL.
                     if consequence_index is not None:
                         if (consequence in self._exclude_consequences or
                                 (consequence not in self._include_consequences and
                                  self._include_consequences != ['all'])):
                             continue
 
-                    # skip transcripts with unwanted biotypes
                     if biotype_index is not None:
                         if (biotype in self._exclude_biotypes or
                                 (biotype not in self._include_biotypes and
                                  self._include_biotypes != ['all'])):
                             continue
 
+                    cache_key = (transcript_id_v, tuple(feature_types))
+                    cached = feature_cache.get(cache_key)
+                    if cached is None:
+                        chrom, strand, features_info = self.get_features(
+                            db, transcript_id_v, feature_types)
+                        feature_cache.put(cache_key, (chrom, strand, features_info))
+                    else:
+                        chrom, strand, features_info = cached
+                    if chrom is None:
+                        continue
+
                     for alt in alts:
                         dedup_key = transcript_id + str(record.REF) + str(alt)
                         if dedup_key in processed_transcript_allele:
@@ -643,8 +786,8 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
                                                               features_info)
                         except TypeError:
                             invalid_records['# variants with invalid record'] += 1
-                            msg = "Wrong VCF record in {}".format(record)
-                            self.get_logger().debug(msg)
+                            if self.get_logger().isEnabledFor(logging.DEBUG):
+                                self.get_logger().debug("Wrong VCF record in %s", record)
                             continue
 
                         if (chrom.lstrip("chr") == str(record.CHROM).lstrip("chr") and
@@ -680,6 +823,80 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
             '\n'.join([x + ":" + str(invalid_records[x]) for x in invalid_records.keys()]))
         self.get_logger().info(msg)
 
+        return output_path
+
+    def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf: str, workers=None) -> str:
+        """Generate proteins for variants by modifying sequences of affected transcripts.
+
+        If workers is None, falls back to self._workers (config) which defaults
+        to 1 (sequential, backward-compatible). Pass workers > 1 to fan out per
+        chromosome via multiprocessing.Pool.
+        :param vcf_file:
+        :param input_fasta:
+        :param gene_annotations_gtf:
+        :param workers: number of parallel worker processes (None => use config default)
+        :return: path to the output proteindb FASTA
+        """
+        if workers is None:
+            workers = self._workers if self._workers else 1
+
+        # Fast path: sequential — single call, identical behaviour to the original implementation.
+        # For sequential runs we do NOT pre-annotate here; _vcf_to_proteindb_chunk handles that
+        # in its else-branch so that transcript_index=0 is set correctly for unannotated VCFs.
+        if workers <= 1:
+            return self._vcf_to_proteindb_chunk(vcf_file, input_fasta, gene_annotations_gtf, self._proteindb_output)
+
+        # Parallel path: pre-annotate unannotated VCFs in the main process. This avoids each
+        # worker racing on the same bedtools-output bed file (which annoate_vcf writes to cwd)
+        # and amortises the bedtools intersect across workers.
+        if not self._annotation_field_name:
+            vcf_file = self.annoate_vcf(vcf_file, gene_annotations_gtf)
+            self._annotation_field_name = 'transcriptOverlaps'
+
+        # Build the FASTA index ONCE in the main process so all workers share it
+        # instead of each re-scanning the FASTA (~14 s × N workers wasted).
+        _ensure_fasta_index(input_fasta)
+
+        # Parallel — split into per-chrom temp VCFs, fan out to a Pool.
+        import multiprocessing as mp
+        import shutil
+        import tempfile
+
+        with tempfile.TemporaryDirectory(prefix='pgatk_v2p_') as tmpdir:
+            # Stream-split VCF by chromosome directly to per-chrom files (constant memory).
+            chunk_paths = _split_vcf_by_chrom(vcf_file, tmpdir)
+
+            if len(chunk_paths) <= 1:
+                # Only one chromosome — run sequentially on the original file.
+                return self._vcf_to_proteindb_chunk(vcf_file, input_fasta, gene_annotations_gtf,
+                                                    self._proteindb_output)
+
+            tasks = []
+            for chrom in sorted(chunk_paths.keys()):
+                chunk_vcf = chunk_paths[chrom]
+                chunk_out = os.path.join(tmpdir, f"out_{_safe_chrom(chrom)}.fa")
+                pa = dict(self.get_pipeline_parameters())
+                pa[EnsemblDataService.PROTEIN_DB_OUTPUT] = chunk_out
+                # Force annotated mode for workers (annoate_vcf already ran in main):
+                pa[EnsemblDataService.ANNOTATION_FIELD_NAME] = self._annotation_field_name
+                tasks.append((self.get_default_parameters(), pa, chunk_vcf,
+                              input_fasta, gene_annotations_gtf, chunk_out))
+
+            self.get_logger().info(
+                "vcf-to-proteindb: dispatching %d chromosome chunk(s) across %d worker(s)",
+                len(tasks), min(workers, len(tasks)))
+
+            with mp.get_context('spawn').Pool(min(workers, len(tasks))) as pool:
+                pool.starmap(_vcf_to_proteindb_worker, tasks)
+
+            # Concatenate the per-chunk FASTAs into the final output.
+            with open(self._proteindb_output, 'wb') as out:
+                for task in tasks:
+                    chunk_out = task[5]
+                    if os.path.exists(chunk_out):
+                        with open(chunk_out, 'rb') as f:
+                            shutil.copyfileobj(f, out)
+
         return self._proteindb_output
 
     @staticmethod
diff --git a/pgatk/proteogenomics/spectrumai.py b/pgatk/proteogenomics/spectrumai.py
index 0fb16b3..612e309 100644
--- a/pgatk/proteogenomics/spectrumai.py
+++ b/pgatk/proteogenomics/spectrumai.py
@@ -281,7 +281,6 @@ def __init__(self, config_data: dict, pipeline_arguments: dict) -> None:
         self._mzml_files = self.get_validate_parameters(variable=self.CONFIG_MZML_FILES, default_value=False)
         self._ions_tolerance = self.get_validate_parameters(variable=self.CONFIG_IONS_TOLERANCE, default_value=0.02)
 
-        ## check if ions_tolerance is string, convert to float
         if isinstance(self._ions_tolerance, str):
             self._ions_tolerance = float(self._ions_tolerance)
 
diff --git a/pgatk/proteomics/db/protein_database_decoy.py b/pgatk/proteomics/db/protein_database_decoy.py
index 5f47e72..ee22853 100644
--- a/pgatk/proteomics/db/protein_database_decoy.py
+++ b/pgatk/proteomics/db/protein_database_decoy.py
@@ -221,21 +221,14 @@ def generate_decoypyrat_database(self) -> None:
     :return:
     """
 
-        # Create empty sets to add all target and decoy peptides
         upeps = set()
         dpeps = set()
 
-        # Counter for number of decoy sequences
         dcount = 0
 
-        # Open FASTA file using first cmd line argument
-        # fasta = SeqIO.parse(self._input_fasta, 'fasta')
-
         with open(self._input_fasta) as handle:
-            # open temporary decoy FASTA file
             with open(self._temp_file, 'w') as outfa:
 
-                # loop each seq in the file
                 for value in SimpleFastaParser(handle):
                     seq = value[1]
                     description = value[0]
@@ -244,53 +237,42 @@ def generate_decoypyrat_database(self) -> None:
                     if not self._isobaric:
                         seq = seq.replace('I', 'L')
 
-                    # digest sequence add peptides to set
                     upeps.update(
                         cleave(sequence=seq, rule=PGATK_ENZYMES.enzymes[self._enzyme]['cleavage rule'],
                                missed_cleavages=0,
                                min_length=self._min_peptide_length))
 
-                    # reverse and switch protein sequence
                     decoyseq = self.revswitch(seq, self._no_switch,
                                               PGATK_ENZYMES.enzymes[self._enzyme]['cleavage sites'])
 
                     # do not store decoy peptide set in reduced memory mode
                     if not self._memory_save:
-                        # update decoy peptide set
                         dpeps.update(
                             cleave(sequence=decoyseq, rule=PGATK_ENZYMES.enzymes[self._enzyme]['cleavage rule'],
                                    missed_cleavages=0,
                                    min_length=self._min_peptide_length))
 
-                    # write decoy protein accession and sequence to file
                     outfa.write('>' + self._decoy_prefix + description + '\n')
                     outfa.write(decoyseq + '\n')
 
-        # Summarise the numbers of target and decoy peptides and their intersection
         nonDecoys = set()
         self.get_logger().info("proteins: %s", dcount)
         self.get_logger().info("target peptides: %s", len(upeps))
 
         # Reloop decoy file in reduced memory mode to store only intersecting decoys
         if self._memory_save:
-            # open temp decoys
             with open(self._temp_file, "rt") as fin:
                 for line in fin:
-                    # if line is not accession
                     if line[0] != '>':
-                        # digest protein
                         for p in cleave(sequence=line.rstrip(),
                                         rule=PGATK_ENZYMES.enzymes[self._enzyme]['cleavage rule'],
                                         missed_cleavages=0, min_length=self._min_peptide_length):
-                            # check if in target peptides if true then add to nonDecoys
                             if p in upeps:
                                 nonDecoys.add(p)
             fin.close()
             self.get_logger().info("decoy peptides: !Memory Saving Made!")
         else:
-            # can only report total number in normal memory mode
             self.get_logger().info("decoy peptides: %s", len(dpeps))
-            # find intersecting peptides
             nonDecoys = upeps.intersection(dpeps)
 
         self.get_logger().info("#intersection: %s", len(nonDecoys))
@@ -298,37 +280,27 @@ def generate_decoypyrat_database(self) -> None:
         # if there are decoy peptides that are in the target peptide set
         if len(nonDecoys) > 0 and self._no_suffle == False:
 
-            # create empty dictionary with bad decoys as keys
             dAlternative = dict.fromkeys(nonDecoys, '')
             noAlternative = list()
 
-            # loop bad decoys / dictionary keys
             for dPep in dAlternative:
                 i = 0
                 aPep = dPep
 
-                # shuffle until aPep is not in target set (maximum of 10 iterations)
                 while aPep in upeps and i < self._max_iterations:
-
-                    # increment iteration counter
                     i += 1
-
-                    # shuffle peptide
                     aPep = self.shuffle(dPep)
 
                     # check if shuffling has an effect if not end iterations
                     if aPep == dPep:
                         i = self._max_iterations
 
-                # update dictionary with alternative shuffled peptide
                 dAlternative[dPep] = aPep
 
-                # warn if peptide has no suitable alternative, add to removal list
                 if i == self._max_iterations:
                     noAlternative.append(dPep)
 
             self.get_logger().info('%s have no alternative peptide', len(noAlternative))
-            # remove peptides with no alternative
             for p in noAlternative:
                 del dAlternative[p]
 
@@ -336,11 +308,8 @@ def generate_decoypyrat_database(self) -> None:
             upeps.clear()
             dpeps.clear()
 
-            # open second decoy file
             with open(self._output_file, "wt") as fout:
 
-                # Attach the target sequences to the database
-                # fasta = SeqIO.parse(self._input_fasta, 'fasta')
                 with open(self._input_fasta) as handle:
                     for value in SimpleFastaParser(handle):
                         description = value[0]
@@ -348,20 +317,15 @@ def generate_decoypyrat_database(self) -> None:
                         fout.write('>' + description + '\n')
                         fout.write(seq + '\n')
 
-                # open original decoy file
                 with open(self._temp_file, "rt") as fin:
-                    # loop each line of original decoy fasta
                     for line in fin:
                         # if line is not accession replace peptides in dictionary with alternatives
                         if line[0] != '>':
-                            # digest decoy sequence
                             for p in cleave(sequence=line.rstrip(),
                                             rule=PGATK_ENZYMES.enzymes[self._enzyme]['cleavage rule'],
                                             missed_cleavages=0, min_length=self._min_peptide_length):
-                                # store decoy peptide for final count
                                 dpeps.add(p)
 
-                                # if decoy peptide is in dictionary replace with alternative
                                 if p in dAlternative:
                                     line = line.replace(p, dAlternative[p])
 
@@ -391,24 +355,19 @@ def pgatk_decoy_database(self) -> None:
      :return:
     """
 
-        # Create empty sets to add all target and decoy peptides
         upeps = set()
         noAlternative = set()
-        # Open FASTA file using first cmd line argument
         fasta = SeqIO.parse(self._input_fasta, 'fasta')
-        # loop each seq in the file
         for record in fasta:
             seq = str(record.seq)
             if not self._isobaric:
                 seq = seq.replace('I', 'L')
 
-                # digest sequence add peptides to the target set
                 upeps.update(
                     cleave(sequence=seq, rule=PGATK_ENZYMES.enzymes[self._enzyme]['cleavage rule'],
                            missed_cleavages=self._max_missed_cleavages,
                            min_length=self._min_peptide_length))
 
-        # open orary decoy FASTA file
         with open(self._output_file, 'w') as outfa:
             fasta = SeqIO.parse(self._input_fasta, 'fasta')
             targets = []
@@ -418,7 +377,6 @@ def pgatk_decoy_database(self) -> None:
                 targets.append(protseq)
                 revprotseq = []
 
-                # output target protein
                 seq = str(record.seq)
                 id_protein = record.id
                 description = record.description
@@ -431,7 +389,6 @@ def pgatk_decoy_database(self) -> None:
                     if not self._isobaric:
                         seq = seq.replace('I', 'L')
 
-                    # reverse and switch protein sequence
                     decoyseq = self.revswitch(seq, self._no_switch,
                                               PGATK_ENZYMES.enzymes[self._enzyme]['cleavage sites'])
 
@@ -455,29 +412,23 @@ def pgatk_decoy_database(self) -> None:
 
                         if found_in_target and not self._no_suffle and decoy_pep not in noAlternative:
                             aPep = decoy_pep
-                            # shuffle until aPep is not in target set (maximum of 10 iterations)
                             i = 0
                             while aPep in upeps and i < self._max_iterations:
-                                # increment iteration counter
                                 i += 1
-                                # shuffle peptide
                                 aPep = self.shuffle(aPep)
 
                                 # check if shuffling has an effect if not end iterations
                                 if aPep == decoy_pep:
                                     i = self._max_iterations
 
-                                # warn if peptide has no suitable alternative, add to removal list
                                 if i == self._max_iterations:
                                     noAlternative.add(decoy_pep)
                                     aPep = ''
-                        # if decoy is generated then add to the list of peptides
                         if aPep:
                             checked_decoy_peps.append(aPep)
                         else:
                             if self._keep_target_hits:
                                 checked_decoy_peps.append(decoy_pep)
-                    # finally join the peptides to generate protein decoy
                     if checked_decoy_peps:
                         revprotseq.append(''.join(checked_decoy_peps))
 
diff --git a/pgatk/testdata/proteindb_from_custom_VCF.fa b/pgatk/testdata/proteindb_from_custom_VCF.fa
index 5325f01..22f87e0 100644
--- a/pgatk/testdata/proteindb_from_custom_VCF.fa
+++ b/pgatk/testdata/proteindb_from_custom_VCF.fa
@@ -1,11 +1,11 @@
->varsample_rs78350717_22.15528913.C.A_ENST00000252835
-MCPLTLQVTGLMNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKDFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII*
->varsample_rs78350717_22.15528913.C.T_ENST00000252835
-MCPLTLQVTGLMNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKVFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII*
 >varsample_rs78350717_22.15528913.C.A_ENST00000643195
 MNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKDFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII*
 >varsample_rs78350717_22.15528913.C.T_ENST00000643195
 MNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKVFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII*
+>varsample_rs78350717_22.15528913.C.A_ENST00000252835
+MCPLTLQVTGLMNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKDFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII*
+>varsample_rs78350717_22.15528913.C.T_ENST00000252835
+MCPLTLQVTGLMNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKVFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII*
 >varsample_rs147461488_22.17740102.GGCCACGCTCAACT.G_ENST00000342111
 MDCEVNNGSSLRDECITNLLVFGFLQSCSDNSFRRELDALGHELPVLAPQWEGYDELQTDGNRSSHSRLGRIEAGASDNNTASAEEETEAAGSVAFMELRQ*F*KSRRHHPEYCQAPRPGRGQHGP*HPSGPG
 >varsample_rs147461488_22.17740102.GGCCACGCTCAACT.G_ENST00000617586
diff --git a/pgatk/testdata/test_cosmic_classification.tsv b/pgatk/testdata/test_cosmic_classification.tsv
new file mode 100644
index 0000000..e5a9f9c
--- /dev/null
+++ b/pgatk/testdata/test_cosmic_classification.tsv
@@ -0,0 +1,9 @@
+COSMIC_PHENOTYPE_ID	PRIMARY_SITE	PRIMARY_HISTOLOGY	SITE_SUBTYPE_1	HISTOLOGY_SUBTYPE_1
+COSO1001	upper_aerodigestive_tract	carcinoma	mouth	squamous_cell_carcinoma
+COSO1002	skin	carcinoma	upper_leg	squamous_cell_carcinoma
+COSO1003	bone	other	femur	giant_cell_tumour
+COSO1004	thyroid	adenoma-nodule-goitre	NS	NS
+COSO1005	liver	other	NS	neoplasm
+COSO1006	haematopoietic_and_lymphoid_tissue	haematopoietic_neoplasm	NS	acute_myeloid_leukaemia
+COSO1007	bone	carcinoma	NS	NS
+COSO1008	skin	carcinoma	NS	NS
diff --git a/pgatk/testdata/test_cosmic_mutations.tsv b/pgatk/testdata/test_cosmic_mutations.tsv
index f4a9ba4..254cef5 100644
--- a/pgatk/testdata/test_cosmic_mutations.tsv
+++ b/pgatk/testdata/test_cosmic_mutations.tsv
@@ -1,13 +1,13 @@
-Gene name	Accession Number	Gene CDS length	HGNC ID	Sample name	ID_sample	ID_tumour	Primary site	Site subtype 1	Site subtype 2	Site subtype 3	Primary histology	Histology subtype 1	Histology subtype 2	Histology subtype 3	Genome-wide screen	Mutation ID	Mutation CDS	Mutation AA	Mutation Description	Mutation zygosity	LOH	GRCh	Mutation genome position	Mutation strand	SNP	Resistance Mutation	FATHMM prediction	FATHMM score	Mutation somatic status	Pubmed_PMID	ID_STUDY	Sample Type	Tumour origin	Age
-HRAS	ENST00000397596	570	5173	HN12PT	1542873	1465176	upper_aerodigestive_tract	mouth	NS	NS	carcinoma	squamous_cell_carcinoma	NS	NS	y	COSM486	c.37G>C	p.G13R	Substitution - Missense		u	38	11:534286-534286	-	n	-	PATHOGENIC	.99415	Confirmed somatic variant	21798897		surgery fresh/frozen	primary	
-HRAS	ENST00000397596	570	5173	V19	2688774	2547756	skin	upper_leg	NS	NS	carcinoma	squamous_cell_carcinoma	NS	NS	n	COSM486	c.37G>C	p.G13R	Substitution - Missense		u	38	11:534286-534286	-	n	-	PATHOGENIC	.99415	Confirmed somatic variant	24662767		surgery fresh/frozen	NS	
-HRAS	ENST00000397596	570	5173	S23372	702100	629222	bone	femur	NS	NS	other	giant_cell_tumour	NS	NS	n	COSM487	c.37G>A	p.G13S	Substitution - Missense		y	38	11:534286-534286	-	n	-	PATHOGENIC	.99324	Reported in another cancer sample as somatic	11903582		surgery-fixed	recurrent	42
-HRAS	ENST00000397596	570	5173	pel-102	720047	644990	upper_aerodigestive_tract	mouth	lip	NS	carcinoma	squamous_cell_carcinoma	NS	NS	n	COSM484	c.35G>A	p.G12D	Substitution - Missense		n	38	11:534288-534288	-	n	-	PATHOGENIC	.99322	Reported in another cancer sample as somatic	8253528		surgery fresh/frozen	primary	
-HRAS	ENST00000397596	570	5173	1965078	1965078	1851573	skin	chest	NS	NS	benign_melanocytic_nevus	NS	NS	NS	n	COSM498	c.182A>T	p.Q61L	Substitution - Missense		u	38	11:533874-533874	-	n	-	PATHOGENIC	.97511	Reported in another cancer sample as somatic	23599145		NS	NS	30
-HRAS	ENST00000397596	570	5173	1571189	1571189	1492737	skin	NS	NS	NS	other	atypical_Spitzoid_tumour	NS	NS	n	COSM498	c.182A>T	p.Q61L	Substitution - Missense	het	u	38	11:533874-533874	-	n	-	PATHOGENIC	.97511	Reported in another cancer sample as somatic	21496703		surgery-fixed	NS	
-HRAS	ENST00000397596	570	5173	S61945	741791	664521	thyroid	NS	NS	NS	adenoma-nodule-goitre	NS	NS	NS	n	COSM499	c.182A>G	p.Q61R	Substitution - Missense		y	38	11:533874-533874	-	n	-	PATHOGENIC	.97179	Reported in another cancer sample as somatic	12727991		NS	primary	
-HRAS	ENST00000397596	570	5173	49T	2745889	2604592	liver	NS	NS	NS	other	neoplasm	NS	NS	y	COSM1729837	c.403C>T	p.R135*	Substitution - Nonsense		u	38	11:533500-533500	-	n	-	PATHOGENIC	.97257	Confirmed somatic variant		660	NS	primary	67
-HRAS	ENST00000397596	570	5173	DM3T4S	2762863	2621122	thyroid	NS	NS	NS	carcinoma	follicular_carcinoma	NS	NS	n	COSM499	c.182A>G	p.Q61R	Substitution - Missense		u	38	11:533874-533874	-	n	-	PATHOGENIC	.97179	Reported in another cancer sample as somatic	29615459		NS	metastasis	58
-HRAS	ENST00000397596	570	5173	TESTSAMPLE	9999991	9999981	haematopoietic_and_lymphoid_tissue	NS	NS	NS	haematopoietic_neoplasm	acute_myeloid_leukaemia	NS	NS	n	COSM_TEST1	c.?	p.G13(H^K)	Substitution - Missense		u	38	11:534286-534286	-	n	-	PATHOGENIC	.99415	Confirmed somatic variant	25858894		blood-bone marrow	primary	
-HRAS	ENST00000397596	570	5173	TESTSAMPLE	9999991	9999981	haematopoietic_and_lymphoid_tissue	NS	NS	NS	haematopoietic_neoplasm	acute_myeloid_leukaemia	NS	NS	n	COSM_TEST2	c.?	p.13_14>21	Complex - insertion inframe		u	38	11:534286-534286	-	n	-	PATHOGENIC	.99415	Confirmed somatic variant	25858894		blood-bone marrow	primary	
-HRAS	ENST00000397596	570	5173	TESTSAMPLE	9999991	9999981	haematopoietic_and_lymphoid_tissue	NS	NS	NS	haematopoietic_neoplasm	acute_myeloid_leukaemia	NS	NS	n	COSM_TEST3	c.?	p.G13_S14>GS	Insertion - In frame		u	38	11:534286-534286	-	n	-	PATHOGENIC	.99415	Confirmed somatic variant	25858894		blood-bone marrow	primary	
+GENE_SYMBOL	COSMIC_GENE_ID	TRANSCRIPT_ACCESSION	COSMIC_SAMPLE_ID	SAMPLE_NAME	COSMIC_PHENOTYPE_ID	GENOMIC_MUTATION_ID	LEGACY_MUTATION_ID	MUTATION_ID	MUTATION_CDS	MUTATION_AA	MUTATION_DESCRIPTION	MUTATION_ZYGOSITY	LOH	CHROMOSOME	GENOME_START	GENOME_STOP	STRAND	PUBMED_PMID	COSMIC_STUDY_ID	HGVSP	HGVSC	HGVSG	GENOMIC_WT_ALLELE	GENOMIC_MUT_ALLELE	MUTATION_SOMATIC_STATUS	POSITIVE_SCREEN
+HRAS	COSG11180	ENST00000397596	COSS1542873	HN12PT	COSO1001	COSV56056559	COSM486	COSM486	c.37G>C	p.G13R	missense_variant		u	11	534286	534286	-	21798897		p.Gly13Arg	c.37G>C		G	C	Confirmed somatic variant	y
+HRAS	COSG11180	ENST00000397596	COSS2688774	V19	COSO1002	COSV56056559	COSM486	COSM486	c.37G>C	p.G13R	missense_variant		u	11	534286	534286	-	24662767		p.Gly13Arg	c.37G>C		G	C	Confirmed somatic variant	n
+HRAS	COSG11180	ENST00000397596	COSS702100	S23372	COSO1003	COSV56056558	COSM487	COSM487	c.37G>A	p.G13S	missense_variant		y	11	534286	534286	-	11903582		p.Gly13Ser	c.37G>A		G	A	Reported in another cancer sample as somatic	n
+HRAS	COSG11180	ENST00000397596	COSS720047	pel-102	COSO1001	COSV56056557	COSM484	COSM484	c.35G>A	p.G12D	missense_variant		n	11	534288	534288	-	8253528		p.Gly12Asp	c.35G>A		G	A	Reported in another cancer sample as somatic	n
+HRAS	COSG11180	ENST00000397596	COSS1965078	1965078	COSO1002	COSV56056584	COSM498	COSM498	c.182A>T	p.Q61L	missense_variant		u	11	533874	533874	-	23599145		p.Gln61Leu	c.182A>T		A	T	Reported in another cancer sample as somatic	n
+HRAS	COSG11180	ENST00000397596	COSS1571189	1571189	COSO1002	COSV56056584	COSM498	COSM498	c.182A>T	p.Q61L	missense_variant	het	u	11	533874	533874	-	21496703		p.Gln61Leu	c.182A>T		A	T	Reported in another cancer sample as somatic	n
+HRAS	COSG11180	ENST00000397596	COSS741791	S61945	COSO1004	COSV56056585	COSM499	COSM499	c.182A>G	p.Q61R	missense_variant		y	11	533874	533874	-	12727991		p.Gln61Arg	c.182A>G		A	G	Reported in another cancer sample as somatic	n
+HRAS	COSG11180	ENST00000397596	COSS2745889	49T	COSO1005	COSV56070001	COSM1729837	COSM1729837	c.403C>T	p.R135*	stop_gained		u	11	533500	533500	-		660	p.Arg135Ter	c.403C>T		C	T	Confirmed somatic variant	y
+HRAS	COSG11180	ENST00000397596	COSS2762863	DM3T4S	COSO1004	COSV56056585	COSM499	COSM499	c.182A>G	p.Q61R	missense_variant		u	11	533874	533874	-	29615459		p.Gln61Arg	c.182A>G		A	G	Reported in another cancer sample as somatic	n
+HRAS	COSG11180	ENST00000397596	COSS9999991	TESTSAMPLE	COSO1006	COSV_TEST1	COSM_TEST1	COSM_TEST1	c.36C>T	p.G12=	synonymous_variant		u	11	534287	534287	-			p.Gly12Gly	c.36C>T		C	T	Confirmed somatic variant	n
+HRAS	COSG11180	ENST00000397596	COSS9999992	TESTSAMPLE2	COSO1007	COSV_TEST2	COSM_TEST2	COSM_TEST2	c.34_36del	p.G12del	inframe_deletion		u	11	534285	534287	-			p.Gly12del	c.34_36del		GGC	-	Confirmed somatic variant	n
+HRAS	COSG11180	ENST00000397596	COSS9999993	TESTSAMPLE3	COSO1008	COSV_TEST3	COSM_TEST3	COSM_TEST3	c.?	p.?	frameshift_variant		u	11	534286	534287	-			p.?	c.?		.	.	Confirmed somatic variant	n
diff --git a/pgatk/testdata/test_cosmic_mutations_proteindb.fa b/pgatk/testdata/test_cosmic_mutations_proteindb.fa
index 9516b34..e2fd977 100644
--- a/pgatk/testdata/test_cosmic_mutations_proteindb.fa
+++ b/pgatk/testdata/test_cosmic_mutations_proteindb.fa
@@ -1,12 +1,14 @@
->COSMIC:HRAS:p.G13R:Substitution-Missense
+>COSMIC:HRAS:p.G13R:missense_variant
 MTEYKLVVVGAGRVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS*
->COSMIC:HRAS:p.G13S:Substitution-Missense
+>COSMIC:HRAS:p.G13S:missense_variant
 MTEYKLVVVGAGSVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS*
->COSMIC:HRAS:p.G12D:Substitution-Missense
+>COSMIC:HRAS:p.G12D:missense_variant
 MTEYKLVVVGADGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS*
->COSMIC:HRAS:p.Q61L:Substitution-Missense
+>COSMIC:HRAS:p.Q61L:missense_variant
 MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGLEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS*
->COSMIC:HRAS:p.Q61R:Substitution-Missense
+>COSMIC:HRAS:p.Q61R:missense_variant
 MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGREEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS*
->COSMIC:HRAS:p.R135*:Substitution-Nonsense
+>COSMIC:HRAS:p.R135*:stop_gained
 MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLA*SYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS*
+>COSMIC:HRAS:p.G12del:inframe_deletion
+MTEYKLVVVGAGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS*
diff --git a/pgatk/testdata/test_cosmic_mutations_proteindb_bone.fa b/pgatk/testdata/test_cosmic_mutations_proteindb_bone.fa
index 3be6bf8..f96859a 100644
--- a/pgatk/testdata/test_cosmic_mutations_proteindb_bone.fa
+++ b/pgatk/testdata/test_cosmic_mutations_proteindb_bone.fa
@@ -1,2 +1,4 @@
->COSMIC:HRAS:p.G13S:Substitution-Missense
+>COSMIC:HRAS:p.G13S:missense_variant
 MTEYKLVVVGAGSVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS*
+>COSMIC:HRAS:p.G12del:inframe_deletion
+MTEYKLVVVGAGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS*
diff --git a/pgatk/testdata/test_cosmic_mutations_proteindb_liver.fa b/pgatk/testdata/test_cosmic_mutations_proteindb_liver.fa
index fc6d70d..62dafeb 100644
--- a/pgatk/testdata/test_cosmic_mutations_proteindb_liver.fa
+++ b/pgatk/testdata/test_cosmic_mutations_proteindb_liver.fa
@@ -1,2 +1,2 @@
->COSMIC:HRAS:p.R135*:Substitution-Nonsense
+>COSMIC:HRAS:p.R135*:stop_gained
 MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLA*SYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS*
diff --git a/pgatk/testdata/test_cosmic_mutations_proteindb_skin.fa b/pgatk/testdata/test_cosmic_mutations_proteindb_skin.fa
index b365210..6d8f4ab 100644
--- a/pgatk/testdata/test_cosmic_mutations_proteindb_skin.fa
+++ b/pgatk/testdata/test_cosmic_mutations_proteindb_skin.fa
@@ -1,4 +1,4 @@
->COSMIC:HRAS:p.G13R:Substitution-Missense
+>COSMIC:HRAS:p.G13R:missense_variant
 MTEYKLVVVGAGRVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS*
->COSMIC:HRAS:p.Q61L:Substitution-Missense
+>COSMIC:HRAS:p.Q61L:missense_variant
 MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGLEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS*
diff --git a/pgatk/testdata/test_cosmic_mutations_proteindb_thyroid.fa b/pgatk/testdata/test_cosmic_mutations_proteindb_thyroid.fa
index dc17c3d..b9ead29 100644
--- a/pgatk/testdata/test_cosmic_mutations_proteindb_thyroid.fa
+++ b/pgatk/testdata/test_cosmic_mutations_proteindb_thyroid.fa
@@ -1,2 +1,2 @@
->COSMIC:HRAS:p.Q61R:Substitution-Missense
+>COSMIC:HRAS:p.Q61R:missense_variant
 MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGREEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS*
diff --git a/pgatk/testdata/test_cosmic_mutations_proteindb_upperaerodigestivetract.fa b/pgatk/testdata/test_cosmic_mutations_proteindb_upperaerodigestivetract.fa
index f3118ae..9b3cf5c 100644
--- a/pgatk/testdata/test_cosmic_mutations_proteindb_upperaerodigestivetract.fa
+++ b/pgatk/testdata/test_cosmic_mutations_proteindb_upperaerodigestivetract.fa
@@ -1,4 +1,4 @@
->COSMIC:HRAS:p.G13R:Substitution-Missense
+>COSMIC:HRAS:p.G13R:missense_variant
 MTEYKLVVVGAGRVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS*
->COSMIC:HRAS:p.G12D:Substitution-Missense
+>COSMIC:HRAS:p.G12D:missense_variant
 MTEYKLVVVGADGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS*
diff --git a/pgatk/tests/pgatk_tests.py b/pgatk/tests/pgatk_tests.py
index 825a642..a738bdf 100644
--- a/pgatk/tests/pgatk_tests.py
+++ b/pgatk/tests/pgatk_tests.py
@@ -178,7 +178,8 @@ def test_cosmic_to_proteindb(self):
                                 '--input_mutation', 'testdata/test_cosmic_mutations.tsv',
                                 '--input_genes', 'testdata/test_cosmic_genes.fa',
                                 '--output_db', 'testdata/test_cosmic_mutations_proteindb.fa',
-                                '--filter_column', 'Primary site',
+                                '--clinical_sample_file', 'testdata/test_cosmic_classification.tsv',
+                                '--filter_column', 'PRIMARY_SITE',
                                 '--split_by_filter_column', '--accepted_values', 'all'])
         if result.exit_code != 0:
             print(result.exception)
diff --git a/pgatk/tests/test_cosmic_downloader.py b/pgatk/tests/test_cosmic_downloader.py
new file mode 100644
index 0000000..5564495
--- /dev/null
+++ b/pgatk/tests/test_cosmic_downloader.py
@@ -0,0 +1,46 @@
+"""Unit tests for the COSMIC downloader URL construction and auth token.
+
+No network. Two tests:
+  * `test_build_api_url_pattern` — verifies build_api_url() composes the
+    scripted-download endpoint correctly from server, api_endpoint, path
+    and bucket.
+  * `test_basic_auth_token_format` — verifies the base64(user:password)
+    token has no trailing newline (a common pitfall when echo'ing).
+"""
+from pgatk.cgenomes.cosmic_downloader import CosmicDownloadService
+from pgatk.config.registry import load_config
+
+
+def test_build_api_url_pattern(tmp_path):
+    """The API URL should be {server}/{endpoint}?path={path}&bucket={bucket}."""
+    config_data = load_config("cosmic", None)
+    pipeline_arguments = {
+        CosmicDownloadService.CONFIG_OUTPUT_DIRECTORY: str(tmp_path),
+        CosmicDownloadService.CONFIG_COSMIC_FTP_USER: "user@example.com",
+        CosmicDownloadService.CONFIG_COSMIC_FTP_PASSWORD: "secret",
+    }
+    svc = CosmicDownloadService(config_data, pipeline_arguments)
+
+    url = svc.build_api_url("grch38/cosmic/v103/Cosmic_MutantCensus_Tsv_v103_GRCh38.tar")
+
+    assert url == (
+        "https://cancer.sanger.ac.uk/api/mono/products/v1/downloads/scripted"
+        "?path=grch38/cosmic/v103/Cosmic_MutantCensus_Tsv_v103_GRCh38.tar"
+        "&bucket=downloads"
+    )
+
+
+def test_basic_auth_token_format(tmp_path):
+    """Auth token is base64(user:password), no trailing newline."""
+    import base64
+
+    config_data = load_config("cosmic", None)
+    pipeline_arguments = {
+        CosmicDownloadService.CONFIG_OUTPUT_DIRECTORY: str(tmp_path),
+        CosmicDownloadService.CONFIG_COSMIC_FTP_USER: "user@example.com",
+        CosmicDownloadService.CONFIG_COSMIC_FTP_PASSWORD: "secret",
+    }
+    svc = CosmicDownloadService(config_data, pipeline_arguments)
+
+    expected = base64.b64encode(b"user@example.com:secret").decode("utf-8")
+    assert svc._cosmic_token == expected
diff --git a/pgatk/tests/test_variant_types_hgvs.py b/pgatk/tests/test_variant_types_hgvs.py
new file mode 100644
index 0000000..f699088
--- /dev/null
+++ b/pgatk/tests/test_variant_types_hgvs.py
@@ -0,0 +1,449 @@
+"""HGVS compliance tests for get_mut_pro_seq across all SO variant types.
+
+Each test uses a real mutation from Cosmic_CompleteTargetedScreensMutant_v103_GRCh38.tsv
+paired with the matching CDS from Cosmic_Genes_v103_GRCh38.fasta.
+
+HGVS rules verified per type:
+  missense_variant        – single AA substitution, length unchanged
+  stop_gained             – stop (*) introduced at predicted position
+  inframe_deletion        – protein shortened by exact number of deleted codons
+  inframe_insertion       – protein lengthened by exact number of inserted codons (dup)
+  protein_altering_variant– delins produces a non-empty altered sequence
+  frameshift_variant      – out-of-frame deletion produces a non-empty translated sequence
+  start_lost              – start codon destroyed; first AA is no longer Met
+  stop_lost               – stop codon removed; protein extends past original terminus
+  stop_retained_variant   – stop codon changed but still a stop; length unchanged
+  synonymous_variant      – filtered before get_mut_pro_seq (pre-processing step)
+  intron_variant          – p.? guard returns empty string
+  3_prime_UTR_variant     – p.? guard returns empty string
+  5_prime_UTR_variant     – p.? guard returns empty string
+  coding_sequence_variant – p.? guard returns empty string
+  splice_acceptor_variant – p.? guard returns empty string
+  splice_donor_variant    – intronic offset guard returns empty string
+  splice_region_variant   – treated as missense when combined type contains missense_variant
+  incomplete_terminal_codon_variant – not present in v103 targeted screens; p.? returns ""
+"""
+from Bio.Seq import Seq
+
+from pgatk.cgenomes.cgenomes_proteindb import CancerGenomesService
+from pgatk.cgenomes.models import SNP
+
+# ---------------------------------------------------------------------------
+# CDS sequences from Cosmic_Genes_v103_GRCh38.fasta
+# Key: gene symbol  Value: CDS nucleotide string (ENST accession in comment)
+# ---------------------------------------------------------------------------
+_CDS = {
+    # ENST00000483967.5 – used for missense_variant
+    "EZH2": (
+        "ATGGGCCAGACTGGGAAGAAATCTGAGAAGGGACCAGTTTGTTGGCGGAAGCGTGTAAAATCAGAGTACATGCGACTGAGACAGCTCAAGAGGTTCAGACGAGCTGATGAAGTAAAGAGTATGTTTAGTTCCAATCGTCAGAAAATTTTGGAAAGAACGGAAATCTTAAACCAAGAATGGAAACAGCGAAGGATACAGCCTGTGCACATCCTGACTTCTTGTTCGGTGACCAGTGACTTGGATTTTCCAACACAAGTCATCCCATTAAAGACTCTGAATGCAGTTGCTTCAGTACCCATAATGTATTCTTGGTCTCCCCTACAGCAGAATTTTATGGTGGAAGATGAAACTGTTTTACATAACATTCCTTATATGGGAGATGAAGTTTTAGATCAGGATGGTACTTTCATTGAAGAACTAATAAAAAATTATGATGGGAAAGTACACGGGGATAGAGAATGTGGGTTTATAAATGATGAAATTTTTGTGGAGTTGGTGAATGCCCTTGGTCAATATAATGATGATGACGATGATGATGATGGAGACGATCCTGAAGAAAGAGAAGAAAAGCAGAAAGATCTGGAGGATCACCGAGATGATAAAGAAAGCCGCCCACCTCGGAAATTTCCTTCTGATAAAATTTTTGAAGCCATTTCCTCAATGTTTCCAGATAAGGGCACAGCAGAAGAACTAAAGGAAAAATATAAAGAACTCACCGAACAGCAGCTCCCAGGCGCACTTCCTCCTGAATGTACCCCCAACATAGATGGACCAAATGCTAAATCTGTTCAGAGAGAGCAAAGCTTACACTCCTTTCATACGCTTTTCTGTAGGCGATGTTTTAAATATGACTGCTTCCTACATCCTTTTCATGCAACACCCAACACTTATAAGCGGAAGAACACAGAAACAGCTCTAGACAACAAACCTTGTGGACCACAGTGTTACCAGCATTTGGAGGGAGCAAAGGAGTTTGCTGCTGCTCTCACCGCTGAGCGGATAAAGACCCCACCAAAACGTCCAGGAGGCCGCAGAAGAGGACGGCTTCCCAATAACAGTAGCAGGCCCAGCACCCCCACCATTAATGTGCTGGAATCAAAGGATACAGACAGTGATAGGGAAGCAGGGACTGAAACGGGGGGAGAGAACAATGATAAAGAAGAAGAAGAGAAGAAAGATGAAACTTCGAGCTCCTCTGAAGCAAATTCTCGGTGTCAAACACCAATAAAGATGAAGCCAAATATTGAACCTCCTGAGAATGTGGAGTGGAGTGGTGCTGAAGCCTCAATGTTTAGAGTCCTCATTGGCACTTACTATGACAATTTCTGTGCCATTGCTAGGTTAATTGGGACCAAAACATGTAGACAGGTGTATGAGTTTAGAGTCAAAGAATCTAGCATCATAGCTCCAGCTCCCGCTGAGGATGTGGATACTCCTCCAAGGAAAAAGAAGAGGAAACACCGGTTGTGGGCTGCACACTGCAGAAAGATACAGCTGAAAAAGGACGGCTCCTCTAACCATGTTTACAACTATCAACCCTGTGATCATCCACGGCAGCCTTGTGACAGTTCGTGCCCTTGTGTGATAGCACAAAATTTTTGTGAAAAGTTTTGTCAATGTAGTTCAGAGTGTCAAAACCGCTTTCCGGGATGCCGCTGCAAAGCACAGTGCAACACCAAGCAGTGCCCGTGCTACCTGGCTGTCCGAGAGTGTGACCCTGACCTCTGTCTTACTTGTGGAGCCGCTGACCATTGGGACAGTAAAAATGTGTCCTGCAAGAACTGCAGTATTCAGCGGGGCTCCAAAAAGCATCTATTGCTGGCACCATCTGACGTGGCAGGCTGGGGGATTTTTATCAAAGATCCTGTGCAGAAAAATGAATTCATCTCAGAATACTGTGGAGAGATTATTTCTCAAGATGAAGCTGACAGAAGAGGGAAAGTGTATGATAAATACATGTGCAGCTTTCTGTTCAACTTGAACAATGATTTTGTGGTGGATGCAACCCGCAAGGGTAACAAAATTCGTTTTGCAAATCATTCGGTAAATCCAAACTGCTATGCAAAAGTTATGATGGTTAACGGTGATCACAGGATAGGTATTTTTGCCAAGAGAGCCATCCAGACTGGCGAAGAGCTGTTTTTTGATTACAGATACAGCCAGGCTGATGCCCTGAAGTATGTCGGCATCGAAAGAGAAATGGAAATCCCTTGA"
+    ),
+    # ENST00000613418.4 – used for stop_gained
+    "FCGR3B": (
+        "ATGCGGACTGAAGATCTCCCAAAGGCTGTGGTGTTCCTGGAGCCTCAATGGTACAGCGTGCTTGAGAAGGACAGTGTGACTCTGAAGTGCCAGGGAGCC"
+        "TACTCCCCTGAGGACAATTCCACACAGTGGTTTCACAATGAGAACCTCATCTCAAGCCAGGCCTCGAGCTACTTCATTGACGCTGCCACAGTCAACGAC"
+        "AGTGGAGAGTACAGGTGCCAGACAAACCTCTCCACCCTCAGTGACCCGGTGCAGCTAGAAGTCCATATCGGCTGGCTGTTGCTCCAGGCCCCTCGGTGGG"
+        "TGTTCAAGGAGGAAGACCCTATTCACCTGAGGTGTCACAGCTGGAAGAACACTGCTCTGCATAAGGTCACATATTTACAGAATGGCAAAGACAGGAAGTAT"
+        "TTTCATCATAATTCTGACTTCCACATTCCAAAAGCCACACTCAAAGATAGCGGCTCCTACTTCTGCAGGGGGCTTGTTGGGAGTAAAAATGTGTCTTCAGA"
+        "GACTGTGAACATCACCATCACTCAAGGTTTGGCAGTGTCAACCATCTCATCATTCTCTCCACCTGGGTACCAAGTCTCTTTCTGCTTGGTGATGGTACTCC"
+        "TTTTTGCAGTGGACACAGGACTATATTTCTCTGTGAAGACAAACATTTGA"
+    ),
+    # ENST00000646174.1 – used for inframe_deletion
+    "CTNNB1": (
+        "ATGGAGTTGGACATGGCCATGGAACCAGACAGAAAAGCGGCTGTTAGTCACTGGCAGCAACAGTCTTACCTGGACTCTGGAATCCATTCTGGTGCCACTAC"
+        "CACAGCTCCTTCTCTGAGTGGTAAAGGCAATCCTGAGGAAGAGGATGTGGATACCTCCCAAGTCCTGTATGAGTGGGAACAGGGATTTTCTCAGTCCTTCA"
+        "CTCAAGAACAAGTAGCTGATATTGATGGACAGTATGCAATGACTCGAGCTCAGAGGGTACGAGCTGCTATGTTCCCTGAGACATTAGATGAGGGCATGCAGA"
+        "TCCCATCTACACAGTTTGATGCTGCTCATCCCACTAATGTCCAGCGTTTGGCTGAACCATCACAGATGCTGAAACATGCAGTTGTAAACTTGATTAACTATCA"
+        "AGATGATGCAGAACTTGCCACACGTGCAATCCCTGAACTGACAAAACTGCTAAATGACGAGGACCAGGTGGTGGTTAATAAGGCTGCAGTTATGGTCCATCAG"
+        "CTTTCTAAAAAGGAAGCTTCCAGACACGCTATCATGCGTTCTCCTCAGATGGTGTCTGCTATTGTACGTACCATGCAGAATACAAATGATGTAGAAACAGCTCG"
+        "TTGTACCGCTGGGACCTTGCATAACCTTTCCCATCATCGTGAGGGCTTACTGGCCATCTTTAAGTCTGGAGGCATTCCTGCCCTGGTGAAAATGCTTGGTTCAC"
+        "CAGTGGATTCTGTGTTGTTTTATGCCATTACAACTCTCCACAACCTTTTATTACATCAAGAAGGAGCTAAAATGGCAGTGCGTTTAGCTGGTGGGCTGCAGAAAA"
+        "TGGTTGCCTTGCTCAACAAAACAAATGTTAAATTCTTGGCTATTACGACAGACTGCCTTCAAATTTTAGCTTATGGCAACCAAGAAAGCAAGCTCATCATACTGG"
+        "CTAGTGGTGGACCCCAAGCTTTAGTAAATATAATGAGGACCTATACTTACGAAAAACTACTGTGGACCACAAGCAGAGTGCTGAAGGTGCTATCTGTCTGCTCTA"
+        "GTAATAAGCCGGCTATTGTAGAAGCTGGTGGAATGCAAGCTTTAGGACTTCACCTGACAGATCCAAGTCAACGTCTTGTTCAGAACTGTCTTTGGACTCTCAGGA"
+        "ATCTTTCAGATGCTGCAACTAAACAGGAAGGGATGGAAGGTCTCCTTGGGACTCTTGTTCAGCTTCTGGGTTCAGATGATATAAATGTGGTCACCTGTGCAGCTG"
+        "GAATTCTTTCTAACCTCACTTGCAATAATTATAAGAACAAGATGATGGTCTGCCAAGTGGGTGGTATAGAGGCTCTTGTGCGTACTGTCCTTCGGGCTGGTGACAG"
+        "GGAAGACATCACTGAGCCTGCCATCTGTGCTCTTCGTCATCTGACCAGCCGACACCAAGAAGCAGAGATGGCCCAGAATGCAGTTCGCCTTCACTATGGACTACCA"
+        "GTTGTGGTTAAGCTCTTACACCCACCATCCCACTGGCCTCTGATAAAGGCTACTGTTGGATTGATTCGAAATCTTGCCCTTTGTCCCGCAAATCATGCACCTTTGC"
+        "GTGAGCAGGGTGCCATTCCACGACTAGTTCAGTTGCTTGTTCGTGCACATCAGGATACCCAGCGCCGTACGTCCATGGGTGGGACACAGCAGCAATTTGTGGAGGG"
+        "GGTCCGCATGGAAGAAATAGTTGAAGGTTGTACCGGAGCCCTTCACATCCTAGCTCGGGATGTTCACAACCGAATTGTTATCAGAGGACTAAATACCATTCCATTG"
+        "TTTGTGCAGCTGCTTTATTCTCCCATTGAAAACATCCAAAGAGTAGCTGCAGGGGTCCTCTGTGAACTTGCTCAGGACAAGGAAGCTGCAGAAGCTATTGAAGCTG"
+        "AGGGAGCCACAGCTCCTCTGACAGAGTTACTTCACTCTAGGAATGAAGGTGTGGCGACATATGCAGCTGCTGTTTTGTTCCGAATGTCTGAGGACAAGCCACAAGAT"
+        "TACAAGAAACGGCTTTCAGTTGAGCTGACCAGCTCTCTCTTCAGAACAGAGCCAATGGCTTGGAATGAGACTGCTGATCTTGGACTTGATATTGGTGCCCAGGGAGAA"
+        "CCCCTTGGATATCGCCAGGATGATCCTAGCTATCGTTCTTTTCACTCTGGTGGATATGGCCAGGATGCCTTGGGTATGGACCCCATGATGGAACATGAGATGGGTGGC"
+        "CACCACCCTGGTGCTGACTATCCAGTTGATGGGCTGCCAGATCTGGGGCATGCCCAGGACCTCATGGATGGGCTGCCTCCAGGTGACAGCAATCAGCTGGCCTGGTTT"
+        "GATACTGACCTGTAA"
+    ),
+    # ENST00000370452.7 – used for inframe_insertion (dup)
+    "SMAP1": (
+        "ATGGCGACGCGCTCCTGTCGGGAGAAGGCTCAGAAGCTGAACGAGCAGCACCAGCTCATCCTATCCAAGCTTCTGAGGGAGGAGGACAACAAGTACTGCGCC"
+        "GACTGCGAGGCCAAAGGTCCTCGATGGGCTTCCTGGAATATTGGTGTGTTTATTTGCATCAGATGTGCTGGAATTCATAGAAATCTTGGGGTTCATATATCC"
+        "AGGGTCAAATCAGTCAACCTAGACCAATGGACAGCAGAACAGATACAGTGCATGCAAGATATGGGAAATACTAAAGCAAGACTACTCTATGAAGCCAATCTTCC"
+        "AGAGAACTTTCGAAGACCACAGACAGATCAAGCAGTGGAATTTTTCATCAGAGATAAATATGAAAAGAAGAAATACTACGATAAAAATGCCATAGCTATTACAAAT"
+        "AAAGAAAAGGAAGAAAAAAAAGGAAGAGAAAAAGAGAGAAAAGGAGCCAGAAAAGCCGGCAAAACCACTTACAGCTGAAAAGCTGCAGAAGAAAGATCAGCAACTGG"
+        "AGCCTAAAAAAAGTACCAGCCCTAAAAAAGCTGCGGAGCCCACTGTGGATCTTTTAGGACTTGATGGCCCTGCTGTGGCACCAGTGACCAACGGGAACACAACGGTG"
+        "CCACCCCTGAACGATGATCTGGACATCTTTGGACCGATGATTTCTAATCCCTTACCTGCAACTGTCATGCCCCCAGCTCAGGGGACACCCTCTGCACCAGCAGCTGCA"
+        "ACCCTGTCTACAGTAACATCTGGGGATCTAGATTTATTCACTGAGCAAACTACAAAATCAGAAGAAGTGGCAAAGAAACAACTTTCCAAAGACTCCATCTTATCTCTG"
+        "TATGGCACAGGAACCATTCAACAGCAAAGTACTCCTGGTGTATTTATGGGACCCACAAATATACCATTTACCTCACAAGCACCAGCTGCATTTCAGGGCTTTCCATCG"
+        "ATGGGCGTGCCTGTGCCTGCAGCTCCTGGCCTTATAGGAAATGTGATGGGACAGAGTCCAAGCATGATGGTGGGCATGCCCATGCCCAATGGGTTTATGGGAAATGCA"
+        "CAAACTGGTGTGATGCCACTTCCTCAGAACGTTGTTGGCCCCCAAGGAGGAATGGTGGGACAAATGGGTGCACCCCAGAGTAAGTTTGGCCTGCCGCAAGCTCAGCAG"
+        "CCCCAGTGGAGCCTCTCACAGATAATGCAGAAGGGTGATGCTGTTCTCCAGCACTCCATCAGTGCAATCTACTGGCCAATGACAAGGTGGTTAAAATGTCCTTTAGT"
+        "AGATGAATCAGCAGATGGCTGGCATGAGTATCAGTAG"
+    ),
+    # ENST00000361676.8 – used for protein_altering_variant
+    "NF2": (
+        "ATGGCCGGGGCCATCGCTTCCCGCATGAGCTTCAGCTCTCTCAAGAGGAAGCAACCCAAGACGTTCACCGTGAGGATCGTCACCATGGACGCCGAGATGGAG"
+        "TTCAATTGCGAGGTACTGGATCATGATGTTTCAAAGGAAGAACCAGTCACCTTTCACTTCTTGGCCAAATTTTATCCTGAGAATGCTGAAGAGGAGCTGGTT"
+        "CAGGAGATCACACAACATTTATTCTTCTTACAGGTAAAGAAGCAGATTTTAGATGAAAAGATCTACTGCCCTCCTGAGGCTTCTGTGCTCCTGGCTTCTTACG"
+        "CCGTCCAGGCCAAGTATGGTGACTACGACCCCAGTGTTCACAAGCGGGGATTTTTGGCCCAAGAGGAATTGCTTCCAAAAAGGGTAATAAATCTGTATCAGAT"
+        "GACTCCGGAAATGTGGGAGGAGAGAATTACTGCTTGGTACGCAGAGCACCGAGGCCGAGCCAGGGATGAAGCTGAAATGGAATATCTGAAGATAGCTCAGGAC"
+        "CTGGAGATGTACGGTGTGAACTACTTTGCAATCCGGAATAAAAAGGGCACAGAGCTGCTGCTTGGAGTGGATGCCCTGGGGCTTCACATTTATGACCCTGAGA"
+        "ACAGACTGACCCCCAAGATCTCCTTCCCGTGGAATGAAATCCGAAACATCTCGTACAGTGACAAGGAGTTTACTATTAAACCACTGGATAAGAAAATTGATGTC"
+        "TTCAAGTTTAACTCCTCAAAGCTTCGTGTTAATAAGCTGATTCTCCAGCTATGTATCGGGAACCATGATCTATTTATGAGGAGAAGGAAAGCCGATTCTTTGGAA"
+        "GTTCAGCAGATGAAAGCCCAGGCCAGGGAGGAGAAGGCTAGAAAGCAGATGGAGCGGCAGCGCCTCGCTCGAGAGAAGCAGATGAGGGAGGAGGCTGAACGCACG"
+        "AGGGATGAGTTGGAGAGGAGGCTGCTGCAGATGAAAGAAGAAGCAACAATGGCCAACGAAGCACTGATGCGGTCTGAGGAGACAGCTGACCTGTTGGCTGAAAAG"
+        "GCCCAGATCACCGAGGAGGAGGCAAAACTTCTGGCCCAGAAGGCCGCAGAGGCTGAGCAGGAAATGCAGCGCATCAAGGCCACAGCGATTCGCACGGAGGAGGAG"
+        "AAGCGCCTGATGGAGCAGAAGGTGCTGGAAGCCGAGGTGCTGGCACTGAAGATGGCTGAGGAGTCAGAGAGGAGGGCCAAAGAGGCAGATCAGCTGAAGCAGGAC"
+        "CTGCAGGAAGCACGCGAGGCGGAGCGAAGAGCCAAGCAGAAGCTCCTGGAGATTGCCACCAAGCCCACGTACCCGCCCATGAACCCAATTCCAGCACCGTTGCCTC"
+        "CTGACATACCAAGCTTCAACCTCATTGGTGACAGCCTGTCTTTCGACTTCAAAGATACTGACATGAAGCGGCTTTCCATGGAGATAGAGAAAGAAAAAGTGGAATAC"
+        "ATGGAAAAGAGCAAGCATCTGCAGGAGCAGCTCAATGAACTCAAGACAGAAATCGAGGCCTTGAAACTGAAAGAGAGGGAGACAGCTCTGGATATTCTGCACAATGA"
+        "GAACTCCGACAGGGGTGGCAGCAGCAAGCACAATACCATTAAAAAGCCTCAAGCCCAAGGCAGAAGACCTATCTGCATTTGA"
+    ),
+    # ENST00000394374.6 – used for frameshift_variant
+    "MEN1": (
+        "ATGGGGCTGAAGGCCGCCCAGAAGACGCTGTTCCCGCTGCGCTCCATCGACGACGTGGTGCGCCTGTTTGCTGCCGAGCTGGGCCGAGAGGAGCCGGACCTG"
+        "GTGCTCCTTTCCTTGGTGCTGGGCTTCGTGGAGCATTTTCTGGCTGTCAACCGCGTCATCCCTACCAACGTTCCCGAGCTCACCTTCCAGCCCAGCCCCGCCC"
+        "CCGACCCGCCTGGCGGCCTCACCTACTTTCCCGTGGCCGACCTGTCTATCATCGCCGCCCTCTATGCCCGCTTCACCGCCCAGATCCGAGGCGCCGTCGACCTG"
+        "TCCCTCTATCCTCGAGAAGGGGGTGTCTCCAGCCGTGAGCTGGTGAAGAAGGTCTCCGATGTCATATGGAACAGCCTCAGCCGCTCCTACTTCAAGGATCGGGCC"
+        "CACATCCAGTCCCTCTTCAGCTTCATCACAGGTTGGAGCCCAGTAGGCACCAAATTGGACAGCTCCGGTGTGGCCTTTGCTGTGGTTGGGGCCTGCCAGGCCCTGG"
+        "GTCTCCGGGATGTCCACCTCGCCCTGTCTGAGGATCATGCCTGGGTAGTGTTTGGGCCCAATGGGGAGCAGACAGCTGAGGTCACCTGGCACGGCAAGGGCAACGA"
+        "GGACCGCAGGGGCCAGACAGTCAATGCCGGTGTGGCTGAGCGGAGCTGGCTGTACCTGAAAGGATCATACATGCGCTGTGACCGCAAGATGGAGGTGGCGTTCATG"
+        "GTGTGTGCCATCAACCCTTCCATTGACCTGCACACCGACTCGCTGGAGCTTCTGCAGCTGCAGCAGAAGCTGCTCTGGCTGCTCTATGACCTGGGACATCTGGAAAG"
+        "GTACCCCATGGCCTTAGGGAACCTGGCAGATCTAGAGGAGCTGGAGCCCACCCCTGGCCGGCCAGACCCACTCACCCTCTACCACAAGGGCATTGCCTCAGCCAAGAC"
+        "CTACTATCGGGATGAACACATCTACCCCTACATGTACCTGGCTGGCTACCACTGTCGCAACCGCAATGTGCGGGAAGCCCTGCAGGCCTGGGCGGACACGGCCACTGT"
+        "CATCCAGGACTACAACTACTGCCGGGAAGACGAGGAGATCTACAAGGAGTTCTTTGAAGTAGCCAATGATGTCATCCCCAACCTGCTGAAGGAGGCAGCCAGCTTGCT"
+        "GGAGGCGGGCGAGGAGCGGCCGGGGGAGCAAAGCCAGGGCACCCAGAGCCAAGGTTCCGCCCTCCAGGACCCTGAGTGCTTCGCCCACCTGCTGCGATTCTACGACGG"
+        "CATCTGCAAATGGGAGGAGGGCAGTCCCACGCCTGTGCTGCATGTGGGCTGGGCCACCTTTCTTGTGCAGTCCCTAGGCCGTTTTGAGGGACAGGTGCGGCAGAAGGT"
+        "GCGCATAGTGAGCCGAGAGGCCGAGGCGGCCGAGGCCGAGGAGCCGTGGGGCGAGGAAGCCCGGGAAGGCCGGCGGCGGGGCCCACGGCGGGAGTCCAAGCCAGAGGAG"
+        "CCCCCGCCGCCCAAGAAGCCAGCACTGGACAAGGGCCTGGGCACCGGCCAGGGTGCAGTGTCAGGACCCCCCCGGAAGCCTCCTGGGACTGTCGCTGGCACAGCCCGAGG"
+        "CCCTGAAGGTGGCAGCACGGCTCAGGTGCCAGCACCCACAGCATCACCACCGCCGGAGGGTCCAGTGCTCACTTTCCAGAGTGAGAAGATGAAGGGCATGAAGGAGCTGC"
+        "TGGTGGCCACCAAGATCAACTCGAGCGCCATCAAGCTGCAACTCACGGCACAGTCGCAAGTGCAGATGAAGAAGCAGAAAGTGTCCACCCCTAGTGACTACACTCTGTCT"
+        "TTCCTCAAGCGGCAGCGCAAAGGCCTCTGA"
+    ),
+    # ENST00000513000.5 – used for start_lost
+    "INPP4B": (
+        "ATGGAAATTAAAGAGGAAGGGGCATCAGAAGAAGGGCAGCACTTTCTTCCTACAGCCCAGGCCAATGATCCCGGGGACTGTCAGTTCACAAGTATCCAGAAGAC"
+        "TCCAAATGAACCGCAGTTGGAATTCATCCTTGCATGCAAGGATCTCGTGGCTCCTGTCCGTGATCGTAAACTGAATACACTGGTGCAGATCTCCGTAATCCACC"
+        "CCGTGGAGCAGAGTCTGACAAGATACTCCAGCACCGAAATTGTGGAGGGAACAAGGGACCCACTGTTTTTGACTGGTGTCACATTCCCATCTGAGTATCCCATCT"
+        "ATGAGGAGACCAAAATAAAACTAACAGTCTATGATGTCAAGGATAAGTCTCATGACACCGTTCGAACCAGTGTCCTACCAGAACATAAGGATCCCCCGCCAGAAGT"
+        "TGGGCGAAGTTTCTTGGGCTATGCCAGTTTTAAAGTGGGAGAGCTGCTGAAGTCAAAGGAGCAATTGCTGGTCCTGAGCCTGAGAACTTCAGATGGTGGCAAAGTG"
+        "GTTGGCACCATAGAAGTCAGTGTCGTGAAGATGGGGGAGATTGAGGATGGGGAAGCCGACCACATCACCACAGATGTACAGGGACAAAAGTGTGCCCTGGTATGTGA"
+        "ATGTACAGCCCCGGAAAGTGTGAGCGGAAAAGATAACTTACCTTTTTTGAATTCAGTGTTAAAGAACCCAGTATGTAAATTATATAGATTTCCCACATCTGACAATAAG"
+        "TGGATGCGAATTCGAGAGCAGATGTCAGAGAGCATTCTTTCCTTTCATATTCCTAAGGAATTGATTTCCCTTCACATTAAAGAAGATTTGTGCAGAAACCAGGAGATA"
+        "AAAGAACTTGGTGAGCTTTCTCCACATTGGGACAATCTGCGAAAAAATGTCCTTACTCACTGTGATCAAATGGTGAATATGTACCAAGACATTCTGACAGAACTTAGCA"
+        "AGGAAACAGGGTCCTCTTTCAAATCAAGCAGCAGCAAAGGAGAGAAAACATTAGAATTTGTTCCAATAAATCTACATCTGCAAAGAATGCAGGTACACAGCCCTCACTTG"
+        "AAAGATGCTCTCTACGATGTCATCACTGTGGGAGCCCCAGCTGCCCATTTTCAGGGATTTAAGAATGGTGGTCTTCGGAAGCTACTCCATAGATTTGAAACAGAAAGAAG"
+        "AAATACCGGATACCAGTTTATTTACTATTCACCTGAAAACACAGCCAAAGCAAAGGAAGTTCTCAGCAACATCAATCAACTACAACCTCTTATAGCAACCCATGCAGACC"
+        "TACTGCTTAATTCTGCAAGCCAGCATTCTCCAGACAGCTTGAAGAATTCTTTAAAGATGCTTTCAGAAAAAACAGAGCTTTTTGTACATGCCTTCAAGGATCAACTTGTC"
+        "AGGAGTGCTCTTTTAGCACTCTACACTGCAAGGCCAGGAGGCATTCTTAAGAAGCCACCCTCTCCTAAGAGCAGCACAGAGGAGAGCAGTCCCCAAGACCAACCCCCAGT"
+        "GATGAGAGGGCAGGACTCCATACCACATCATTCAGACTATGATGAGGAAGAGTGGGACAGGGTGTGGGCCAATGTGGGGAAGAGCCTGAACTGCATTATTGCTATGGTGG"
+        "ACAAACTGATTGAAAGAGATGGTGGCAGTGAAGGCAGTGGCGGCAACAATGATGGAGAAAAGGAACCTTCATTAACAGATGCCATTCCCTCTCACCCAAGAGAGGACTGG"
+        "TATGAACAGTTGTATCCCCTCATCCTTACCCTGAAGGACTGCATGGGAGAAGTGGTGAACCGAGCCAAGCAGTCCCTGACATTTGTGCTCCTTCAGGAACTTGCGTACAGC"
+        "TTGCCCCAGTGTCTGATGCTGACGCTAAGAAGAGACATCGTCTTCAGCCAAGCACTTGCTGGATTGGTTTGTGGTTTTATCATCAAATTACAGACAAGTCTGTATGACCC"
+        "AGGCTTCCTACAGCAGCTTCACACAGTGGGGTTGATAGTACAATATGAAGGACTGCTAAGTACATACAGCGATGAAATTGGAATGCTAGAGGACATGGCCGTTGGCATTTC"
+        "CGATTTAAAGAAAGTTGCATTTAAAATAATTGAAGCCAAATCCAATGATGTGTTGCCAGTTATAACAGGAAGACGAGAACATTACGTGGTAGAGGTCAAGCTTCCAGCCAG"
+        "AATGTTTGAGTCACTACCTCTACAGATTAAAGAAGGACAGTTGCTTCATGTGTATCCAGTACTTTTTAATGTTGGAATCAATGAACAGCAAACTCTGGCTGAAAGGTTTGG"
+        "AGATGTCTCTTTGCAAGAAAGTATTAATCAGGAAAACTTCGAACTTCTACAAGAATATTACAAGATATTTATGGAAAAGATGCCTCCTGATTATATTTCACATTTTCAGGAA"
+        "CAAAATGATTTAAAAGCATTGCTAGAAAATCTCCTTCAAAATATCCAATCCAAAAAAAGAAAGAATGTAGAAATTATGTGGCTGGCTGCAACGATTTGCCGCAAACTGAATG"
+        "GTATTCGTTTCACCTGTTGTAAAAGTGCCAAAGACAGGACATCGATGTCAGTGACACTTGAACAATGCTCAATCTTGAGAGATGAGCACCAGTTACACAAGGACTTCTTTAT"
+        "CCGAGCGCTGGATTGCATGAGAAGAGAAGGATGCCGCATAGAGAATGTACTGAAGAATATCAAATGCAGAAAGTATGCTTTCAACATGCTACAGCTGATGGCTTTCCCCAAG"
+        "TACTACAGACCTCCAGAGGGGACTTATGGAAAAGCTGACACCTAA"
+    ),
+    # ENST00000443433.6 – used for stop_lost
+    "MYD88": (
+        "ATGCGACCCGACCGCGCTGAGGCTCCAGGACCGCCCGCCATGGCTGCAGGAGGTCCCGGCGCGGGGTCTGCGGCCCCGGTCTCCTCCACATCCTCCCTTCCCC"
+        "TGGCTGCTCTCAACATGCGAGTGCGGCGCCGCCTGTCTCTGTTCTTGAACGTGCGGACACAGGTGGCGGCCGACTGGACCGCGCTGGCGGAGGAGATGGACTTT"
+        "GAGTACTTGGAGATCCGGCAACTGGAGACACAAGCGGACCCCACTGGCAGGCTGCTGGACGCCTGGCAGGGACGCCCTGGCGCCTCTGTAGGCCGACTGCTCGAG"
+        "CTGCTTACCAAGCTGGGCCGCGACGACGTGCTGCTGGAGCTGGGACCCAGCATTGAGGAGGATTGCCAAAAGTATATCTTGAAGCAGCAGCAGGAGGAGGCTGAG"
+        "AAGCCTTTACAGGTGGCCGCTGTAGACAGCAGTGTCCCACGGACAGCAGAGCTGGCGGGCATCACCACACTTGATGACCCCCTGGGTGCCGCCGGATGGTGGTGGT"
+        "TGTCTCTGATGATTACCTGCAGAGCAAGGAATGTGACTTCCAGACCAAATTTGCACTCAGCCTCTCTCCAGGTGCCCATCAGAAGCGACTGA"
+    ),
+    # ENST00000494014.1 – used for stop_retained_variant
+    "EPHA3": (
+        "ATGGATTGTCAGCTCTCCATCCTCCTCCTTCTCAGCTGCTCTGTTCTCGACAGCTTCGGGGAACTGATTCCGCAGCCTTCCAATGAAGTCAATCTACTGGATTC"
+        "AAAAACAATTCAAGGGGAGCTGGGCTGGATCTCTTATCCATCACATGGGTGGGAAGAGATCAGTGGTGTGGATGAACATTACACACCCATCAGGACTTACCAGGTG"
+        "TGCAATGTCATGGACCACAGTCAAAACAATTGGCTGAGAACAAACTGGGTCCCCAGGAACTCAGCTCAGAAGATTTATGTGGAGCTCAAGTTCACTCTACGAGACTG"
+        "CAATAGCATTCCATTGGTTTTAGGAACTTGCAAGGAGACATTCAACCTGTACTACATGGAGTCTGATGATGATCATGGGGTGAAATTTCGAGAGCATCAGTTTACAAA"
+        "GATTGACACCATTGCAGCTGATGAAAGTTTCACTCAAATGGATCTTGGGGACCGTATTCTGAAGCTCAACACTGAGATTAGAGAAGTAGGTCCTGTCAACAAGAAGGG"
+        "ATTTTATTTGGCATTTCAAGATGTTGGTGCTTGTGTTGCCTTGGTGTCTGTGAGAGTATACTTCAAAAAGTGCCCATTTACAGTGAAGAATCTGGCTATGTTTCCAGA"
+        "CACGGTACCCATGGACTCCCAGTCCCTGGTGGAGGTTAGAGGGTCTTGTGTCAACAATTCTAAGGAGGAAGATCCTCCAAGGATGTACTGCAGTACAGAAGGCGAATGG"
+        "CTTGTACCCATTGGCAAGTGTTCCTGCAATGCTGGCTATGAAGAAAGAGGTTTTATGTGCCAAGCTTGTCGACCAGGTTTCTACAAGGCATTGGATGGTAATATGAAGT"
+        "GTGCTAAGTGCCCGCCTCACAGTTCTACTCAGGAAGATGGTTCAATGAACTGCAGGTGTGAGAATAATTACTTCCGGGCAGACAAAGACCCTCCATCCATGGCTTGTACC"
+        "CGACCTCCATCTTCACCAAGAAATGTTATCTCTAATATAAACGAGACCTCAGTTATCCTGGACTGGAGTTGGCCCCTGGACACAGGAGGCCGGAAAGATGTTACCTTCAAC"
+        "ATCATATGTAAAAAATGTGGGTGGAATATAAAACAGTGTGAGCCATGCAGCCCAAATGTCCGCTTCCTCCCTCGACAGTTTGGACTCACCAACACCACGGTGACAGTGACAG"
+        "ACCTTCTGGCACATACTAACTACACCTTTGAGATTGATGCCGTTAATGGGGTGTCAGAGCTGAGCTCCCCACCAAGACAGTTTGCTGCGGTCAGCATCACAACTAATCAGGC"
+        "TGCTCCATCACCTGTCCTGACGATTAAGAAAGATCGGACCTCCAGAAATAGCATCTCTTTGTCCTGGCAAGAACCTGAACATCCTAATGGGATCATATTGGACTACGAGGTC"
+        "AAATACTATGAAAAGCAGGAACAAGAAACAAGTTATACCATTCTGAGGGCAAGAGGCACAAATGTTACCATCAGTAGCCTCAAGCCTGACACTATATACGTATTCCAAATCC"
+        "GAGCCCGAACAGCCGCTGGATATGGGACGAACAGCCGCAAGTTTGAGTTTGAAACTAGTCCAGACTCTTTCTCCATCTCTGGTGAAAGTAGCCAAGTGGTCATGATCGCCAT"
+        "TTCAGCGGCAGTAGCAATTATTCTCCTCACTGTTGTCATCTATGTTTTGATTGGGAGGTTCTGTGGCTATAAGTCAAAACATGGGGCAGATGAAAAAAGACTTCATTTTGGC"
+        "AATGGGCATTTAAAACTTCCAGGTCTCAGGACTTATGTTGACCCACATACATATGAAGACCCTACCCAAGCTGTTCATGAGTTTGCCAAGGAATTGGATGCCACCAACATATC"
+        "CATTGATAAAGTTGTTGGAGCAGGTGAATTTGGAGAGGTGTGCAGTGGTCGCTTAAAACTTCCTTCAAAAAAAGAGATTTCAGTGGCCATTAAGACCCTGAAAGTTGGCTAC"
+        "ACAGAAAAGCAGAGGAGAGACTTCCTGGGAGAAGCAAGCATTATGGGACAGTTTGACCACCCCAATATCATTCGACTGGAAGGAGTTGTTACCAAAAGTAAGCCAGTTATGA"
+        "TTGTCACAGAATACATGGAGAATGGTTCCTTGGATAGTTTCCTACGTAAACACGATGCCCAGTTTACTGTCATTCAGCTAGTGGGGATGCTTCGAGGGATAGCATCTGGCAT"
+        "GAAGTACCTGTCAGACATGGGCTATGTTCACCGAGACCTCGCTGCTCGGAACATCTTGATCAACAGTAACTTGGTGTGTAAGGTTTCTGATTTCGGACTTTCGCGTGTCCTG"
+        "GAGGATGACCCAGAAGCTGCTTATACAACAAGAGGAGGGAAGATCCCAATCAGGTGGACATCACCAGAAGCTATAGCCTACCGCAAGTTCACGTCAGCCAGCGATGTATGGAG"
+        "TTATGGGATTGTTCTCTGGGAGGTGATGTCTTATGGAGAGAGACCATACTGGGAGATGTCCAATCAGGATGTAATTAAAGCTGTAGATGAGGGCTATCGACTGCCACCCCCC"
+        "ATGGACTGCCCAGCTGCCTTGTATCAGCTGATGCTGGACTGCTGGCAGAAAGACAGGAACAACAGACCCAAGTTTGAGCAGATTGTTAGTATTCTGGACAAGCTTATCCGGA"
+        "ATCCCGGCAGCCTGAAGATCATCACCAGTGCAGCCGCAAGGCCATCAAACCTTCTTCTGGACCAAAGCAATGTGGATATCACTACCTTCCGCACAACAGTGACATGA"
+    ),
+}
+
+
+def _seq(gene: str) -> Seq:
+    return Seq("".join(_CDS[gene].split()))
+
+
+def _wt_protein(gene: str) -> str:
+    return str(_seq(gene).translate(to_stop=False))
+
+
+def _call(gene, dna_mut, aa_mut, mutation_type) -> str:
+    snp = SNP(gene=gene, mrna="", dna_mut=dna_mut, aa_mut=aa_mut, mutation_type=mutation_type)
+    return CancerGenomesService.get_mut_pro_seq(snp, _seq(gene))
+
+
+# ---------------------------------------------------------------------------
+# Actionable types — should produce a non-empty mutant protein
+# ---------------------------------------------------------------------------
+
+class TestActionableVariantTypes:
+
+    def test_missense_variant(self):
+        """HGVS rule: single AA substitution leaves total protein length unchanged.
+
+        Source: EZH2 c.656C>T p.S219F (missense_variant)
+        DNA path: substitution at position 656 (C→T) changes codon 219 from Ser to Phe.
+        """
+        wt = _wt_protein("EZH2")
+        result = _call("EZH2", "c.656C>T", "p.S219F", "missense_variant")
+
+        assert result, "missense_variant must produce a non-empty protein"
+        assert len(result) == len(wt), (
+            f"Missense must preserve protein length: wt={len(wt)}, mut={len(result)}"
+        )
+        assert result[218] == "F", (
+            f"Position 219 (index 218) must change to Phe; got '{result[218]}'"
+        )
+        assert result[:218] == wt[:218], "Residues before the substitution must be unchanged"
+        assert result[219:] == wt[219:], "Residues after the substitution must be unchanged"
+
+    def test_stop_gained(self):
+        """HGVS rule: stop codon (*) introduced at the predicted position.
+
+        Source: FCGR3B c.394A>T p.K132* (stop_gained)
+        DNA path: substitution at position 394 (A→T) converts codon 132 to a stop.
+        """
+        result = _call("FCGR3B", "c.394A>T", "p.K132*", "stop_gained")
+
+        assert result, "stop_gained must produce a non-empty translated sequence"
+        assert result[131] == "*", (
+            f"Stop codon must be introduced at position 132 (index 131); got '{result[131]}'"
+        )
+
+    def test_inframe_deletion(self):
+        """HGVS rule: in-frame deletion shortens the protein by exactly the number of deleted codons.
+
+        Source: CTNNB1 c.104_124del p.T35_G41del (inframe_deletion)
+        DNA path: deletes 21 nucleotides (positions 104–124) = 7 codons = 7 amino acids.
+        """
+        wt = _wt_protein("CTNNB1")
+        result = _call("CTNNB1", "c.104_124del", "p.T35_G41del", "inframe_deletion")
+
+        deleted_nt = 124 - 104 + 1   # 21 nt = 7 codons
+        deleted_aa = deleted_nt // 3
+
+        assert result, "inframe_deletion must produce a non-empty protein"
+        assert len(result) == len(wt) - deleted_aa, (
+            f"Inframe deletion of {deleted_aa} AAs: expected len {len(wt) - deleted_aa}, "
+            f"got {len(result)}"
+        )
+        assert result[:34] == wt[:34], "Residues before the deletion must be unchanged"
+
+    def test_inframe_insertion_dup(self):
+        """HGVS rule: in-frame duplication lengthens the protein by exactly the number of duplicated codons.
+
+        Source: SMAP1 c.1226_1228dup p.I409dup (inframe_insertion)
+        DNA path: duplicates 3 nucleotides (positions 1226–1228) = 1 codon = 1 amino acid inserted.
+        """
+        wt = _wt_protein("SMAP1")
+        result = _call("SMAP1", "c.1226_1228dup", "p.I409dup", "inframe_insertion")
+
+        inserted_nt = 1228 - 1226 + 1   # 3 nt = 1 codon
+        inserted_aa = inserted_nt // 3
+
+        assert result, "inframe_insertion (dup) must produce a non-empty protein"
+        assert len(result) == len(wt) + inserted_aa, (
+            f"Duplication of {inserted_aa} AA: expected len {len(wt) + inserted_aa}, "
+            f"got {len(result)}"
+        )
+        assert result[:409] == wt[:409], "Residues before the duplication site must be unchanged"
+
+    def test_protein_altering_variant(self):
+        """HGVS rule: delins replaces a nucleotide range; result is non-empty and differs from WT.
+
+        Source: NF2 c.252_262delinsCT p.L85_K88delins* (protein_altering_variant)
+        DNA path: deletes positions 252–262 (11 nt) and inserts CT (2 nt).
+        The AA annotation (delins*) predicts a premature stop in the altered sequence.
+        """
+        wt = _wt_protein("NF2")
+        result = _call("NF2", "c.252_262delinsCT", "p.L85_K88delins*", "protein_altering_variant")
+
+        assert result, "protein_altering_variant must produce a non-empty sequence"
+        assert result != wt, "protein_altering_variant must differ from the wild-type sequence"
+
+    def test_frameshift_variant(self):
+        """HGVS rule: out-of-frame deletion disrupts the reading frame; the code translates the
+        resulting sequence as-is (to_stop=False) producing a non-empty aberrant protein.
+
+        Source: MEN1 c.292del p.R98Efs*21 (frameshift_variant)
+        DNA path: single-nucleotide deletion at position 292 shifts the reading frame.
+        """
+        result = _call("MEN1", "c.292del", "p.R98Efs*21", "frameshift_variant")
+
+        assert result, "frameshift_variant must produce a non-empty translated sequence"
+
+    def test_start_lost(self):
+        """HGVS rule: start codon destroyed; the processed sequence no longer begins with Met.
+
+        Source: INPP4B c.3G>T p.M1? (start_lost)
+        The annotation is p.M1? (ambiguous protein consequence) but dna_mut is unambiguous,
+        so the DNA substitution path is taken.  ATG (Met) → ATT (Ile) at codon 1.
+        """
+        result = _call("INPP4B", "c.3G>T", "p.M1?", "start_lost")
+
+        assert result, "start_lost must produce a non-empty translated sequence via the DNA path"
+        assert result[0] != "M", "Start codon is destroyed; first AA must no longer be Met"
+        assert result[0] == "I", "ATG→ATT encodes Ile; first AA must be Ile"
+
+    def test_stop_lost(self):
+        """HGVS rule: stop codon removed; the protein extends past the original terminus.
+
+        Source: MYD88 c.611_613delinsGCGGCCCCC p.D204_*205delinsGGPR (protein_altering_variant,stop_lost)
+        DNA path: delins at 611–613 removes the original stop codon; the protein extends.
+        """
+        wt = _wt_protein("MYD88")
+        result = _call(
+            "MYD88",
+            "c.611_613delinsGCGGCCCCC",
+            "p.D204_*205delinsGGPR",
+            "protein_altering_variant,stop_lost",
+        )
+
+        assert result, "stop_lost must produce a non-empty protein"
+        assert len(result) > len(wt), (
+            f"Stop-lost protein must be longer than WT: wt={len(wt)}, mut={len(result)}"
+        )
+
+    def test_stop_retained_variant(self):
+        """HGVS rule: synonymous change at the stop codon; stop is preserved; length unchanged.
+
+        Source: EPHA3 c.2756G>A p.*919= (stop_retained_variant)
+        DNA path: TGA (stop) → TAA (stop); both are stop codons, protein length is unchanged.
+        """
+        wt = _wt_protein("EPHA3")
+        result = _call("EPHA3", "c.2756G>A", "p.*919=", "stop_retained_variant")
+
+        assert result, "stop_retained_variant must produce a non-empty sequence"
+        assert len(result) == len(wt), (
+            f"Stop-retained must preserve protein length: wt={len(wt)}, mut={len(result)}"
+        )
+        assert result[-1] == "*", "Stop codon must still be present at the terminus"
+
+
+# ---------------------------------------------------------------------------
+# Filtered / skipped types — get_mut_pro_seq must return an empty string
+# ---------------------------------------------------------------------------
+
+class TestSkippedVariantTypes:
+
+    def test_synonymous_variant_is_prefiltered(self):
+        """synonymous_variant rows are discarded in cosmic_to_proteindb before get_mut_pro_seq
+        is ever called.  Verify the filter expression matches the v103 SO term.
+        """
+        assert "synonymous_variant" in "synonymous_variant"
+        assert "synonymous_variant" in "splice_region_variant,synonymous_variant"
+        assert "synonymous_variant" not in "missense_variant"
+
+    def test_intron_variant_returns_empty(self):
+        """intron_variant always carries p.? — the p.? guard returns an empty string.
+
+        Source: RAD51C c.145+706C>T p.? (intron_variant)
+        """
+        result = _call("EZH2", "c.145+706C>T", "p.?", "intron_variant")
+        assert result == "", f"intron_variant with p.? must return ''; got {repr(result)}"
+
+    def test_3_prime_utr_variant_returns_empty(self):
+        """3_prime_UTR_variant always carries p.? — the p.? guard returns an empty string.
+
+        Source: SUFU c.*2816G>A p.? (3_prime_UTR_variant)
+        The c.*N notation signals a 3'UTR position; the outer if-condition fails on p.?.
+        """
+        result = _call("EZH2", "c.*2816G>A", "p.?", "3_prime_UTR_variant")
+        assert result == "", f"3_prime_UTR_variant with p.? must return ''; got {repr(result)}"
+
+    def test_5_prime_utr_variant_returns_empty(self):
+        """5_prime_UTR_variant always carries p.? — the p.? guard returns an empty string.
+
+        Source: PSMD13 c.-20C>T p.? (5_prime_UTR_variant)
+        """
+        result = _call("EZH2", "c.-20C>T", "p.?", "5_prime_UTR_variant")
+        assert result == "", f"5_prime_UTR_variant with p.? must return ''; got {repr(result)}"
+
+    def test_coding_sequence_variant_returns_empty(self):
+        """coding_sequence_variant in v103 uses intronic-offset positions with p.?.
+
+        Source: OAS2 c.2157+2A>G p.? (coding_sequence_variant)
+        Both the intronic offset guard ('+' in coord) and the p.? guard ensure ''.
+        """
+        result = _call("EZH2", "c.2157+2A>G", "p.?", "coding_sequence_variant")
+        assert result == "", f"coding_sequence_variant with p.? must return ''; got {repr(result)}"
+
+    def test_splice_acceptor_variant_returns_empty(self):
+        """splice_acceptor_variant uses intronic-offset position notation with p.?.
+
+        Source: TSHR c.171-1G>A p.? (splice_acceptor_variant)
+        The '-' offset guard in the delins branch (and p.? in the substitution guard) return ''.
+        """
+        result = _call("EZH2", "c.171-1G>A", "p.?", "splice_acceptor_variant")
+        assert result == "", f"splice_acceptor_variant with p.? must return ''; got {repr(result)}"
+
+    def test_splice_donor_variant_returns_empty(self):
+        """splice_donor_variant uses downstream-offset position notation with p.?.
+
+        Source: PTCH1 c.3306+2T>G p.? (splice_donor_variant)
+        """
+        result = _call("EZH2", "c.3306+2T>G", "p.?", "splice_donor_variant")
+        assert result == "", f"splice_donor_variant with p.? must return ''; got {repr(result)}"
+
+    def test_splice_region_variant_composite_treated_as_primary_type(self):
+        """splice_region_variant only appears in v103 as part of a composite type string.
+        When combined with missense_variant, the missense guard fires first and the
+        mutation is processed as a missense — same-length result with a single AA change.
+
+        Source: EZH2 c.656C>T p.S219F (missense_variant — re-used with composite type)
+        """
+        wt = _wt_protein("EZH2")
+        result = _call(
+            "EZH2", "c.656C>T", "p.S219F", "missense_variant,splice_region_variant"
+        )
+        assert result, "composite type containing missense_variant must produce a result"
+        assert len(result) == len(wt), "missense-handled result must preserve protein length"
+        assert result[218] == "F", "Substitution at codon 219 must produce Phe"
+
+    def test_incomplete_terminal_codon_variant_returns_empty(self):
+        """incomplete_terminal_codon_variant is absent from v103 targeted-screen data.
+        By convention it carries p.?, so get_mut_pro_seq returns an empty string.
+        """
+        result = _call("EZH2", "c.2213_2214insA", "p.?", "incomplete_terminal_codon_variant")
+        assert result == "", (
+            f"incomplete_terminal_codon_variant with p.? must return ''; got {repr(result)}"
+        )
diff --git a/pgatk/tests/test_vcf_to_proteindb_parallel.py b/pgatk/tests/test_vcf_to_proteindb_parallel.py
new file mode 100644
index 0000000..c9183ff
--- /dev/null
+++ b/pgatk/tests/test_vcf_to_proteindb_parallel.py
@@ -0,0 +1,64 @@
+"""Equivalence test: vcf_to_proteindb with workers=2 produces the same sequence
+content as workers=1 on a small fixture."""
+import os
+from pathlib import Path
+
+from click.testing import CliRunner
+
+from pgatk.cli import cli
+
+
+def _sequence_set(fasta_path) -> set:
+    seqs = []
+    current = []
+    with open(fasta_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            if line.startswith('>'):
+                if current:
+                    seqs.append(''.join(current))
+                    current = []
+            else:
+                current.append(line.strip())
+        if current:
+            seqs.append(''.join(current))
+    return set(seqs)
+
+
+def test_parallel_matches_sequential(tmp_path):
+    # pgatk package root: .../pgatk/pgatk/
+    pkg_root = Path(__file__).resolve().parents[1]
+    testdata = pkg_root / 'testdata'
+    config = pkg_root / 'config' / 'ensembl_config.yaml'
+    out_seq = tmp_path / 'seq.fa'
+    out_par = tmp_path / 'par.fa'
+
+    common_args = [
+        'vcf-to-proteindb',
+        '--config_file', str(config),
+        '--vcf', str(testdata / 'test.vcf'),
+        '--input_fasta', str(testdata / 'test.fa'),
+        '--gene_annotations_gtf', str(testdata / 'test.gtf'),
+        '--protein_prefix', 'ensvar',
+        '--af_field', 'MAF',
+        '--annotation_field_name', 'CSQ',
+        '--biotype_str', 'feature_type',
+        '--include_biotypes', 'mRNA,ncRNA',
+    ]
+
+    runner = CliRunner()
+    orig_dir = os.getcwd()
+    try:
+        # Run from the package root so the .db file is created next to the .gtf.
+        os.chdir(str(pkg_root))
+
+        r1 = runner.invoke(cli, common_args + ['--output_proteindb', str(out_seq)])
+        assert r1.exit_code == 0, r1.output + (str(r1.exception) if r1.exception else '')
+
+        r2 = runner.invoke(cli, common_args + ['--output_proteindb', str(out_par), '--workers', '2'])
+        assert r2.exit_code == 0, r2.output + (str(r2.exception) if r2.exception else '')
+    finally:
+        os.chdir(orig_dir)
+
+    # FASTA header order may differ across runs (per-worker ordering); compare
+    # sequence content as a set.
+    assert _sequence_set(out_seq) == _sequence_set(out_par)
diff --git a/pgatk/toolbox/general.py b/pgatk/toolbox/general.py
index 15f7603..897cd71 100644
--- a/pgatk/toolbox/general.py
+++ b/pgatk/toolbox/general.py
@@ -17,7 +17,6 @@
 import yaml
 import gzip
 
-# Logging defaults
 from requests import HTTPError
 
 from pgatk.toolbox.exceptions import ToolBoxException
@@ -61,7 +60,6 @@ def __init__(self, root_config_name: str, yaml_configuration: dict, pipeline_par
         else:
             self._default_params = {}
 
-        # Prepare Logging subsystem
         if self._default_params is not None and self._ROOT_CONFIG_NAME in self._default_params:
             if self._CONFIG_LOGGER in self._default_params[self._ROOT_CONFIG_NAME]:
                 if self._CONFIG_LOGGER_LEVEL in self._default_params[self._ROOT_CONFIG_NAME][self._CONFIG_LOGGER]:
@@ -96,10 +94,8 @@ def get_config_value(self, key: str, default: Any = None) -> Any:
         Checks pipeline_parameters first (flat lookup), then
         default_params[_ROOT_CONFIG_NAME][key], then returns default.
         """
-        # Check pipeline params first (flat lookup by key)
         if key in self._pipeline_parameters:
             return self._pipeline_parameters[key]
-        # Then check default config params under root config name
         root = self._ROOT_CONFIG_NAME
         if (self._default_params is not None
                 and root in self._default_params
@@ -117,12 +113,10 @@ def get_log_handlers(self) -> list:
         return self._log_handlers
 
     def get_logger(self) -> logging.Logger:
-        # Get own logger
         return self._logger
 
     def get_session_log_files(self) -> list:
         log_files = []
-        # Add the application logs
         log_files.extend(self._log_files)
         return log_files
 
@@ -267,14 +261,12 @@ def check_create_folders_overwrite(folders: List[str]) -> None:
             if not os.path.isdir(folder):
                 invalid_folders.append(folder)
     if invalid_folders:
-        # If there's any invalid folder, we don't make any change, and we report the situation by raising an exception
         raise ToolBoxException("The following folders ARE NOT FOLDERS - '{}'"
                                .format(invalid_folders))
     for folder in folders:
         try:
             shutil.rmtree(folder)
         except FileNotFoundError:
-            # It is find if the folder is not there
             pass
     check_create_folders(folders)
 
@@ -347,7 +339,6 @@ def gunzip_files(files: List[str]) -> list[tuple[str, str]]:
                 (stdout, stderr) = gunzip_subprocess.communicate(timeout=timeout)
                 if gunzip_subprocess.poll() is not None:
                     if gunzip_subprocess.returncode != 0:
-                        # ERROR - Report this
                         err_msg = "ERROR uncompressing file '{}' output from subprocess STDOUT: {}\nSTDERR: {}" \
                             .format(file, stdout.decode('utf8'), stderr.decode('utf8'))
                         files_with_error.append((file, err_msg))
diff --git a/pgatk/toolbox/vcf_utils.py b/pgatk/toolbox/vcf_utils.py
index 61fe4db..d784361 100644
--- a/pgatk/toolbox/vcf_utils.py
+++ b/pgatk/toolbox/vcf_utils.py
@@ -30,7 +30,6 @@ def check_overlap(var_start: int, var_end: int, features_info: Optional[list] =
         features_info = [[0, 1, 'type']]
     if var_start == -1:
         return True
-    # check if the var overlaps any of the features
     for feature_pos in features_info:
         pep_start = feature_pos[0]
         pep_end = feature_pos[1]
diff --git a/pyproject.toml b/pyproject.toml
index 610da6c..1d4b6e2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "pgatk"
-version = "0.0.26"
+version = "0.0.27"
 description = "Python tools for proteogenomics"
 readme = "README.md"
 requires-python = ">=3.9"
@@ -71,3 +71,10 @@ exclude = ["tests*", "*.tests*", "*.tests.*", "tests.*"]
 
 [tool.setuptools.package-data]
 pgatk = ["config/*.yaml", "config/*.json"]
+
+[tool.pydocstyle]
+# D211 conflicts with D203 (which we follow: one blank line before class docstring).
+# D213 conflicts with D212 (which we follow: multi-line summary on the first line).
+# pydocstyle errors when both rules in each pair are enabled; explicitly disable
+# the ones we don't follow.
+add-ignore = ["D211", "D213"]
diff --git a/scripts/benchmark_vcf_to_proteindb.py b/scripts/benchmark_vcf_to_proteindb.py
new file mode 100755
index 0000000..8901565
--- /dev/null
+++ b/scripts/benchmark_vcf_to_proteindb.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+"""One-shot benchmark / profiler for ``pgatk vcf-to-proteindb``.
+
+Times one full run end-to-end. With ``--profile-out PATH``, also wraps the
+run in cProfile and writes a ``.prof`` file; print the top-N hotspots with
+``--print-top``. Not invoked by CI.
+
+Basic usage:
+    python scripts/benchmark_vcf_to_proteindb.py \
+        --vcf    /path/to/sample.vcf \
+        --fasta  /path/to/transcripts.fa \
+        --gtf    /path/to/annotation.gtf \
+        --output /tmp/benchmark_proteindb.fa
+
+Profile a chr22 run and print the 30 hottest functions:
+    python scripts/benchmark_vcf_to_proteindb.py \
+        --vcf chr22.vcf --fasta tx.fa --gtf chr22.gtf \
+        --output /tmp/chr22.fa \
+        --profile-out /tmp/chr22.prof --print-top 30
+"""
+import argparse
+import cProfile
+import io
+import pstats
+import time
+from pathlib import Path
+
+from pgatk.ensembl.ensembl import EnsemblDataService
+from pgatk.config.registry import load_config
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--vcf", required=True, help="Input VCF (annotated or raw)")
+    parser.add_argument("--fasta", required=True, help="Transcript FASTA matching the GTF")
+    parser.add_argument("--gtf", required=True, help="Gene annotations GTF")
+    parser.add_argument("--output", required=True, help="Output protein DB FASTA")
+    parser.add_argument(
+        "--annotation-field-name",
+        default="CSQ",
+        help="VCF INFO field that carries per-transcript annotation (default: CSQ). "
+        "Use empty string to force re-annotation via bedtools intersect.",
+    )
+    parser.add_argument(
+        "--profile-out",
+        help="If set, wrap the run in cProfile and write the .prof file here.",
+    )
+    parser.add_argument(
+        "--print-top",
+        type=int,
+        default=0,
+        help="With --profile-out, also print the top-N functions by cumulative time "
+             "(default: 0, no printing). Recommended N=30 for a first look.",
+    )
+    args = parser.parse_args()
+
+    config_data = load_config("ensembl_config", None)
+    pipeline_arguments = {
+        EnsemblDataService.PROTEIN_DB_OUTPUT: args.output,
+        EnsemblDataService.ANNOTATION_FIELD_NAME: args.annotation_field_name,
+    }
+    svc = EnsemblDataService(config_data, pipeline_arguments)
+
+    profiler = cProfile.Profile() if args.profile_out else None
+    start = time.perf_counter()
+    if profiler is not None:
+        profiler.enable()
+    try:
+        svc.vcf_to_proteindb(args.vcf, args.fasta, args.gtf)
+    finally:
+        if profiler is not None:
+            profiler.disable()
+    elapsed = time.perf_counter() - start
+
+    if profiler is not None:
+        profiler.dump_stats(args.profile_out)
+
+    output_path = Path(args.output)
+    output_size = output_path.stat().st_size if output_path.exists() else 0
+    seq_count = 0
+    if output_path.exists():
+        with open(output_path, "r", encoding="utf-8") as fh:
+            for line in fh:
+                if line.startswith(">"):
+                    seq_count += 1
+
+    print()
+    print("=== BENCHMARK RESULT ===")
+    print(f"  VCF:        {args.vcf}")
+    print(f"  Elapsed:    {elapsed:.2f} s  ({elapsed / 60:.2f} min)")
+    print(f"  Output:     {args.output}  ({output_size} bytes, {seq_count} sequences)")
+    if args.profile_out:
+        print(f"  Profile:    {args.profile_out}")
+
+    if args.profile_out and args.print_top > 0:
+        print()
+        print(f"=== TOP {args.print_top} BY CUMULATIVE TIME ===")
+        buf = io.StringIO()
+        stats = pstats.Stats(args.profile_out, stream=buf)
+        stats.strip_dirs().sort_stats('cumulative').print_stats(args.print_top)
+        print(buf.getvalue())
+        print(f"=== TOP {args.print_top} BY OWN (tottime) TIME ===")
+        buf = io.StringIO()
+        stats = pstats.Stats(args.profile_out, stream=buf)
+        stats.strip_dirs().sort_stats('tottime').print_stats(args.print_top)
+        print(buf.getvalue())
+
+
+if __name__ == "__main__":
+    main()
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..2baaabd
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,15 @@
+[pydocstyle]
+# D211 conflicts with D203 (we follow D203 — one blank line before class docstring).
+# D213 conflicts with D212 (we follow D212 — multi-line summary on the first line).
+# Both pairs of rules cannot be satisfied simultaneously; pick one side per pair.
+add-ignore = D211,D213
+
+[bandit]
+# B101 fires on every `assert` used in pytest tests. Pytest depends on assert
+# rewriting; the bytecode-stripping concern doesn't apply to test files.
+# Exclude test directories entirely so pytest assertions are not flagged.
+exclude_dirs = ./pgatk/tests,./tests
+# B202 (tarfile.extractall) — _safe_extract_tar in pgatk/cgenomes/cosmic_downloader.py
+# performs explicit member-by-member validation before calling extractall(members=…)
+# with the validated list. Annotated with `# nosec B202` at the call site.
+skips = B101