From b6702422e9a3fb63110bd6ac5c379c550effcdff Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 11 May 2026 09:48:01 +0100 Subject: [PATCH 01/23] docs: rename --var_prefix to --protein_prefix in CLI examples The click commands accept --protein_prefix; --var_prefix no longer exists. Example commands in docs/use-cases.md and docs/pgatk-cli.md were stale and failed when copy-pasted. Closes #100 --- docs/pgatk-cli.md | 10 +++++----- docs/use-cases.md | 18 +++++++++--------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/docs/pgatk-cli.md b/docs/pgatk-cli.md index f36971a..8bf7878 100644 --- a/docs/pgatk-cli.md +++ b/docs/pgatk-cli.md @@ -318,7 +318,7 @@ Usage: pgatk vcf-to-proteindb [OPTIONS] Options: --translation_table INTEGER Translation table (Default 1) --mito_translation_table INTEGER Mito_trans_table (default 2) - --var_prefix TEXT String to add as prefix for the variant peptides + --protein_prefix TEXT String to add as prefix for the variant peptides --report_ref_seq Also report the reference peptide from overlapping transcripts --annotation_field_name TEXT Annotation field name in INFO column (default: CSQ) --af_field TEXT Field name for variant allele frequency (default: none) @@ -467,7 +467,7 @@ Usage: pgatk dnaseq-to-proteindb [OPTIONS] --biotype_str TEXT String used to identify gene/transcript biotype (default: transcript_biotype) --expression_str TEXT String for extracting expression value (default: None) --expression_thresh FLOAT Threshold for expression value filtering (default: 5) - --var_prefix TEXT Prefix to be added to fasta headers (default: none) + --protein_prefix TEXT Prefix to be added to fasta headers (default: none) -h, --help Show this message and exit. ``` @@ -489,7 +489,7 @@ Usage: pgatk dnaseq-to-proteindb [OPTIONS] --config_file config/ensembl_config.yaml \ --input_fasta transcript_sequences.fa \ --output_proteindb proteindb_from_lincRNA_canonical_sequences.fa \ - --var_prefix lincRNA_ \ + --protein_prefix lincRNA_ \ --include_biotypes lincRNA ``` @@ -500,7 +500,7 @@ Usage: pgatk dnaseq-to-proteindb [OPTIONS] --config_file config/ensembl_config.yaml \ --input_fasta transcript_sequences.fa \ --output_proteindb proteindb_from_processed_pseudogene.fa \ - --var_prefix pseudogene_ \ + --protein_prefix pseudogene_ \ --include_biotypes processed_pseudogene,transcribed_processed_pseudogene,translated_processed_pseudogene \ --skip_including_all_cds ``` @@ -512,7 +512,7 @@ Usage: pgatk dnaseq-to-proteindb [OPTIONS] --config_file config/ensembl_config.yaml \ --input_fasta transcript_sequences.fa \ --output_proteindb proteindb_from_altORFs.fa \ - --var_prefix altorf_ \ + --protein_prefix altorf_ \ --include_biotypes altORFs \ --skip_including_all_cds ``` diff --git a/docs/use-cases.md b/docs/use-cases.md index bd3cf20..0f151e9 100644 --- a/docs/use-cases.md +++ b/docs/use-cases.md @@ -57,7 +57,7 @@ detectable peptides. Translate them in three reading frames: pgatk dnaseq-to-proteindb \ --input_fasta ensembl_human/transcripts.fa \ --output_proteindb pseudogene.fa \ - --var_prefix pseudo_ \ + --protein_prefix pseudo_ \ --include_biotypes processed_pseudogene,unprocessed_pseudogene,transcribed_processed_pseudogene,transcribed_unprocessed_pseudogene,translated_processed_pseudogene \ --num_orfs 3 \ --skip_including_all_cds @@ -72,7 +72,7 @@ micropeptides: pgatk dnaseq-to-proteindb \ --input_fasta ensembl_human/transcripts.fa \ --output_proteindb lncrna.fa \ - --var_prefix lncrna_ \ + --protein_prefix lncrna_ \ --include_biotypes lincRNA,antisense,sense_intronic,sense_overlapping \ --num_orfs 3 \ --skip_including_all_cds @@ -87,7 +87,7 @@ peptides: pgatk dnaseq-to-proteindb \ --input_fasta ensembl_human/transcripts.fa \ --output_proteindb altorf.fa \ - --var_prefix altorf_ \ + --protein_prefix altorf_ \ --include_biotypes altORFs \ --skip_including_all_cds ``` @@ -546,7 +546,7 @@ them in three reading frames: pgatk dnaseq-to-proteindb \ --input_fasta transcripts.fa \ --output_proteindb lincRNA_proteins.fa \ - --var_prefix lincRNA_ \ + --protein_prefix lincRNA_ \ --include_biotypes lincRNA \ --num_orfs 3 \ --skip_including_all_cds @@ -560,7 +560,7 @@ Some pseudogenes are transcribed and may produce functional peptides: pgatk dnaseq-to-proteindb \ --input_fasta transcripts.fa \ --output_proteindb pseudogene_proteins.fa \ - --var_prefix pseudogene_ \ + --protein_prefix pseudogene_ \ --include_biotypes processed_pseudogene,transcribed_processed_pseudogene,translated_processed_pseudogene \ --num_orfs 3 \ --skip_including_all_cds @@ -575,7 +575,7 @@ discover upstream ORFs (uORFs), overlapping ORFs, and downstream ORFs: pgatk dnaseq-to-proteindb \ --input_fasta transcripts.fa \ --output_proteindb altorf_proteins.fa \ - --var_prefix altorf_ \ + --protein_prefix altorf_ \ --include_biotypes altORFs \ --skip_including_all_cds ``` @@ -586,7 +586,7 @@ pgatk dnaseq-to-proteindb \ pgatk dnaseq-to-proteindb \ --input_fasta transcripts.fa \ --output_proteindb antisense_proteins.fa \ - --var_prefix antisense_ \ + --protein_prefix antisense_ \ --include_biotypes antisense,antisense_RNA \ --num_orfs 3 \ --skip_including_all_cds @@ -979,7 +979,7 @@ pgatk cosmic-to-proteindb \ pgatk dnaseq-to-proteindb \ --input_fasta ensembl_data/transcripts.fa \ --output_proteindb lincRNA.fa \ - --var_prefix lincRNA_ \ + --protein_prefix lincRNA_ \ --include_biotypes lincRNA \ --num_orfs 3 \ --skip_including_all_cds @@ -988,7 +988,7 @@ pgatk dnaseq-to-proteindb \ pgatk dnaseq-to-proteindb \ --input_fasta ensembl_data/transcripts.fa \ --output_proteindb pseudogene.fa \ - --var_prefix pseudogene_ \ + --protein_prefix pseudogene_ \ --include_biotypes processed_pseudogene,transcribed_processed_pseudogene \ --num_orfs 3 \ --skip_including_all_cds From bb4edb66c166bd763570c1b728da9064d112d84d Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 11 May 2026 11:31:01 +0100 Subject: [PATCH 02/23] Stop tracking docs/plans/ (internal working docs) Implementation plans and design scratch notes are kept locally as working artifacts only; they don't belong in the published repo. Files remain on disk and are now gitignored. --- .gitignore | 3 + .../2026-03-01-pgatk-graph-engine-design.md | 270 ------------------ .../2026-03-03-protein-accession-design.md | 96 ------- 3 files changed, 3 insertions(+), 366 deletions(-) delete mode 100644 docs/plans/2026-03-01-pgatk-graph-engine-design.md delete mode 100644 docs/plans/2026-03-03-protein-accession-design.md diff --git a/.gitignore b/.gitignore index 3e8faad..c9aa01f 100644 --- a/.gitignore +++ b/.gitignore @@ -135,3 +135,6 @@ pgatk/testdata/Meleagris_gallopavo* .DS_Store .codacy/ .cursor/ + +# Internal working docs (implementation plans, scratch notes) +docs/plans/ diff --git a/docs/plans/2026-03-01-pgatk-graph-engine-design.md b/docs/plans/2026-03-01-pgatk-graph-engine-design.md deleted file mode 100644 index 9048e99..0000000 --- a/docs/plans/2026-03-01-pgatk-graph-engine-design.md +++ /dev/null @@ -1,270 +0,0 @@ -# pgatk Evolution: Graph-Based Transcript Modeling & Feature Parity - -**Date:** 2026-03-01 -**Status:** Approved -**Authors:** Yasset Perez-Riverol, Claude (AI assistant) - ---- - -## 1. Executive Summary - -This document describes the research and implementation plan to evolve pgatk from a linear, one-variant-at-a-time proteogenomics database generator into a graph-based transcript modeling engine capable of multi-variant co-occurrence, gene fusions, RNA editing, and circular RNA support. The plan includes a prerequisite infrastructure cleanup phase and a phased feature rollout with performance optimization. - ---- - -## 2. Current State Assessment - -### What pgatk does well - -- Only Python tool with native one-command downloaders for ENSEMBL, COSMIC, and cBioPortal -- Integrated decoy generation (4 methods: protein-reverse, protein-shuffle, DecoyPyrat, pgdbdeep) -- Spectrum-level validation via SpectrumAI (unique among database generators) -- Aho-Corasick exact matching + sliding-window mismatch search (BlastGetPosition) -- Published benchmark: 43,501 non-canonical peptides across 64 human cell lines (Umer et al. 2022) -- Pangenome proteogenomics demonstrated (Wang et al. 2024 preprint): 4,991 novel peptides from HPRC - -### Critical gaps vs. the field (2024-2026) - -| Gap | Impact | Solved by | -|---|---|---| -| No multi-variant co-occurrence | Transcript with N SNPs produces 1 sequence instead of up to 2^N | moPepGen (DAG), vcf2prot (phased VCF) | -| No gene fusions | Major cancer event type missed entirely | moPepGen | -| No RNA editing | A-to-I editing creates novel peptides invisible to DNA-only | moPepGen | -| No circular RNA | Emerging class of translated non-coding RNA | moPepGen | -| No ClinVar/NCBI support | Open issue #24 since 2019 | Manual workaround only | -| pandas iterrows in VCF processing | ~100x slower than vectorized operations | Internal bottleneck | -| No transcript feature caching | Redundant gffutils DB queries per variant | Internal bottleneck | - -### Competitive landscape - -**moPepGen** (Nature Biotechnology, June 2025): Graph-based (DAG per transcript), handles all event types, ~4x more non-canonical peptides than prior methods. The algorithmic benchmark to match. - -**vcf2prot** (ElAbd et al. 2022, bioRxiv): Rust-based Sequence Intermediate Representation (SIR) approach. ~1000x faster than PrecisionProDB. Processes 99,254 variants across 8,192 patients in ~11 minutes. Demonstrates the performance ceiling achievable with systems-level optimization. Limited to phased VCFs with BCFtools/csq annotation. - -**PG2** (J. Proteome Research 2023): Snakemake pipeline integrating genome + transcriptome. Handles splicing and fusions but requires heavy infrastructure. - -**pXg** (MCP 2024): RNA-Seq + proteomics + de novo sequencing for immunopeptidomics. - -**NeoDisc** (Nature Biotechnology 2024): End-to-end clinical neoantigen pipeline. - -**PepCentric** (bioRxiv 2025): Repository-scale validation against 2.3B spectra. Complementary to database generators. - -### Key code quality findings - -- `get_altseq()` (ensembl.py): 60-line heart of variant processing, handles SNP/ins/del but only one variant at a time -- `vcf_to_proteindb()` (ensembl.py): ~260 lines, main bottleneck is `vcf_reader.iterrows()` + per-row gffutils DB lookups -- No `pathlib`, no `dataclasses`, inconsistent type hints -- `db/` standalone scripts use `sys.argv` at import time (not importable as modules) -- Variable shadowing bug in `protein_database_decoy.py` (`decoy_sequence` variable shadows the imported `pyteomics.fasta.decoy_sequence` function) -- Broad `except Exception` in several places suppresses unexpected errors -- Mixed `print()` and logger usage for error reporting -- Repetitive 3-layer config fallback pattern across all service classes - ---- - -## 3. Architecture: TranscriptGraph Engine - -### Core Concept - -Each transcript is modeled as a directed acyclic graph (DAG): - -- **Nodes** represent sequence segments (reference sequence between variant positions) -- **Edges** represent either the reference path or an alternative (variant) path -- **Traversal** of all root-to-leaf paths generates all combinatorial protein sequences - -``` -Reference: ──[seg1]──[seg2]──[seg3]──[seg4]── - │ │ -Variant A: └─[alt_A]─┘ │ -Variant B: └─[alt_B]─┘ - -Paths: ref, A only, B only, A+B → 4 protein sequences -``` - -### Module Structure - -``` -pgatk/ - graph/ # NEW module - __init__.py - transcript_graph.py # TranscriptGraph class (DAG builder + traverser) - variant_nodes.py # Node types: SNP, Insertion, Deletion, Fusion, RNAEdit, CircRNA - graph_enumerator.py # Path enumeration with combinatorial explosion controls - event_parsers/ # Input parsers for each event type - __init__.py - vcf_parser.py # VCF → graph edges (replaces current get_altseq) - fusion_parser.py # Gene fusion calls → cross-transcript edges - rnaedit_parser.py # RNA editing sites → substitution edges - circrna_parser.py # Back-splice junctions → circular edges - clinvar_parser.py # ClinVar VCF → graph edges (issue #24) - filters/ - __init__.py - allele_frequency.py # AF-based pruning - consequence.py # VEP consequence filtering - expression.py # Expression-level filtering - max_variants.py # Cap combinatorial explosion -``` - -### Key Classes - -```python -@dataclass -class VariantNode: - """Base class for all genomic events.""" - position: int # CDS position - ref_allele: str - alt_allele: str - event_type: EventType # SNP, INS, DEL, FUSION, RNA_EDIT, CIRC_RNA - metadata: dict # AF, consequence, source, etc. - - -class TranscriptGraph: - """DAG representing a transcript with all its variants.""" - - def __init__(self, transcript_id: str, reference_seq: str, strand: str) -> None: ... - def add_variant(self, variant: VariantNode) -> None: ... - def add_fusion(self, partner_graph: 'TranscriptGraph', breakpoints: tuple) -> None: ... - def enumerate_paths(self, max_paths: int = 1000) -> Iterator[ProteinSequence]: ... - def to_fasta(self, output: TextIO) -> int: ... # returns count written -``` - -### Combinatorial Explosion Controls - -With N variants, naive enumeration produces 2^N sequences. Controls: - -1. **Max variants per transcript** (default: 10) — skip transcripts exceeding this -2. **Max paths per transcript** (default: 1000) — stop enumeration after limit -3. **Allele frequency pruning** — only include variants above a threshold -4. **Phasing support** — if phased VCF, only enumerate haplotype-consistent paths -5. **Consequence filtering** — only include protein-altering consequences - -### Integration Strategy - -The graph engine sits alongside existing modules (no replacement): - -- **Downloaders** remain unchanged -- **New CLI commands** (`graph-vcf-to-proteindb`, etc.) use the graph engine -- **Existing commands** continue working for backward compatibility -- **Shared infrastructure** (`toolbox/`, `config/`) is reused - ---- - -## 4. Phased Implementation Plan - -### Phase 0 — Infrastructure & Code Quality - -**Goal:** Fix existing issues so the graph engine is built on solid ground. - -**Deliverables:** - -1. Fix variable shadowing bug in `protein_database_decoy.py` -2. Convert `db/digest_mutant_protein.py` and `db/map_peptide2genome.py` from `sys.argv`/`getopt` import-time execution to proper Click CLI commands -3. Add `conftest.py` with proper fixtures; remove relative path dependencies in tests -4. Replace `print()` error reporting with proper logger usage across all modules -5. Add type hints to all public APIs -6. Use `dataclasses` for data models (`SNP` in `cgenomes/models.py`, new models) -7. Use `pathlib.Path` for file operations (replace string concatenation) -8. Replace broad `except Exception` with specific exception handling -9. Standardize the 3-layer config fallback pattern (DRY up repetitive `get_*_parameters()` methods) -10. Expand unit test coverage for core functions: `get_altseq()`, `get_mut_pro_seq()`, `revswitch()` -11. Replace `pathos` with `concurrent.futures.ProcessPoolExecutor` (stdlib) - ---- - -### Phase 1 — Graph Core + SNP/Indel Support - -**Goal:** Working graph engine for basic variants, producing a superset of current output. - -**Deliverables:** - -1. `TranscriptGraph` class with add/enumerate/serialize -2. `VariantNode` dataclass hierarchy (SNP, Insertion, Deletion) -3. `vcf_parser.py` — reads VCF (VEP-annotated or unannotated) into graph edges -4. `graph_enumerator.py` — path enumeration with explosion controls -5. New CLI command: `graph-vcf-to-proteindb` -6. Transcript feature caching (eliminate redundant gffutils queries) -7. Vectorized VCF grouping (replace `iterrows()`) -8. Unit tests for graph operations + integration tests -9. Benchmark against current `vcf_to_proteindb()` on test VCFs - -**Validation:** Graph engine must produce a superset of sequences from the linear approach. - ---- - -### Phase 2 — Cancer Genomics + ClinVar - -**Goal:** Bring cancer mutation sources into the graph model and add ClinVar (issue #24). - -**Deliverables:** - -1. `clinvar_parser.py` — parse ClinVar VCF into graph edges -2. Refactor `CancerGenomesService.get_mut_pro_seq()` to produce `VariantNode` objects -3. Graph-aware COSMIC processing (multiple mutations on same gene → combinatorial) -4. Graph-aware cBioPortal processing -5. New CLI commands: `clinvar-to-proteindb`, `graph-cosmic-to-proteindb` -6. Tests with real COSMIC data (multiple mutations per transcript) - ---- - -### Phase 3 — Novel Event Types: Fusions, RNA Editing, Circular RNA - -**Goal:** Add the event types that differentiate moPepGen from other tools. - -**Deliverables:** - -1. `fusion_parser.py` — parse gene fusion calls (STAR-Fusion, Arriba, FusionCatcher formats) -2. `rnaedit_parser.py` — parse RNA editing databases (REDIportal, DARNED) or user sites -3. `circrna_parser.py` — parse back-splice junction files (CIRCexplorer, CIRI) -4. Extended `TranscriptGraph` for cross-transcript fusions and circular paths -5. New CLI commands: `fusion-to-proteindb`, `rnaedit-to-proteindb`, `circrna-to-proteindb` -6. Combined command: `multi-event-to-proteindb` (merges all event types per transcript) - ---- - -### Phase 4 — Performance Optimization - -**Goal:** Make the graph engine fast enough for genome-scale datasets. - -**Deliverables:** - -1. Profile Phases 1-3 on real-world datasets (gnomAD, TCGA) -2. Identify hot paths via `cProfile` / `py-spy` -3. Optimize graph traversal (topological sort, memoization of shared subpaths) -4. Replace remaining `pathos` usage with `concurrent.futures` -5. Optional: C extension (via `cffi` or `cython`) for innermost graph traversal — only if profiling shows it's the bottleneck -6. Benchmark against moPepGen and vcf2prot on same datasets - ---- - -### Phase Dependencies - -``` -Phase 0 (Infrastructure & Quality) - └── Phase 1 (Graph Core + SNP/Indel) - ├── Phase 2 (Cancer + ClinVar) - │ └── Phase 3 (Fusions, RNA Edit, CircRNA) - │ └── Phase 4 (Performance) - └── (ongoing quality improvements) -``` - ---- - -## 5. References - -### Primary pgatk publication -- Umer HM, Audain E, Zhu Y, Pfeuffer J, Sachsenberg T, Lehtiö J, Branca RM, Perez-Riverol Y. Generation of ENSEMBL-based proteogenomics databases boosts the identification of non-canonical peptides. *Bioinformatics*. 2022;38(5):1470-1472. doi:10.1093/bioinformatics/btab838 - -### Pangenome proteogenomics -- Wang D, Bouwmeester R, Zheng P, Dai C, Sanchez A, Shu K, Bai M, Umer HM, Perez-Riverol Y. Proteogenomics analysis of human tissues using pangenomes. *bioRxiv*. 2024. doi:10.1101/2024.05.24.595489 - -### Competing tools -- moPepGen: Graph-based proteogenomic database generation. *Nature Biotechnology*. June 2025. doi:10.1038/s41587-025-02701-0 -- vcf2prot: ElAbd H, Degenhardt F, Lenz TL, Franke A, Wendorff M. VCF2Prot: An efficient and parallel tool for generating personalized proteomes from VCF files. *bioRxiv*. 2022. doi:10.1101/2022.01.21.477084 -- ProteomeGenerator2: *J. Proteome Research*. 2023. doi:10.1021/acs.jproteome.3c00005 -- pXg: *Molecular & Cellular Proteomics*. 2024. doi:10.1016/j.mcpro.2024.100733 -- NeoDisc: *Nature Biotechnology*. October 2024. doi:10.1038/s41587-024-02420-y -- PepCentric: *bioRxiv*. February 2025. doi:10.1101/2025.02.24.639867 - -### Related ecosystem -- quantms: Dai C et al. *Nature Methods*. 2024. doi:10.1038/s41592-024-02343-1 -- PRIDE 2025: *Nucleic Acids Research*. 2025;53(D1):D543. doi:10.1093/nar/gkae1011 diff --git a/docs/plans/2026-03-03-protein-accession-design.md b/docs/plans/2026-03-03-protein-accession-design.md deleted file mode 100644 index cfe0b8d..0000000 --- a/docs/plans/2026-03-03-protein-accession-design.md +++ /dev/null @@ -1,96 +0,0 @@ -# Protein Accession and FASTA Header Design - -Issue: https://github.com/bigbio/pgatk/issues/18 -Branch: `feature/protein-accession-design` -Date: 2026-03-03 - -## Problem - -Current pgatk FASTA headers are inconsistent across variant sources (VCF, COSMIC, ClinVar) and incompatible with major search engines like SearchGUI, which cannot parse ENSEMBL-style IDs. - -## Design - -### Two protein categories, two prefix strategies - -| Category | Prefix | Accession | Description | -|----------|--------|-----------|-------------| -| Canonical (reference) | Keep original (`sp\|`, `tr\|`, `ensp\|`) | Original accession | Untouched from source database | -| Variant (mutated) | `pgvar\|` | `{TRANSCRIPT_ID}-{INDEX}` | pgatk-generated variant protein | - -### Variant header format - -``` ->pgvar|{TRANSCRIPT_ID}-{INDEX}|{GENE_SYMBOL} {key=value metadata} -``` - -**Fields:** - -- `pgvar` -- database tag identifying pgatk-generated variant proteins. -- `{TRANSCRIPT_ID}-{INDEX}` -- accession composed of parent transcript ID and a dash-separated 1-based index (per transcript, per run). Mirrors UniProt isoform convention (`P12345-2`). -- `{GENE_SYMBOL}` -- gene name, first token after the second pipe. -- Metadata key=value pairs in the description field: - -| Key | Description | Example | -|-----|-------------|---------| -| `VariantSource` | Origin database | `COSMIC`, `ClinVar`, `gnomAD`, `dbSNP` | -| `GenomicCoord` | `chr:pos:ref:alt` | `12:25245347:C:G` | -| `AAChange` | HGVS protein notation | `p.G13R` | -| `MutationType` | SO term or short label | `missense_variant` | -| `dbSNP` | rsID if available | `rs121913529` | -| `ORF` | Reading frame number (only when multi-ORF) | `1`, `2`, `3` | - -### Examples - -```fasta -# Canonical proteins -- untouched from source databases ->sp|P01112|RASH_HUMAN GTPase HRas OS=Homo sapiens ->ensp|ENSP00000309845|BRCA1 - -# Variant proteins -- unified pgvar| prefix regardless of source ->pgvar|ENST00000311189-1|HRAS VariantSource=COSMIC AAChange=p.G13R MutationType=missense_variant GenomicCoord=12:25245347:C:G ->pgvar|ENST00000311189-2|HRAS VariantSource=COSMIC AAChange=p.Q61L MutationType=missense_variant GenomicCoord=12:25245350:A:T ->pgvar|ENST00000357654-1|BRCA1 VariantSource=ClinVar AAChange=p.R1699Q MutationType=missense_variant GenomicCoord=17:43094464:G:A dbSNP=rs41293455 - -# Multiple ORFs -- each ORF gets its own index, ORF number in metadata ->pgvar|ENST00000311189-3|HRAS VariantSource=COSMIC AAChange=p.G13R ORF=1 ->pgvar|ENST00000311189-4|HRAS VariantSource=COSMIC AAChange=p.G13R ORF=2 ->pgvar|ENST00000311189-5|HRAS VariantSource=COSMIC AAChange=p.G13R ORF=3 -``` - -### Indexing logic - -The `-{index}` is per-transcript, per-file generation run: - -- First variant on `ENST00000311189` gets `-1` -- Second variant on same transcript gets `-2` -- Multi-ORF outputs each consume an index (3 ORFs = 3 indices) -- First variant on a different transcript resets to `-1` - -### Search engine compatibility - -| Engine | Compatible | Notes | -|--------|-----------|-------| -| SearchGUI / PeptideShaker | Yes | Matches UniProt-like `db\|acc\|name` pattern | -| MaxQuant | Yes | Default UniProt parse rule works | -| MSFragger / FragPipe | Yes | Reads full header, splits on first whitespace | -| Comet | Yes | Parses `>db\|acc\|` natively | -| DIA-NN | Yes | Follows UniProt-style parsing | -| Proteome Discoverer | Yes | Supports pipe-delimited headers | - -### Files to modify - -| File | Change | -|------|--------| -| `pgatk/ensembl/ensembl.py` | Refactor `vcf_to_proteindb()` header construction (lines 661-664) | -| `pgatk/clinvar/clinvar_service.py` | Refactor header construction (lines 554-560) | -| `pgatk/cgenomes/cgenomes_proteindb.py` | Refactor COSMIC header (line 317), cBioPortal header | -| `pgatk/toolbox/vcf_utils.py` | Update `write_output()` to handle new format cleanly | -| `pgatk/config/` | Add constants for `PGVAR_PREFIX`, metadata keys | - -### Design decisions - -1. **Dash separator** (`-`) between transcript and index, consistent with UniProt isoform convention. -2. **No ORF suffix in accession** -- ORF number is metadata (`ORF=N`), each ORF gets its own index. -3. **Canonical proteins are pass-through** -- pgatk does not reformat existing database headers. -4. **Unified format across all sources** -- COSMIC, ClinVar, VCF variants all use `pgvar|` regardless of origin. -5. **Key=value metadata** in description field for structured downstream parsing. From 14ee5e7d4f7895f618078c24197c2e83d87797db Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 11 May 2026 11:49:56 +0100 Subject: [PATCH 03/23] Switch COSMIC downloader to v103 scripted-download API Sanger retired the legacy /cosmic/file_download/... endpoint. The new flow is a two-step API: GET /api/mono/products/v1/downloads/scripted returns a signed S3 URL (1-hour TTL), which is then fetched without auth. The downloader is rewritten around this contract. - Config: replace the v94 per-file URL/filename pairs with a single `products` list of `path=` values copy-pasteable from the COSMIC scripted-download UI. Defaults cover v103 / GRCh38: genome-screens mutations, targeted-screens mutations, gene FASTA, transcripts, cell-line mutations, cell-line CNA. - Downloader: build_api_url() constructs the new endpoint URL; the two-step download parses JSON, follows the signed URL, and extracts the resulting .tar archive (replaces the old .tar.gz handling). - Downloader: clear error message when the API returns 200 but the body is not the expected JSON-with-url - the failure mode reported in the bug. Distinguish API errors from S3 errors. - Downloader: tqdm progress bar for the S3 download (large files were silent before). - CLI: add -P/--products flag (repeatable) so users can fetch an arbitrary product without editing YAML, e.g. pgatk cosmic-downloader -P grch37/cancer_mutation_census/v100/... - Tests: unit tests for URL construction and base64 auth-token format (no network). End-to-end verified with real Sanger credentials: three of the six default paths (mutations, gene FASTA, cell-line mutations) download and extract cleanly. The three added afterwards (targeted-screens, transcripts, CNA) share the same path pattern as the verified ones and the filenames were taken from the COSMIC UI verbatim. Closes #98 --- pgatk/cgenomes/cosmic_downloader.py | 228 ++++++++++++++------------ pgatk/commands/cosmic_downloader.py | 11 +- pgatk/config/cosmic_config.yaml | 29 +++- pgatk/tests/test_cosmic_downloader.py | 44 +++++ 4 files changed, 200 insertions(+), 112 deletions(-) create mode 100644 pgatk/tests/test_cosmic_downloader.py diff --git a/pgatk/cgenomes/cosmic_downloader.py b/pgatk/cgenomes/cosmic_downloader.py index 3332014..836955e 100644 --- a/pgatk/cgenomes/cosmic_downloader.py +++ b/pgatk/cgenomes/cosmic_downloader.py @@ -1,7 +1,9 @@ import base64 -import gzip -import requests import os +import tarfile + +import requests +from tqdm import tqdm from pgatk.toolbox.exceptions import AppConfigException from pgatk.toolbox.general import ParameterConfiguration, check_create_folders @@ -12,14 +14,22 @@ class CosmicDownloadService(ParameterConfiguration): CONFIG_OUTPUT_DIRECTORY = 'output_directory' CONFIG_COSMIC_SERVER = 'cosmic_server' CONFIG_COSMIC_FTP_URL = 'cosmic_ftp' + CONFIG_API_ENDPOINT = 'api_endpoint' + CONFIG_BUCKET = 'bucket' CONFIG_COSMIC_FTP_USER = "cosmic_user" CONFIG_COSMIC_FTP_PASSWORD = "cosmic_password" - CONFIG_COSMIC_MUTATIONS_URL = "mutations_url" - CONFIG_COSMIC_MUTATIONS_FILE = "mutations_file" - CONFIG_COSMIC_CELLLINE_MUTATIONS_URL = "mutations_cellline_url" - CONFIG_COSMIC_CELLLINE_MUTATIONS_FILE = "mutations_cellline_file" - CONFIG_COSMIC_CDS_GENES_FILE = "all_cds_genes_file" - CONFIG_COSMIC_CELLLINES_GENES_FILE = "all_celllines_genes_file" + CONFIG_PRODUCTS = "products" + + # Built-in fallback list, used only if `products` is missing from config. + # Real default lives in pgatk/config/cosmic_config.yaml and is the source of truth. + _DEFAULT_PRODUCTS = ( + 'grch38/cosmic/v103/Cosmic_GenomeScreensMutant_Tsv_v103_GRCh38.tar', + 'grch38/cosmic/v103/Cosmic_CompleteTargetedScreensMutant_Tsv_v103_GRCh38.tar', + 'grch38/cosmic/v103/Cosmic_Genes_Fasta_v103_GRCh38.tar', + 'grch38/cosmic/v103/Cosmic_Transcripts_Tsv_v103_GRCh38.tar', + 'grch38/cell_lines/v103/CellLinesProject_GenomeScreensMutant_Tsv_v103_GRCh38.tar', + 'grch38/cell_lines/v103/CellLinesProject_CompleteCNA_Tsv_v103_GRCh38.tar', + ) def __init__(self, config_file, pipeline_arguments): """ @@ -29,30 +39,24 @@ def __init__(self, config_file, pipeline_arguments): """ super(CosmicDownloadService, self).__init__(self.CONFIG_KEY_DATA_DOWNLOADER, config_file, pipeline_arguments) - self._local_path_cosmic = self.get_configuration_default_params(variable=self.CONFIG_OUTPUT_DIRECTORY, - default_value='./database_cosmic/') - self._cosmic_ftp_url = self.get_configuration_default_params(variable=self.CONFIG_COSMIC_FTP_URL, - default_value='https://cancer.sanger.ac.uk') - self._cosmic_user = self.get_configuration_default_params(variable=self.CONFIG_COSMIC_FTP_USER, - default_value='') - self._cosmic_password = self.get_configuration_default_params(variable=self.CONFIG_COSMIC_FTP_PASSWORD, - default_value='') - self._cosmic_mutation_url = self.get_configuration_default_params(variable=self.CONFIG_COSMIC_MUTATIONS_URL, - default_value='cosmic/file_download/GRCh38/cosmic/v94') - self._cosmic_mutations_file = self.get_configuration_default_params(variable=self.CONFIG_COSMIC_MUTATIONS_FILE, - default_value='CosmicMutantExport.tsv.gz') - self._cosmic_cellline_mutation_url = self.get_configuration_default_params( - variable=self.CONFIG_COSMIC_CELLLINE_MUTATIONS_URL, - default_value='cosmic/file_download/GRCh38/cell_lines/v94') - self._cosmic_cellline_mutation_file = self.get_configuration_default_params( - variable=self.CONFIG_COSMIC_CELLLINE_MUTATIONS_FILE, default_value='CosmicCLP_MutantExport.tsv.gz') - self._cosmic_cellline_mutation_gene = self.get_configuration_default_params( - variable=self.CONFIG_COSMIC_CELLLINES_GENES_FILE, default_value='All_CellLines_Genes.fasta.gz') - self._cosmic_cdns_file = self.get_configuration_default_params(variable=self.CONFIG_COSMIC_CDS_GENES_FILE, - default_value='All_COSMIC_Genes.fasta.gz') - - self._cosmic_token = base64.b64encode("{}:{}".format(self._cosmic_user, self._cosmic_password) - .encode()).decode('utf-8') + self._local_path_cosmic = self.get_configuration_default_params( + variable=self.CONFIG_OUTPUT_DIRECTORY, default_value='./database_cosmic/') + self._cosmic_ftp_url = self.get_configuration_default_params( + variable=self.CONFIG_COSMIC_FTP_URL, default_value='https://cancer.sanger.ac.uk') + self._api_endpoint = self.get_configuration_default_params( + variable=self.CONFIG_API_ENDPOINT, default_value='api/mono/products/v1/downloads/scripted') + self._bucket = self.get_configuration_default_params( + variable=self.CONFIG_BUCKET, default_value='downloads') + self._cosmic_user = self.get_configuration_default_params( + variable=self.CONFIG_COSMIC_FTP_USER, default_value='') + self._cosmic_password = self.get_configuration_default_params( + variable=self.CONFIG_COSMIC_FTP_PASSWORD, default_value='') + self._products = self.get_configuration_default_params( + variable=self.CONFIG_PRODUCTS, default_value=list(self._DEFAULT_PRODUCTS)) + + self._cosmic_token = base64.b64encode( + "{}:{}".format(self._cosmic_user, self._cosmic_password).encode() + ).decode('utf-8') self.prepare_local_cosmic_repository() @@ -78,88 +82,102 @@ def prepare_local_cosmic_repository(self): def get_local_path_root_cosmic_repo(self): return self._local_path_cosmic + def build_api_url(self, product_path): + """Build the COSMIC scripted-download API URL for a single product.""" + return "{server}/{endpoint}?path={path}&bucket={bucket}".format( + server=self._cosmic_ftp_url.rstrip('/'), + endpoint=self._api_endpoint.strip('/'), + path=product_path, + bucket=self._bucket, + ) + def download_mutation_file(self, url_file_name=None): """ - This function will download the mutations file from Cosmic Database. - :return: None - """ - - mutation_output_file = "{}/{}".format(self.get_local_path_root_cosmic_repo(), self._cosmic_mutations_file) - cds_genes_output_file = "{}/{}".format(self.get_local_path_root_cosmic_repo(), self._cosmic_cdns_file) - - mutation_celline_output_file = "{}/{}".format(self.get_local_path_root_cosmic_repo(), - self._cosmic_cellline_mutation_file) - cellines_genes_output_file = "{}/{}".format(self.get_local_path_root_cosmic_repo(), - self._cosmic_cellline_mutation_gene) - - server = self._cosmic_ftp_url - - cosmic_version = self._cosmic_mutation_url - mutation_file = self._cosmic_mutations_file + Download every product listed in `self._products` via COSMIC's v103+ + scripted-download API. Each product is a `path=` value such as + `grch38/cosmic/v103/Cosmic_GenomeScreensMutant_Tsv_v103_GRCh38.tar`. - cosmic_cellline_version = self._cosmic_cellline_mutation_url - mutation_celline_file = self._cosmic_cellline_mutation_file - - all_cds_gene_file = self._cosmic_cdns_file - - all_celllines_gene_file = self._cosmic_cellline_mutation_gene - - mutation_url = "{}/{}/{}".format(server, cosmic_version, mutation_file) - cds_gene_url = "{}/{}/{}".format(server, cosmic_version, all_cds_gene_file) + If url_file_name is provided, write the API URLs to that file instead of downloading. + """ - celllines_gene_url = "{}/{}/{}".format(server, cosmic_cellline_version, all_celllines_gene_file) - mutation_cellline_url = "{}/{}/{}".format(server, cosmic_cellline_version, mutation_celline_file) + products = list(self._products) + if not products: + self.get_logger().warning("No COSMIC products configured; nothing to download.") + return - if url_file_name is None: - token = "Basic {}".format(self._cosmic_token) - self.download_file_cosmic(mutation_url, mutation_output_file, token) - self.download_file_cosmic(cds_gene_url, cds_genes_output_file, token) + token = "Basic {}".format(self._cosmic_token) + output_dir = self.get_local_path_root_cosmic_repo() - self.download_file_cosmic(celllines_gene_url, cellines_genes_output_file, token) - self.download_file_cosmic(mutation_cellline_url, mutation_celline_output_file, token) + if url_file_name is not None: + with open(url_file_name, 'w', encoding='utf-8') as url_file: + for path in products: + api_url = self.build_api_url(path) + output_file = "{}/{}".format(output_dir, os.path.basename(path)) + url_file.write("{}\t{}\n".format(api_url, output_file)) + return - else: - if url_file_name is not None: - with open(url_file_name, 'w', encoding='utf-8') as url_file: - url_file.write("{}\t{}\n".format(mutation_url, mutation_output_file)) - url_file.write("{}\t{}\n".format(cds_gene_url, cds_genes_output_file)) - url_file.write("{}\t{}\n".format(celllines_gene_url, cellines_genes_output_file)) - url_file.write("{}\t{}\n".format(mutation_cellline_url, mutation_celline_output_file)) + for path in products: + api_url = self.build_api_url(path) + output_file = "{}/{}".format(output_dir, os.path.basename(path)) + self.download_file_cosmic(api_url, output_file, token) - def download_file_cosmic(self, url, local_file, token): + def download_file_cosmic(self, api_url, local_file, token): """ - Download file from cosmic repository using requests - :param url: url of the file to be download - :param local_file: local file - :param token: token to be used - :return: + Two-step download from COSMIC's scripted-download API: + 1) GET api_url with Basic auth -> returns JSON {"url": ""} + 2) GET the presigned URL (no auth, 1-hour TTL) -> stream to local_file + + If local_file ends in .tar, extract its contents to the same directory + and delete the archive. """ + api_response = requests.get(api_url, headers={'Authorization': token}, timeout=30) + if api_response.status_code != 200: + msg = ("COSMIC API request failed: HTTP {} for {}. Body: {!r}" + .format(api_response.status_code, api_url, api_response.text[:200])) + self.get_logger().error(msg) + raise AppConfigException(msg) + + try: + payload = api_response.json() + download_url = payload['url'] + except (ValueError, KeyError) as exc: + msg = ("COSMIC API returned 200 but the body is not a JSON object with a 'url' " + "key (got: {!r}). This usually means the `path=` value is wrong for the " + "release/assembly, or your account lacks access. URL was: {}. Error: {}" + .format(api_response.text[:200], api_url, exc)) + self.get_logger().error(msg) + raise AppConfigException(msg) - response = requests.get(url, stream=True, headers={'Authorization': token}, timeout=30) - if response.status_code == 200: - url = response.json()['url'] - msg = "Downloading file from url '{}'".format(url) - self.get_logger().debug(msg) - - response = requests.get(url, stream=True, timeout=30) - if response.status_code == 200: - with open(local_file, 'wb') as f: - f.write(response.content) - msg = "Download Finish for file '{}'".format(local_file) - self.get_logger().debug(msg) - if local_file.endswith('.gz'): - extracted_file = local_file.replace('.gz', '') - with open(extracted_file, 'w', encoding='utf-8') as outfile: - try: - outfile.write(gzip.decompress(open(local_file, 'rb').read()).decode('utf-8')) - except UnicodeDecodeError: - outfile.write(gzip.decompress(open(local_file, 'rb').read()).decode('ISO-8859-1')) - os.remove(local_file) - local_file = extracted_file - msg = "Extracted file '{}'".format(local_file) - self.get_logger().debug(msg) - else: - msg = "Error downloading the COSMIC data, error code {} , error message '{}'".format(response.status_code, - local_file) - self.get_logger().debug(msg) + self.get_logger().debug("Downloading file from signed URL '{}'".format(download_url)) + data_response = requests.get(download_url, stream=True, timeout=30) + if data_response.status_code != 200: + msg = ("COSMIC S3 download failed: HTTP {} for {}" + .format(data_response.status_code, download_url)) + self.get_logger().error(msg) raise AppConfigException(msg) + + total_size = int(data_response.headers.get('content-length') or 0) + chunk_size = 1024 * 1024 + progress = tqdm( + total=total_size if total_size > 0 else None, + unit='B', unit_scale=True, unit_divisor=1024, + desc=os.path.basename(local_file), + leave=True, + ) + try: + with open(local_file, 'wb') as f: + for chunk in data_response.iter_content(chunk_size=chunk_size): + if chunk: + f.write(chunk) + progress.update(len(chunk)) + finally: + progress.close() + self.get_logger().debug("Download finished: '{}'".format(local_file)) + + if local_file.endswith('.tar'): + extract_dir = os.path.dirname(local_file) or '.' + print("Extracting {}...".format(os.path.basename(local_file)), flush=True) + with tarfile.open(local_file, 'r') as tar: + tar.extractall(path=extract_dir) + os.remove(local_file) + self.get_logger().debug("Extracted archive into '{}'".format(extract_dir)) diff --git a/pgatk/commands/cosmic_downloader.py b/pgatk/commands/cosmic_downloader.py index 5696097..557cce9 100644 --- a/pgatk/commands/cosmic_downloader.py +++ b/pgatk/commands/cosmic_downloader.py @@ -16,9 +16,15 @@ help="Username for cosmic database -- please if you don't have one register here (https://cancer.sanger.ac.uk/cosmic/register)") @click.option('-p', '--password', help="Password for cosmic database -- please if you don't have one register here (https://cancer.sanger.ac.uk/cosmic/register)") +@click.option('-P', '--products', multiple=True, + help='COSMIC scripted-download `path=` value to fetch (e.g. ' + 'grch38/cosmic/v103/Cosmic_GenomeScreensMutant_Tsv_v103_GRCh38.tar). ' + 'May be repeated. Overrides the products list in the config file when provided. ' + 'Find available paths at https://cancer.sanger.ac.uk/cosmic/download/cosmic ' + '(open the "Scripted Download" panel for any product).') @click.option("--url_file", help='Add the url to a downloaded file') @click.pass_context -def cosmic_downloader(ctx, config_file, output_directory, username, password, url_file): +def cosmic_downloader(ctx, config_file, output_directory, username, password, products, url_file): config_data = load_config("cosmic", config_file) @@ -31,5 +37,8 @@ def cosmic_downloader(ctx, config_file, output_directory, username, password, ur if password is not None: pipeline_arguments[CosmicDownloadService.CONFIG_COSMIC_FTP_PASSWORD] = password + if products: + pipeline_arguments[CosmicDownloadService.CONFIG_PRODUCTS] = list(products) + cosmic_downloader_service = CosmicDownloadService(config_data, pipeline_arguments) cosmic_downloader_service.download_mutation_file(url_file_name=url_file) diff --git a/pgatk/config/cosmic_config.yaml b/pgatk/config/cosmic_config.yaml index 6b6da9b..7721295 100644 --- a/pgatk/config/cosmic_config.yaml +++ b/pgatk/config/cosmic_config.yaml @@ -2,14 +2,31 @@ cosmic_data: output_directory: database_cosmic cosmic_server: cosmic_ftp: https://cancer.sanger.ac.uk - mutations_url: cosmic/file_download/GRCh38/cosmic/v94 - all_cds_genes_file: All_COSMIC_Genes.fasta.gz - mutations_file: CosmicMutantExport.tsv.gz - mutations_cellline_url: cosmic/file_download/GRCh38/cell_lines/v94 - mutations_cellline_file: CosmicCLP_MutantExport.tsv.gz - all_celllines_genes_file: All_CellLines_Genes.fasta.gz + api_endpoint: api/mono/products/v1/downloads/scripted + bucket: downloads cosmic_user: '' cosmic_password: '' + # List of COSMIC scripted-download paths to fetch. Each entry is the `path=` + # value from the curl command shown on the COSMIC UI's "Scripted Download" panel + # for a product (https://cancer.sanger.ac.uk/cosmic/download/cosmic). + # + # Path format: /// + # - is lowercase (`grch37`, `grch38`) + # - is `cosmic` for the main COSMIC project, `cell_lines` for the + # Cell Lines Project (other projects: `cancer_mutation_census`, `actionability`) + # - is the release (`v103`, etc.) + # - is the exact .tar archive name shown in the UI + # + # Add or remove entries to change what gets downloaded. Defaults below cover the + # three files the cosmic-to-proteindb pipeline expects (mutations TSV + gene FASTA + # for the main COSMIC project, plus cell-line mutations TSV). + products: + - grch38/cosmic/v103/Cosmic_GenomeScreensMutant_Tsv_v103_GRCh38.tar + - grch38/cosmic/v103/Cosmic_CompleteTargetedScreensMutant_Tsv_v103_GRCh38.tar + - grch38/cosmic/v103/Cosmic_Genes_Fasta_v103_GRCh38.tar + - grch38/cosmic/v103/Cosmic_Transcripts_Tsv_v103_GRCh38.tar + - grch38/cell_lines/v103/CellLinesProject_GenomeScreensMutant_Tsv_v103_GRCh38.tar + - grch38/cell_lines/v103/CellLinesProject_CompleteCNA_Tsv_v103_GRCh38.tar logger: formatters: DEBUG: "%(asctime)s [%(levelname)7s][%(name)48s][%(module)32s, %(lineno)4s] %(message)s" diff --git a/pgatk/tests/test_cosmic_downloader.py b/pgatk/tests/test_cosmic_downloader.py new file mode 100644 index 0000000..dea686c --- /dev/null +++ b/pgatk/tests/test_cosmic_downloader.py @@ -0,0 +1,44 @@ +"""Unit tests for the COSMIC downloader URL construction. + +No network. Builds a CosmicDownloadService against the shipped default config +and asserts that the four product API URLs match the expected pattern. +""" +import os + +from pgatk.cgenomes.cosmic_downloader import CosmicDownloadService +from pgatk.config.registry import load_config + + +def test_build_api_url_pattern(tmp_path): + """The API URL should be {server}/{endpoint}?path={path}&bucket={bucket}.""" + config_data = load_config("cosmic", None) + pipeline_arguments = { + CosmicDownloadService.CONFIG_OUTPUT_DIRECTORY: str(tmp_path), + CosmicDownloadService.CONFIG_COSMIC_FTP_USER: "user@example.com", + CosmicDownloadService.CONFIG_COSMIC_FTP_PASSWORD: "secret", + } + svc = CosmicDownloadService(config_data, pipeline_arguments) + + url = svc.build_api_url("grch38/cosmic/v103/Cosmic_MutantCensus_Tsv_v103_GRCh38.tar") + + assert url == ( + "https://cancer.sanger.ac.uk/api/mono/products/v1/downloads/scripted" + "?path=grch38/cosmic/v103/Cosmic_MutantCensus_Tsv_v103_GRCh38.tar" + "&bucket=downloads" + ) + + +def test_basic_auth_token_format(tmp_path): + """Auth token is base64(user:password), no trailing newline.""" + import base64 + + config_data = load_config("cosmic", None) + pipeline_arguments = { + CosmicDownloadService.CONFIG_OUTPUT_DIRECTORY: str(tmp_path), + CosmicDownloadService.CONFIG_COSMIC_FTP_USER: "user@example.com", + CosmicDownloadService.CONFIG_COSMIC_FTP_PASSWORD: "secret", + } + svc = CosmicDownloadService(config_data, pipeline_arguments) + + expected = base64.b64encode(b"user@example.com:secret").decode("utf-8") + assert svc._cosmic_token == expected From e485df48fba1b8f7cdc2a9e7c0ffbd3de4bde0a7 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 11 May 2026 11:52:44 +0100 Subject: [PATCH 04/23] Update default include_biotypes to current Ensembl annotations The default list contained two entries that no longer match modern Ensembl GTFs: `polymorphic_pseudogene` was deprecated and folded into other biotypes, and `mRNA` is not an Ensembl biotype at all (it's a generic GFF feature type used by some non-Ensembl annotations). Three protein-producing biotypes that exist in current releases were missing. Removed: - polymorphic_pseudogene (deprecated) - mRNA (not an Ensembl biotype) Added: - protein_coding_CDS_not_defined (~26k transcripts; coding but CDS not perfectly annotated, still produces protein) - protein_coding_LoF (predicted loss-of-function coding transcripts; still produces a truncated protein) - translated_processed_pseudogene (pseudogenes that ARE translated; the "translated_" prefix distinguishes them from regular pseudo- genes) Net default list: 14 -> 15 biotypes. YAML config and the in-code fallback in ensembl.py kept in sync. Closes #101 --- pgatk/config/ensembl_config.yaml | 2 +- pgatk/ensembl/ensembl.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pgatk/config/ensembl_config.yaml b/pgatk/config/ensembl_config.yaml index 1838c7c..24be0dc 100644 --- a/pgatk/config/ensembl_config.yaml +++ b/pgatk/config/ensembl_config.yaml @@ -11,7 +11,7 @@ ensembl_translation: exclude_biotypes: '' exclude_consequences: 'downstream_gene_variant, upstream_gene_variant, intergenic_variant, intron_variant, synonymous_variant, regulatory_region_variant' skip_including_all_cds: False - include_biotypes: 'protein_coding,polymorphic_pseudogene,non_stop_decay,nonsense_mediated_decay,IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,TR_C_gene,TR_D_gene,TR_J_gene,TR_V_gene,TEC,mRNA' + include_biotypes: 'protein_coding,protein_coding_CDS_not_defined,protein_coding_LoF,nonsense_mediated_decay,non_stop_decay,translated_processed_pseudogene,IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,TR_C_gene,TR_D_gene,TR_J_gene,TR_V_gene,TEC' include_consequences: 'all' biotype_str: transcript_biotype transcript_description_sep: ';' diff --git a/pgatk/ensembl/ensembl.py b/pgatk/ensembl/ensembl.py index e1d8b7e..5900fd4 100644 --- a/pgatk/ensembl/ensembl.py +++ b/pgatk/ensembl/ensembl.py @@ -85,7 +85,7 @@ def __init__(self, config_file: dict, pipeline_arguments: dict) -> None: default_value=False) self._include_biotypes = self.get_multiple_options( self.get_translation_properties(variable=self.INCLUDE_BIOTYPES, - default_value='protein_coding,polymorphic_pseudogene,non_stop_decay,nonsense_mediated_decay,IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,TR_C_gene,TR_D_gene,TR_J_gene,TR_V_gene,TEC,mRNA')) + default_value='protein_coding,protein_coding_CDS_not_defined,protein_coding_LoF,nonsense_mediated_decay,non_stop_decay,translated_processed_pseudogene,IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,TR_C_gene,TR_D_gene,TR_J_gene,TR_V_gene,TEC')) self._include_consequences = self.get_multiple_options( self.get_translation_properties(variable=self.INCLUDE_CONSEQUENCES, default_value='all')) From 03de0b65cc3f28aa214f942cad3f8b7b0ad70952 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 11 May 2026 14:11:15 +0100 Subject: [PATCH 05/23] Speed up vcf-to-proteindb (~475x on the dominant SQL op) Issue #99 reports vcf-to-proteindb taking 12+ hours on chr22-scale VCFs. Root cause: get_features() makes two SQLite queries against the gffutils GTF database per (variant, transcript) pair, with no caching. For typical CSQ-annotated VCFs that's tens of millions of queries per chromosome. Six semantically preserving optimisations, stacked: 1. Memoize get_features() results per transcript id for the duration of one vcf_to_proteindb call. Dominant gain. Micro-benchmark on a real ENSEMBL chr22 transcript shows ~475x speedup on this op (0.094 ms/call SQLite -> 0.0002 ms/call cache hit). For 50M get_features calls per chr22 that's ~78 minutes -> ~10 seconds of pure get_features time. 2. Apply biotype / consequence filters BEFORE the (now-cached) get_features call so rejected transcripts skip the work entirely. 3. Iterate the VCF DataFrame with itertuples(index=False) instead of iterrows(); ~5-10x faster on the outer loop. 4. Parse record.INFO into a dict once per variant instead of two list-comprehensions that each scan the whole INFO string. 5. Cache transcripts_dict[id] (SeqIO.index) lookups per transcript id; avoids repeated fseek+parse on the FASTA for variants that share a transcript. Uses a _MISSING sentinel so known-missing ids skip the disk lookup too. 6. Lazy log formatting in the hot loop -- wrap debug() calls with isEnabledFor(DEBUG) and use %s placeholders so the message string isn't built when DEBUG isn't enabled. Verified end-to-end against the three in-repo pytest tests (test_vcf_to_proteindb, test_vcf_to_proteindb_notannotated, test_vcf_gnomad_to_proteindb). End-to-end run on a 10K-variant 1000 Genomes chr22 VCF + ENSEMBL release 113 chr22 GTF + full cDNA FASTA completes in ~30 seconds (mostly gffutils DB build + bedtools intersect, both one-time costs unrelated to the fix); 43 sequences produced. Behaviour change worth noting: the "feature IDs from VCF not found in FASTA" counter now counts each missing transcript once (cached), not per-variant. The old counter over-counted on repeated lookups of the same missing id. Behaviour, not output, change. Add scripts/benchmark_vcf_to_proteindb.py: one-shot wall-clock timer for re-validating perf on representative data. Not used by CI. Closes #99 --- pgatk/ensembl/ensembl.py | 140 ++++++++++++++++++-------- scripts/benchmark_vcf_to_proteindb.py | 65 ++++++++++++ 2 files changed, 162 insertions(+), 43 deletions(-) create mode 100755 scripts/benchmark_vcf_to_proteindb.py diff --git a/pgatk/ensembl/ensembl.py b/pgatk/ensembl/ensembl.py index 5900fd4..883088f 100644 --- a/pgatk/ensembl/ensembl.py +++ b/pgatk/ensembl/ensembl.py @@ -19,6 +19,32 @@ ) +class _FeatureCache: + """Per-run memoization of get_features() results, keyed by transcript id. + + Bounded by a soft maxsize to avoid unbounded growth on whole-genome runs. + Defaults to 50000, which comfortably covers human ENSEMBL (~250k transcripts + across all chromosomes; a single chr is ~10-20k). + """ + + def __init__(self, maxsize: int = 50000) -> None: + self._cache: dict[tuple, tuple] = {} + self._maxsize = maxsize + + def get(self, key: tuple) -> Optional[tuple]: + return self._cache.get(key) + + def put(self, key: tuple, value: tuple) -> None: + if len(self._cache) >= self._maxsize: + # Soft eviction: drop a fifth at random when full. Cheap and bounded. + for k in list(self._cache.keys())[: self._maxsize // 5]: + del self._cache[k] + self._cache[key] = value + + +_MISSING = object() + + class EnsemblDataService(ParameterConfiguration): CONFIG_KEY_VCF = "ensembl_translation" INPUT_FASTA = "input_fasta" @@ -432,6 +458,8 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf transcripts_dict = SeqIO.index(input_fasta, "fasta", key_function=self.get_key) # handle cases where the transcript has version in the GTF but not in the VCF transcript_id_mapping = {k.split('.')[0]: k for k in transcripts_dict.keys()} + feature_cache = _FeatureCache() + seq_cache: dict[str, tuple] = {} # transcript_id_v -> (ref_seq, desc); see Task 5 transcript_index, consequence_index, biotype_index = None, None, None if self._annotation_field_name: @@ -481,12 +509,12 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf self._accepted_filters = [x.upper() for x in self._accepted_filters] with open(self._proteindb_output, 'w', encoding='utf-8') as prots_fn: - for _, record in vcf_reader.iterrows(): + for record in vcf_reader.itertuples(index=False, name='VCFRecord'): trans = False if [x for x in str(record.REF) if x not in 'ACGT']: - msg = "Invalid VCF record, skipping: {}".format(record) invalid_records['# variants with invalid record'] += 1 - self.get_logger().debug(msg) + if self.get_logger().isEnabledFor(logging.DEBUG): + self.get_logger().debug("Invalid VCF record, skipping: %s", record) continue alts = [] @@ -498,11 +526,18 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf continue alts.append(alt) if not alts: - msg = "Invalid VCF record, skipping: {}".format(record) invalid_records['# variants with invalid record'] += 1 - self.get_logger().debug(msg) + if self.get_logger().isEnabledFor(logging.DEBUG): + self.get_logger().debug("Invalid VCF record, skipping: %s", record) continue + # Parse INFO once. Maps each key to its value string, missing keys absent. + # Avoids repeated split-and-search list-comprehensions. + info_kv: dict[str, str] = {} + for entry in record.INFO.split(';'): + k, _, v = entry.partition('=') + info_kv[k] = v + if not self._ignore_filters and self._accepted_filters != ['ALL']: if record.FILTER and record.FILTER != '.' and record.FILTER != 'NA' and record.FILTER != '': # if not PASS: None and empty means PASS filters = set(record.FILTER.upper().split(',')) @@ -511,12 +546,10 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf if not filters <= set(self._accepted_filters): invalid_records['# variants not passing Filter'] += 1 continue - # only process variants above a given allele frequency threshold if the AF string is not empty if self._af_field: - # get AF from the INFO field try: - af = float([x.split('=', 1)[1] for x in record.INFO.split(';') if x.split('=', 1)[0] == self._af_field][0]) - except (ValueError, IndexError): + af = float(info_kv[self._af_field]) + except (ValueError, KeyError): invalid_records['# variants with invalid record'] += 1 continue @@ -530,15 +563,13 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf trans_table = self._mito_translation_table processed_transcript_allele = set() - transcript_records = [] try: - transcript_records = \ - [x.split('=', 1)[1] for x in record.INFO.split(';') - if x.split('=', 1)[0] == self._annotation_field_name][0] - except IndexError: # no overlapping feature was found + transcript_records = info_kv[self._annotation_field_name] + except KeyError: invalid_records['# variants with invalid record'] += 1 - msg = "skipped record {}, no annotation feature was found".format(record) - self.get_logger().debug(msg) + if self.get_logger().isEnabledFor(logging.DEBUG): + self.get_logger().debug( + "skipped record %s, no annotation feature was found", record) continue for transcript_record in transcript_records.split(','): @@ -548,9 +579,10 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf consequence = transcript_info[consequence_index] except IndexError: invalid_records['# variants with invalid record'] += 1 - msg = "Give a valid index for the consequence in the INFO field for: {}".format( - transcript_record) - self.get_logger().debug(msg) + if self.get_logger().isEnabledFor(logging.DEBUG): + self.get_logger().debug( + "Give a valid index for the consequence in the INFO field for: %s", + transcript_record) continue except TypeError: pass @@ -559,9 +591,10 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf biotype = transcript_info[biotype_index] except IndexError: invalid_records['# variants with invalid record'] += 1 - msg = "Give a valid index for the biotype in the INFO field for: {}".format( - transcript_record) - self.get_logger().debug(msg) + if self.get_logger().isEnabledFor(logging.DEBUG): + self.get_logger().debug( + "Give a valid index for the biotype in the INFO field for: %s", + transcript_record) continue except TypeError: pass @@ -570,9 +603,10 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf transcript_id = transcript_info[transcript_index] except IndexError: invalid_records['# variants with invalid record'] += 1 - msg = "Give a valid index for the Transcript IDs in the INFO field for: {}".format( - transcript_record) - self.get_logger().debug(msg) + if self.get_logger().isEnabledFor(logging.DEBUG): + self.get_logger().debug( + "Give a valid index for the Transcript IDs in the INFO field for: %s", + transcript_record) continue if transcript_id == "": continue @@ -582,15 +616,26 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf except KeyError: transcript_id_v = transcript_id - try: - row = transcripts_dict[transcript_id_v] - ref_seq = row.seq # get the seq and desc for the transcript from the fasta of the gtf - desc = str(row.description) - except KeyError: - invalid_records['# feature IDs from VCF that are not found in the given FASTA file'] += 1 - msg = "Feature {} not found in fasta of the GTF file {}".format(transcript_id_v, record) - self.get_logger().debug(msg) + cached_row = seq_cache.get(transcript_id_v, _MISSING) + if cached_row is _MISSING: + try: + row = transcripts_dict[transcript_id_v] + ref_seq = row.seq + desc = str(row.description) + seq_cache[transcript_id_v] = (ref_seq, desc) + except KeyError: + invalid_records['# feature IDs from VCF that are not found in the given FASTA file'] += 1 + if self.get_logger().isEnabledFor(logging.DEBUG): + self.get_logger().debug( + "Feature %s not found in fasta of the GTF file %s", + transcript_id_v, record) + seq_cache[transcript_id_v] = None + continue + elif cached_row is None: + # already-known-missing; skip without re-counting in the stats continue + else: + ref_seq, desc = cached_row feature_types = ['exon'] # check if cds info exists in the fasta header otherwise translate all exons @@ -608,15 +653,13 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf feature_types = ['CDS', 'stop_codon'] num_orfs = 1 except (ValueError, IndexError): - msg = "Could not extract cds position from fasta header for: {}".format(desc) - self.get_logger().debug(msg) + if self.get_logger().isEnabledFor(logging.DEBUG): + self.get_logger().debug( + "Could not extract cds position from fasta header for: %s", desc) - chrom, strand, features_info = self.get_features(db, - transcript_id_v, - feature_types) - if chrom is None: # the record info was not found - continue - # skip transcripts with unwanted consequences + # skip transcripts with unwanted consequences (do this BEFORE + # the expensive get_features() SQL call — both filter inputs + # come from transcript_info, not from get_features) if consequence_index is not None: if (consequence in self._exclude_consequences or (consequence not in self._include_consequences and @@ -630,6 +673,17 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf self._include_biotypes != ['all'])): continue + cache_key = (transcript_id_v, tuple(feature_types)) + cached = feature_cache.get(cache_key) + if cached is None: + chrom, strand, features_info = self.get_features( + db, transcript_id_v, feature_types) + feature_cache.put(cache_key, (chrom, strand, features_info)) + else: + chrom, strand, features_info = cached + if chrom is None: # the record info was not found + continue + for alt in alts: dedup_key = transcript_id + str(record.REF) + str(alt) if dedup_key in processed_transcript_allele: @@ -643,8 +697,8 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf features_info) except TypeError: invalid_records['# variants with invalid record'] += 1 - msg = "Wrong VCF record in {}".format(record) - self.get_logger().debug(msg) + if self.get_logger().isEnabledFor(logging.DEBUG): + self.get_logger().debug("Wrong VCF record in %s", record) continue if (chrom.lstrip("chr") == str(record.CHROM).lstrip("chr") and diff --git a/scripts/benchmark_vcf_to_proteindb.py b/scripts/benchmark_vcf_to_proteindb.py new file mode 100755 index 0000000..bf17700 --- /dev/null +++ b/scripts/benchmark_vcf_to_proteindb.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 +"""One-shot benchmark for ``pgatk vcf-to-proteindb``. + +Times one full run end-to-end. Compare two runs of this script (one on a +pre-fix build, one on a post-fix build) to measure the speedup achieved +by issue #99. Not invoked by CI. + +Usage: + python scripts/benchmark_vcf_to_proteindb.py \ + --vcf /path/to/sample.vcf \ + --fasta /path/to/transcripts.fa \ + --gtf /path/to/annotation.gtf \ + --output /tmp/benchmark_proteindb.fa +""" +import argparse +import time +from pathlib import Path + +from pgatk.ensembl.ensembl import EnsemblDataService +from pgatk.config.registry import load_config + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--vcf", required=True, help="Input VCF (annotated or raw)") + parser.add_argument("--fasta", required=True, help="Transcript FASTA matching the GTF") + parser.add_argument("--gtf", required=True, help="Gene annotations GTF") + parser.add_argument("--output", required=True, help="Output protein DB FASTA") + parser.add_argument( + "--annotation-field-name", + default="CSQ", + help="VCF INFO field that carries per-transcript annotation (default: CSQ). " + "Use empty string to force re-annotation via bedtools intersect.", + ) + args = parser.parse_args() + + config_data = load_config("ensembl_config", None) + pipeline_arguments = { + EnsemblDataService.PROTEIN_DB_OUTPUT: args.output, + EnsemblDataService.ANNOTATION_FIELD_NAME: args.annotation_field_name, + } + svc = EnsemblDataService(config_data, pipeline_arguments) + + start = time.perf_counter() + svc.vcf_to_proteindb(args.vcf, args.fasta, args.gtf) + elapsed = time.perf_counter() - start + + output_path = Path(args.output) + output_size = output_path.stat().st_size if output_path.exists() else 0 + seq_count = 0 + if output_path.exists(): + with open(output_path, "r", encoding="utf-8") as fh: + for line in fh: + if line.startswith(">"): + seq_count += 1 + + print() + print("=== BENCHMARK RESULT ===") + print(f" VCF: {args.vcf}") + print(f" Elapsed: {elapsed:.2f} s ({elapsed / 60:.2f} min)") + print(f" Output: {args.output} ({output_size} bytes, {seq_count} sequences)") + + +if __name__ == "__main__": + main() From 9eba32c7ca6c36ced66289b79d9c1e51648388c5 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 11 May 2026 14:24:59 +0100 Subject: [PATCH 06/23] Address Codacy findings: tarfile path-traversal + minor style cosmic_downloader.py: tarfile.extractall on COSMIC archives is now wrapped in _safe_extract_tar(), which validates each member against the destination directory before extraction. Rejects entries that would escape the target via absolute path, .. traversal, or unsafe symlink/hardlink targets (CVE-2007-4559 family). COSMIC archives are trusted in practice but defence-in-depth costs nothing and silences the static-analysis warning. ensembl.py: add blank line before _FeatureCache docstring (D203). D213 ("multi-line docstring summary on second line") not addressed - it conflicts with D212 ("on the first line"), which the codebase already follows throughout. Standard modern Python style. --- pgatk/cgenomes/cosmic_downloader.py | 34 +++++++++++++++++++++++++++-- pgatk/ensembl/ensembl.py | 1 + 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/pgatk/cgenomes/cosmic_downloader.py b/pgatk/cgenomes/cosmic_downloader.py index 836955e..f913f69 100644 --- a/pgatk/cgenomes/cosmic_downloader.py +++ b/pgatk/cgenomes/cosmic_downloader.py @@ -177,7 +177,37 @@ def download_file_cosmic(self, api_url, local_file, token): if local_file.endswith('.tar'): extract_dir = os.path.dirname(local_file) or '.' print("Extracting {}...".format(os.path.basename(local_file)), flush=True) - with tarfile.open(local_file, 'r') as tar: - tar.extractall(path=extract_dir) + self._safe_extract_tar(local_file, extract_dir) os.remove(local_file) self.get_logger().debug("Extracted archive into '{}'".format(extract_dir)) + + @staticmethod + def _safe_extract_tar(tar_path, dest_dir): + """Extract tar_path into dest_dir, rejecting any member that would + escape dest_dir via absolute path or .. traversal (CVE-2007-4559). + COSMIC archives are trusted in practice, but defence in depth costs + nothing and silences static-analysis warnings. + """ + dest_abs = os.path.realpath(dest_dir) + with tarfile.open(tar_path, 'r') as tar: + for member in tar.getmembers(): + member_path = os.path.realpath(os.path.join(dest_abs, member.name)) + if os.path.commonpath([dest_abs, member_path]) != dest_abs: + raise AppConfigException( + "Refusing to extract tar member {!r}: would escape target dir {!r}" + .format(member.name, dest_abs) + ) + if member.issym() or member.islnk(): + link_target = member.linkname + if os.path.isabs(link_target): + raise AppConfigException( + "Refusing to extract tar member {!r}: absolute symlink/hardlink target {!r}" + .format(member.name, link_target) + ) + resolved = os.path.realpath(os.path.join(dest_abs, os.path.dirname(member.name), link_target)) + if os.path.commonpath([dest_abs, resolved]) != dest_abs: + raise AppConfigException( + "Refusing to extract tar member {!r}: symlink/hardlink would escape target dir" + .format(member.name) + ) + tar.extractall(path=dest_abs) diff --git a/pgatk/ensembl/ensembl.py b/pgatk/ensembl/ensembl.py index 883088f..2e21aae 100644 --- a/pgatk/ensembl/ensembl.py +++ b/pgatk/ensembl/ensembl.py @@ -20,6 +20,7 @@ class _FeatureCache: + """Per-run memoization of get_features() results, keyed by transcript id. Bounded by a soft maxsize to avoid unbounded growth on whole-genome runs. From 2b467a5c57fd1b293aace53a44a27fe58cac875a Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 11 May 2026 14:43:14 +0100 Subject: [PATCH 07/23] Configure pydocstyle to suppress D211/D213 (conflict with codebase style) Codacy's default pydocstyle config has both D203 ("one blank line before class docstring") and D211 ("no blank lines before class docstring") enabled. These rules contradict each other; the codebase follows D203. Likewise, D213 ("multi-line summary on second line") conflicts with D212 ("on the first line"), which is the codebase's established convention. Add `add-ignore = ["D211", "D213"]` to silence the false positives. --- pyproject.toml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 610da6c..6dff712 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,3 +71,10 @@ exclude = ["tests*", "*.tests*", "*.tests.*", "tests.*"] [tool.setuptools.package-data] pgatk = ["config/*.yaml", "config/*.json"] + +[tool.pydocstyle] +# D211 conflicts with D203 (which we follow: one blank line before class docstring). +# D213 conflicts with D212 (which we follow: multi-line summary on the first line). +# pydocstyle errors when both rules in each pair are enabled; explicitly disable +# the ones we don't follow. +add-ignore = ["D211", "D213"] From f781b00ebf1772f5a9ac0260e3d68ad9c0d03bb8 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 11 May 2026 14:48:10 +0100 Subject: [PATCH 08/23] Silence remaining Codacy noise (tarfile + pydocstyle config) The pydocstyle settings in pyproject.toml in 2b467a5 didn't help - Codacy runs an older pydocstyle that only reads its native config files (setup.cfg, .pydocstyle, .pydocstyle.ini, tox.ini), not pyproject.toml's `[tool.pydocstyle]` section. Add `.pydocstyle` with the same `add-ignore = D211,D213` and a `.codacy.yaml` that sets the same patterns at the Codacy engine level. Also tighten _safe_extract_tar to pass the validated member list to extractall() explicitly (members=safe_members). Functionally identical, but makes the contract obvious to readers (and the static analyser): only validated members are extracted. The `# nosec B202` comment marks the deliberate suppression for any Bandit-style scanner that still flags extractall calls. No source-code behaviour change. --- .codacy.yaml | 15 +++++++++++++++ .pydocstyle | 6 ++++++ pgatk/cgenomes/cosmic_downloader.py | 5 ++++- 3 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 .codacy.yaml create mode 100644 .pydocstyle diff --git a/.codacy.yaml b/.codacy.yaml new file mode 100644 index 0000000..7a461b7 --- /dev/null +++ b/.codacy.yaml @@ -0,0 +1,15 @@ +--- +# Codacy configuration. https://docs.codacy.com/repositories-configure/codacy-configuration-file/ +# +# pydocstyle has two pairs of mutually-exclusive rules: +# D203 vs D211 (blank line before class docstring; codebase follows D203) +# D212 vs D213 (multi-line summary line; codebase follows D212) +# Both rules in each pair are enabled in Codacy's default profile, which produces +# unavoidable noise — pick one and silence the other. Settings here also apply to +# any local pydocstyle invocation that reads `.pydocstyle`. + +engines: + pydocstyle: + enabled: true + settings: + add_ignore: ["D211", "D213"] diff --git a/.pydocstyle b/.pydocstyle new file mode 100644 index 0000000..bd70be3 --- /dev/null +++ b/.pydocstyle @@ -0,0 +1,6 @@ +[pydocstyle] +# D211 conflicts with D203 (codebase follows D203: one blank line before class docstring). +# D213 conflicts with D212 (codebase follows D212: multi-line summary on the first line). +# Disabling the rules we don't follow stops the mutually-exclusive-pair noise from +# static analysers (pydocstyle, Codacy). +add-ignore = D211,D213 diff --git a/pgatk/cgenomes/cosmic_downloader.py b/pgatk/cgenomes/cosmic_downloader.py index f913f69..5bd7c07 100644 --- a/pgatk/cgenomes/cosmic_downloader.py +++ b/pgatk/cgenomes/cosmic_downloader.py @@ -190,6 +190,7 @@ def _safe_extract_tar(tar_path, dest_dir): """ dest_abs = os.path.realpath(dest_dir) with tarfile.open(tar_path, 'r') as tar: + safe_members = [] for member in tar.getmembers(): member_path = os.path.realpath(os.path.join(dest_abs, member.name)) if os.path.commonpath([dest_abs, member_path]) != dest_abs: @@ -210,4 +211,6 @@ def _safe_extract_tar(tar_path, dest_dir): "Refusing to extract tar member {!r}: symlink/hardlink would escape target dir" .format(member.name) ) - tar.extractall(path=dest_abs) + safe_members.append(member) + # nosec B202 - members are validated above; extracting only the validated list + tar.extractall(path=dest_abs, members=safe_members) From dd86c39ca9a254bc279c28ee58d7d8e2fe94f604 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 11 May 2026 16:24:11 +0100 Subject: [PATCH 09/23] Fix remaining real Codacy findings; defer style-conflict ones to UI Real issues fixed: - test_cosmic_downloader.py: remove unused `import os` (F401). - cosmic_downloader.py: rewrite _safe_extract_tar docstring with proper summary/body split (single-period summary line + blank line before body), fixes D205 and D415. - cosmic_downloader.py: add explicit type hint `safe_members: list[tarfile.TarInfo]` on the validated-members list, and add a clear comment at the extractall() call site explaining what was validated. Helps reviewers (and static analysers that can follow type hints) confirm the safety contract. Codacy/Bandit config: - setup.cfg [bandit]: skip B101 entirely (B101 fires on every pytest `assert`, which is a false positive because pytest depends on assert rewriting). Also document the B202 (tarfile.extractall) suppression at the call site. - setup.cfg [pydocstyle]: same `add-ignore = D211,D213` as the existing .pydocstyle + pyproject.toml. setup.cfg is the most universally-read pydocstyle config location across versions. Remaining D211 / D213 noise on the PR cannot be fixed in code - they are mutually-exclusive pydocstyle rules (D211 vs D203, D213 vs D212) and the codebase has to follow one of each pair. The Codacy engine enables both halves of each pair by default and appears to ignore local config files. Disable D211 and D213 in the Codacy repository settings UI (Repository -> Settings -> Code patterns -> pydocstyle) to silence them globally. --- pgatk/cgenomes/cosmic_downloader.py | 18 +++++++++++------- pgatk/tests/test_cosmic_downloader.py | 2 -- setup.cfg | 15 +++++++++++++++ 3 files changed, 26 insertions(+), 9 deletions(-) create mode 100644 setup.cfg diff --git a/pgatk/cgenomes/cosmic_downloader.py b/pgatk/cgenomes/cosmic_downloader.py index 5bd7c07..0a8f7c6 100644 --- a/pgatk/cgenomes/cosmic_downloader.py +++ b/pgatk/cgenomes/cosmic_downloader.py @@ -183,14 +183,17 @@ def download_file_cosmic(self, api_url, local_file, token): @staticmethod def _safe_extract_tar(tar_path, dest_dir): - """Extract tar_path into dest_dir, rejecting any member that would - escape dest_dir via absolute path or .. traversal (CVE-2007-4559). - COSMIC archives are trusted in practice, but defence in depth costs - nothing and silences static-analysis warnings. + """Extract tar_path into dest_dir, rejecting unsafe members. + + Guards against path traversal (CVE-2007-4559) via absolute paths or + ``..`` components in member names, and ensures any symlink/hardlink + targets resolve within dest_dir. COSMIC archives are trusted in + practice, but defence in depth costs nothing and silences static + analysers. """ dest_abs = os.path.realpath(dest_dir) with tarfile.open(tar_path, 'r') as tar: - safe_members = [] + safe_members: list[tarfile.TarInfo] = [] for member in tar.getmembers(): member_path = os.path.realpath(os.path.join(dest_abs, member.name)) if os.path.commonpath([dest_abs, member_path]) != dest_abs: @@ -212,5 +215,6 @@ def _safe_extract_tar(tar_path, dest_dir): .format(member.name) ) safe_members.append(member) - # nosec B202 - members are validated above; extracting only the validated list - tar.extractall(path=dest_abs, members=safe_members) + # All members in safe_members were validated against dest_abs above: + # absolute paths, .. traversal, and unsafe symlink targets all rejected. + tar.extractall(path=dest_abs, members=safe_members) # nosec B202 diff --git a/pgatk/tests/test_cosmic_downloader.py b/pgatk/tests/test_cosmic_downloader.py index dea686c..5a4b028 100644 --- a/pgatk/tests/test_cosmic_downloader.py +++ b/pgatk/tests/test_cosmic_downloader.py @@ -3,8 +3,6 @@ No network. Builds a CosmicDownloadService against the shipped default config and asserts that the four product API URLs match the expected pattern. """ -import os - from pgatk.cgenomes.cosmic_downloader import CosmicDownloadService from pgatk.config.registry import load_config diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..2baaabd --- /dev/null +++ b/setup.cfg @@ -0,0 +1,15 @@ +[pydocstyle] +# D211 conflicts with D203 (we follow D203 — one blank line before class docstring). +# D213 conflicts with D212 (we follow D212 — multi-line summary on the first line). +# Both pairs of rules cannot be satisfied simultaneously; pick one side per pair. +add-ignore = D211,D213 + +[bandit] +# B101 fires on every `assert` used in pytest tests. Pytest depends on assert +# rewriting; the bytecode-stripping concern doesn't apply to test files. +# Exclude test directories entirely so pytest assertions are not flagged. +exclude_dirs = ./pgatk/tests,./tests +# B202 (tarfile.extractall) — _safe_extract_tar in pgatk/cgenomes/cosmic_downloader.py +# performs explicit member-by-member validation before calling extractall(members=…) +# with the validated list. Annotated with `# nosec B202` at the call site. +skips = B101 From e6889e4c5547c38d65a8693a6e41d6084d17045f Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 11 May 2026 16:27:40 +0100 Subject: [PATCH 10/23] Bump version to 0.0.27 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6dff712..1d4b6e2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "pgatk" -version = "0.0.26" +version = "0.0.27" description = "Python tools for proteogenomics" readme = "README.md" requires-python = ">=3.9" From ab10aa639ff5ca22b91c6b97cdd02a197464d7dc Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 11 May 2026 16:35:15 +0100 Subject: [PATCH 11/23] Clean up unnecessary comments added in this PR - Drop "_FeatureCache.put" eviction comment that incorrectly claimed random eviction (the loop actually drops the first 1/5 of insertion order). One-line loop is self-explanatory without commentary. - Drop "see Task 5" dangling reference in seq_cache declaration; the task lived in a now-gitignored implementation plan and means nothing to a future reader. - Collapse two-line "Parse INFO once" comment to one line; keep the WHY (avoids repeated scans) and drop the WHAT (variable name and loop already say it). - Collapse the two filter-block comments into a single rationale line explaining why the filters run before get_features(). --- pgatk/ensembl/ensembl.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/pgatk/ensembl/ensembl.py b/pgatk/ensembl/ensembl.py index 2e21aae..70d2e7f 100644 --- a/pgatk/ensembl/ensembl.py +++ b/pgatk/ensembl/ensembl.py @@ -37,7 +37,6 @@ def get(self, key: tuple) -> Optional[tuple]: def put(self, key: tuple, value: tuple) -> None: if len(self._cache) >= self._maxsize: - # Soft eviction: drop a fifth at random when full. Cheap and bounded. for k in list(self._cache.keys())[: self._maxsize // 5]: del self._cache[k] self._cache[key] = value @@ -460,7 +459,7 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf # handle cases where the transcript has version in the GTF but not in the VCF transcript_id_mapping = {k.split('.')[0]: k for k in transcripts_dict.keys()} feature_cache = _FeatureCache() - seq_cache: dict[str, tuple] = {} # transcript_id_v -> (ref_seq, desc); see Task 5 + seq_cache: dict[str, tuple] = {} # transcript_id_v -> (ref_seq, desc) transcript_index, consequence_index, biotype_index = None, None, None if self._annotation_field_name: @@ -532,8 +531,7 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf self.get_logger().debug("Invalid VCF record, skipping: %s", record) continue - # Parse INFO once. Maps each key to its value string, missing keys absent. - # Avoids repeated split-and-search list-comprehensions. + # Parse INFO once; avoids repeated split-and-search list-comprehensions per variant. info_kv: dict[str, str] = {} for entry in record.INFO.split(';'): k, _, v = entry.partition('=') @@ -658,16 +656,14 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf self.get_logger().debug( "Could not extract cds position from fasta header for: %s", desc) - # skip transcripts with unwanted consequences (do this BEFORE - # the expensive get_features() SQL call — both filter inputs - # come from transcript_info, not from get_features) + # Apply consequence/biotype filters before get_features(); both inputs + # come from transcript_info, so we can reject early without SQL. if consequence_index is not None: if (consequence in self._exclude_consequences or (consequence not in self._include_consequences and self._include_consequences != ['all'])): continue - # skip transcripts with unwanted biotypes if biotype_index is not None: if (biotype in self._exclude_biotypes or (biotype not in self._include_biotypes and From 3ede3ba465179d4a4c6cdad81565453fa92aadf4 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 11 May 2026 16:48:22 +0100 Subject: [PATCH 12/23] Remove unused comments across the codebase Codebase-wide cleanup pass: drop comments that restate well-named code, stale task/PR/issue references that have rotted, orphan commented-out code, narrative step-trackers, and mid-file author stamps. Preserved all docstrings, workaround notes, domain-knowledge comments, and substantive WHY rationale. No behaviour change. --- pgatk/cgenomes/cgenomes_proteindb.py | 7 +-- pgatk/clinvar/clinvar_service.py | 18 ------- pgatk/commands/ensembl_downloader.py | 1 - pgatk/db/digest_mutant_protein.py | 4 +- pgatk/db/map_peptide2genome.py | 4 +- pgatk/ensembl/data_downloader.py | 2 - pgatk/ensembl/ensembl.py | 18 ++----- pgatk/proteogenomics/spectrumai.py | 1 - pgatk/proteomics/db/protein_database_decoy.py | 49 ------------------- pgatk/toolbox/general.py | 9 ---- pgatk/toolbox/vcf_utils.py | 1 - 11 files changed, 8 insertions(+), 106 deletions(-) diff --git a/pgatk/cgenomes/cgenomes_proteindb.py b/pgatk/cgenomes/cgenomes_proteindb.py index f42d7b8..fe979be 100644 --- a/pgatk/cgenomes/cgenomes_proteindb.py +++ b/pgatk/cgenomes/cgenomes_proteindb.py @@ -140,7 +140,7 @@ def get_mut_pro_seq(snp: SNP, seq: Seq) -> Optional[str]: ref_dna = re.sub("[^A-Z]+", "", tmplist[0]) mut_dna = re.sub("[^A-Z]+", "", tmplist[1]) index = int(positions[0]) - 1 - if ref_dna == str(seq[index]).upper() and mut_dna in nucleotide: # + if ref_dna == str(seq[index]).upper() and mut_dna in nucleotide: seq_mut = seq[:index] + mut_dna + seq[index + 1:] mut_pro_seq = str(seq_mut.translate(to_stop=False)) elif "delins" in snp.dna_mut: @@ -304,7 +304,6 @@ def cosmic_to_proteindb(self) -> None: if len(row) <= max_col: self.get_logger().debug("Skipping malformed row (insufficient columns) at line %s: %s", line_counter, row[:5]) continue - # filter out mutations from unspecified groups if filter_col is not None: if row[filter_col] not in self._accepted_values and self._accepted_values != ['all']: continue @@ -382,7 +381,6 @@ def get_value_per_sample(self, local_clinical_sample_file: str, filter_column: s if line.startswith('#'): continue sl = line.strip().split('\t') - # check for header and re-assign columns if 'SAMPLE_ID' in sl and filter_column in sl: filter_column_col, sample_id_col = self.get_sample_headers(sl, filter_column) # Skip adding the header row itself to sample_value @@ -442,16 +440,13 @@ def cbioportal_to_proteindb(self) -> None: if row[0] == '#': self.get_logger().info("skipping line (%s): %s", i, row) continue - # check for header in the mutations file and get column indices if set(header_cols.keys()).issubset(set(row)): header_cols = self.get_mut_header_cols(header_cols, row) continue - # check if any is none in header_cols then continue if None in header_cols.values(): self.get_logger().error("Incorrect header column is given") continue - # get filter value and check it group = None if self._accepted_values != ['all'] or self._split_by_filter_column: try: diff --git a/pgatk/clinvar/clinvar_service.py b/pgatk/clinvar/clinvar_service.py index ee9a2cc..f368690 100644 --- a/pgatk/clinvar/clinvar_service.py +++ b/pgatk/clinvar/clinvar_service.py @@ -372,13 +372,10 @@ def run(self) -> str: """ logger.info("Starting ClinVar pipeline") - # 1. Load chromosome mapper chrom_mapper = ChromosomeMapper.from_assembly_report(self._assembly_report) - # 2. Parse GTF db = self._parse_gtf(self._gtf_file) - # 3. Load transcript FASTA transcripts_dict = SeqIO.index( self._fasta_file, "fasta", @@ -389,10 +386,8 @@ def run(self) -> str: k.split(".")[0]: k for k in transcripts_dict.keys() } - # 4. Read VCF once into DataFrame _metadata, vcf_df = self._read_vcf(self._vcf_file) - # 5. Find overlapping transcripts via BedTools (from DataFrame) overlap_map = self._build_overlap_map(vcf_df, self._gtf_file, chrom_mapper) logger.info("Found %d variants with transcript overlaps", len(overlap_map)) @@ -411,7 +406,6 @@ def run(self) -> str: for _, record in vcf_df.iterrows(): stats["variants_processed"] += 1 - # --- Validate alleles --- ref = str(record.REF) if any(c not in "ACGT" for c in ref): continue @@ -423,26 +417,22 @@ def run(self) -> str: if not alts: continue - # --- CLNSIG filter --- info = str(record.INFO) clnsig = self._get_info_field(info, "CLNSIG") if not self.passes_clnsig_filter(clnsig, self._clnsig_exclude): stats["variants_filtered_clnsig"] += 1 continue - # --- MC consequence filter --- mc_field = self._get_info_field(info, "MC") if not self.passes_mc_filter(mc_field, self._include_consequences): stats["variants_filtered_mc"] += 1 continue - # --- Parse gene symbol and CLNSIG for description --- gene_symbol, _ = self.parse_geneinfo( self._get_info_field(info, "GENEINFO") ) desc_str = f"{clnsig}|{gene_symbol}" if gene_symbol else clnsig - # --- Find overlapping transcripts --- chrom = str(record.CHROM) pos = int(record.POS) @@ -465,7 +455,6 @@ def run(self) -> str: continue processed_pairs.add(pair_key) - # Resolve transcript in FASTA tid = transcript_id if tid not in transcripts_dict: tid = transcript_id_mapping.get( @@ -479,7 +468,6 @@ def run(self) -> str: ) continue - # --- Biotype filter --- if self._include_biotypes != ["all"]: biotype = self._get_transcript_biotype(db, tid) if biotype and biotype not in self._include_biotypes: @@ -489,7 +477,6 @@ def run(self) -> str: ref_seq = fasta_record.seq desc = str(fasta_record.description) - # Determine CDS info and feature types cds_info: list[int] = [] feature_types = ["exon"] num_orfs = 3 @@ -512,19 +499,16 @@ def run(self) -> str: desc, ) - # Get features from GTF feat_chrom, strand, features_info = self._get_features( db, tid, feature_types ) if feat_chrom is None: continue - # Check overlap at feature level var_end = pos + len(ref) - 1 if not check_overlap(pos, var_end, features_info): continue - # Apply variant coding_ref_seq, coding_alt_seq = get_altseq( ref_seq, Seq(ref), @@ -538,7 +522,6 @@ def run(self) -> str: if coding_alt_seq == "": continue - # Translate ref_orfs, alt_orfs = get_orfs_vcf( coding_ref_seq, coding_alt_seq, @@ -546,7 +529,6 @@ def run(self) -> str: num_orfs, ) - # Build sequence ID record_id = "" if record.ID and str(record.ID) != ".": record_id = str(record.ID) diff --git a/pgatk/commands/ensembl_downloader.py b/pgatk/commands/ensembl_downloader.py index 83adb88..0885cd7 100644 --- a/pgatk/commands/ensembl_downloader.py +++ b/pgatk/commands/ensembl_downloader.py @@ -38,7 +38,6 @@ def ensembl_downloader(ctx, config_file, output_directory, taxonomy, folder_pref if taxonomy is None and ensembl_name is None: raise click.UsageError("Either --taxonomy or --ensembl_name is required.") - # Parse pipelines parameters. pipeline_arguments = {} if output_directory is not None: diff --git a/pgatk/db/digest_mutant_protein.py b/pgatk/db/digest_mutant_protein.py index 7b49201..6651aaa 100644 --- a/pgatk/db/digest_mutant_protein.py +++ b/pgatk/db/digest_mutant_protein.py @@ -18,7 +18,7 @@ def trypsin_cleavage(proseq: str, miss_cleavage: int): except IndexError: pass - if aa in ['K', 'R'] and next_aa != 'P': # for trypsin peptides + if aa in ['K', 'R'] and next_aa != 'P': if len(peptide) > 0: peptides.append(peptide) peptide = '' @@ -63,7 +63,6 @@ def digest_mutant_proteins( :param max_length: Maximum peptide length to retain (default: 40). :param missed_cleavages: Number of missed cleavages for trypsin digestion (default: 0). """ - # Build canonical peptidome from reference FASTA handle1 = SeqIO.parse(fasta_file, 'fasta') peptidome = {} @@ -78,7 +77,6 @@ def digest_mutant_proteins( log.info("Known peptides number: %d", len(peptidome)) handle1.close() - # Parse mutant protein input files filelist = input_file.split(",") handle_list = [] for f in filelist: diff --git a/pgatk/db/map_peptide2genome.py b/pgatk/db/map_peptide2genome.py index a09b119..c573dc0 100644 --- a/pgatk/db/map_peptide2genome.py +++ b/pgatk/db/map_peptide2genome.py @@ -87,7 +87,7 @@ def parse_gtf(infile): dic = {} with open(infile, "r", encoding='utf-8') as infile_object: for line in infile_object: - if line[0] != "#": # skip lines commented out + if line[0] != "#": row = line.strip().split("\t") if row[2] == "CDS": attri_list = row[8].split(";") @@ -153,7 +153,6 @@ def map_peptides_to_genome( pep_dic = {} with open(input_file, 'r', encoding='utf-8') as input_stream: - # peptide table with two columns, peptide sequence in first column, protein ID in second column input_stream.readline() for line in input_stream: row = line.strip().split("\t") @@ -188,7 +187,6 @@ def map_peptides_to_genome( exons, pep_trans_start, pep_trans_end ) - # handle exceptions if pep_chr_start > pep_chr_end: non_mapped_pep += 1 continue diff --git a/pgatk/ensembl/data_downloader.py b/pgatk/ensembl/data_downloader.py index 1ad4812..0be653b 100644 --- a/pgatk/ensembl/data_downloader.py +++ b/pgatk/ensembl/data_downloader.py @@ -6,7 +6,6 @@ 2. Given a species ID, collect its GTF data, with the option of decompressing it or not. """ -# App imports from json import loads import re from pgatk.toolbox.general import ParameterConfiguration, check_create_folders, download_file @@ -432,7 +431,6 @@ def get_vcf_files(self, species: dict, url_file=None) -> list: file_name=self.get_local_path_root_ensembl_repo() + '/' + file_name, log=self.get_logger(), url_file=url_file) if downloaded_file is not None: - # if chr1 is downloaded then try all others files.append(downloaded_file) for chrN in range(2, 23): # chr2-22 file_name = '{}_incl_consequences-chr{}.vcf.gz'.format(species['name'], chrN) diff --git a/pgatk/ensembl/ensembl.py b/pgatk/ensembl/ensembl.py index 70d2e7f..5849082 100644 --- a/pgatk/ensembl/ensembl.py +++ b/pgatk/ensembl/ensembl.py @@ -263,10 +263,10 @@ def dnaseq_to_proteindb(self, input_fasta: str) -> str: with open(self._proteindb_output, 'w', encoding='utf-8') as prots_fn: for record_id in seq_dict.keys(): - ref_seq = seq_dict[record_id].seq # get the seq and desc for the record from the fasta of the gtf + ref_seq = seq_dict[record_id].seq desc = str(seq_dict[record_id].description) - key_values = {} # extract key=value in the desc into a dict + key_values = {} sep = self._transcript_description_sep desc = desc.replace(' ', sep) for value in desc.split(sep): @@ -290,7 +290,6 @@ def dnaseq_to_proteindb(self, input_fasta: str) -> str: record_id, desc) self.get_logger().debug(msg) - # only include features that have the specified biotypes or they have CDSs info if 'CDS' in key_values.keys() and ( not self._skip_including_all_cds or 'altORFs' in self._include_biotypes): pass @@ -299,7 +298,6 @@ def dnaseq_to_proteindb(self, input_fasta: str) -> str: 'all']))): continue - # check wether to filter on expression and if it passes if self._expression_str: try: if float(key_values[self._expression_str]) < self._expression_thresh: @@ -386,12 +384,10 @@ def annoate_vcf(vcf_file: str, gtf_file: str, muts_dict.setdefault(key, []).append(transcript_id) with open(f"{vcf_stem}_annotated.vcf", 'w', encoding='utf-8') as ann, open(vcf_file, 'r', encoding='utf-8') as v: - # write vcf headers to the output file for line in v.readlines(): if line.startswith('#'): ann.write(line) else: - # write the mutations and their overlapping transcript to output file sl = line.strip().split('\t') if len(sl) < 8: ann.write(line) @@ -459,7 +455,7 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf # handle cases where the transcript has version in the GTF but not in the VCF transcript_id_mapping = {k.split('.')[0]: k for k in transcripts_dict.keys()} feature_cache = _FeatureCache() - seq_cache: dict[str, tuple] = {} # transcript_id_v -> (ref_seq, desc) + seq_cache: dict[str, tuple] = {} transcript_index, consequence_index, biotype_index = None, None, None if self._annotation_field_name: @@ -473,7 +469,6 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf except IndexError: pass - # try to extract index of transcript ID and consequence from the VCF metadata in the header try: transcript_index = annotation_cols.index(self._transcript_str.upper()) except ValueError: @@ -522,7 +517,6 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf if alt is None: continue elif [x for x in str(alt) if x not in 'ACGT']: - # check if all alt alleles are nucleotides continue alts.append(alt) if not alts: @@ -538,7 +532,7 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf info_kv[k] = v if not self._ignore_filters and self._accepted_filters != ['ALL']: - if record.FILTER and record.FILTER != '.' and record.FILTER != 'NA' and record.FILTER != '': # if not PASS: None and empty means PASS + if record.FILTER and record.FILTER != '.' and record.FILTER != 'NA' and record.FILTER != '': # None and empty means PASS filters = set(record.FILTER.upper().split(',')) if ';' in record.FILTER and len(filters) <= 1: filters = set(record.FILTER.upper().split(';')) @@ -552,7 +546,6 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf invalid_records['# variants with invalid record'] += 1 continue - # check if the AF passed the threshold if af < self._af_threshold: invalid_records['# variants not passing AF threshold'] += 1 continue @@ -637,7 +630,6 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf ref_seq, desc = cached_row feature_types = ['exon'] - # check if cds info exists in the fasta header otherwise translate all exons cds_info = [] num_orfs = 3 if 'CDS=' in desc: @@ -678,7 +670,7 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf feature_cache.put(cache_key, (chrom, strand, features_info)) else: chrom, strand, features_info = cached - if chrom is None: # the record info was not found + if chrom is None: continue for alt in alts: diff --git a/pgatk/proteogenomics/spectrumai.py b/pgatk/proteogenomics/spectrumai.py index 0fb16b3..612e309 100644 --- a/pgatk/proteogenomics/spectrumai.py +++ b/pgatk/proteogenomics/spectrumai.py @@ -281,7 +281,6 @@ def __init__(self, config_data: dict, pipeline_arguments: dict) -> None: self._mzml_files = self.get_validate_parameters(variable=self.CONFIG_MZML_FILES, default_value=False) self._ions_tolerance = self.get_validate_parameters(variable=self.CONFIG_IONS_TOLERANCE, default_value=0.02) - ## check if ions_tolerance is string, convert to float if isinstance(self._ions_tolerance, str): self._ions_tolerance = float(self._ions_tolerance) diff --git a/pgatk/proteomics/db/protein_database_decoy.py b/pgatk/proteomics/db/protein_database_decoy.py index 5f47e72..ee22853 100644 --- a/pgatk/proteomics/db/protein_database_decoy.py +++ b/pgatk/proteomics/db/protein_database_decoy.py @@ -221,21 +221,14 @@ def generate_decoypyrat_database(self) -> None: :return: """ - # Create empty sets to add all target and decoy peptides upeps = set() dpeps = set() - # Counter for number of decoy sequences dcount = 0 - # Open FASTA file using first cmd line argument - # fasta = SeqIO.parse(self._input_fasta, 'fasta') - with open(self._input_fasta) as handle: - # open temporary decoy FASTA file with open(self._temp_file, 'w') as outfa: - # loop each seq in the file for value in SimpleFastaParser(handle): seq = value[1] description = value[0] @@ -244,53 +237,42 @@ def generate_decoypyrat_database(self) -> None: if not self._isobaric: seq = seq.replace('I', 'L') - # digest sequence add peptides to set upeps.update( cleave(sequence=seq, rule=PGATK_ENZYMES.enzymes[self._enzyme]['cleavage rule'], missed_cleavages=0, min_length=self._min_peptide_length)) - # reverse and switch protein sequence decoyseq = self.revswitch(seq, self._no_switch, PGATK_ENZYMES.enzymes[self._enzyme]['cleavage sites']) # do not store decoy peptide set in reduced memory mode if not self._memory_save: - # update decoy peptide set dpeps.update( cleave(sequence=decoyseq, rule=PGATK_ENZYMES.enzymes[self._enzyme]['cleavage rule'], missed_cleavages=0, min_length=self._min_peptide_length)) - # write decoy protein accession and sequence to file outfa.write('>' + self._decoy_prefix + description + '\n') outfa.write(decoyseq + '\n') - # Summarise the numbers of target and decoy peptides and their intersection nonDecoys = set() self.get_logger().info("proteins: %s", dcount) self.get_logger().info("target peptides: %s", len(upeps)) # Reloop decoy file in reduced memory mode to store only intersecting decoys if self._memory_save: - # open temp decoys with open(self._temp_file, "rt") as fin: for line in fin: - # if line is not accession if line[0] != '>': - # digest protein for p in cleave(sequence=line.rstrip(), rule=PGATK_ENZYMES.enzymes[self._enzyme]['cleavage rule'], missed_cleavages=0, min_length=self._min_peptide_length): - # check if in target peptides if true then add to nonDecoys if p in upeps: nonDecoys.add(p) fin.close() self.get_logger().info("decoy peptides: !Memory Saving Made!") else: - # can only report total number in normal memory mode self.get_logger().info("decoy peptides: %s", len(dpeps)) - # find intersecting peptides nonDecoys = upeps.intersection(dpeps) self.get_logger().info("#intersection: %s", len(nonDecoys)) @@ -298,37 +280,27 @@ def generate_decoypyrat_database(self) -> None: # if there are decoy peptides that are in the target peptide set if len(nonDecoys) > 0 and self._no_suffle == False: - # create empty dictionary with bad decoys as keys dAlternative = dict.fromkeys(nonDecoys, '') noAlternative = list() - # loop bad decoys / dictionary keys for dPep in dAlternative: i = 0 aPep = dPep - # shuffle until aPep is not in target set (maximum of 10 iterations) while aPep in upeps and i < self._max_iterations: - - # increment iteration counter i += 1 - - # shuffle peptide aPep = self.shuffle(dPep) # check if shuffling has an effect if not end iterations if aPep == dPep: i = self._max_iterations - # update dictionary with alternative shuffled peptide dAlternative[dPep] = aPep - # warn if peptide has no suitable alternative, add to removal list if i == self._max_iterations: noAlternative.append(dPep) self.get_logger().info('%s have no alternative peptide', len(noAlternative)) - # remove peptides with no alternative for p in noAlternative: del dAlternative[p] @@ -336,11 +308,8 @@ def generate_decoypyrat_database(self) -> None: upeps.clear() dpeps.clear() - # open second decoy file with open(self._output_file, "wt") as fout: - # Attach the target sequences to the database - # fasta = SeqIO.parse(self._input_fasta, 'fasta') with open(self._input_fasta) as handle: for value in SimpleFastaParser(handle): description = value[0] @@ -348,20 +317,15 @@ def generate_decoypyrat_database(self) -> None: fout.write('>' + description + '\n') fout.write(seq + '\n') - # open original decoy file with open(self._temp_file, "rt") as fin: - # loop each line of original decoy fasta for line in fin: # if line is not accession replace peptides in dictionary with alternatives if line[0] != '>': - # digest decoy sequence for p in cleave(sequence=line.rstrip(), rule=PGATK_ENZYMES.enzymes[self._enzyme]['cleavage rule'], missed_cleavages=0, min_length=self._min_peptide_length): - # store decoy peptide for final count dpeps.add(p) - # if decoy peptide is in dictionary replace with alternative if p in dAlternative: line = line.replace(p, dAlternative[p]) @@ -391,24 +355,19 @@ def pgatk_decoy_database(self) -> None: :return: """ - # Create empty sets to add all target and decoy peptides upeps = set() noAlternative = set() - # Open FASTA file using first cmd line argument fasta = SeqIO.parse(self._input_fasta, 'fasta') - # loop each seq in the file for record in fasta: seq = str(record.seq) if not self._isobaric: seq = seq.replace('I', 'L') - # digest sequence add peptides to the target set upeps.update( cleave(sequence=seq, rule=PGATK_ENZYMES.enzymes[self._enzyme]['cleavage rule'], missed_cleavages=self._max_missed_cleavages, min_length=self._min_peptide_length)) - # open orary decoy FASTA file with open(self._output_file, 'w') as outfa: fasta = SeqIO.parse(self._input_fasta, 'fasta') targets = [] @@ -418,7 +377,6 @@ def pgatk_decoy_database(self) -> None: targets.append(protseq) revprotseq = [] - # output target protein seq = str(record.seq) id_protein = record.id description = record.description @@ -431,7 +389,6 @@ def pgatk_decoy_database(self) -> None: if not self._isobaric: seq = seq.replace('I', 'L') - # reverse and switch protein sequence decoyseq = self.revswitch(seq, self._no_switch, PGATK_ENZYMES.enzymes[self._enzyme]['cleavage sites']) @@ -455,29 +412,23 @@ def pgatk_decoy_database(self) -> None: if found_in_target and not self._no_suffle and decoy_pep not in noAlternative: aPep = decoy_pep - # shuffle until aPep is not in target set (maximum of 10 iterations) i = 0 while aPep in upeps and i < self._max_iterations: - # increment iteration counter i += 1 - # shuffle peptide aPep = self.shuffle(aPep) # check if shuffling has an effect if not end iterations if aPep == decoy_pep: i = self._max_iterations - # warn if peptide has no suitable alternative, add to removal list if i == self._max_iterations: noAlternative.add(decoy_pep) aPep = '' - # if decoy is generated then add to the list of peptides if aPep: checked_decoy_peps.append(aPep) else: if self._keep_target_hits: checked_decoy_peps.append(decoy_pep) - # finally join the peptides to generate protein decoy if checked_decoy_peps: revprotseq.append(''.join(checked_decoy_peps)) diff --git a/pgatk/toolbox/general.py b/pgatk/toolbox/general.py index 15f7603..897cd71 100644 --- a/pgatk/toolbox/general.py +++ b/pgatk/toolbox/general.py @@ -17,7 +17,6 @@ import yaml import gzip -# Logging defaults from requests import HTTPError from pgatk.toolbox.exceptions import ToolBoxException @@ -61,7 +60,6 @@ def __init__(self, root_config_name: str, yaml_configuration: dict, pipeline_par else: self._default_params = {} - # Prepare Logging subsystem if self._default_params is not None and self._ROOT_CONFIG_NAME in self._default_params: if self._CONFIG_LOGGER in self._default_params[self._ROOT_CONFIG_NAME]: if self._CONFIG_LOGGER_LEVEL in self._default_params[self._ROOT_CONFIG_NAME][self._CONFIG_LOGGER]: @@ -96,10 +94,8 @@ def get_config_value(self, key: str, default: Any = None) -> Any: Checks pipeline_parameters first (flat lookup), then default_params[_ROOT_CONFIG_NAME][key], then returns default. """ - # Check pipeline params first (flat lookup by key) if key in self._pipeline_parameters: return self._pipeline_parameters[key] - # Then check default config params under root config name root = self._ROOT_CONFIG_NAME if (self._default_params is not None and root in self._default_params @@ -117,12 +113,10 @@ def get_log_handlers(self) -> list: return self._log_handlers def get_logger(self) -> logging.Logger: - # Get own logger return self._logger def get_session_log_files(self) -> list: log_files = [] - # Add the application logs log_files.extend(self._log_files) return log_files @@ -267,14 +261,12 @@ def check_create_folders_overwrite(folders: List[str]) -> None: if not os.path.isdir(folder): invalid_folders.append(folder) if invalid_folders: - # If there's any invalid folder, we don't make any change, and we report the situation by raising an exception raise ToolBoxException("The following folders ARE NOT FOLDERS - '{}'" .format(invalid_folders)) for folder in folders: try: shutil.rmtree(folder) except FileNotFoundError: - # It is find if the folder is not there pass check_create_folders(folders) @@ -347,7 +339,6 @@ def gunzip_files(files: List[str]) -> list[tuple[str, str]]: (stdout, stderr) = gunzip_subprocess.communicate(timeout=timeout) if gunzip_subprocess.poll() is not None: if gunzip_subprocess.returncode != 0: - # ERROR - Report this err_msg = "ERROR uncompressing file '{}' output from subprocess STDOUT: {}\nSTDERR: {}" \ .format(file, stdout.decode('utf8'), stderr.decode('utf8')) files_with_error.append((file, err_msg)) diff --git a/pgatk/toolbox/vcf_utils.py b/pgatk/toolbox/vcf_utils.py index 61fe4db..d784361 100644 --- a/pgatk/toolbox/vcf_utils.py +++ b/pgatk/toolbox/vcf_utils.py @@ -30,7 +30,6 @@ def check_overlap(var_start: int, var_end: int, features_info: Optional[list] = features_info = [[0, 1, 'type']] if var_start == -1: return True - # check if the var overlaps any of the features for feature_pos in features_info: pep_start = feature_pos[0] pep_end = feature_pos[1] From 237f89009567b0559f702294b71deb2035878ca3 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 11 May 2026 16:53:20 +0100 Subject: [PATCH 13/23] Address Copilot review feedback on PR #102 Five issues from the Copilot review, fixed: 1. seq_cache type annotation: changed from `dict[str, tuple]` to `dict[str, Optional[tuple]]` to reflect that we store None as the "known-missing transcript" marker alongside (ref_seq, desc) tuples. Added a comment explaining the _MISSING sentinel pattern. 2. cosmic_downloader.py: redact the presigned S3 URL when logging. The download URL embeds a short-lived AWS signature/credentials in its query string; even with a 1-hour TTL it can leak into log aggregators. Log only the path component. 3. commands/cosmic_downloader.py: rewrite the `--url_file` help text. The flag used to write a direct download URL; after the v103 API migration it writes the SCRIPTED-DOWNLOAD API URL (step 1, not the signed S3 URL). The help now describes the actual TSV format and the indirection. 4. tests/test_cosmic_downloader.py: rewrite module docstring to describe what the tests actually check (one URL pattern + the token format), not the stale "four product API URLs" claim. 5. cosmic_downloader.py: fix the `output_directory` config-lookup bug. `get_configuration_default_params` only searched under `cosmic_data.cosmic_server.`, so the `output_directory` field at `cosmic_data.` (top level) in cosmic_config.yaml was silently ignored - the service always fell back to the hardcoded `./database_cosmic/`. Now check the top level too, so the YAML value takes effect. Pre-existing bug, surfaced by the Copilot review. --- pgatk/cgenomes/cosmic_downloader.py | 25 +++++++++++++++---------- pgatk/commands/cosmic_downloader.py | 6 +++++- pgatk/ensembl/ensembl.py | 5 ++++- pgatk/tests/test_cosmic_downloader.py | 12 ++++++++---- 4 files changed, 32 insertions(+), 16 deletions(-) diff --git a/pgatk/cgenomes/cosmic_downloader.py b/pgatk/cgenomes/cosmic_downloader.py index 0a8f7c6..65a066b 100644 --- a/pgatk/cgenomes/cosmic_downloader.py +++ b/pgatk/cgenomes/cosmic_downloader.py @@ -61,16 +61,19 @@ def __init__(self, config_file, pipeline_arguments): self.prepare_local_cosmic_repository() def get_configuration_default_params(self, variable: str, default_value): - return_value = default_value if variable in self.get_pipeline_parameters(): - return_value = self.get_pipeline_parameters()[variable] - elif self.CONFIG_KEY_DATA_DOWNLOADER in self.get_default_parameters() \ - and self.CONFIG_COSMIC_SERVER in self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER] \ - and variable in self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][ - self.CONFIG_COSMIC_SERVER]: - return_value = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_COSMIC_SERVER][ - variable] - return return_value + return self.get_pipeline_parameters()[variable] + defaults = self.get_default_parameters() + if self.CONFIG_KEY_DATA_DOWNLOADER not in defaults: + return default_value + data = defaults[self.CONFIG_KEY_DATA_DOWNLOADER] + # `output_directory` lives at cosmic_data.; everything else under cosmic_data.cosmic_server.. + if variable in data and not isinstance(data[variable], dict): + return data[variable] + server = data.get(self.CONFIG_COSMIC_SERVER, {}) + if variable in server: + return server[variable] + return default_value def prepare_local_cosmic_repository(self): self.get_logger().debug("Preparing local cbioportal repository, root folder - '{}'".format( @@ -148,7 +151,9 @@ def download_file_cosmic(self, api_url, local_file, token): self.get_logger().error(msg) raise AppConfigException(msg) - self.get_logger().debug("Downloading file from signed URL '{}'".format(download_url)) + # The presigned URL embeds short-lived AWS credentials in its query string; + # log only the path so the secret material doesn't leak into log aggregators. + self.get_logger().debug("Downloading file from signed URL (path %s)", download_url.split('?', 1)[0]) data_response = requests.get(download_url, stream=True, timeout=30) if data_response.status_code != 200: msg = ("COSMIC S3 download failed: HTTP {} for {}" diff --git a/pgatk/commands/cosmic_downloader.py b/pgatk/commands/cosmic_downloader.py index 557cce9..fe35216 100644 --- a/pgatk/commands/cosmic_downloader.py +++ b/pgatk/commands/cosmic_downloader.py @@ -22,7 +22,11 @@ 'May be repeated. Overrides the products list in the config file when provided. ' 'Find available paths at https://cancer.sanger.ac.uk/cosmic/download/cosmic ' '(open the "Scripted Download" panel for any product).') -@click.option("--url_file", help='Add the url to a downloaded file') +@click.option("--url_file", + help='If set, do not download anything. Instead, write a TSV with one row per ' + 'product (`\\t`) so the downloads can be driven ' + 'externally (e.g. wget/curl loop). The api_url is the COSMIC scripted-download ' + 'endpoint URL — fetching it returns JSON containing the actual signed S3 URL.') @click.pass_context def cosmic_downloader(ctx, config_file, output_directory, username, password, products, url_file): diff --git a/pgatk/ensembl/ensembl.py b/pgatk/ensembl/ensembl.py index 5849082..d83d031 100644 --- a/pgatk/ensembl/ensembl.py +++ b/pgatk/ensembl/ensembl.py @@ -455,7 +455,10 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf # handle cases where the transcript has version in the GTF but not in the VCF transcript_id_mapping = {k.split('.')[0]: k for k in transcripts_dict.keys()} feature_cache = _FeatureCache() - seq_cache: dict[str, tuple] = {} + # Value is (ref_seq, desc) for a known transcript, or None for a transcript + # we've already looked up and confirmed isn't in the FASTA index (avoids re-trying + # the disk seek). _MISSING sentinel below distinguishes "not yet looked up". + seq_cache: dict[str, Optional[tuple]] = {} transcript_index, consequence_index, biotype_index = None, None, None if self._annotation_field_name: diff --git a/pgatk/tests/test_cosmic_downloader.py b/pgatk/tests/test_cosmic_downloader.py index 5a4b028..5564495 100644 --- a/pgatk/tests/test_cosmic_downloader.py +++ b/pgatk/tests/test_cosmic_downloader.py @@ -1,7 +1,11 @@ -"""Unit tests for the COSMIC downloader URL construction. - -No network. Builds a CosmicDownloadService against the shipped default config -and asserts that the four product API URLs match the expected pattern. +"""Unit tests for the COSMIC downloader URL construction and auth token. + +No network. Two tests: + * `test_build_api_url_pattern` — verifies build_api_url() composes the + scripted-download endpoint correctly from server, api_endpoint, path + and bucket. + * `test_basic_auth_token_format` — verifies the base64(user:password) + token has no trailing newline (a common pitfall when echo'ing). """ from pgatk.cgenomes.cosmic_downloader import CosmicDownloadService from pgatk.config.registry import load_config From c1ea9cc6e3f2fb0b23668406258c487e8244f8c0 Mon Sep 17 00:00:00 2001 From: Husen Umer Date: Tue, 12 May 2026 11:39:08 +0300 Subject: [PATCH 14/23] Update use cases and configuration for Ensembl biotypes and file formats - removed .gz from file names in the docs since ensembl download uncopmress files after downloading: `.gtf.gz` to `.gtf` and `.vcf.gz` to `.vcf` in multiple commands for consistency. - Updated the `include_biotypes` for pseudogenes adding: `unitary_pseudogene` and `rRNA_pseudogene`. - Renamed references from `lincRNA` to `lncRNA` throughout the documentation for accuracy. - update canonical biotypes to only keep those that have CDS similar to the reference. --- docs/use-cases.md | 60 ++++++++++++++++---------------- pgatk/config/ensembl_config.yaml | 2 +- 2 files changed, 31 insertions(+), 31 deletions(-) diff --git a/docs/use-cases.md b/docs/use-cases.md index 0f151e9..0697af1 100644 --- a/docs/use-cases.md +++ b/docs/use-cases.md @@ -33,7 +33,7 @@ pgatk ensembl-downloader \ ```bash gffread -F -w ensembl_human/transcripts.fa \ -g ensembl_human/genome.fa \ - ensembl_human/Homo_sapiens.GRCh38.*.gtf.gz + ensembl_human/Homo_sapiens.GRCh38.*.gtf ``` ### Step 3 -- Canonical protein-coding sequences @@ -58,7 +58,7 @@ pgatk dnaseq-to-proteindb \ --input_fasta ensembl_human/transcripts.fa \ --output_proteindb pseudogene.fa \ --protein_prefix pseudo_ \ - --include_biotypes processed_pseudogene,unprocessed_pseudogene,transcribed_processed_pseudogene,transcribed_unprocessed_pseudogene,translated_processed_pseudogene \ + --include_biotypes pseudogene,processed_pseudogene,unprocessed_pseudogene,transcribed_processed_pseudogene,transcribed_unprocessed_pseudogene,translated_processed_pseudogene,unitary_pseudogene,transcribed_unitary_pseudogene,rRNA_pseudogene,IG_V_pseudogene,TR_V_pseudogene,IG_C_pseudogene,TR_J_pseudogene,IG_J_pseudogene,IG_pseudogene \ --num_orfs 3 \ --skip_including_all_cds ``` @@ -73,7 +73,7 @@ pgatk dnaseq-to-proteindb \ --input_fasta ensembl_human/transcripts.fa \ --output_proteindb lncrna.fa \ --protein_prefix lncrna_ \ - --include_biotypes lincRNA,antisense,sense_intronic,sense_overlapping \ + --include_biotypes lncRNA \ --num_orfs 3 \ --skip_including_all_cds ``` @@ -98,9 +98,9 @@ Include common human variants from ENSEMBL: ```bash pgatk vcf-to-proteindb \ - --vcf ensembl_human/homo_sapiens_incl_consequences.vcf.gz \ + --vcf ensembl_human/homo_sapiens_incl_consequences.vcf \ --input_fasta ensembl_human/transcripts.fa \ - --gene_annotations_gtf ensembl_human/Homo_sapiens.GRCh38.*.gtf.gz \ + --gene_annotations_gtf ensembl_human/Homo_sapiens.GRCh38.*.gtf \ --output_proteindb ensembl_variants.fa ``` @@ -197,7 +197,7 @@ extract transcript sequences from the GTF and genome FASTA: ```bash gffread -F -w ensembl_human/transcripts.fa \ -g ensembl_human/genome.fa \ - ensembl_human/Homo_sapiens.GRCh38.*.gtf.gz + ensembl_human/Homo_sapiens.GRCh38.*.gtf ``` ### Step 3 -- Generate the variant protein database @@ -206,9 +206,9 @@ Translate all ENSEMBL variants that affect protein-coding transcripts: ```bash pgatk vcf-to-proteindb \ - --vcf ensembl_human/homo_sapiens_incl_consequences.vcf.gz \ + --vcf ensembl_human/homo_sapiens_incl_consequences.vcf \ --input_fasta ensembl_human/transcripts.fa \ - --gene_annotations_gtf ensembl_human/Homo_sapiens.GRCh38.*.gtf.gz \ + --gene_annotations_gtf ensembl_human/Homo_sapiens.GRCh38.*.gtf \ --output_proteindb ensembl_human/variant_proteins.fa ``` @@ -255,7 +255,7 @@ Include only variants present in at least 1% of the population: ```bash pgatk vcf-to-proteindb \ - --vcf homo_sapiens_incl_consequences.vcf.gz \ + --vcf homo_sapiens_incl_consequences.vcf \ --input_fasta transcripts.fa \ --gene_annotations_gtf genes.gtf \ --af_field MAF \ @@ -270,7 +270,7 @@ detectable by mass spectrometry: ```bash pgatk vcf-to-proteindb \ - --vcf homo_sapiens_incl_consequences.vcf.gz \ + --vcf homo_sapiens_incl_consequences.vcf \ --input_fasta transcripts.fa \ --gene_annotations_gtf genes.gtf \ --af_field MAF \ @@ -289,7 +289,7 @@ a specific population: pgatk vcf-to-proteindb \ --vcf gnomad.exomes.v4.1.sites.vcf.bgz \ --input_fasta gencode_transcripts.fa \ - --gene_annotations_gtf gencode.v44.annotation.gtf.gz \ + --gene_annotations_gtf gencode.v44.annotation.gtf \ --annotation_field_name vep \ --af_field AF_afr \ --af_threshold 0.01 \ @@ -324,18 +324,18 @@ pgatk ncbi-downloader -o ncbi_data This downloads four files to `ncbi_data/`: -- `GRCh38_latest_genomic.gtf.gz` -- RefSeq gene annotations -- `GRCh38_latest_rna.fna.gz` -- RefSeq transcript nucleotide sequences +- `GRCh38_latest_genomic.gtf` -- RefSeq gene annotations +- `GRCh38_latest_rna.fna` -- RefSeq transcript nucleotide sequences - `GRCh38_latest_assembly_report.txt` -- Chromosome name mapping -- `clinvar.vcf.gz` -- ClinVar variant calls +- `clinvar.vcf` -- ClinVar variant calls ### Step 2 -- Generate the ClinVar protein database ```bash pgatk clinvar-to-proteindb \ - --vcf ncbi_data/clinvar.vcf.gz \ - --gtf ncbi_data/GRCh38_latest_genomic.gtf.gz \ - --fasta ncbi_data/GRCh38_latest_rna.fna.gz \ + --vcf ncbi_data/clinvar.vcf \ + --gtf ncbi_data/GRCh38_latest_genomic.gtf \ + --fasta ncbi_data/GRCh38_latest_rna.fna \ --assembly-report ncbi_data/GRCh38_latest_assembly_report.txt \ --output clinvar_proteins.fa ``` @@ -537,17 +537,17 @@ open reading frames (smORFs), micropeptides from lncRNAs, pseudogene-encoded proteins, and alternative reading frames. Studies have found that non-canonical peptides can account for over 5% of total identifications. -### lincRNA-derived proteins +### lncRNA-derived proteins -Long intergenic non-coding RNAs (lincRNAs) can encode small proteins. Translate +Long intergenic non-coding RNAs (lncRNAs) can encode small proteins. Translate them in three reading frames: ```bash pgatk dnaseq-to-proteindb \ --input_fasta transcripts.fa \ - --output_proteindb lincRNA_proteins.fa \ - --protein_prefix lincRNA_ \ - --include_biotypes lincRNA \ + --output_proteindb lncRNA_proteins.fa \ + --protein_prefix lncRNA_ \ + --include_biotypes lncRNA \ --num_orfs 3 \ --skip_including_all_cds ``` @@ -598,7 +598,7 @@ Combine all non-canonical sources with the canonical proteome: ```bash cat canonical_proteins.fa \ - lincRNA_proteins.fa \ + lncRNA_proteins.fa \ pseudogene_proteins.fa \ altorf_proteins.fa \ antisense_proteins.fa \ @@ -616,7 +616,7 @@ pgatk generate-decoy \ ```bash pgatk digest-mutant-protein \ - --input lincRNA_proteins.fa,pseudogene_proteins.fa,altorf_proteins.fa \ + --input lncRNA_proteins.fa,pseudogene_proteins.fa,altorf_proteins.fa \ --fasta canonical_proteins.fa \ --output novel_unique_peptides.fa ``` @@ -975,12 +975,12 @@ pgatk cosmic-to-proteindb \ --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \ --output_db cosmic_variants.fa -# Non-coding RNA (lincRNA) +# Non-coding RNA (lncRNA) pgatk dnaseq-to-proteindb \ --input_fasta ensembl_data/transcripts.fa \ - --output_proteindb lincRNA.fa \ - --protein_prefix lincRNA_ \ - --include_biotypes lincRNA \ + --output_proteindb lncRNA.fa \ + --protein_prefix lncRNA_ \ + --include_biotypes lncRNA \ --num_orfs 3 \ --skip_including_all_cds @@ -1001,7 +1001,7 @@ cat canonical.fa \ ensembl_variants.fa \ clinvar_variants.fa \ cosmic_variants.fa \ - lincRNA.fa \ + lncRNA.fa \ pseudogene.fa \ > combined_target.fa ``` @@ -1027,7 +1027,7 @@ pgatk generate-decoy \ ```bash pgatk digest-mutant-protein \ - --input ensembl_variants.fa,clinvar_variants.fa,cosmic_variants.fa,lincRNA.fa,pseudogene.fa \ + --input ensembl_variants.fa,clinvar_variants.fa,cosmic_variants.fa,lncRNA.fa,pseudogene.fa \ --fasta canonical.fa \ --output unique_variant_peptides.fa \ --min-len 7 \ diff --git a/pgatk/config/ensembl_config.yaml b/pgatk/config/ensembl_config.yaml index 24be0dc..ac679ed 100644 --- a/pgatk/config/ensembl_config.yaml +++ b/pgatk/config/ensembl_config.yaml @@ -11,7 +11,7 @@ ensembl_translation: exclude_biotypes: '' exclude_consequences: 'downstream_gene_variant, upstream_gene_variant, intergenic_variant, intron_variant, synonymous_variant, regulatory_region_variant' skip_including_all_cds: False - include_biotypes: 'protein_coding,protein_coding_CDS_not_defined,protein_coding_LoF,nonsense_mediated_decay,non_stop_decay,translated_processed_pseudogene,IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,TR_C_gene,TR_D_gene,TR_J_gene,TR_V_gene,TEC' + include_biotypes: 'IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,nonsense_mediated_decay,non_stop_decay,protein_coding,protein_coding_LoF,TR_C_gene,TR_D_gene,TR_J_gene,TR_V_gene' include_consequences: 'all' biotype_str: transcript_biotype transcript_description_sep: ';' From a4627904ef8d48fb563369cbee37491b66d6d32b Mon Sep 17 00:00:00 2001 From: Husen Umer Date: Tue, 12 May 2026 13:38:51 +0300 Subject: [PATCH 15/23] Updated use cases documentation - Renamed use case headings for better organization and clarity. - Added a new section for 'Putative proteins (three-frame)' with example commands. - Updated input commands to include the new putative proteins in relevant workflows. - Enhanced descriptions for condition-specific databases and cancer-type specific databases to improve understanding. --- docs/use-cases.md | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/docs/use-cases.md b/docs/use-cases.md index 0697af1..f08f690 100644 --- a/docs/use-cases.md +++ b/docs/use-cases.md @@ -6,7 +6,7 @@ a search-ready protein database. --- -## Cell-Type Specific Non-Canonical Peptide Discovery +## USE CASE 1: Cell-Type Specific Non-Canonical Peptide Discovery !!! abstract "Featured workflow" This workflow reproduces the analysis from [Umer et al., *Bioinformatics* 2022](https://doi.org/10.1093/bioinformatics/btab838), @@ -78,6 +78,20 @@ pgatk dnaseq-to-proteindb \ --skip_including_all_cds ``` +#### Putative proteins (three-frame) + +Protein coding genes without CDs that are not validated but annoated: + +```bash +pgatk dnaseq-to-proteindb \ + --input_fasta ensembl_human/transcripts.fa \ + --output_proteindb putative.fa \ + --protein_prefix putative_ \ + --include_biotypes protein_coding_CDS_not_defined,TEC,translated_processed_pseudogene \ + --num_orfs 3 \ + --skip_including_all_cds +``` + #### Alternative ORFs (exonic out-of-frame translation) Non-canonical reading frames of protein-coding mRNAs can produce cryptic @@ -130,6 +144,7 @@ pgatk cosmic-to-proteindb \ cat canonical.fa \ pseudogene.fa \ lncrna.fa \ + putative.fa \ altorf.fa \ ensembl_variants.fa \ > cell_type_target.fa @@ -149,7 +164,7 @@ set of non-canonical peptides unique to these novel sources: ```bash pgatk digest-mutant-protein \ - --input pseudogene.fa,lncrna.fa,altorf.fa,ensembl_variants.fa \ + --input pseudogene.fa,lncrna.fa,putative.fa,altorf.fa,ensembl_variants.fa \ --fasta canonical.fa \ --output non_canonical_peptides.fa \ --min-len 7 \ @@ -166,7 +181,7 @@ pgatk digest-mutant-protein \ --- -## 1. Human Variant Protein Database from ENSEMBL +## USE CASE 2: Human Variant Protein Database from ENSEMBL Build a variant protein database using ENSEMBL population variants (common SNPs and indels) for human proteogenomics searches. This is the most common starting @@ -242,7 +257,7 @@ The file `target_decoy.fa` is ready for database searching. --- -## 2. Population-Specific Variant Database +## USE CASE 3: Population-Specific Variant Database Population-level genetic variants cause amino acid changes that are invisible to standard reference database searches. Studies have shown that incorporating @@ -308,7 +323,7 @@ pgatk vcf-to-proteindb \ --- -## 3. ClinVar Clinical Variant Database +## USE CASE 4: ClinVar Clinical Variant Database [ClinVar](https://www.ncbi.nlm.nih.gov/clinvar/) catalogs the relationship between human variants and clinical phenotypes. Building a ClinVar-derived @@ -356,7 +371,7 @@ pgatk generate-decoy \ --- -## 4. Tumor-Specific Databases for Cancer Proteogenomics +## USE CASE 5: Condition-Specific Databases for Proteogenomics Cancer proteogenomics studies (such as those from CPTAC) build tumor-specific protein databases to detect somatic mutant peptides, understand @@ -364,7 +379,7 @@ therapy resistance, and prioritize neoantigen candidates. The databases combine somatic mutations from cancer-specific sources (COSMIC, cBioPortal) and/or patient-matched whole-exome sequencing. -### 4a. COSMIC somatic mutations by cancer type +### 4a. Cancer-speific database using COSMIC somatic mutations Generate one protein database per primary tissue site. This is the standard approach for large-scale cancer proteogenomics when patient-level WES is not @@ -388,7 +403,7 @@ pgatk cosmic-to-proteindb \ This produces files like `cosmic_proteins_lung.fa`, `cosmic_proteins_breast.fa`, etc. Use the tissue-matched database for your cancer type of interest. -### 4b. Single cancer type +### 4b. Cancer-type speific database using COSMIC somatic mutations Build a focused database for one cancer type (e.g. lung): @@ -400,7 +415,7 @@ pgatk cosmic-to-proteindb \ --accepted_values "lung" ``` -### 4c. Cell-line proteogenomics +### 4c. Cell-type specific databases When analyzing cell-line proteomes, use cell-line-specific mutations. COSMIC provides a dedicated cell-line export with mutations annotated per sample: @@ -475,7 +490,7 @@ pgatk generate-decoy \ --- -## 5. Patient-Specific Database from WGS/WES +## USE CASE 6: Patient-Specific Database from WGS/WES When matched whole-genome or whole-exome sequencing data is available for a sample, build a personalized protein database from the patient's own @@ -530,7 +545,7 @@ pgatk vcf-to-proteindb \ --- -## 6. Novel ORF and Micropeptide Discovery +## USE CASE 7: Novel ORF and Micropeptide Discovery Proteogenomics is a key approach for discovering novel coding regions: small open reading frames (smORFs), micropeptides from lncRNAs, pseudogene-encoded @@ -1002,6 +1017,7 @@ cat canonical.fa \ clinvar_variants.fa \ cosmic_variants.fa \ lncRNA.fa \ + putative.fa \ pseudogene.fa \ > combined_target.fa ``` From 2178ca0638ffbd9e464d7b61710ed7befba6ab15 Mon Sep 17 00:00:00 2001 From: Husen Umer Date: Wed, 13 May 2026 11:57:52 +0300 Subject: [PATCH 16/23] Enhance mutation handling in CancerGenomesService based on COSMIC updates - Improved handling of deletion-insertion (delins) mutations with better validation and making sure to skip intronic and UTR offsets. - Added support for tandem duplication (dup) mutations, allowing for accurate sequence reconstruction. - Updated mutation type checks to include additional HGVS terms for missense, nonsense, and in-frame insertions/deletions. - Refined the required columns in the CosmicMutantExport input to align with updated naming conventions. Additionally, added a new 'Validations' section in the documentation for clarity on module correctness, currently only covers COSMIC mutation translation based on results in tests/test_variant_types_hgvs.py. --- docs/index.md | 1 + docs/validations.md | 130 +++++++ pgatk/cgenomes/cgenomes_proteindb.py | 87 +++-- pgatk/tests/test_variant_types_hgvs.py | 449 +++++++++++++++++++++++++ 4 files changed, 639 insertions(+), 28 deletions(-) create mode 100644 docs/validations.md create mode 100644 pgatk/tests/test_variant_types_hgvs.py diff --git a/docs/index.md b/docs/index.md index f6e58a2..87026a3 100644 --- a/docs/index.md +++ b/docs/index.md @@ -23,6 +23,7 @@ See the [Installation](installation.md) page for more options (Bioconda, Docker, | [Introduction](introduction.md) | Overview of the proteogenomics field | | [Installation](installation.md) | How to install pgatk (pip, Bioconda, Docker, source) | | [pgatk CLI](pgatk-cli.md) | Full command-line reference for all tools | +| [Validations](validations.md) | Tests and validations to ensure correctness of the modules| | [Use Cases](use-cases.md) | End-to-end workflows and recipes for common scenarios | | [File Formats](formats.md) | BED, GTF, GCT format specifications | | [Changelog](changelog.md) | Version history and release notes | diff --git a/docs/validations.md b/docs/validations.md new file mode 100644 index 0000000..015ef72 --- /dev/null +++ b/docs/validations.md @@ -0,0 +1,130 @@ +# Test Validations + +This page documents the validation strategy for PGATK, describing what the automated tests verify and how to independently confirm their correctness. +# 1. Variant Translation Tests - COSMIC +## 1.1 Validation per variant type (`test_variant_types_hgvs.py`) + +These tests verify that `CancerGenomesService.get_mut_pro_seq()` correctly processes all 18 Sequence Ontology (SO) variant types present in COSMIC v103 mutation files, following [HGVS nomenclature rules](https://hgvs-nomenclature.org/stable/). + +Each test uses a real mutation from `Cosmic_CompleteTargetedScreensMutant_v103_GRCh38.tsv` paired with the matching CDS from `Cosmic_Genes_v103_GRCh38.fasta`. + +--- + +### Actionable Variant Types + +These types are expected to produce a non-empty mutant protein sequence. Tests assert biologically specific properties, not just that the result is non-empty. + +| Variant Type | Gene | Mutation | HGVS Rule Verified | +|---|---|---|---| +| `missense_variant` | EZH2 | c.656C>T p.S219F | Protein length unchanged; position 218 (0-indexed) is `F`; all other residues identical to WT | +| `stop_gained` | FCGR3B | c.394A>T p.K132* | Position 131 (0-indexed) is `*` (stop codon) in the translated output | +| `inframe_deletion` | CTNNB1 | c.104_124del p.T35_G41del | Protein shortened by exactly 7 AA (21 nt deleted ÷ 3); residues before position 35 unchanged | +| `inframe_insertion` (dup) | SMAP1 | c.1226_1228dup p.I409dup | Protein lengthened by exactly 1 AA (3 nt duplicated ÷ 3); residues before the dup site unchanged | +| `protein_altering_variant` | NF2 | c.252_262delinsCT p.L85_K88delins* | Result is non-empty and differs from the wild-type sequence | +| `frameshift_variant` | MEN1 | c.292del p.R98Efs*21 | Non-empty translated sequence produced from the frame-shifted CDS | +| `start_lost` | INPP4B | c.3G>T p.M1? | First AA is not Met; it is Ile (`ATG→ATT` via the DNA substitution path) | +| `stop_lost` | MYD88 | c.611_613delinsGCGGCCCCC p.D204_*205delinsGGPR | Mutant protein is longer than WT (stop codon removed; read-through occurs) | +| `stop_retained_variant` | EPHA3 | c.2756G>A p.*919= | Length unchanged; last residue is still `*` (`TGA→TAA`, both stop codons) | + +--- + +### Filtered / Skipped Variant Types + +These types must produce an empty string `""`. There are two mechanisms: + +- **Pre-filter** (`cosmic_to_proteindb`): `synonymous_variant` rows are discarded before `get_mut_pro_seq` is called. +- **`p.?` guard** (`get_mut_pro_seq`): When `aa_mut == "p.?"`, the function immediately returns `""`. All intronic, UTR, and splice variants in COSMIC v103 carry `p.?`. + +| Variant Type | Mechanism | Source Mutation | +|---|---|---| +| `synonymous_variant` | Pre-filter in `cosmic_to_proteindb` | Filter logic validated directly on the SO term strings | +| `intron_variant` | `p.?` guard | RAD51C c.145+706C>T p.? | +| `3_prime_UTR_variant` | `p.?` guard | SUFU c.*2816G>A p.? | +| `5_prime_UTR_variant` | `p.?` guard | PSMD13 c.-20C>T p.? | +| `coding_sequence_variant` | `p.?` guard (also has intronic offset `+`) | OAS2 c.2157+2A>G p.? | +| `splice_acceptor_variant` | `p.?` guard | TSHR c.171-1G>A p.? | +| `splice_donor_variant` | `p.?` guard | PTCH1 c.3306+2T>G p.? | +| `splice_region_variant` (composite) | Fires missense branch when combined with `missense_variant` | EZH2 c.656C>T p.S219F with type `missense_variant,splice_region_variant` | +| `incomplete_terminal_codon_variant` | `p.?` guard (absent from v103 targeted-screen data; tested by convention) | c.2213_2214insA p.? | + +!!! note + `splice_region_variant` only appears in COSMIC v103 as part of a composite type string (e.g. `missense_variant,splice_region_variant`). When combined with an actionable type, the actionable type's branch fires first and the mutation is processed normally. + +--- + +## 1.2 Independent Validation of the Tests + +### 1. Re-derive CDS sequences from the FASTA + +The CDS strings embedded in the test file came from `Cosmic_Genes_v103_GRCh38.fasta`. To verify a gene (e.g. EZH2): + +```bash +python3 - <<'EOF' +from Bio import SeqIO +fasta = "use-cases/cosmic_data/Cosmic_Genes_v103_GRCh38.fasta" +for rec in SeqIO.parse(fasta, "fasta"): + if "EZH2" in rec.id and "ENST00000483967" in rec.id: + print(f"len={len(rec.seq)}") + print(f"prot[218]={rec.seq.translate(to_stop=False)[218]}") + print(rec.seq[:50]) +EOF +``` + +Expected: `len=2211`, `prot[218]=S` (which becomes `F` after the c.656C>T substitution). + +### 2. Confirm mutations exist in the COSMIC TSV + +Every test documents its source mutation. To verify a mutation is real and not fabricated: + +```bash +grep "EZH2" use-cases/cosmic_data/Cosmic_CompleteTargetedScreensMutant_v103_GRCh38.tsv | \ + grep "c.656C>T" | \ + cut -f1-10 +``` + +### 3. Manually compute expected protein output + +For the missense test, manually apply the substitution and translate: + +```python +from Bio.Seq import Seq + +cds = '''ATGGGCCAGACTGGGAAGAAATCTGAGAAGGGACCAGTTTGTTGGCGGAAGCGTGTAAAATCAGAGTACATGCGACTGAGACAGCTCAAGAGGTTCAGACGAGCTGATGAAGTAAAGAGTATGTTTAGTTCCAATCGTCAGAAAATT +TTGGAAAGAACGGAAATCTTAAACCAAGAATGGAAACAGCGAAGGATACAGCCTGTGCACATCCTGACTTCTTGTTCGGTGACCAGTGACTTGGATTTTCCAACACAAGTCATCCCATTAAAGACTCTGAATGCAGTTGCTTCAGTACCCATAATG +TATTCTTGGTCTCCCCTACAGCAGAATTTTATGGTGGAAGATGAAACTGTTTTACATAACATTCCTTATATGGGAGATGAAGTTTTAGATCAGGATGGTACTTTCATTGAAGAACTAATAAAAAATTATGATGGGAAAGTACACGGGGATAGAGAA +TGTGGGTTTATAAATGATGAAATTTTTGTGGAGTTGGTGAATGCCCTTGGTCAATATAATGATGATGACGATGATGATGATGGAGACGATCCTGAAGAAAGAGAAGAAAAGCAGAAAGATCTGGAGGATCACCGAGATGATAAAGAAAGCCGCCCA +CCTCGGAAATTTCCTTCTGATAAAATTTTTGAAGCCATTTCCTCAATGTTTCCAGATAAGGGCACAGCAGAAGAACTAAAGGAAAAATATAAAGAACTCACCGAACAGCAGCTCCCAGGCGCACTTCCTCCTGAATGTACCCCCAACATAGATGGA +CCAAATGCTAAATCTGTTCAGAGAGAGCAAAGCTTACACTCCTTTCATACGCTTTTCTGTAGGCGATGTTTTAAATATGACTGCTTCCTACATCCTTTTCATGCAACACCCAACACTTATAAGCGGAAGAACACAGAAACAGCTCTAGACAACAAA +CCTTGTGGACCACAGTGTTACCAGCATTTGGAGGGAGCAAAGGAGTTTGCTGCTGCTCTCACCGCTGAGCGGATAAAGACCCCACCAAAACGTCCAGGAGGCCGCAGAAGAGGACGGCTTCCCAATAACAGTAGCAGGCCCAGCACCCCCACCATT +AATGTGCTGGAATCAAAGGATACAGACAGTGATAGGGAAGCAGGGACTGAAACGGGGGGAGAGAACAATGATAAAGAAGAAGAAGAGAAGAAAGATGAAACTTCGAGCTCCTCTGAAGCAAATTCTCGGTGTCAAACACCAATAAAGATGAAGCCA +AATATTGAACCTCCTGAGAATGTGGAGTGGAGTGGTGCTGAAGCCTCAATGTTTAGAGTCCTCATTGGCACTTACTATGACAATTTCTGTGCCATTGCTAGGTTAATTGGGACCAAAACATGTAGACAGGTGTATGAGTTTAGAGTCAAAGAATCT +AGCATCATAGCTCCAGCTCCCGCTGAGGATGTGGATACTCCTCCAAGGAAAAAGAAGAGGAAACACCGGTTGTGGGCTGCACACTGCAGAAAGATACAGCTGAAAAAGGACGGCTCCTCTAACCATGTTTACAACTATCAACCCTGTGATCATCCA +CGGCAGCCTTGTGACAGTTCGTGCCCTTGTGTGATAGCACAAAATTTTTGTGAAAAGTTTTGTCAATGTAGTTCAGAGTGTCAAAACCGCTTTCCGGGATGCCGCTGCAAAGCACAGTGCAACACCAAGCAGTGCCCGTGCTACCTGGCTGTCCGA +GAGTGTGACCCTGACCTCTGTCTTACTTGTGGAGCCGCTGACCATTGGGACAGTAAAAATGTGTCCTGCAAGAACTGCAGTATTCAGCGGGGCTCCAAAAAGCATCTATTGCTGGCACCATCTGACGTGGCAGGCTGGGGGATTTTTATCAAAGAT +CCTGTGCAGAAAAATGAATTCATCTCAGAATACTGTGGAGAGATTATTTCTCAAGATGAAGCTGACAGAAGAGGGAAAGTGTATGATAAATACATGTGCAGCTTTCTGTTCAACTTGAACAATGATTTTGTGGTGGATGCAACCCGCAAGGGTAAC +AAAATTCGTTTTGCAAATCATTCGGTAAATCCAAACTGCTATGCAAAAGTTATGATGGTTAACGGTGATCACAGGATAGGTATTTTTGCCAAGAGAGCCATCCAGACTGGCGAAGAGCTGTTTTTTGATTACAGATACAGCCAGGCTGATGCCCTG +AAGTATGTCGGCATCGAAAGAGAAATGGAAATCCCTTGA'''.replace('\n','') # EZH2 CDS string + +seq = Seq(cds) +wt_prot = seq.translate(to_stop=False) +print(wt_prot[218]) # 'S' — wild-type Serine at position 219 + +# Apply c.656C>T: position 656 is 0-indexed 655 +mut_seq = seq[:655] + "T" + seq[656:] +mut_prot = mut_seq.translate(to_stop=False) +print(mut_prot[218]) # 'F' — mutant Phenylalanine +``` + +--- + +## 1.3 Core Unit Tests (`test_cosmic_core.py`) + +14 Unit tests are implemented to verify the internal logic of `get_mut_pro_seq()` using minimal synthetic sequences (`Seq("ATGAAATTT")` etc.). + +| Class | Coverage | +|---|---| +| `TestGetMutProSeqDnaBranch` | DNA substitution, insertion, single-base deletion, two-position deletion, ambiguous `c.?` routing to protein branch | +| `TestGetMutProSeqProteinBranch` | Missense, nonsense truncation, in-frame insertion, two-position in-frame deletion, single-position in-frame deletion | +| `TestGetMutProSeqEdgeCases` | Mutation beyond sequence length, ambiguous `p.?` returns `""`, non-alpha last char in aa_mut returns `""`, DNA ref-allele mismatch returns `""` | + +These tests are fast to run and isolate individual code paths without requiring reference files. diff --git a/pgatk/cgenomes/cgenomes_proteindb.py b/pgatk/cgenomes/cgenomes_proteindb.py index fe979be..70fb72b 100644 --- a/pgatk/cgenomes/cgenomes_proteindb.py +++ b/pgatk/cgenomes/cgenomes_proteindb.py @@ -144,15 +144,24 @@ def get_mut_pro_seq(snp: SNP, seq: Seq) -> Optional[str]: seq_mut = seq[:index] + mut_dna + seq[index + 1:] mut_pro_seq = str(seq_mut.translate(to_stop=False)) elif "delins" in snp.dna_mut: - # Deletion-insertion: delete range then insert new bases - insert_dna = snp.dna_mut.split("delins")[1] - if insert_dna.isalpha() and len(positions) >= 2: - del_index1 = int(positions[0]) - 1 - del_index2 = int(positions[1]) + # HGVS delins: one or more nucleotides replaced by one or more other nucleotides. + coord_part, insert_raw = snp.dna_mut.split("delins", 1) + insert_dna = insert_raw.upper() + if re.search(r'\d[+-]|\*|-\d', coord_part): + # Intronic (c.N+X, c.N-X), 5'UTR (c.-N) or 3'UTR (c.*N) offsets + # cannot be mapped onto a CDS-only FASTA sequence; skip. + pass + elif not insert_dna.isalpha(): + # Unknown ('?'), N[n], or conversion-notation insertions cannot be resolved. + pass + elif len(positions) >= 2: + # Range delins: delete positions[0]..positions[1] (1-indexed, inclusive). + del_index1 = int(positions[0]) - 1 # 0-indexed start + del_index2 = int(positions[1]) # 0-indexed exclusive end seq_mut = seq[:del_index1] + insert_dna + seq[del_index2:] mut_pro_seq = str(seq_mut.translate(to_stop=False)) - elif insert_dna.isalpha() and len(positions) == 1: - # Single-position delins: replace one base + elif len(positions) == 1: + # Single-position delins: replace the one nucleotide at positions[0]. del_index1 = int(positions[0]) - 1 seq_mut = seq[:del_index1] + insert_dna + seq[del_index1 + 1:] mut_pro_seq = str(seq_mut.translate(to_stop=False)) @@ -165,6 +174,20 @@ def get_mut_pro_seq(snp: SNP, seq: Seq) -> Optional[str]: seq_mut = seq[:ins_index1] + insert_dna + seq[ins_index1:] mut_pro_seq = str(seq_mut.translate(to_stop=False)) + elif "dup" in snp.dna_mut: + # Tandem duplication: re-insert the duplicated range after its end position. + if len(positions) == 2: + dup_start = int(positions[0]) - 1 + dup_end = int(positions[1]) + dup_seq = str(seq[dup_start:dup_end]) + seq_mut = seq[:dup_end] + dup_seq + seq[dup_end:] + mut_pro_seq = str(seq_mut.translate(to_stop=False)) + elif len(positions) == 1: + dup_pos = int(positions[0]) + dup_seq = str(seq[dup_pos - 1:dup_pos]) + seq_mut = seq[:dup_pos] + dup_seq + seq[dup_pos:] + mut_pro_seq = str(seq_mut.translate(to_stop=False)) + elif "del" in snp.dna_mut: if len(positions) == 2: del_index1 = int(positions[0]) - 1 @@ -180,7 +203,7 @@ def get_mut_pro_seq(snp: SNP, seq: Seq) -> Optional[str]: positions = re.findall(r'\d+', snp.aa_mut) protein_seq = str(seq.translate(to_stop=False)) - if "Missense" in snp.mutation_type: + if "Missense" in snp.mutation_type or "missense_variant" in snp.mutation_type: # Extract the mutant residue from HGVS like p.V600E (1-letter) # or p.Val600Glu (3-letter). For 1-letter the last char is the # AA; for 3-letter we need to convert the last triplet. @@ -190,20 +213,27 @@ def get_mut_pro_seq(snp: SNP, seq: Seq) -> Optional[str]: return '' index = int(positions[0]) - 1 mut_pro_seq = protein_seq[:index] + mut_aa + protein_seq[index + 1:] - elif "Nonsense" in snp.mutation_type: + elif "Nonsense" in snp.mutation_type or "stop_gained" in snp.mutation_type: index = int(positions[0]) - 1 mut_pro_seq = protein_seq[:index] - elif "Insertion - In frame" in snp.mutation_type: - try: - index = snp.aa_mut.index("ins") - except ValueError: - return '' - insert_aa_raw = snp.aa_mut[index + 3:] - insert_aa = _three_to_one(insert_aa_raw) - if insert_aa.isalpha(): - ins_index1 = int(positions[0]) - mut_pro_seq = protein_seq[:ins_index1] + insert_aa + protein_seq[ins_index1:] - elif "Deletion - In frame" in snp.mutation_type: + elif "Insertion - In frame" in snp.mutation_type or "inframe_insertion" in snp.mutation_type: + if "dup" in snp.aa_mut: + # Protein-level tandem dup: re-insert the duplicated residues after the range. + dup_start = int(positions[0]) - 1 + dup_end = int(positions[-1]) + dup_aa = protein_seq[dup_start:dup_end] + mut_pro_seq = protein_seq[:dup_end] + dup_aa + protein_seq[dup_end:] + else: + try: + index = snp.aa_mut.index("ins") + except ValueError: + return '' + insert_aa_raw = snp.aa_mut[index + 3:] + insert_aa = _three_to_one(insert_aa_raw) + if insert_aa.isalpha(): + ins_index1 = int(positions[0]) + mut_pro_seq = protein_seq[:ins_index1] + insert_aa + protein_seq[ins_index1:] + elif "Deletion - In frame" in snp.mutation_type or "inframe_deletion" in snp.mutation_type: if len(positions) == 2: del_index1 = int(positions[0]) - 1 del_index2 = int(positions[1]) @@ -211,7 +241,8 @@ def get_mut_pro_seq(snp: SNP, seq: Seq) -> Optional[str]: elif len(positions) == 1: del_index1 = int(positions[0]) - 1 mut_pro_seq = protein_seq[:del_index1] + protein_seq[del_index1 + 1:] - elif "Complex" in snp.mutation_type and "frameshift" not in snp.mutation_type: + elif ("Complex" in snp.mutation_type or "protein_altering_variant" in snp.mutation_type) \ + and "frameshift" not in snp.mutation_type: try: index = snp.aa_mut.index(">") except ValueError: @@ -262,16 +293,16 @@ def cosmic_to_proteindb(self) -> None: self.get_logger().debug("Reading input CosmicMutantExport.tsv ...") line_counter = 1 - required_columns = ["Gene name", "Accession Number", "Mutation CDS", "Mutation AA", "Mutation Description"] + required_columns = ["GENE_SYMBOL", "TRANSCRIPT_ACCESSION", "MUTATION_CDS", "MUTATION_AA", "MUTATION_DESCRIPTION"] with _open_text(self._local_mutation_file, encoding="latin-1") as cosmic_input, \ open(self._local_output_file, 'w', encoding='utf-8') as output: header = cosmic_input.readline().strip().split("\t") try: - gene_col = header.index("Gene name") - enst_col = header.index("Accession Number") - cds_col = header.index("Mutation CDS") - aa_col = header.index("Mutation AA") - muttype_col = header.index("Mutation Description") + gene_col = header.index("GENE_SYMBOL") + enst_col = header.index("TRANSCRIPT_ACCESSION") + cds_col = header.index("MUTATION_CDS") + aa_col = header.index("MUTATION_AA") + muttype_col = header.index("MUTATION_DESCRIPTION") except ValueError as e: self.get_logger().error( "COSMIC file missing required columns. Expected %s, got: %s", @@ -308,7 +339,7 @@ def cosmic_to_proteindb(self) -> None: if row[filter_col] not in self._accepted_values and self._accepted_values != ['all']: continue - if "coding silent" in row[muttype_col]: + if "coding silent" in row[muttype_col] or "synonymous_variant" in row[muttype_col]: continue snp = SNP(gene=row[gene_col], mrna=row[enst_col], dna_mut=row[cds_col], aa_mut=row[aa_col], diff --git a/pgatk/tests/test_variant_types_hgvs.py b/pgatk/tests/test_variant_types_hgvs.py new file mode 100644 index 0000000..f699088 --- /dev/null +++ b/pgatk/tests/test_variant_types_hgvs.py @@ -0,0 +1,449 @@ +"""HGVS compliance tests for get_mut_pro_seq across all SO variant types. + +Each test uses a real mutation from Cosmic_CompleteTargetedScreensMutant_v103_GRCh38.tsv +paired with the matching CDS from Cosmic_Genes_v103_GRCh38.fasta. + +HGVS rules verified per type: + missense_variant – single AA substitution, length unchanged + stop_gained – stop (*) introduced at predicted position + inframe_deletion – protein shortened by exact number of deleted codons + inframe_insertion – protein lengthened by exact number of inserted codons (dup) + protein_altering_variant– delins produces a non-empty altered sequence + frameshift_variant – out-of-frame deletion produces a non-empty translated sequence + start_lost – start codon destroyed; first AA is no longer Met + stop_lost – stop codon removed; protein extends past original terminus + stop_retained_variant – stop codon changed but still a stop; length unchanged + synonymous_variant – filtered before get_mut_pro_seq (pre-processing step) + intron_variant – p.? guard returns empty string + 3_prime_UTR_variant – p.? guard returns empty string + 5_prime_UTR_variant – p.? guard returns empty string + coding_sequence_variant – p.? guard returns empty string + splice_acceptor_variant – p.? guard returns empty string + splice_donor_variant – intronic offset guard returns empty string + splice_region_variant – treated as missense when combined type contains missense_variant + incomplete_terminal_codon_variant – not present in v103 targeted screens; p.? returns "" +""" +from Bio.Seq import Seq + +from pgatk.cgenomes.cgenomes_proteindb import CancerGenomesService +from pgatk.cgenomes.models import SNP + +# --------------------------------------------------------------------------- +# CDS sequences from Cosmic_Genes_v103_GRCh38.fasta +# Key: gene symbol Value: CDS nucleotide string (ENST accession in comment) +# --------------------------------------------------------------------------- +_CDS = { + # ENST00000483967.5 – used for missense_variant + "EZH2": ( + "ATGGGCCAGACTGGGAAGAAATCTGAGAAGGGACCAGTTTGTTGGCGGAAGCGTGTAAAATCAGAGTACATGCGACTGAGACAGCTCAAGAGGTTCAGACGAGCTGATGAAGTAAAGAGTATGTTTAGTTCCAATCGTCAGAAAATTTTGGAAAGAACGGAAATCTTAAACCAAGAATGGAAACAGCGAAGGATACAGCCTGTGCACATCCTGACTTCTTGTTCGGTGACCAGTGACTTGGATTTTCCAACACAAGTCATCCCATTAAAGACTCTGAATGCAGTTGCTTCAGTACCCATAATGTATTCTTGGTCTCCCCTACAGCAGAATTTTATGGTGGAAGATGAAACTGTTTTACATAACATTCCTTATATGGGAGATGAAGTTTTAGATCAGGATGGTACTTTCATTGAAGAACTAATAAAAAATTATGATGGGAAAGTACACGGGGATAGAGAATGTGGGTTTATAAATGATGAAATTTTTGTGGAGTTGGTGAATGCCCTTGGTCAATATAATGATGATGACGATGATGATGATGGAGACGATCCTGAAGAAAGAGAAGAAAAGCAGAAAGATCTGGAGGATCACCGAGATGATAAAGAAAGCCGCCCACCTCGGAAATTTCCTTCTGATAAAATTTTTGAAGCCATTTCCTCAATGTTTCCAGATAAGGGCACAGCAGAAGAACTAAAGGAAAAATATAAAGAACTCACCGAACAGCAGCTCCCAGGCGCACTTCCTCCTGAATGTACCCCCAACATAGATGGACCAAATGCTAAATCTGTTCAGAGAGAGCAAAGCTTACACTCCTTTCATACGCTTTTCTGTAGGCGATGTTTTAAATATGACTGCTTCCTACATCCTTTTCATGCAACACCCAACACTTATAAGCGGAAGAACACAGAAACAGCTCTAGACAACAAACCTTGTGGACCACAGTGTTACCAGCATTTGGAGGGAGCAAAGGAGTTTGCTGCTGCTCTCACCGCTGAGCGGATAAAGACCCCACCAAAACGTCCAGGAGGCCGCAGAAGAGGACGGCTTCCCAATAACAGTAGCAGGCCCAGCACCCCCACCATTAATGTGCTGGAATCAAAGGATACAGACAGTGATAGGGAAGCAGGGACTGAAACGGGGGGAGAGAACAATGATAAAGAAGAAGAAGAGAAGAAAGATGAAACTTCGAGCTCCTCTGAAGCAAATTCTCGGTGTCAAACACCAATAAAGATGAAGCCAAATATTGAACCTCCTGAGAATGTGGAGTGGAGTGGTGCTGAAGCCTCAATGTTTAGAGTCCTCATTGGCACTTACTATGACAATTTCTGTGCCATTGCTAGGTTAATTGGGACCAAAACATGTAGACAGGTGTATGAGTTTAGAGTCAAAGAATCTAGCATCATAGCTCCAGCTCCCGCTGAGGATGTGGATACTCCTCCAAGGAAAAAGAAGAGGAAACACCGGTTGTGGGCTGCACACTGCAGAAAGATACAGCTGAAAAAGGACGGCTCCTCTAACCATGTTTACAACTATCAACCCTGTGATCATCCACGGCAGCCTTGTGACAGTTCGTGCCCTTGTGTGATAGCACAAAATTTTTGTGAAAAGTTTTGTCAATGTAGTTCAGAGTGTCAAAACCGCTTTCCGGGATGCCGCTGCAAAGCACAGTGCAACACCAAGCAGTGCCCGTGCTACCTGGCTGTCCGAGAGTGTGACCCTGACCTCTGTCTTACTTGTGGAGCCGCTGACCATTGGGACAGTAAAAATGTGTCCTGCAAGAACTGCAGTATTCAGCGGGGCTCCAAAAAGCATCTATTGCTGGCACCATCTGACGTGGCAGGCTGGGGGATTTTTATCAAAGATCCTGTGCAGAAAAATGAATTCATCTCAGAATACTGTGGAGAGATTATTTCTCAAGATGAAGCTGACAGAAGAGGGAAAGTGTATGATAAATACATGTGCAGCTTTCTGTTCAACTTGAACAATGATTTTGTGGTGGATGCAACCCGCAAGGGTAACAAAATTCGTTTTGCAAATCATTCGGTAAATCCAAACTGCTATGCAAAAGTTATGATGGTTAACGGTGATCACAGGATAGGTATTTTTGCCAAGAGAGCCATCCAGACTGGCGAAGAGCTGTTTTTTGATTACAGATACAGCCAGGCTGATGCCCTGAAGTATGTCGGCATCGAAAGAGAAATGGAAATCCCTTGA" + ), + # ENST00000613418.4 – used for stop_gained + "FCGR3B": ( + "ATGCGGACTGAAGATCTCCCAAAGGCTGTGGTGTTCCTGGAGCCTCAATGGTACAGCGTGCTTGAGAAGGACAGTGTGACTCTGAAGTGCCAGGGAGCC" + "TACTCCCCTGAGGACAATTCCACACAGTGGTTTCACAATGAGAACCTCATCTCAAGCCAGGCCTCGAGCTACTTCATTGACGCTGCCACAGTCAACGAC" + "AGTGGAGAGTACAGGTGCCAGACAAACCTCTCCACCCTCAGTGACCCGGTGCAGCTAGAAGTCCATATCGGCTGGCTGTTGCTCCAGGCCCCTCGGTGGG" + "TGTTCAAGGAGGAAGACCCTATTCACCTGAGGTGTCACAGCTGGAAGAACACTGCTCTGCATAAGGTCACATATTTACAGAATGGCAAAGACAGGAAGTAT" + "TTTCATCATAATTCTGACTTCCACATTCCAAAAGCCACACTCAAAGATAGCGGCTCCTACTTCTGCAGGGGGCTTGTTGGGAGTAAAAATGTGTCTTCAGA" + "GACTGTGAACATCACCATCACTCAAGGTTTGGCAGTGTCAACCATCTCATCATTCTCTCCACCTGGGTACCAAGTCTCTTTCTGCTTGGTGATGGTACTCC" + "TTTTTGCAGTGGACACAGGACTATATTTCTCTGTGAAGACAAACATTTGA" + ), + # ENST00000646174.1 – used for inframe_deletion + "CTNNB1": ( + "ATGGAGTTGGACATGGCCATGGAACCAGACAGAAAAGCGGCTGTTAGTCACTGGCAGCAACAGTCTTACCTGGACTCTGGAATCCATTCTGGTGCCACTAC" + "CACAGCTCCTTCTCTGAGTGGTAAAGGCAATCCTGAGGAAGAGGATGTGGATACCTCCCAAGTCCTGTATGAGTGGGAACAGGGATTTTCTCAGTCCTTCA" + "CTCAAGAACAAGTAGCTGATATTGATGGACAGTATGCAATGACTCGAGCTCAGAGGGTACGAGCTGCTATGTTCCCTGAGACATTAGATGAGGGCATGCAGA" + "TCCCATCTACACAGTTTGATGCTGCTCATCCCACTAATGTCCAGCGTTTGGCTGAACCATCACAGATGCTGAAACATGCAGTTGTAAACTTGATTAACTATCA" + "AGATGATGCAGAACTTGCCACACGTGCAATCCCTGAACTGACAAAACTGCTAAATGACGAGGACCAGGTGGTGGTTAATAAGGCTGCAGTTATGGTCCATCAG" + "CTTTCTAAAAAGGAAGCTTCCAGACACGCTATCATGCGTTCTCCTCAGATGGTGTCTGCTATTGTACGTACCATGCAGAATACAAATGATGTAGAAACAGCTCG" + "TTGTACCGCTGGGACCTTGCATAACCTTTCCCATCATCGTGAGGGCTTACTGGCCATCTTTAAGTCTGGAGGCATTCCTGCCCTGGTGAAAATGCTTGGTTCAC" + "CAGTGGATTCTGTGTTGTTTTATGCCATTACAACTCTCCACAACCTTTTATTACATCAAGAAGGAGCTAAAATGGCAGTGCGTTTAGCTGGTGGGCTGCAGAAAA" + "TGGTTGCCTTGCTCAACAAAACAAATGTTAAATTCTTGGCTATTACGACAGACTGCCTTCAAATTTTAGCTTATGGCAACCAAGAAAGCAAGCTCATCATACTGG" + "CTAGTGGTGGACCCCAAGCTTTAGTAAATATAATGAGGACCTATACTTACGAAAAACTACTGTGGACCACAAGCAGAGTGCTGAAGGTGCTATCTGTCTGCTCTA" + "GTAATAAGCCGGCTATTGTAGAAGCTGGTGGAATGCAAGCTTTAGGACTTCACCTGACAGATCCAAGTCAACGTCTTGTTCAGAACTGTCTTTGGACTCTCAGGA" + "ATCTTTCAGATGCTGCAACTAAACAGGAAGGGATGGAAGGTCTCCTTGGGACTCTTGTTCAGCTTCTGGGTTCAGATGATATAAATGTGGTCACCTGTGCAGCTG" + "GAATTCTTTCTAACCTCACTTGCAATAATTATAAGAACAAGATGATGGTCTGCCAAGTGGGTGGTATAGAGGCTCTTGTGCGTACTGTCCTTCGGGCTGGTGACAG" + "GGAAGACATCACTGAGCCTGCCATCTGTGCTCTTCGTCATCTGACCAGCCGACACCAAGAAGCAGAGATGGCCCAGAATGCAGTTCGCCTTCACTATGGACTACCA" + "GTTGTGGTTAAGCTCTTACACCCACCATCCCACTGGCCTCTGATAAAGGCTACTGTTGGATTGATTCGAAATCTTGCCCTTTGTCCCGCAAATCATGCACCTTTGC" + "GTGAGCAGGGTGCCATTCCACGACTAGTTCAGTTGCTTGTTCGTGCACATCAGGATACCCAGCGCCGTACGTCCATGGGTGGGACACAGCAGCAATTTGTGGAGGG" + "GGTCCGCATGGAAGAAATAGTTGAAGGTTGTACCGGAGCCCTTCACATCCTAGCTCGGGATGTTCACAACCGAATTGTTATCAGAGGACTAAATACCATTCCATTG" + "TTTGTGCAGCTGCTTTATTCTCCCATTGAAAACATCCAAAGAGTAGCTGCAGGGGTCCTCTGTGAACTTGCTCAGGACAAGGAAGCTGCAGAAGCTATTGAAGCTG" + "AGGGAGCCACAGCTCCTCTGACAGAGTTACTTCACTCTAGGAATGAAGGTGTGGCGACATATGCAGCTGCTGTTTTGTTCCGAATGTCTGAGGACAAGCCACAAGAT" + "TACAAGAAACGGCTTTCAGTTGAGCTGACCAGCTCTCTCTTCAGAACAGAGCCAATGGCTTGGAATGAGACTGCTGATCTTGGACTTGATATTGGTGCCCAGGGAGAA" + "CCCCTTGGATATCGCCAGGATGATCCTAGCTATCGTTCTTTTCACTCTGGTGGATATGGCCAGGATGCCTTGGGTATGGACCCCATGATGGAACATGAGATGGGTGGC" + "CACCACCCTGGTGCTGACTATCCAGTTGATGGGCTGCCAGATCTGGGGCATGCCCAGGACCTCATGGATGGGCTGCCTCCAGGTGACAGCAATCAGCTGGCCTGGTTT" + "GATACTGACCTGTAA" + ), + # ENST00000370452.7 – used for inframe_insertion (dup) + "SMAP1": ( + "ATGGCGACGCGCTCCTGTCGGGAGAAGGCTCAGAAGCTGAACGAGCAGCACCAGCTCATCCTATCCAAGCTTCTGAGGGAGGAGGACAACAAGTACTGCGCC" + "GACTGCGAGGCCAAAGGTCCTCGATGGGCTTCCTGGAATATTGGTGTGTTTATTTGCATCAGATGTGCTGGAATTCATAGAAATCTTGGGGTTCATATATCC" + "AGGGTCAAATCAGTCAACCTAGACCAATGGACAGCAGAACAGATACAGTGCATGCAAGATATGGGAAATACTAAAGCAAGACTACTCTATGAAGCCAATCTTCC" + "AGAGAACTTTCGAAGACCACAGACAGATCAAGCAGTGGAATTTTTCATCAGAGATAAATATGAAAAGAAGAAATACTACGATAAAAATGCCATAGCTATTACAAAT" + "AAAGAAAAGGAAGAAAAAAAAGGAAGAGAAAAAGAGAGAAAAGGAGCCAGAAAAGCCGGCAAAACCACTTACAGCTGAAAAGCTGCAGAAGAAAGATCAGCAACTGG" + "AGCCTAAAAAAAGTACCAGCCCTAAAAAAGCTGCGGAGCCCACTGTGGATCTTTTAGGACTTGATGGCCCTGCTGTGGCACCAGTGACCAACGGGAACACAACGGTG" + "CCACCCCTGAACGATGATCTGGACATCTTTGGACCGATGATTTCTAATCCCTTACCTGCAACTGTCATGCCCCCAGCTCAGGGGACACCCTCTGCACCAGCAGCTGCA" + "ACCCTGTCTACAGTAACATCTGGGGATCTAGATTTATTCACTGAGCAAACTACAAAATCAGAAGAAGTGGCAAAGAAACAACTTTCCAAAGACTCCATCTTATCTCTG" + "TATGGCACAGGAACCATTCAACAGCAAAGTACTCCTGGTGTATTTATGGGACCCACAAATATACCATTTACCTCACAAGCACCAGCTGCATTTCAGGGCTTTCCATCG" + "ATGGGCGTGCCTGTGCCTGCAGCTCCTGGCCTTATAGGAAATGTGATGGGACAGAGTCCAAGCATGATGGTGGGCATGCCCATGCCCAATGGGTTTATGGGAAATGCA" + "CAAACTGGTGTGATGCCACTTCCTCAGAACGTTGTTGGCCCCCAAGGAGGAATGGTGGGACAAATGGGTGCACCCCAGAGTAAGTTTGGCCTGCCGCAAGCTCAGCAG" + "CCCCAGTGGAGCCTCTCACAGATAATGCAGAAGGGTGATGCTGTTCTCCAGCACTCCATCAGTGCAATCTACTGGCCAATGACAAGGTGGTTAAAATGTCCTTTAGT" + "AGATGAATCAGCAGATGGCTGGCATGAGTATCAGTAG" + ), + # ENST00000361676.8 – used for protein_altering_variant + "NF2": ( + "ATGGCCGGGGCCATCGCTTCCCGCATGAGCTTCAGCTCTCTCAAGAGGAAGCAACCCAAGACGTTCACCGTGAGGATCGTCACCATGGACGCCGAGATGGAG" + "TTCAATTGCGAGGTACTGGATCATGATGTTTCAAAGGAAGAACCAGTCACCTTTCACTTCTTGGCCAAATTTTATCCTGAGAATGCTGAAGAGGAGCTGGTT" + "CAGGAGATCACACAACATTTATTCTTCTTACAGGTAAAGAAGCAGATTTTAGATGAAAAGATCTACTGCCCTCCTGAGGCTTCTGTGCTCCTGGCTTCTTACG" + "CCGTCCAGGCCAAGTATGGTGACTACGACCCCAGTGTTCACAAGCGGGGATTTTTGGCCCAAGAGGAATTGCTTCCAAAAAGGGTAATAAATCTGTATCAGAT" + "GACTCCGGAAATGTGGGAGGAGAGAATTACTGCTTGGTACGCAGAGCACCGAGGCCGAGCCAGGGATGAAGCTGAAATGGAATATCTGAAGATAGCTCAGGAC" + "CTGGAGATGTACGGTGTGAACTACTTTGCAATCCGGAATAAAAAGGGCACAGAGCTGCTGCTTGGAGTGGATGCCCTGGGGCTTCACATTTATGACCCTGAGA" + "ACAGACTGACCCCCAAGATCTCCTTCCCGTGGAATGAAATCCGAAACATCTCGTACAGTGACAAGGAGTTTACTATTAAACCACTGGATAAGAAAATTGATGTC" + "TTCAAGTTTAACTCCTCAAAGCTTCGTGTTAATAAGCTGATTCTCCAGCTATGTATCGGGAACCATGATCTATTTATGAGGAGAAGGAAAGCCGATTCTTTGGAA" + "GTTCAGCAGATGAAAGCCCAGGCCAGGGAGGAGAAGGCTAGAAAGCAGATGGAGCGGCAGCGCCTCGCTCGAGAGAAGCAGATGAGGGAGGAGGCTGAACGCACG" + "AGGGATGAGTTGGAGAGGAGGCTGCTGCAGATGAAAGAAGAAGCAACAATGGCCAACGAAGCACTGATGCGGTCTGAGGAGACAGCTGACCTGTTGGCTGAAAAG" + "GCCCAGATCACCGAGGAGGAGGCAAAACTTCTGGCCCAGAAGGCCGCAGAGGCTGAGCAGGAAATGCAGCGCATCAAGGCCACAGCGATTCGCACGGAGGAGGAG" + "AAGCGCCTGATGGAGCAGAAGGTGCTGGAAGCCGAGGTGCTGGCACTGAAGATGGCTGAGGAGTCAGAGAGGAGGGCCAAAGAGGCAGATCAGCTGAAGCAGGAC" + "CTGCAGGAAGCACGCGAGGCGGAGCGAAGAGCCAAGCAGAAGCTCCTGGAGATTGCCACCAAGCCCACGTACCCGCCCATGAACCCAATTCCAGCACCGTTGCCTC" + "CTGACATACCAAGCTTCAACCTCATTGGTGACAGCCTGTCTTTCGACTTCAAAGATACTGACATGAAGCGGCTTTCCATGGAGATAGAGAAAGAAAAAGTGGAATAC" + "ATGGAAAAGAGCAAGCATCTGCAGGAGCAGCTCAATGAACTCAAGACAGAAATCGAGGCCTTGAAACTGAAAGAGAGGGAGACAGCTCTGGATATTCTGCACAATGA" + "GAACTCCGACAGGGGTGGCAGCAGCAAGCACAATACCATTAAAAAGCCTCAAGCCCAAGGCAGAAGACCTATCTGCATTTGA" + ), + # ENST00000394374.6 – used for frameshift_variant + "MEN1": ( + "ATGGGGCTGAAGGCCGCCCAGAAGACGCTGTTCCCGCTGCGCTCCATCGACGACGTGGTGCGCCTGTTTGCTGCCGAGCTGGGCCGAGAGGAGCCGGACCTG" + "GTGCTCCTTTCCTTGGTGCTGGGCTTCGTGGAGCATTTTCTGGCTGTCAACCGCGTCATCCCTACCAACGTTCCCGAGCTCACCTTCCAGCCCAGCCCCGCCC" + "CCGACCCGCCTGGCGGCCTCACCTACTTTCCCGTGGCCGACCTGTCTATCATCGCCGCCCTCTATGCCCGCTTCACCGCCCAGATCCGAGGCGCCGTCGACCTG" + "TCCCTCTATCCTCGAGAAGGGGGTGTCTCCAGCCGTGAGCTGGTGAAGAAGGTCTCCGATGTCATATGGAACAGCCTCAGCCGCTCCTACTTCAAGGATCGGGCC" + "CACATCCAGTCCCTCTTCAGCTTCATCACAGGTTGGAGCCCAGTAGGCACCAAATTGGACAGCTCCGGTGTGGCCTTTGCTGTGGTTGGGGCCTGCCAGGCCCTGG" + "GTCTCCGGGATGTCCACCTCGCCCTGTCTGAGGATCATGCCTGGGTAGTGTTTGGGCCCAATGGGGAGCAGACAGCTGAGGTCACCTGGCACGGCAAGGGCAACGA" + "GGACCGCAGGGGCCAGACAGTCAATGCCGGTGTGGCTGAGCGGAGCTGGCTGTACCTGAAAGGATCATACATGCGCTGTGACCGCAAGATGGAGGTGGCGTTCATG" + "GTGTGTGCCATCAACCCTTCCATTGACCTGCACACCGACTCGCTGGAGCTTCTGCAGCTGCAGCAGAAGCTGCTCTGGCTGCTCTATGACCTGGGACATCTGGAAAG" + "GTACCCCATGGCCTTAGGGAACCTGGCAGATCTAGAGGAGCTGGAGCCCACCCCTGGCCGGCCAGACCCACTCACCCTCTACCACAAGGGCATTGCCTCAGCCAAGAC" + "CTACTATCGGGATGAACACATCTACCCCTACATGTACCTGGCTGGCTACCACTGTCGCAACCGCAATGTGCGGGAAGCCCTGCAGGCCTGGGCGGACACGGCCACTGT" + "CATCCAGGACTACAACTACTGCCGGGAAGACGAGGAGATCTACAAGGAGTTCTTTGAAGTAGCCAATGATGTCATCCCCAACCTGCTGAAGGAGGCAGCCAGCTTGCT" + "GGAGGCGGGCGAGGAGCGGCCGGGGGAGCAAAGCCAGGGCACCCAGAGCCAAGGTTCCGCCCTCCAGGACCCTGAGTGCTTCGCCCACCTGCTGCGATTCTACGACGG" + "CATCTGCAAATGGGAGGAGGGCAGTCCCACGCCTGTGCTGCATGTGGGCTGGGCCACCTTTCTTGTGCAGTCCCTAGGCCGTTTTGAGGGACAGGTGCGGCAGAAGGT" + "GCGCATAGTGAGCCGAGAGGCCGAGGCGGCCGAGGCCGAGGAGCCGTGGGGCGAGGAAGCCCGGGAAGGCCGGCGGCGGGGCCCACGGCGGGAGTCCAAGCCAGAGGAG" + "CCCCCGCCGCCCAAGAAGCCAGCACTGGACAAGGGCCTGGGCACCGGCCAGGGTGCAGTGTCAGGACCCCCCCGGAAGCCTCCTGGGACTGTCGCTGGCACAGCCCGAGG" + "CCCTGAAGGTGGCAGCACGGCTCAGGTGCCAGCACCCACAGCATCACCACCGCCGGAGGGTCCAGTGCTCACTTTCCAGAGTGAGAAGATGAAGGGCATGAAGGAGCTGC" + "TGGTGGCCACCAAGATCAACTCGAGCGCCATCAAGCTGCAACTCACGGCACAGTCGCAAGTGCAGATGAAGAAGCAGAAAGTGTCCACCCCTAGTGACTACACTCTGTCT" + "TTCCTCAAGCGGCAGCGCAAAGGCCTCTGA" + ), + # ENST00000513000.5 – used for start_lost + "INPP4B": ( + "ATGGAAATTAAAGAGGAAGGGGCATCAGAAGAAGGGCAGCACTTTCTTCCTACAGCCCAGGCCAATGATCCCGGGGACTGTCAGTTCACAAGTATCCAGAAGAC" + "TCCAAATGAACCGCAGTTGGAATTCATCCTTGCATGCAAGGATCTCGTGGCTCCTGTCCGTGATCGTAAACTGAATACACTGGTGCAGATCTCCGTAATCCACC" + "CCGTGGAGCAGAGTCTGACAAGATACTCCAGCACCGAAATTGTGGAGGGAACAAGGGACCCACTGTTTTTGACTGGTGTCACATTCCCATCTGAGTATCCCATCT" + "ATGAGGAGACCAAAATAAAACTAACAGTCTATGATGTCAAGGATAAGTCTCATGACACCGTTCGAACCAGTGTCCTACCAGAACATAAGGATCCCCCGCCAGAAGT" + "TGGGCGAAGTTTCTTGGGCTATGCCAGTTTTAAAGTGGGAGAGCTGCTGAAGTCAAAGGAGCAATTGCTGGTCCTGAGCCTGAGAACTTCAGATGGTGGCAAAGTG" + "GTTGGCACCATAGAAGTCAGTGTCGTGAAGATGGGGGAGATTGAGGATGGGGAAGCCGACCACATCACCACAGATGTACAGGGACAAAAGTGTGCCCTGGTATGTGA" + "ATGTACAGCCCCGGAAAGTGTGAGCGGAAAAGATAACTTACCTTTTTTGAATTCAGTGTTAAAGAACCCAGTATGTAAATTATATAGATTTCCCACATCTGACAATAAG" + "TGGATGCGAATTCGAGAGCAGATGTCAGAGAGCATTCTTTCCTTTCATATTCCTAAGGAATTGATTTCCCTTCACATTAAAGAAGATTTGTGCAGAAACCAGGAGATA" + "AAAGAACTTGGTGAGCTTTCTCCACATTGGGACAATCTGCGAAAAAATGTCCTTACTCACTGTGATCAAATGGTGAATATGTACCAAGACATTCTGACAGAACTTAGCA" + "AGGAAACAGGGTCCTCTTTCAAATCAAGCAGCAGCAAAGGAGAGAAAACATTAGAATTTGTTCCAATAAATCTACATCTGCAAAGAATGCAGGTACACAGCCCTCACTTG" + "AAAGATGCTCTCTACGATGTCATCACTGTGGGAGCCCCAGCTGCCCATTTTCAGGGATTTAAGAATGGTGGTCTTCGGAAGCTACTCCATAGATTTGAAACAGAAAGAAG" + "AAATACCGGATACCAGTTTATTTACTATTCACCTGAAAACACAGCCAAAGCAAAGGAAGTTCTCAGCAACATCAATCAACTACAACCTCTTATAGCAACCCATGCAGACC" + "TACTGCTTAATTCTGCAAGCCAGCATTCTCCAGACAGCTTGAAGAATTCTTTAAAGATGCTTTCAGAAAAAACAGAGCTTTTTGTACATGCCTTCAAGGATCAACTTGTC" + "AGGAGTGCTCTTTTAGCACTCTACACTGCAAGGCCAGGAGGCATTCTTAAGAAGCCACCCTCTCCTAAGAGCAGCACAGAGGAGAGCAGTCCCCAAGACCAACCCCCAGT" + "GATGAGAGGGCAGGACTCCATACCACATCATTCAGACTATGATGAGGAAGAGTGGGACAGGGTGTGGGCCAATGTGGGGAAGAGCCTGAACTGCATTATTGCTATGGTGG" + "ACAAACTGATTGAAAGAGATGGTGGCAGTGAAGGCAGTGGCGGCAACAATGATGGAGAAAAGGAACCTTCATTAACAGATGCCATTCCCTCTCACCCAAGAGAGGACTGG" + "TATGAACAGTTGTATCCCCTCATCCTTACCCTGAAGGACTGCATGGGAGAAGTGGTGAACCGAGCCAAGCAGTCCCTGACATTTGTGCTCCTTCAGGAACTTGCGTACAGC" + "TTGCCCCAGTGTCTGATGCTGACGCTAAGAAGAGACATCGTCTTCAGCCAAGCACTTGCTGGATTGGTTTGTGGTTTTATCATCAAATTACAGACAAGTCTGTATGACCC" + "AGGCTTCCTACAGCAGCTTCACACAGTGGGGTTGATAGTACAATATGAAGGACTGCTAAGTACATACAGCGATGAAATTGGAATGCTAGAGGACATGGCCGTTGGCATTTC" + "CGATTTAAAGAAAGTTGCATTTAAAATAATTGAAGCCAAATCCAATGATGTGTTGCCAGTTATAACAGGAAGACGAGAACATTACGTGGTAGAGGTCAAGCTTCCAGCCAG" + "AATGTTTGAGTCACTACCTCTACAGATTAAAGAAGGACAGTTGCTTCATGTGTATCCAGTACTTTTTAATGTTGGAATCAATGAACAGCAAACTCTGGCTGAAAGGTTTGG" + "AGATGTCTCTTTGCAAGAAAGTATTAATCAGGAAAACTTCGAACTTCTACAAGAATATTACAAGATATTTATGGAAAAGATGCCTCCTGATTATATTTCACATTTTCAGGAA" + "CAAAATGATTTAAAAGCATTGCTAGAAAATCTCCTTCAAAATATCCAATCCAAAAAAAGAAAGAATGTAGAAATTATGTGGCTGGCTGCAACGATTTGCCGCAAACTGAATG" + "GTATTCGTTTCACCTGTTGTAAAAGTGCCAAAGACAGGACATCGATGTCAGTGACACTTGAACAATGCTCAATCTTGAGAGATGAGCACCAGTTACACAAGGACTTCTTTAT" + "CCGAGCGCTGGATTGCATGAGAAGAGAAGGATGCCGCATAGAGAATGTACTGAAGAATATCAAATGCAGAAAGTATGCTTTCAACATGCTACAGCTGATGGCTTTCCCCAAG" + "TACTACAGACCTCCAGAGGGGACTTATGGAAAAGCTGACACCTAA" + ), + # ENST00000443433.6 – used for stop_lost + "MYD88": ( + "ATGCGACCCGACCGCGCTGAGGCTCCAGGACCGCCCGCCATGGCTGCAGGAGGTCCCGGCGCGGGGTCTGCGGCCCCGGTCTCCTCCACATCCTCCCTTCCCC" + "TGGCTGCTCTCAACATGCGAGTGCGGCGCCGCCTGTCTCTGTTCTTGAACGTGCGGACACAGGTGGCGGCCGACTGGACCGCGCTGGCGGAGGAGATGGACTTT" + "GAGTACTTGGAGATCCGGCAACTGGAGACACAAGCGGACCCCACTGGCAGGCTGCTGGACGCCTGGCAGGGACGCCCTGGCGCCTCTGTAGGCCGACTGCTCGAG" + "CTGCTTACCAAGCTGGGCCGCGACGACGTGCTGCTGGAGCTGGGACCCAGCATTGAGGAGGATTGCCAAAAGTATATCTTGAAGCAGCAGCAGGAGGAGGCTGAG" + "AAGCCTTTACAGGTGGCCGCTGTAGACAGCAGTGTCCCACGGACAGCAGAGCTGGCGGGCATCACCACACTTGATGACCCCCTGGGTGCCGCCGGATGGTGGTGGT" + "TGTCTCTGATGATTACCTGCAGAGCAAGGAATGTGACTTCCAGACCAAATTTGCACTCAGCCTCTCTCCAGGTGCCCATCAGAAGCGACTGA" + ), + # ENST00000494014.1 – used for stop_retained_variant + "EPHA3": ( + "ATGGATTGTCAGCTCTCCATCCTCCTCCTTCTCAGCTGCTCTGTTCTCGACAGCTTCGGGGAACTGATTCCGCAGCCTTCCAATGAAGTCAATCTACTGGATTC" + "AAAAACAATTCAAGGGGAGCTGGGCTGGATCTCTTATCCATCACATGGGTGGGAAGAGATCAGTGGTGTGGATGAACATTACACACCCATCAGGACTTACCAGGTG" + "TGCAATGTCATGGACCACAGTCAAAACAATTGGCTGAGAACAAACTGGGTCCCCAGGAACTCAGCTCAGAAGATTTATGTGGAGCTCAAGTTCACTCTACGAGACTG" + "CAATAGCATTCCATTGGTTTTAGGAACTTGCAAGGAGACATTCAACCTGTACTACATGGAGTCTGATGATGATCATGGGGTGAAATTTCGAGAGCATCAGTTTACAAA" + "GATTGACACCATTGCAGCTGATGAAAGTTTCACTCAAATGGATCTTGGGGACCGTATTCTGAAGCTCAACACTGAGATTAGAGAAGTAGGTCCTGTCAACAAGAAGGG" + "ATTTTATTTGGCATTTCAAGATGTTGGTGCTTGTGTTGCCTTGGTGTCTGTGAGAGTATACTTCAAAAAGTGCCCATTTACAGTGAAGAATCTGGCTATGTTTCCAGA" + "CACGGTACCCATGGACTCCCAGTCCCTGGTGGAGGTTAGAGGGTCTTGTGTCAACAATTCTAAGGAGGAAGATCCTCCAAGGATGTACTGCAGTACAGAAGGCGAATGG" + "CTTGTACCCATTGGCAAGTGTTCCTGCAATGCTGGCTATGAAGAAAGAGGTTTTATGTGCCAAGCTTGTCGACCAGGTTTCTACAAGGCATTGGATGGTAATATGAAGT" + "GTGCTAAGTGCCCGCCTCACAGTTCTACTCAGGAAGATGGTTCAATGAACTGCAGGTGTGAGAATAATTACTTCCGGGCAGACAAAGACCCTCCATCCATGGCTTGTACC" + "CGACCTCCATCTTCACCAAGAAATGTTATCTCTAATATAAACGAGACCTCAGTTATCCTGGACTGGAGTTGGCCCCTGGACACAGGAGGCCGGAAAGATGTTACCTTCAAC" + "ATCATATGTAAAAAATGTGGGTGGAATATAAAACAGTGTGAGCCATGCAGCCCAAATGTCCGCTTCCTCCCTCGACAGTTTGGACTCACCAACACCACGGTGACAGTGACAG" + "ACCTTCTGGCACATACTAACTACACCTTTGAGATTGATGCCGTTAATGGGGTGTCAGAGCTGAGCTCCCCACCAAGACAGTTTGCTGCGGTCAGCATCACAACTAATCAGGC" + "TGCTCCATCACCTGTCCTGACGATTAAGAAAGATCGGACCTCCAGAAATAGCATCTCTTTGTCCTGGCAAGAACCTGAACATCCTAATGGGATCATATTGGACTACGAGGTC" + "AAATACTATGAAAAGCAGGAACAAGAAACAAGTTATACCATTCTGAGGGCAAGAGGCACAAATGTTACCATCAGTAGCCTCAAGCCTGACACTATATACGTATTCCAAATCC" + "GAGCCCGAACAGCCGCTGGATATGGGACGAACAGCCGCAAGTTTGAGTTTGAAACTAGTCCAGACTCTTTCTCCATCTCTGGTGAAAGTAGCCAAGTGGTCATGATCGCCAT" + "TTCAGCGGCAGTAGCAATTATTCTCCTCACTGTTGTCATCTATGTTTTGATTGGGAGGTTCTGTGGCTATAAGTCAAAACATGGGGCAGATGAAAAAAGACTTCATTTTGGC" + "AATGGGCATTTAAAACTTCCAGGTCTCAGGACTTATGTTGACCCACATACATATGAAGACCCTACCCAAGCTGTTCATGAGTTTGCCAAGGAATTGGATGCCACCAACATATC" + "CATTGATAAAGTTGTTGGAGCAGGTGAATTTGGAGAGGTGTGCAGTGGTCGCTTAAAACTTCCTTCAAAAAAAGAGATTTCAGTGGCCATTAAGACCCTGAAAGTTGGCTAC" + "ACAGAAAAGCAGAGGAGAGACTTCCTGGGAGAAGCAAGCATTATGGGACAGTTTGACCACCCCAATATCATTCGACTGGAAGGAGTTGTTACCAAAAGTAAGCCAGTTATGA" + "TTGTCACAGAATACATGGAGAATGGTTCCTTGGATAGTTTCCTACGTAAACACGATGCCCAGTTTACTGTCATTCAGCTAGTGGGGATGCTTCGAGGGATAGCATCTGGCAT" + "GAAGTACCTGTCAGACATGGGCTATGTTCACCGAGACCTCGCTGCTCGGAACATCTTGATCAACAGTAACTTGGTGTGTAAGGTTTCTGATTTCGGACTTTCGCGTGTCCTG" + "GAGGATGACCCAGAAGCTGCTTATACAACAAGAGGAGGGAAGATCCCAATCAGGTGGACATCACCAGAAGCTATAGCCTACCGCAAGTTCACGTCAGCCAGCGATGTATGGAG" + "TTATGGGATTGTTCTCTGGGAGGTGATGTCTTATGGAGAGAGACCATACTGGGAGATGTCCAATCAGGATGTAATTAAAGCTGTAGATGAGGGCTATCGACTGCCACCCCCC" + "ATGGACTGCCCAGCTGCCTTGTATCAGCTGATGCTGGACTGCTGGCAGAAAGACAGGAACAACAGACCCAAGTTTGAGCAGATTGTTAGTATTCTGGACAAGCTTATCCGGA" + "ATCCCGGCAGCCTGAAGATCATCACCAGTGCAGCCGCAAGGCCATCAAACCTTCTTCTGGACCAAAGCAATGTGGATATCACTACCTTCCGCACAACAGTGACATGA" + ), +} + + +def _seq(gene: str) -> Seq: + return Seq("".join(_CDS[gene].split())) + + +def _wt_protein(gene: str) -> str: + return str(_seq(gene).translate(to_stop=False)) + + +def _call(gene, dna_mut, aa_mut, mutation_type) -> str: + snp = SNP(gene=gene, mrna="", dna_mut=dna_mut, aa_mut=aa_mut, mutation_type=mutation_type) + return CancerGenomesService.get_mut_pro_seq(snp, _seq(gene)) + + +# --------------------------------------------------------------------------- +# Actionable types — should produce a non-empty mutant protein +# --------------------------------------------------------------------------- + +class TestActionableVariantTypes: + + def test_missense_variant(self): + """HGVS rule: single AA substitution leaves total protein length unchanged. + + Source: EZH2 c.656C>T p.S219F (missense_variant) + DNA path: substitution at position 656 (C→T) changes codon 219 from Ser to Phe. + """ + wt = _wt_protein("EZH2") + result = _call("EZH2", "c.656C>T", "p.S219F", "missense_variant") + + assert result, "missense_variant must produce a non-empty protein" + assert len(result) == len(wt), ( + f"Missense must preserve protein length: wt={len(wt)}, mut={len(result)}" + ) + assert result[218] == "F", ( + f"Position 219 (index 218) must change to Phe; got '{result[218]}'" + ) + assert result[:218] == wt[:218], "Residues before the substitution must be unchanged" + assert result[219:] == wt[219:], "Residues after the substitution must be unchanged" + + def test_stop_gained(self): + """HGVS rule: stop codon (*) introduced at the predicted position. + + Source: FCGR3B c.394A>T p.K132* (stop_gained) + DNA path: substitution at position 394 (A→T) converts codon 132 to a stop. + """ + result = _call("FCGR3B", "c.394A>T", "p.K132*", "stop_gained") + + assert result, "stop_gained must produce a non-empty translated sequence" + assert result[131] == "*", ( + f"Stop codon must be introduced at position 132 (index 131); got '{result[131]}'" + ) + + def test_inframe_deletion(self): + """HGVS rule: in-frame deletion shortens the protein by exactly the number of deleted codons. + + Source: CTNNB1 c.104_124del p.T35_G41del (inframe_deletion) + DNA path: deletes 21 nucleotides (positions 104–124) = 7 codons = 7 amino acids. + """ + wt = _wt_protein("CTNNB1") + result = _call("CTNNB1", "c.104_124del", "p.T35_G41del", "inframe_deletion") + + deleted_nt = 124 - 104 + 1 # 21 nt = 7 codons + deleted_aa = deleted_nt // 3 + + assert result, "inframe_deletion must produce a non-empty protein" + assert len(result) == len(wt) - deleted_aa, ( + f"Inframe deletion of {deleted_aa} AAs: expected len {len(wt) - deleted_aa}, " + f"got {len(result)}" + ) + assert result[:34] == wt[:34], "Residues before the deletion must be unchanged" + + def test_inframe_insertion_dup(self): + """HGVS rule: in-frame duplication lengthens the protein by exactly the number of duplicated codons. + + Source: SMAP1 c.1226_1228dup p.I409dup (inframe_insertion) + DNA path: duplicates 3 nucleotides (positions 1226–1228) = 1 codon = 1 amino acid inserted. + """ + wt = _wt_protein("SMAP1") + result = _call("SMAP1", "c.1226_1228dup", "p.I409dup", "inframe_insertion") + + inserted_nt = 1228 - 1226 + 1 # 3 nt = 1 codon + inserted_aa = inserted_nt // 3 + + assert result, "inframe_insertion (dup) must produce a non-empty protein" + assert len(result) == len(wt) + inserted_aa, ( + f"Duplication of {inserted_aa} AA: expected len {len(wt) + inserted_aa}, " + f"got {len(result)}" + ) + assert result[:409] == wt[:409], "Residues before the duplication site must be unchanged" + + def test_protein_altering_variant(self): + """HGVS rule: delins replaces a nucleotide range; result is non-empty and differs from WT. + + Source: NF2 c.252_262delinsCT p.L85_K88delins* (protein_altering_variant) + DNA path: deletes positions 252–262 (11 nt) and inserts CT (2 nt). + The AA annotation (delins*) predicts a premature stop in the altered sequence. + """ + wt = _wt_protein("NF2") + result = _call("NF2", "c.252_262delinsCT", "p.L85_K88delins*", "protein_altering_variant") + + assert result, "protein_altering_variant must produce a non-empty sequence" + assert result != wt, "protein_altering_variant must differ from the wild-type sequence" + + def test_frameshift_variant(self): + """HGVS rule: out-of-frame deletion disrupts the reading frame; the code translates the + resulting sequence as-is (to_stop=False) producing a non-empty aberrant protein. + + Source: MEN1 c.292del p.R98Efs*21 (frameshift_variant) + DNA path: single-nucleotide deletion at position 292 shifts the reading frame. + """ + result = _call("MEN1", "c.292del", "p.R98Efs*21", "frameshift_variant") + + assert result, "frameshift_variant must produce a non-empty translated sequence" + + def test_start_lost(self): + """HGVS rule: start codon destroyed; the processed sequence no longer begins with Met. + + Source: INPP4B c.3G>T p.M1? (start_lost) + The annotation is p.M1? (ambiguous protein consequence) but dna_mut is unambiguous, + so the DNA substitution path is taken. ATG (Met) → ATT (Ile) at codon 1. + """ + result = _call("INPP4B", "c.3G>T", "p.M1?", "start_lost") + + assert result, "start_lost must produce a non-empty translated sequence via the DNA path" + assert result[0] != "M", "Start codon is destroyed; first AA must no longer be Met" + assert result[0] == "I", "ATG→ATT encodes Ile; first AA must be Ile" + + def test_stop_lost(self): + """HGVS rule: stop codon removed; the protein extends past the original terminus. + + Source: MYD88 c.611_613delinsGCGGCCCCC p.D204_*205delinsGGPR (protein_altering_variant,stop_lost) + DNA path: delins at 611–613 removes the original stop codon; the protein extends. + """ + wt = _wt_protein("MYD88") + result = _call( + "MYD88", + "c.611_613delinsGCGGCCCCC", + "p.D204_*205delinsGGPR", + "protein_altering_variant,stop_lost", + ) + + assert result, "stop_lost must produce a non-empty protein" + assert len(result) > len(wt), ( + f"Stop-lost protein must be longer than WT: wt={len(wt)}, mut={len(result)}" + ) + + def test_stop_retained_variant(self): + """HGVS rule: synonymous change at the stop codon; stop is preserved; length unchanged. + + Source: EPHA3 c.2756G>A p.*919= (stop_retained_variant) + DNA path: TGA (stop) → TAA (stop); both are stop codons, protein length is unchanged. + """ + wt = _wt_protein("EPHA3") + result = _call("EPHA3", "c.2756G>A", "p.*919=", "stop_retained_variant") + + assert result, "stop_retained_variant must produce a non-empty sequence" + assert len(result) == len(wt), ( + f"Stop-retained must preserve protein length: wt={len(wt)}, mut={len(result)}" + ) + assert result[-1] == "*", "Stop codon must still be present at the terminus" + + +# --------------------------------------------------------------------------- +# Filtered / skipped types — get_mut_pro_seq must return an empty string +# --------------------------------------------------------------------------- + +class TestSkippedVariantTypes: + + def test_synonymous_variant_is_prefiltered(self): + """synonymous_variant rows are discarded in cosmic_to_proteindb before get_mut_pro_seq + is ever called. Verify the filter expression matches the v103 SO term. + """ + assert "synonymous_variant" in "synonymous_variant" + assert "synonymous_variant" in "splice_region_variant,synonymous_variant" + assert "synonymous_variant" not in "missense_variant" + + def test_intron_variant_returns_empty(self): + """intron_variant always carries p.? — the p.? guard returns an empty string. + + Source: RAD51C c.145+706C>T p.? (intron_variant) + """ + result = _call("EZH2", "c.145+706C>T", "p.?", "intron_variant") + assert result == "", f"intron_variant with p.? must return ''; got {repr(result)}" + + def test_3_prime_utr_variant_returns_empty(self): + """3_prime_UTR_variant always carries p.? — the p.? guard returns an empty string. + + Source: SUFU c.*2816G>A p.? (3_prime_UTR_variant) + The c.*N notation signals a 3'UTR position; the outer if-condition fails on p.?. + """ + result = _call("EZH2", "c.*2816G>A", "p.?", "3_prime_UTR_variant") + assert result == "", f"3_prime_UTR_variant with p.? must return ''; got {repr(result)}" + + def test_5_prime_utr_variant_returns_empty(self): + """5_prime_UTR_variant always carries p.? — the p.? guard returns an empty string. + + Source: PSMD13 c.-20C>T p.? (5_prime_UTR_variant) + """ + result = _call("EZH2", "c.-20C>T", "p.?", "5_prime_UTR_variant") + assert result == "", f"5_prime_UTR_variant with p.? must return ''; got {repr(result)}" + + def test_coding_sequence_variant_returns_empty(self): + """coding_sequence_variant in v103 uses intronic-offset positions with p.?. + + Source: OAS2 c.2157+2A>G p.? (coding_sequence_variant) + Both the intronic offset guard ('+' in coord) and the p.? guard ensure ''. + """ + result = _call("EZH2", "c.2157+2A>G", "p.?", "coding_sequence_variant") + assert result == "", f"coding_sequence_variant with p.? must return ''; got {repr(result)}" + + def test_splice_acceptor_variant_returns_empty(self): + """splice_acceptor_variant uses intronic-offset position notation with p.?. + + Source: TSHR c.171-1G>A p.? (splice_acceptor_variant) + The '-' offset guard in the delins branch (and p.? in the substitution guard) return ''. + """ + result = _call("EZH2", "c.171-1G>A", "p.?", "splice_acceptor_variant") + assert result == "", f"splice_acceptor_variant with p.? must return ''; got {repr(result)}" + + def test_splice_donor_variant_returns_empty(self): + """splice_donor_variant uses downstream-offset position notation with p.?. + + Source: PTCH1 c.3306+2T>G p.? (splice_donor_variant) + """ + result = _call("EZH2", "c.3306+2T>G", "p.?", "splice_donor_variant") + assert result == "", f"splice_donor_variant with p.? must return ''; got {repr(result)}" + + def test_splice_region_variant_composite_treated_as_primary_type(self): + """splice_region_variant only appears in v103 as part of a composite type string. + When combined with missense_variant, the missense guard fires first and the + mutation is processed as a missense — same-length result with a single AA change. + + Source: EZH2 c.656C>T p.S219F (missense_variant — re-used with composite type) + """ + wt = _wt_protein("EZH2") + result = _call( + "EZH2", "c.656C>T", "p.S219F", "missense_variant,splice_region_variant" + ) + assert result, "composite type containing missense_variant must produce a result" + assert len(result) == len(wt), "missense-handled result must preserve protein length" + assert result[218] == "F", "Substitution at codon 219 must produce Phe" + + def test_incomplete_terminal_codon_variant_returns_empty(self): + """incomplete_terminal_codon_variant is absent from v103 targeted-screen data. + By convention it carries p.?, so get_mut_pro_seq returns an empty string. + """ + result = _call("EZH2", "c.2213_2214insA", "p.?", "incomplete_terminal_codon_variant") + assert result == "", ( + f"incomplete_terminal_codon_variant with p.? must return ''; got {repr(result)}" + ) From 311fafb7c36fc45ab5f0981b807f96375c16a738 Mon Sep 17 00:00:00 2001 From: Husen Umer Date: Wed, 13 May 2026 14:42:29 +0300 Subject: [PATCH 17/23] minor command fixes in use-cases file --- docs/use-cases.md | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/docs/use-cases.md b/docs/use-cases.md index f08f690..3f9e0a4 100644 --- a/docs/use-cases.md +++ b/docs/use-cases.md @@ -32,7 +32,7 @@ pgatk ensembl-downloader \ ```bash gffread -F -w ensembl_human/transcripts.fa \ - -g ensembl_human/genome.fa \ + -g ensembl_human/Homo_sapiens.GRCh38.dna_sm.toplevel.fa \ ensembl_human/Homo_sapiens.GRCh38.*.gtf ``` @@ -106,19 +106,7 @@ pgatk dnaseq-to-proteindb \ --skip_including_all_cds ``` -### Step 5 -- Population variant proteins - -Include common human variants from ENSEMBL: - -```bash -pgatk vcf-to-proteindb \ - --vcf ensembl_human/homo_sapiens_incl_consequences.vcf \ - --input_fasta ensembl_human/transcripts.fa \ - --gene_annotations_gtf ensembl_human/Homo_sapiens.GRCh38.*.gtf \ - --output_proteindb ensembl_variants.fa -``` - -### Step 6 -- COSMIC somatic mutations (optional, for cancer cell lines) +### Step 5 -- COSMIC somatic mutations (optional, for cancer cell lines) For cancer cell-line studies, add cell-line-specific somatic mutations: @@ -136,7 +124,7 @@ pgatk cosmic-to-proteindb \ --split_by_filter_column ``` -### Step 7 -- Combine and generate target-decoy database +### Step 6 -- Combine and generate target-decoy database ```bash # Combine all components @@ -146,7 +134,6 @@ cat canonical.fa \ lncrna.fa \ putative.fa \ altorf.fa \ - ensembl_variants.fa \ > cell_type_target.fa # Generate decoy sequences @@ -211,7 +198,7 @@ extract transcript sequences from the GTF and genome FASTA: ```bash gffread -F -w ensembl_human/transcripts.fa \ - -g ensembl_human/genome.fa \ + -g ensembl_human/Homo_sapiens.GRCh38.dna_sm.toplevel.fa \ ensembl_human/Homo_sapiens.GRCh38.*.gtf ``` From dd286f8941e764ba6fed55dd07446e809acd94ae Mon Sep 17 00:00:00 2001 From: Husen Umer Date: Wed, 13 May 2026 18:49:47 +0300 Subject: [PATCH 18/23] Update COSMIC integration for v103 schema changes - Updated input file names and parameters in the use cases to reflect the new COSMIC v103 format. - Introduced the `--clinical_sample_file` option to allow mapping of `COSMIC_PHENOTYPE_ID` to `PRIMARY_SITE` for filtering. - Enhanced the `CancerGenomesService` to utilize the classification file for phenotype mapping, improving mutation filtering based on tissue type. - Adjusted documentation to clarify the new data model and integration tests for COSMIC v103. - Updated test cases to align with the new schema and ensure accurate output generation. --- docs/use-cases.md | 49 ++++--- docs/validations.md | 127 ++++++++++++++++++ pgatk/cgenomes/cgenomes_proteindb.py | 112 ++++++++++----- pgatk/commands/cosmic_to_proteindb.py | 10 +- pgatk/config/cosmic_config.yaml | 3 +- pgatk/testdata/proteindb_from_custom_VCF.fa | 8 +- pgatk/testdata/test_cosmic_classification.tsv | 9 ++ pgatk/testdata/test_cosmic_mutations.tsv | 26 ++-- .../test_cosmic_mutations_proteindb.fa | 14 +- .../test_cosmic_mutations_proteindb_bone.fa | 4 +- .../test_cosmic_mutations_proteindb_liver.fa | 2 +- .../test_cosmic_mutations_proteindb_skin.fa | 4 +- ...test_cosmic_mutations_proteindb_thyroid.fa | 2 +- ...tions_proteindb_upperaerodigestivetract.fa | 4 +- pgatk/tests/pgatk_tests.py | 3 +- 15 files changed, 292 insertions(+), 85 deletions(-) create mode 100644 pgatk/testdata/test_cosmic_classification.tsv diff --git a/docs/use-cases.md b/docs/use-cases.md index 3f9e0a4..0442c9c 100644 --- a/docs/use-cases.md +++ b/docs/use-cases.md @@ -117,10 +117,11 @@ pgatk cosmic-downloader \ -o cosmic_data pgatk cosmic-to-proteindb \ - --input_mutation cosmic_data/CosmicCLP_MutantExport.tsv.gz \ - --input_genes cosmic_data/All_CellLines_Genes.fasta.gz \ + --input_mutation cosmic_data/CellLinesProject_GenomeScreensMutant_v103_GRCh38.tsv.gz \ + --input_genes cosmic_data/Cosmic_Genes_v103_GRCh38.fasta.gz \ --output_db cosmic_cellline.fa \ - --filter_column "Sample name" \ + --clinical_sample_file cosmic_data/Cosmic_Classification_v103_GRCh38.tsv.gz \ + --filter_column PRIMARY_SITE \ --split_by_filter_column ``` @@ -144,7 +145,7 @@ pgatk generate-decoy \ --decoy_prefix DECOY_ ``` -### Step 8 -- Extract non-canonical unique peptides +### Step 7 -- Extract non-canonical unique peptides After database searching with your search engine, you can also pre-compute the set of non-canonical peptides unique to these novel sources: @@ -381,24 +382,34 @@ pgatk cosmic-downloader \ # Generate per-tissue databases pgatk cosmic-to-proteindb \ - --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \ - --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \ + --input_mutation cosmic_data/Cosmic_GenomeScreensMutant_v103_GRCh38.tsv.gz \ + --input_genes cosmic_data/Cosmic_Genes_v103_GRCh38.fasta.gz \ --output_db cosmic_proteins.fa \ + --clinical_sample_file cosmic_data/Cosmic_Classification_v103_GRCh38.tsv.gz \ + --filter_column PRIMARY_SITE \ --split_by_filter_column ``` This produces files like `cosmic_proteins_lung.fa`, `cosmic_proteins_breast.fa`, etc. Use the tissue-matched database for your cancer type of interest. +!!! note "COSMIC v103 tissue annotation" + In COSMIC v103, `PRIMARY_SITE` is not a column in the mutation file. It lives + in the separate Classification file (`Cosmic_Classification_v103_GRCh38.tsv.gz`) + and is joined via `COSMIC_PHENOTYPE_ID`. Pass the Classification file via + `--clinical_sample_file` whenever tissue-type filtering or splitting is needed. + ### 4b. Cancer-type speific database using COSMIC somatic mutations Build a focused database for one cancer type (e.g. lung): ```bash pgatk cosmic-to-proteindb \ - --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \ - --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \ + --input_mutation cosmic_data/Cosmic_GenomeScreensMutant_v103_GRCh38.tsv.gz \ + --input_genes cosmic_data/Cosmic_Genes_v103_GRCh38.fasta.gz \ --output_db cosmic_lung_proteins.fa \ + --clinical_sample_file cosmic_data/Cosmic_Classification_v103_GRCh38.tsv.gz \ + --filter_column PRIMARY_SITE \ --accepted_values "lung" ``` @@ -409,10 +420,11 @@ provides a dedicated cell-line export with mutations annotated per sample: ```bash pgatk cosmic-to-proteindb \ - --input_mutation cosmic_data/CosmicCLP_MutantExport.tsv.gz \ - --input_genes cosmic_data/All_CellLines_Genes.fasta.gz \ + --input_mutation cosmic_data/CellLinesProject_GenomeScreensMutant_v103_GRCh38.tsv.gz \ + --input_genes cosmic_data/Cosmic_Genes_v103_GRCh38.fasta.gz \ --output_db cosmic_cellline_proteins.fa \ - --filter_column "Sample name" \ + --clinical_sample_file cosmic_data/Cosmic_Classification_v103_GRCh38.tsv.gz \ + --filter_column PRIMARY_SITE \ --split_by_filter_column ``` @@ -446,15 +458,16 @@ pgatk cbioportal-to-proteindb \ ### 4e. Combined cancer database for immunopeptidomics For HLA immunopeptidomics / neoantigen discovery, a broad mutation database -maximizes the chance of detecting mutant HLA-presented peptides. Studies have -shown that COSMIC-derived databases can identify 5x more mutant immunopeptides -than patient WES alone. Combine COSMIC with ClinVar: +maximizes the chance of detecting mutant HLA-presented peptides. COSMIC-derived +databases have been shown to identify 5x more mutant HLA-I immunopeptides than +patient WES alone ([Wang et al., *J Transl Med* 2024](https://doi.org/10.1186/s12967-023-04821-0)). +Combine COSMIC with ClinVar: ```bash # Generate COSMIC mutations pgatk cosmic-to-proteindb \ - --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \ - --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \ + --input_mutation cosmic_data/Cosmic_GenomeScreensMutant_v103_GRCh38.tsv.gz \ + --input_genes cosmic_data/Cosmic_Genes_v103_GRCh38.fasta.gz \ --output_db cosmic_proteins.fa # Generate ClinVar mutations @@ -973,8 +986,8 @@ pgatk clinvar-to-proteindb \ # COSMIC somatic mutations pgatk cosmic-to-proteindb \ - --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \ - --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \ + --input_mutation cosmic_data/Cosmic_GenomeScreensMutant_v103_GRCh38.tsv.gz \ + --input_genes cosmic_data/Cosmic_Genes_v103_GRCh38.fasta.gz \ --output_db cosmic_variants.fa # Non-coding RNA (lncRNA) diff --git a/docs/validations.md b/docs/validations.md index 015ef72..6dd3acc 100644 --- a/docs/validations.md +++ b/docs/validations.md @@ -128,3 +128,130 @@ print(mut_prot[218]) # 'F' — mutant Phenylalanine | `TestGetMutProSeqEdgeCases` | Mutation beyond sequence length, ambiguous `p.?` returns `""`, non-alpha last char in aa_mut returns `""`, DNA ref-allele mismatch returns `""` | These tests are fast to run and isolate individual code paths without requiring reference files. + +--- + +# 2. COSMIC Integration Pipeline Tests + +## 2.1 COSMIC v103 Schema and Data Model + +COSMIC v103 introduced two breaking changes from earlier releases: + +**Column name changes** — all columns are now `UPPERCASE_UNDERSCORE`: + +| Old (v2/v98) | New (v103) | +|---|---| +| `Gene name` | `GENE_SYMBOL` | +| `Accession Number` | `TRANSCRIPT_ACCESSION` | +| `CDS mutation` | `MUTATION_CDS` | +| `AA Mutation` | `MUTATION_AA` | +| `Type` | `MUTATION_DESCRIPTION` | +| `Primary site` | *(moved to Classification file)* | + +**Tissue-type data architecture** — `PRIMARY_SITE` (tissue of origin) is no longer a column in the mutation export. It lives in a separate **Classification file** (`Cosmic_Classification_Tsv_v103_GRCh38.tar`) and is joined via `COSMIC_PHENOTYPE_ID`: + +``` +mutation_file.COSMIC_PHENOTYPE_ID (e.g. COSO1001) + → Cosmic_Classification_v103_GRCh38.tsv.COSMIC_PHENOTYPE_ID + → PRIMARY_SITE (e.g. upper_aerodigestive_tract) +``` + +`COSMIC_PHENOTYPE_ID` is present at column 6 in both `Cosmic_CompleteTargetedScreensMutant` and `Cosmic_GenomeScreensMutant` files. No hop through the Sample file is needed. + +> **Note**: The Sample file (`Cosmic_Sample_Tsv_v103_GRCh38.tsv`) contains 34 columns covering sequencing metadata (`SAMPLE_NAME`, `COSMIC_SAMPLE_ID`, `TUMOUR_ID`, etc.) but does **not** contain `PRIMARY_SITE`. The Classification file is the authoritative source for tissue and histology. + +--- + +## 2.2 Integration Test (`test_cosmic_to_proteindb`) + +**Test file**: `pgatk/tests/pgatk_tests.py::PgatkRunnerTests::test_cosmic_to_proteindb` + +The test runs the full `cosmic-to-proteindb` pipeline end-to-end using small testdata files that mirror the real v103 schema: + +| File | Purpose | +|---|---| +| `testdata/test_cosmic_mutations.tsv` | 12 HRAS mutation rows in v103 27-column format; covers `missense_variant`, `stop_gained`, `inframe_deletion`, `synonymous_variant` (filtered), `frameshift_variant` (`p.?`, skipped) | +| `testdata/test_cosmic_genes.fa` | HRAS CDS FASTA from `Cosmic_Genes_v103_GRCh38.fasta` | +| `testdata/test_cosmic_classification.tsv` | 8 rows mapping `COSMIC_PHENOTYPE_ID` (COSO1001–COSO1008) to `PRIMARY_SITE`; covers `upper_aerodigestive_tract`, `skin`, `bone`, `thyroid`, `liver`, `haematopoietic_and_lymphoid_tissue` | + +CLI invocation used in the test: + +``` +cosmic-to-proteindb + --config_file config/cosmic_config.yaml + --input_mutation testdata/test_cosmic_mutations.tsv + --input_genes testdata/test_cosmic_genes.fa + --output_db testdata/test_cosmic_mutations_proteindb.fa + --clinical_sample_file testdata/test_cosmic_classification.tsv + --filter_column PRIMARY_SITE + --split_by_filter_column + --accepted_values all +``` + +Expected output files (split per `PRIMARY_SITE`): + +``` +testdata/test_cosmic_mutations_proteindb_bone.fa +testdata/test_cosmic_mutations_proteindb_liver.fa +testdata/test_cosmic_mutations_proteindb_skin.fa +testdata/test_cosmic_mutations_proteindb_thyroid.fa +testdata/test_cosmic_mutations_proteindb_upperaerodigestivetract.fa +``` + +The `synonymous_variant` row (COSO1006) is pre-filtered and produces no output. The `frameshift_variant` row carrying `p.?` (COSO1008) is skipped by the `p.?` guard. The consolidated `testdata/test_cosmic_mutations_proteindb.fa` contains all non-filtered mutations regardless of tissue type. + +--- + +## 2.3 `--clinical_sample_file` CLI Option + +The `--clinical_sample_file` / `-cl` option accepts any TSV file that maps a sample/phenotype ID column to the `--filter_column` value. For COSMIC, this is the Classification file: + +| Argument | COSMIC v103 value | +|---|---| +| `--clinical_sample_file` | `Cosmic_Classification_v103_GRCh38.tsv` (or `.gz`) | +| `--filter_column` | `PRIMARY_SITE` | +| Join key used internally | `COSMIC_PHENOTYPE_ID` (auto-detected for COSMIC) | + +For **cBioportal**, the equivalent is the clinical sample file with `SAMPLE_ID` as the join key — the `--clinical_sample_file` option uses `SAMPLE_ID` by default and switches to `COSMIC_PHENOTYPE_ID` only when invoked via `cosmic-to-proteindb`. + +--- + +## 2.4 Independent Validation with Real COSMIC Files + +After downloading COSMIC v103 files into `use-cases/use-case2/cosmic_data/`, run the pipeline against a subset of real mutations: + +```bash +# Extract 500 actionable mutations (skip intronic/UTR/synonymous) +zcat use-cases/use-case2/cosmic_data/Cosmic_CompleteTargetedScreensMutant_v103_GRCh38.tsv.gz \ + | head -1 > /tmp/cosmic_test.tsv +zcat use-cases/use-case2/cosmic_data/Cosmic_CompleteTargetedScreensMutant_v103_GRCh38.tsv.gz \ + | grep -E "missense_variant|stop_gained|inframe_deletion|inframe_insertion|frameshift" \ + | head -500 >> /tmp/cosmic_test.tsv + +pgatk cosmic-to-proteindb \ + --config_file pgatk/config/cosmic_config.yaml \ + --input_mutation /tmp/cosmic_test.tsv \ + --input_genes <(zcat use-cases/use-case2/cosmic_data/Cosmic_Genes_v103_GRCh38.fasta.gz) \ + --output_db /tmp/cosmic_out.fa \ + --clinical_sample_file <(zcat use-cases/use-case2/cosmic_data/Cosmic_Classification_v103_GRCh38.tsv.gz) \ + --filter_column PRIMARY_SITE \ + --split_by_filter_column \ + --accepted_values all +``` + +Expected: ~27 per-tissue output files, including canonical COSMIC tissue types (`lung`, `breast`, `large_intestine`, `skin`, `haematopoietic_and_lymphoid_tissue`, etc.). FASTA headers follow the format: + +``` +>COSMIC:GENE_SYMBOL:p.MutAA:SO_term +``` + +To verify the join is working (i.e. phenotype IDs are resolving to tissue names): + +```bash +# Count sequences per tissue file +for f in /tmp/cosmic_out_*.fa; do + echo "$f: $(grep -c '^>' $f) sequences" +done +``` + +A file named `cosmic_out_NS.fa` is expected for phenotypes where tissue is not specified in the Classification file — these are retained rather than silently dropped. diff --git a/pgatk/cgenomes/cgenomes_proteindb.py b/pgatk/cgenomes/cgenomes_proteindb.py index 70fb72b..0d1e39f 100644 --- a/pgatk/cgenomes/cgenomes_proteindb.py +++ b/pgatk/cgenomes/cgenomes_proteindb.py @@ -287,6 +287,23 @@ def cosmic_to_proteindb(self) -> None: except KeyError: COSMIC_CDS_DB[record.id] = [record] + # Build phenotype-to-group mapping from the classification file if provided. + # COSMIC_PHENOTYPE_ID is present directly in the mutation file (column 6) and + # is the join key to the Cosmic_Classification file where PRIMARY_SITE lives. + sample_groups_dict = {} + if self._local_clinical_sample_file and self._filter_column: + sample_groups_dict = self.get_value_per_sample( + self._local_clinical_sample_file, + self._filter_column, + sample_id_column='COSMIC_PHENOTYPE_ID', + ) + if not sample_groups_dict: + self.get_logger().warning( + "clinical_sample_file '%s' produced no phenotype mappings; " + "falling back to direct filter-column lookup in the mutation file.", + self._local_clinical_sample_file, + ) + regex = re.compile('[^a-zA-Z]') mutation_dic = {} groups_mutations_dict = {} @@ -296,55 +313,81 @@ def cosmic_to_proteindb(self) -> None: required_columns = ["GENE_SYMBOL", "TRANSCRIPT_ACCESSION", "MUTATION_CDS", "MUTATION_AA", "MUTATION_DESCRIPTION"] with _open_text(self._local_mutation_file, encoding="latin-1") as cosmic_input, \ open(self._local_output_file, 'w', encoding='utf-8') as output: - header = cosmic_input.readline().strip().split("\t") + col_header = cosmic_input.readline().strip().split("\t") try: - gene_col = header.index("GENE_SYMBOL") - enst_col = header.index("TRANSCRIPT_ACCESSION") - cds_col = header.index("MUTATION_CDS") - aa_col = header.index("MUTATION_AA") - muttype_col = header.index("MUTATION_DESCRIPTION") + gene_col = col_header.index("GENE_SYMBOL") + enst_col = col_header.index("TRANSCRIPT_ACCESSION") + cds_col = col_header.index("MUTATION_CDS") + aa_col = col_header.index("MUTATION_AA") + muttype_col = col_header.index("MUTATION_DESCRIPTION") except ValueError as e: self.get_logger().error( "COSMIC file missing required columns. Expected %s, got: %s", - required_columns, header + required_columns, col_header ) raise ValueError( f"COSMIC mutation file missing required column: {e}. " f"Expected columns include: {required_columns}" ) from e + + # COSMIC_PHENOTYPE_ID is the direct join key to the classification file. + try: + phenotype_id_col = col_header.index("COSMIC_PHENOTYPE_ID") + except ValueError: + phenotype_id_col = None + if sample_groups_dict: + self.get_logger().warning( + "COSMIC_PHENOTYPE_ID not found in mutation file header; classification file join will be skipped." + ) + + # Fall back to a direct filter column in the mutation file when no classification file is provided. filter_col = None - if self._filter_column: + if not sample_groups_dict and self._filter_column: try: - filter_col = header.index(self._filter_column) + filter_col = col_header.index(self._filter_column) except ValueError: self.get_logger().warning( - "Filter column '%s' not found in COSMIC header: %s. Filtering disabled.", - self._filter_column, header + "Filter column '%s' not found in COSMIC header. Filtering disabled.", + self._filter_column, ) max_col = max(gene_col, enst_col, cds_col, aa_col, muttype_col) if filter_col is not None: max_col = max(max_col, filter_col) + if phenotype_id_col is not None and sample_groups_dict: + max_col = max(max_col, phenotype_id_col) for line in cosmic_input: if line_counter % 10000 == 0: - msg = "Number of lines finished -- '{}'".format(line_counter) - self.get_logger().debug(msg) + self.get_logger().debug("Number of lines finished -- '%s'", line_counter) line_counter += 1 row = line.strip().split("\t") if len(row) <= max_col: self.get_logger().debug("Skipping malformed row (insufficient columns) at line %s: %s", line_counter, row[:5]) continue - if filter_col is not None: - if row[filter_col] not in self._accepted_values and self._accepted_values != ['all']: + + # Determine the group (e.g. primary site) for this mutation. + group = None + if sample_groups_dict and phenotype_id_col is not None: + group = sample_groups_dict.get(row[phenotype_id_col]) + if group is None and (self._accepted_values != ['all'] or self._split_by_filter_column): + self.get_logger().warning( + "No classification found for COSMIC_PHENOTYPE_ID '%s'; skipping row.", + row[phenotype_id_col], + ) continue + elif filter_col is not None: + group = row[filter_col] + + if group is not None and group not in self._accepted_values and self._accepted_values != ['all']: + continue if "coding silent" in row[muttype_col] or "synonymous_variant" in row[muttype_col]: continue snp = SNP(gene=row[gene_col], mrna=row[enst_col], dna_mut=row[cds_col], aa_mut=row[aa_col], mutation_type=row[muttype_col]) - header = "COSMIC:%s:%s:%s" % (snp.gene, snp.aa_mut, snp.mutation_type.replace(" ", "")) + fasta_header = "COSMIC:%s:%s:%s" % (snp.gene, snp.aa_mut, snp.mutation_type.replace(" ", "")) try: this_gene_records = COSMIC_CDS_DB[snp.gene] seqs = [] @@ -364,16 +407,16 @@ def cosmic_to_proteindb(self) -> None: break if mut_pro_seq: - entry = ">%s\n%s\n" % (header, mut_pro_seq) - if header not in mutation_dic: + entry = ">%s\n%s\n" % (fasta_header, mut_pro_seq) + if fasta_header not in mutation_dic: output.write(entry) - mutation_dic[header] = 1 + mutation_dic[fasta_header] = 1 - if self._split_by_filter_column and filter_col is not None: + if self._split_by_filter_column and group is not None: try: - groups_mutations_dict[row[filter_col]][header] = entry + groups_mutations_dict[group][fasta_header] = entry except KeyError: - groups_mutations_dict[row[filter_col]] = {header: entry} + groups_mutations_dict[group] = {fasta_header: entry} else: self.get_logger().warning( f"Could not parse mutation record: gene={snp.gene}, dna_mut={snp.dna_mut}, " @@ -383,13 +426,14 @@ def cosmic_to_proteindb(self) -> None: for group_name in groups_mutations_dict.keys(): output_base = str(Path(self._local_output_file).with_suffix('')) with open(f"{output_base}_{regex.sub('', group_name)}.fa", 'w', encoding='utf-8') as fn: - for header in groups_mutations_dict[group_name].keys(): - fn.write(groups_mutations_dict[group_name][header]) + for fasta_hdr in groups_mutations_dict[group_name].keys(): + fn.write(groups_mutations_dict[group_name][fasta_hdr]) self.get_logger().debug("COSMIC contains in total {} non redundant mutations".format(len(mutation_dic))) @staticmethod - def get_sample_headers(header_line: list, filter_column: str) -> tuple[Optional[int], Optional[int]]: + def get_sample_headers(header_line: list, filter_column: str, + sample_id_column: str = 'SAMPLE_ID') -> tuple[Optional[int], Optional[int]]: _logger = logging.getLogger(__name__) try: filter_col = header_line.index(filter_column) @@ -397,13 +441,14 @@ def get_sample_headers(header_line: list, filter_column: str) -> tuple[Optional[ _logger.warning('%s was not found in the header row: %s', filter_column, header_line) return None, None try: - sample_id_col = header_line.index('SAMPLE_ID') + sample_id_col = header_line.index(sample_id_column) except ValueError: - _logger.warning('SAMPLE_ID was not found in the header row: %s', header_line) + _logger.warning('%s was not found in the header row: %s', sample_id_column, header_line) return None, None return filter_col, sample_id_col - def get_value_per_sample(self, local_clinical_sample_file: str, filter_column: str) -> dict: + def get_value_per_sample(self, local_clinical_sample_file: str, filter_column: str, + sample_id_column: str = 'SAMPLE_ID') -> dict: sample_value = {} if local_clinical_sample_file: with open(local_clinical_sample_file, 'r', encoding='utf-8') as clin_fn: @@ -412,16 +457,17 @@ def get_value_per_sample(self, local_clinical_sample_file: str, filter_column: s if line.startswith('#'): continue sl = line.strip().split('\t') - if 'SAMPLE_ID' in sl and filter_column in sl: - filter_column_col, sample_id_col = self.get_sample_headers(sl, filter_column) - # Skip adding the header row itself to sample_value + if sample_id_column in sl and filter_column in sl: + filter_column_col, sample_id_col = self.get_sample_headers( + sl, filter_column, sample_id_column + ) continue if filter_column_col is not None and sample_id_col is not None: if (sample_id_col < len(sl) and filter_column_col < len(sl)): sample_value[sl[sample_id_col]] = sl[filter_column_col].strip().replace(' ', '_') else: - self.get_logger().warning("No column was found for %s, %s in %s", filter_column, 'SAMPLE_ID', - local_clinical_sample_file) + self.get_logger().warning("No column was found for %s, %s in %s", filter_column, + sample_id_column, local_clinical_sample_file) return sample_value @staticmethod diff --git a/pgatk/commands/cosmic_to_proteindb.py b/pgatk/commands/cosmic_to_proteindb.py index 5c24ca4..b4d0c27 100644 --- a/pgatk/commands/cosmic_to_proteindb.py +++ b/pgatk/commands/cosmic_to_proteindb.py @@ -22,9 +22,12 @@ @click.option('-s', '--split_by_filter_column', help='Use this flag to generate a proteinDB per group as specified in the filter_column, default is False', is_flag=True) +@click.option('-cl', '--clinical_sample_file', + help='COSMIC sample annotation file used to map ' + 'COSMIC_SAMPLE_ID to the filter column value (e.g. PRIMARY_SITE)') @click.pass_context def cosmic_to_proteindb(ctx, config_file, input_mutation, input_genes, output_db, - filter_column, accepted_values, split_by_filter_column): + filter_column, accepted_values, split_by_filter_column, clinical_sample_file): config_data = load_config("cosmic", config_file) @@ -39,10 +42,13 @@ def cosmic_to_proteindb(ctx, config_file, input_mutation, input_genes, output_db if output_db is not None: pipeline_arguments[CancerGenomesService.CONFIG_OUTPUT_FILE] = output_db + if clinical_sample_file is not None: + pipeline_arguments[CancerGenomesService.CLINICAL_SAMPLE_FILE] = clinical_sample_file + if filter_column is not None: pipeline_arguments[CancerGenomesService.FILTER_COLUMN] = filter_column elif config_data is None: - pipeline_arguments[CancerGenomesService.FILTER_COLUMN] = 'Primary site' + pipeline_arguments[CancerGenomesService.FILTER_COLUMN] = 'PRIMARY_SITE' if accepted_values is not None: pipeline_arguments[CancerGenomesService.ACCEPTED_VALUES] = accepted_values diff --git a/pgatk/config/cosmic_config.yaml b/pgatk/config/cosmic_config.yaml index 7721295..0555af5 100644 --- a/pgatk/config/cosmic_config.yaml +++ b/pgatk/config/cosmic_config.yaml @@ -27,6 +27,7 @@ cosmic_data: - grch38/cosmic/v103/Cosmic_Transcripts_Tsv_v103_GRCh38.tar - grch38/cell_lines/v103/CellLinesProject_GenomeScreensMutant_Tsv_v103_GRCh38.tar - grch38/cell_lines/v103/CellLinesProject_CompleteCNA_Tsv_v103_GRCh38.tar + - grch38/cosmic/v103/Cosmic_Classification_Tsv_v103_GRCh38.tar logger: formatters: DEBUG: "%(asctime)s [%(levelname)7s][%(name)48s][%(module)32s, %(lineno)4s] %(message)s" @@ -34,7 +35,7 @@ cosmic_data: loglevel: DEBUG proteindb: filter_info: - filter_column: 'Primary site' + filter_column: 'PRIMARY_SITE' accepted_values: "all" split_by_filter_column: False clinical_sample_file: '' diff --git a/pgatk/testdata/proteindb_from_custom_VCF.fa b/pgatk/testdata/proteindb_from_custom_VCF.fa index 5325f01..22f87e0 100644 --- a/pgatk/testdata/proteindb_from_custom_VCF.fa +++ b/pgatk/testdata/proteindb_from_custom_VCF.fa @@ -1,11 +1,11 @@ ->varsample_rs78350717_22.15528913.C.A_ENST00000252835 -MCPLTLQVTGLMNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKDFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII* ->varsample_rs78350717_22.15528913.C.T_ENST00000252835 -MCPLTLQVTGLMNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKVFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII* >varsample_rs78350717_22.15528913.C.A_ENST00000643195 MNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKDFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII* >varsample_rs78350717_22.15528913.C.T_ENST00000643195 MNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKVFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII* +>varsample_rs78350717_22.15528913.C.A_ENST00000252835 +MCPLTLQVTGLMNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKDFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII* +>varsample_rs78350717_22.15528913.C.T_ENST00000252835 +MCPLTLQVTGLMNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKVFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII* >varsample_rs147461488_22.17740102.GGCCACGCTCAACT.G_ENST00000342111 MDCEVNNGSSLRDECITNLLVFGFLQSCSDNSFRRELDALGHELPVLAPQWEGYDELQTDGNRSSHSRLGRIEAGASDNNTASAEEETEAAGSVAFMELRQ*F*KSRRHHPEYCQAPRPGRGQHGP*HPSGPG >varsample_rs147461488_22.17740102.GGCCACGCTCAACT.G_ENST00000617586 diff --git a/pgatk/testdata/test_cosmic_classification.tsv b/pgatk/testdata/test_cosmic_classification.tsv new file mode 100644 index 0000000..e5a9f9c --- /dev/null +++ b/pgatk/testdata/test_cosmic_classification.tsv @@ -0,0 +1,9 @@ +COSMIC_PHENOTYPE_ID PRIMARY_SITE PRIMARY_HISTOLOGY SITE_SUBTYPE_1 HISTOLOGY_SUBTYPE_1 +COSO1001 upper_aerodigestive_tract carcinoma mouth squamous_cell_carcinoma +COSO1002 skin carcinoma upper_leg squamous_cell_carcinoma +COSO1003 bone other femur giant_cell_tumour +COSO1004 thyroid adenoma-nodule-goitre NS NS +COSO1005 liver other NS neoplasm +COSO1006 haematopoietic_and_lymphoid_tissue haematopoietic_neoplasm NS acute_myeloid_leukaemia +COSO1007 bone carcinoma NS NS +COSO1008 skin carcinoma NS NS diff --git a/pgatk/testdata/test_cosmic_mutations.tsv b/pgatk/testdata/test_cosmic_mutations.tsv index f4a9ba4..254cef5 100644 --- a/pgatk/testdata/test_cosmic_mutations.tsv +++ b/pgatk/testdata/test_cosmic_mutations.tsv @@ -1,13 +1,13 @@ -Gene name Accession Number Gene CDS length HGNC ID Sample name ID_sample ID_tumour Primary site Site subtype 1 Site subtype 2 Site subtype 3 Primary histology Histology subtype 1 Histology subtype 2 Histology subtype 3 Genome-wide screen Mutation ID Mutation CDS Mutation AA Mutation Description Mutation zygosity LOH GRCh Mutation genome position Mutation strand SNP Resistance Mutation FATHMM prediction FATHMM score Mutation somatic status Pubmed_PMID ID_STUDY Sample Type Tumour origin Age -HRAS ENST00000397596 570 5173 HN12PT 1542873 1465176 upper_aerodigestive_tract mouth NS NS carcinoma squamous_cell_carcinoma NS NS y COSM486 c.37G>C p.G13R Substitution - Missense u 38 11:534286-534286 - n - PATHOGENIC .99415 Confirmed somatic variant 21798897 surgery fresh/frozen primary -HRAS ENST00000397596 570 5173 V19 2688774 2547756 skin upper_leg NS NS carcinoma squamous_cell_carcinoma NS NS n COSM486 c.37G>C p.G13R Substitution - Missense u 38 11:534286-534286 - n - PATHOGENIC .99415 Confirmed somatic variant 24662767 surgery fresh/frozen NS -HRAS ENST00000397596 570 5173 S23372 702100 629222 bone femur NS NS other giant_cell_tumour NS NS n COSM487 c.37G>A p.G13S Substitution - Missense y 38 11:534286-534286 - n - PATHOGENIC .99324 Reported in another cancer sample as somatic 11903582 surgery-fixed recurrent 42 -HRAS ENST00000397596 570 5173 pel-102 720047 644990 upper_aerodigestive_tract mouth lip NS carcinoma squamous_cell_carcinoma NS NS n COSM484 c.35G>A p.G12D Substitution - Missense n 38 11:534288-534288 - n - PATHOGENIC .99322 Reported in another cancer sample as somatic 8253528 surgery fresh/frozen primary -HRAS ENST00000397596 570 5173 1965078 1965078 1851573 skin chest NS NS benign_melanocytic_nevus NS NS NS n COSM498 c.182A>T p.Q61L Substitution - Missense u 38 11:533874-533874 - n - PATHOGENIC .97511 Reported in another cancer sample as somatic 23599145 NS NS 30 -HRAS ENST00000397596 570 5173 1571189 1571189 1492737 skin NS NS NS other atypical_Spitzoid_tumour NS NS n COSM498 c.182A>T p.Q61L Substitution - Missense het u 38 11:533874-533874 - n - PATHOGENIC .97511 Reported in another cancer sample as somatic 21496703 surgery-fixed NS -HRAS ENST00000397596 570 5173 S61945 741791 664521 thyroid NS NS NS adenoma-nodule-goitre NS NS NS n COSM499 c.182A>G p.Q61R Substitution - Missense y 38 11:533874-533874 - n - PATHOGENIC .97179 Reported in another cancer sample as somatic 12727991 NS primary -HRAS ENST00000397596 570 5173 49T 2745889 2604592 liver NS NS NS other neoplasm NS NS y COSM1729837 c.403C>T p.R135* Substitution - Nonsense u 38 11:533500-533500 - n - PATHOGENIC .97257 Confirmed somatic variant 660 NS primary 67 -HRAS ENST00000397596 570 5173 DM3T4S 2762863 2621122 thyroid NS NS NS carcinoma follicular_carcinoma NS NS n COSM499 c.182A>G p.Q61R Substitution - Missense u 38 11:533874-533874 - n - PATHOGENIC .97179 Reported in another cancer sample as somatic 29615459 NS metastasis 58 -HRAS ENST00000397596 570 5173 TESTSAMPLE 9999991 9999981 haematopoietic_and_lymphoid_tissue NS NS NS haematopoietic_neoplasm acute_myeloid_leukaemia NS NS n COSM_TEST1 c.? p.G13(H^K) Substitution - Missense u 38 11:534286-534286 - n - PATHOGENIC .99415 Confirmed somatic variant 25858894 blood-bone marrow primary -HRAS ENST00000397596 570 5173 TESTSAMPLE 9999991 9999981 haematopoietic_and_lymphoid_tissue NS NS NS haematopoietic_neoplasm acute_myeloid_leukaemia NS NS n COSM_TEST2 c.? p.13_14>21 Complex - insertion inframe u 38 11:534286-534286 - n - PATHOGENIC .99415 Confirmed somatic variant 25858894 blood-bone marrow primary -HRAS ENST00000397596 570 5173 TESTSAMPLE 9999991 9999981 haematopoietic_and_lymphoid_tissue NS NS NS haematopoietic_neoplasm acute_myeloid_leukaemia NS NS n COSM_TEST3 c.? p.G13_S14>GS Insertion - In frame u 38 11:534286-534286 - n - PATHOGENIC .99415 Confirmed somatic variant 25858894 blood-bone marrow primary +GENE_SYMBOL COSMIC_GENE_ID TRANSCRIPT_ACCESSION COSMIC_SAMPLE_ID SAMPLE_NAME COSMIC_PHENOTYPE_ID GENOMIC_MUTATION_ID LEGACY_MUTATION_ID MUTATION_ID MUTATION_CDS MUTATION_AA MUTATION_DESCRIPTION MUTATION_ZYGOSITY LOH CHROMOSOME GENOME_START GENOME_STOP STRAND PUBMED_PMID COSMIC_STUDY_ID HGVSP HGVSC HGVSG GENOMIC_WT_ALLELE GENOMIC_MUT_ALLELE MUTATION_SOMATIC_STATUS POSITIVE_SCREEN +HRAS COSG11180 ENST00000397596 COSS1542873 HN12PT COSO1001 COSV56056559 COSM486 COSM486 c.37G>C p.G13R missense_variant u 11 534286 534286 - 21798897 p.Gly13Arg c.37G>C G C Confirmed somatic variant y +HRAS COSG11180 ENST00000397596 COSS2688774 V19 COSO1002 COSV56056559 COSM486 COSM486 c.37G>C p.G13R missense_variant u 11 534286 534286 - 24662767 p.Gly13Arg c.37G>C G C Confirmed somatic variant n +HRAS COSG11180 ENST00000397596 COSS702100 S23372 COSO1003 COSV56056558 COSM487 COSM487 c.37G>A p.G13S missense_variant y 11 534286 534286 - 11903582 p.Gly13Ser c.37G>A G A Reported in another cancer sample as somatic n +HRAS COSG11180 ENST00000397596 COSS720047 pel-102 COSO1001 COSV56056557 COSM484 COSM484 c.35G>A p.G12D missense_variant n 11 534288 534288 - 8253528 p.Gly12Asp c.35G>A G A Reported in another cancer sample as somatic n +HRAS COSG11180 ENST00000397596 COSS1965078 1965078 COSO1002 COSV56056584 COSM498 COSM498 c.182A>T p.Q61L missense_variant u 11 533874 533874 - 23599145 p.Gln61Leu c.182A>T A T Reported in another cancer sample as somatic n +HRAS COSG11180 ENST00000397596 COSS1571189 1571189 COSO1002 COSV56056584 COSM498 COSM498 c.182A>T p.Q61L missense_variant het u 11 533874 533874 - 21496703 p.Gln61Leu c.182A>T A T Reported in another cancer sample as somatic n +HRAS COSG11180 ENST00000397596 COSS741791 S61945 COSO1004 COSV56056585 COSM499 COSM499 c.182A>G p.Q61R missense_variant y 11 533874 533874 - 12727991 p.Gln61Arg c.182A>G A G Reported in another cancer sample as somatic n +HRAS COSG11180 ENST00000397596 COSS2745889 49T COSO1005 COSV56070001 COSM1729837 COSM1729837 c.403C>T p.R135* stop_gained u 11 533500 533500 - 660 p.Arg135Ter c.403C>T C T Confirmed somatic variant y +HRAS COSG11180 ENST00000397596 COSS2762863 DM3T4S COSO1004 COSV56056585 COSM499 COSM499 c.182A>G p.Q61R missense_variant u 11 533874 533874 - 29615459 p.Gln61Arg c.182A>G A G Reported in another cancer sample as somatic n +HRAS COSG11180 ENST00000397596 COSS9999991 TESTSAMPLE COSO1006 COSV_TEST1 COSM_TEST1 COSM_TEST1 c.36C>T p.G12= synonymous_variant u 11 534287 534287 - p.Gly12Gly c.36C>T C T Confirmed somatic variant n +HRAS COSG11180 ENST00000397596 COSS9999992 TESTSAMPLE2 COSO1007 COSV_TEST2 COSM_TEST2 COSM_TEST2 c.34_36del p.G12del inframe_deletion u 11 534285 534287 - p.Gly12del c.34_36del GGC - Confirmed somatic variant n +HRAS COSG11180 ENST00000397596 COSS9999993 TESTSAMPLE3 COSO1008 COSV_TEST3 COSM_TEST3 COSM_TEST3 c.? p.? frameshift_variant u 11 534286 534287 - p.? c.? . . Confirmed somatic variant n diff --git a/pgatk/testdata/test_cosmic_mutations_proteindb.fa b/pgatk/testdata/test_cosmic_mutations_proteindb.fa index 9516b34..e2fd977 100644 --- a/pgatk/testdata/test_cosmic_mutations_proteindb.fa +++ b/pgatk/testdata/test_cosmic_mutations_proteindb.fa @@ -1,12 +1,14 @@ ->COSMIC:HRAS:p.G13R:Substitution-Missense +>COSMIC:HRAS:p.G13R:missense_variant MTEYKLVVVGAGRVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS* ->COSMIC:HRAS:p.G13S:Substitution-Missense +>COSMIC:HRAS:p.G13S:missense_variant MTEYKLVVVGAGSVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS* ->COSMIC:HRAS:p.G12D:Substitution-Missense +>COSMIC:HRAS:p.G12D:missense_variant MTEYKLVVVGADGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS* ->COSMIC:HRAS:p.Q61L:Substitution-Missense +>COSMIC:HRAS:p.Q61L:missense_variant MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGLEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS* ->COSMIC:HRAS:p.Q61R:Substitution-Missense +>COSMIC:HRAS:p.Q61R:missense_variant MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGREEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS* ->COSMIC:HRAS:p.R135*:Substitution-Nonsense +>COSMIC:HRAS:p.R135*:stop_gained MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLA*SYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS* +>COSMIC:HRAS:p.G12del:inframe_deletion +MTEYKLVVVGAGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS* diff --git a/pgatk/testdata/test_cosmic_mutations_proteindb_bone.fa b/pgatk/testdata/test_cosmic_mutations_proteindb_bone.fa index 3be6bf8..f96859a 100644 --- a/pgatk/testdata/test_cosmic_mutations_proteindb_bone.fa +++ b/pgatk/testdata/test_cosmic_mutations_proteindb_bone.fa @@ -1,2 +1,4 @@ ->COSMIC:HRAS:p.G13S:Substitution-Missense +>COSMIC:HRAS:p.G13S:missense_variant MTEYKLVVVGAGSVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS* +>COSMIC:HRAS:p.G12del:inframe_deletion +MTEYKLVVVGAGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS* diff --git a/pgatk/testdata/test_cosmic_mutations_proteindb_liver.fa b/pgatk/testdata/test_cosmic_mutations_proteindb_liver.fa index fc6d70d..62dafeb 100644 --- a/pgatk/testdata/test_cosmic_mutations_proteindb_liver.fa +++ b/pgatk/testdata/test_cosmic_mutations_proteindb_liver.fa @@ -1,2 +1,2 @@ ->COSMIC:HRAS:p.R135*:Substitution-Nonsense +>COSMIC:HRAS:p.R135*:stop_gained MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLA*SYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS* diff --git a/pgatk/testdata/test_cosmic_mutations_proteindb_skin.fa b/pgatk/testdata/test_cosmic_mutations_proteindb_skin.fa index b365210..6d8f4ab 100644 --- a/pgatk/testdata/test_cosmic_mutations_proteindb_skin.fa +++ b/pgatk/testdata/test_cosmic_mutations_proteindb_skin.fa @@ -1,4 +1,4 @@ ->COSMIC:HRAS:p.G13R:Substitution-Missense +>COSMIC:HRAS:p.G13R:missense_variant MTEYKLVVVGAGRVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS* ->COSMIC:HRAS:p.Q61L:Substitution-Missense +>COSMIC:HRAS:p.Q61L:missense_variant MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGLEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS* diff --git a/pgatk/testdata/test_cosmic_mutations_proteindb_thyroid.fa b/pgatk/testdata/test_cosmic_mutations_proteindb_thyroid.fa index dc17c3d..b9ead29 100644 --- a/pgatk/testdata/test_cosmic_mutations_proteindb_thyroid.fa +++ b/pgatk/testdata/test_cosmic_mutations_proteindb_thyroid.fa @@ -1,2 +1,2 @@ ->COSMIC:HRAS:p.Q61R:Substitution-Missense +>COSMIC:HRAS:p.Q61R:missense_variant MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGREEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS* diff --git a/pgatk/testdata/test_cosmic_mutations_proteindb_upperaerodigestivetract.fa b/pgatk/testdata/test_cosmic_mutations_proteindb_upperaerodigestivetract.fa index f3118ae..9b3cf5c 100644 --- a/pgatk/testdata/test_cosmic_mutations_proteindb_upperaerodigestivetract.fa +++ b/pgatk/testdata/test_cosmic_mutations_proteindb_upperaerodigestivetract.fa @@ -1,4 +1,4 @@ ->COSMIC:HRAS:p.G13R:Substitution-Missense +>COSMIC:HRAS:p.G13R:missense_variant MTEYKLVVVGAGRVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS* ->COSMIC:HRAS:p.G12D:Substitution-Missense +>COSMIC:HRAS:p.G12D:missense_variant MTEYKLVVVGADGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS* diff --git a/pgatk/tests/pgatk_tests.py b/pgatk/tests/pgatk_tests.py index 825a642..a738bdf 100644 --- a/pgatk/tests/pgatk_tests.py +++ b/pgatk/tests/pgatk_tests.py @@ -178,7 +178,8 @@ def test_cosmic_to_proteindb(self): '--input_mutation', 'testdata/test_cosmic_mutations.tsv', '--input_genes', 'testdata/test_cosmic_genes.fa', '--output_db', 'testdata/test_cosmic_mutations_proteindb.fa', - '--filter_column', 'Primary site', + '--clinical_sample_file', 'testdata/test_cosmic_classification.tsv', + '--filter_column', 'PRIMARY_SITE', '--split_by_filter_column', '--accepted_values', 'all']) if result.exit_code != 0: print(result.exception) From 0b9c179584fff36638ad7174783c35b8eecd6085 Mon Sep 17 00:00:00 2001 From: Husen Umer Date: Wed, 13 May 2026 19:06:15 +0300 Subject: [PATCH 19/23] minor fix to cosmic gzip file reading --- pgatk/cgenomes/cgenomes_proteindb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgatk/cgenomes/cgenomes_proteindb.py b/pgatk/cgenomes/cgenomes_proteindb.py index 0d1e39f..1484728 100644 --- a/pgatk/cgenomes/cgenomes_proteindb.py +++ b/pgatk/cgenomes/cgenomes_proteindb.py @@ -451,7 +451,7 @@ def get_value_per_sample(self, local_clinical_sample_file: str, filter_column: s sample_id_column: str = 'SAMPLE_ID') -> dict: sample_value = {} if local_clinical_sample_file: - with open(local_clinical_sample_file, 'r', encoding='utf-8') as clin_fn: + with _open_text(local_clinical_sample_file, encoding='utf-8') as clin_fn: filter_column_col, sample_id_col = None, None for line in clin_fn.readlines(): if line.startswith('#'): From 7c08c5014bb700c0f4ad2e666fc8ab0eb75f4324 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 13 May 2026 19:24:22 +0100 Subject: [PATCH 20/23] Add cProfile support to vcf-to-proteindb benchmark script Adds --profile-out PATH and --print-top N flags. When given, the run is wrapped in cProfile, the .prof file is written to PATH, and the top N functions are printed sorted by both cumulative time and own time. Used to identify the remaining hot path after the issue-#99 fix before designing a parallelization strategy. --- scripts/benchmark_vcf_to_proteindb.py | 57 ++++++++++++++++++++++++--- 1 file changed, 51 insertions(+), 6 deletions(-) diff --git a/scripts/benchmark_vcf_to_proteindb.py b/scripts/benchmark_vcf_to_proteindb.py index bf17700..8901565 100755 --- a/scripts/benchmark_vcf_to_proteindb.py +++ b/scripts/benchmark_vcf_to_proteindb.py @@ -1,18 +1,27 @@ #!/usr/bin/env python3 -"""One-shot benchmark for ``pgatk vcf-to-proteindb``. +"""One-shot benchmark / profiler for ``pgatk vcf-to-proteindb``. -Times one full run end-to-end. Compare two runs of this script (one on a -pre-fix build, one on a post-fix build) to measure the speedup achieved -by issue #99. Not invoked by CI. +Times one full run end-to-end. With ``--profile-out PATH``, also wraps the +run in cProfile and writes a ``.prof`` file; print the top-N hotspots with +``--print-top``. Not invoked by CI. -Usage: +Basic usage: python scripts/benchmark_vcf_to_proteindb.py \ --vcf /path/to/sample.vcf \ --fasta /path/to/transcripts.fa \ --gtf /path/to/annotation.gtf \ --output /tmp/benchmark_proteindb.fa + +Profile a chr22 run and print the 30 hottest functions: + python scripts/benchmark_vcf_to_proteindb.py \ + --vcf chr22.vcf --fasta tx.fa --gtf chr22.gtf \ + --output /tmp/chr22.fa \ + --profile-out /tmp/chr22.prof --print-top 30 """ import argparse +import cProfile +import io +import pstats import time from pathlib import Path @@ -32,6 +41,17 @@ def main() -> None: help="VCF INFO field that carries per-transcript annotation (default: CSQ). " "Use empty string to force re-annotation via bedtools intersect.", ) + parser.add_argument( + "--profile-out", + help="If set, wrap the run in cProfile and write the .prof file here.", + ) + parser.add_argument( + "--print-top", + type=int, + default=0, + help="With --profile-out, also print the top-N functions by cumulative time " + "(default: 0, no printing). Recommended N=30 for a first look.", + ) args = parser.parse_args() config_data = load_config("ensembl_config", None) @@ -41,10 +61,20 @@ def main() -> None: } svc = EnsemblDataService(config_data, pipeline_arguments) + profiler = cProfile.Profile() if args.profile_out else None start = time.perf_counter() - svc.vcf_to_proteindb(args.vcf, args.fasta, args.gtf) + if profiler is not None: + profiler.enable() + try: + svc.vcf_to_proteindb(args.vcf, args.fasta, args.gtf) + finally: + if profiler is not None: + profiler.disable() elapsed = time.perf_counter() - start + if profiler is not None: + profiler.dump_stats(args.profile_out) + output_path = Path(args.output) output_size = output_path.stat().st_size if output_path.exists() else 0 seq_count = 0 @@ -59,6 +89,21 @@ def main() -> None: print(f" VCF: {args.vcf}") print(f" Elapsed: {elapsed:.2f} s ({elapsed / 60:.2f} min)") print(f" Output: {args.output} ({output_size} bytes, {seq_count} sequences)") + if args.profile_out: + print(f" Profile: {args.profile_out}") + + if args.profile_out and args.print_top > 0: + print() + print(f"=== TOP {args.print_top} BY CUMULATIVE TIME ===") + buf = io.StringIO() + stats = pstats.Stats(args.profile_out, stream=buf) + stats.strip_dirs().sort_stats('cumulative').print_stats(args.print_top) + print(buf.getvalue()) + print(f"=== TOP {args.print_top} BY OWN (tottime) TIME ===") + buf = io.StringIO() + stats = pstats.Stats(args.profile_out, stream=buf) + stats.strip_dirs().sort_stats('tottime').print_stats(args.print_top) + print(buf.getvalue()) if __name__ == "__main__": From 9235f7d71b6018f5e4932320f5b26b7907cc0574 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 13 May 2026 19:48:42 +0100 Subject: [PATCH 21/23] Parallelise vcf-to-proteindb per chromosome via multiprocessing Profile of 50K gnomAD chr22 variants showed ~36s one-time setup and ~14s per-variant work, with Biopython translate the dominant CPU cost. Implements per-chromosome multiprocessing.Pool to fan out the per-variant loop across cores. - New `--workers N` CLI flag (default 1, sequential, backward-compatible). - Internally: pre-annotate VCF in the main process (avoids bedtools-cwd race), split the VCF into per-chromosome temp files, dispatch to a spawn-context Pool, concatenate per-chunk outputs. - Existing per-variant loop moved verbatim into `_vcf_to_proteindb_chunk`; the public `vcf_to_proteindb` now dispatches sequential or parallel. - Workers re-construct EnsemblDataService fresh per process (no shared gffutils SQLite handles or SeqIO indices); each pays its own setup but the per-variant CPU work parallelises near-linearly. - Equivalence test verifies workers=2 produces the same set of output sequences as workers=1 on the existing test.vcf fixture. --- pgatk/commands/vcf_to_proteindb.py | 9 +- pgatk/ensembl/ensembl.py | 124 +++++++++++++++++- pgatk/tests/test_vcf_to_proteindb_parallel.py | 64 +++++++++ 3 files changed, 193 insertions(+), 4 deletions(-) create mode 100644 pgatk/tests/test_vcf_to_proteindb_parallel.py diff --git a/pgatk/commands/vcf_to_proteindb.py b/pgatk/commands/vcf_to_proteindb.py index da2d474..7cb2faa 100644 --- a/pgatk/commands/vcf_to_proteindb.py +++ b/pgatk/commands/vcf_to_proteindb.py @@ -38,6 +38,8 @@ help="enabling this option causes or variants to be parsed. By default only variants that have not failed any filters will be processed (FILTER column is PASS, None, .) or if the filters are subset of the accepted filters. (default is False)", is_flag=True) @click.option('--accepted_filters', help="Accepted filters for variant parsing") +@click.option('-w', '--workers', type=int, default=None, + help="Number of worker processes to fan out across (default: 1, sequential). Per-chromosome split.") @click.pass_context def vcf_to_proteindb(ctx, config_file, input_fasta, vcf, gene_annotations_gtf, translation_table, mito_translation_table, @@ -45,7 +47,7 @@ def vcf_to_proteindb(ctx, config_file, input_fasta, vcf, gene_annotations_gtf, t af_field, af_threshold, transcript_str, biotype_str, exclude_biotypes, include_biotypes, consequence_str, exclude_consequences, skip_including_all_cds, include_consequences, - ignore_filters, accepted_filters): + ignore_filters, accepted_filters, workers): config_data = load_config("ensembl_config", config_file) @@ -105,5 +107,8 @@ def vcf_to_proteindb(ctx, config_file, input_fasta, vcf, gene_annotations_gtf, t if accepted_filters is not None: pipeline_arguments[EnsemblDataService.ACCEPTED_FILTERS] = accepted_filters + if workers is not None: + pipeline_arguments[EnsemblDataService.WORKERS] = workers + ensembl_data_service = EnsemblDataService(config_data, pipeline_arguments) - ensembl_data_service.vcf_to_proteindb(vcf, input_fasta, gene_annotations_gtf) + ensembl_data_service.vcf_to_proteindb(vcf, input_fasta, gene_annotations_gtf, workers=workers) diff --git a/pgatk/ensembl/ensembl.py b/pgatk/ensembl/ensembl.py index d83d031..f06d69d 100644 --- a/pgatk/ensembl/ensembl.py +++ b/pgatk/ensembl/ensembl.py @@ -1,6 +1,8 @@ from __future__ import annotations import logging +import os +import re import sqlite3 from pathlib import Path from typing import Any, Optional @@ -45,6 +47,42 @@ def put(self, key: tuple, value: tuple) -> None: _MISSING = object() +def _safe_chrom(chrom: str) -> str: + """Sanitise a chromosome identifier into a filesystem-safe filename component.""" + return re.sub(r'[^A-Za-z0-9._-]', '_', str(chrom)) + + +def _split_vcf_by_chrom(vcf_file: str) -> tuple[list[str], dict[str, list[str]]]: + """Read vcf_file once; return (header_lines, {chrom: [data_lines]}). + + Preserves line ordering within each chromosome. Headers retain trailing newlines; + data lines retain trailing newlines. Empty/whitespace-only data lines are skipped. + """ + header: list[str] = [] + buckets: dict[str, list[str]] = {} + with open(vcf_file, 'r', encoding='utf-8') as f: + for line in f: + if line.startswith('#'): + header.append(line) + continue + if not line.strip(): + continue + chrom = line.split('\t', 1)[0] + buckets.setdefault(chrom, []).append(line) + return header, buckets + + +def _vcf_to_proteindb_worker(default_params, pipeline_args, vcf_file, + input_fasta, gene_annotations_gtf, output_path): + """Module-level worker for multiprocessing.Pool.starmap. + + Re-constructs an EnsemblDataService in the worker process from the + pickled config dict + pipeline args, then runs the chunk pipeline. + """ + svc = EnsemblDataService(default_params, pipeline_args) + svc._vcf_to_proteindb_chunk(vcf_file, input_fasta, gene_annotations_gtf, output_path) + + class EnsemblDataService(ParameterConfiguration): CONFIG_KEY_VCF = "ensembl_translation" INPUT_FASTA = "input_fasta" @@ -73,6 +111,7 @@ class EnsemblDataService(ParameterConfiguration): EXPRESSION_THRESH = "expression_thresh" IGNORE_FILTERS = "ignore_filters" ACCEPTED_FILTERS = "accepted_filters" + WORKERS = "workers" def __init__(self, config_file: dict, pipeline_arguments: dict) -> None: """ @@ -129,6 +168,12 @@ def __init__(self, config_file: dict, pipeline_arguments: dict) -> None: self._accepted_filters = self.get_multiple_options( self.get_translation_properties(variable=self.ACCEPTED_FILTERS, default_value='PASS')) + raw_workers = self.get_translation_properties(variable=self.WORKERS, default_value=1) + try: + self._workers = max(1, int(raw_workers)) + except (TypeError, ValueError): + self._workers = 1 + def get_translation_properties(self, variable: str, default_value: Any) -> Any: value_return = default_value if variable in self.get_pipeline_parameters(): @@ -436,7 +481,7 @@ def vcf_from_file(vcf_file: str) -> tuple[list, pd.DataFrame]: return metadata, vcf_df - def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf: str) -> str: + def _vcf_to_proteindb_chunk(self, vcf_file: str, input_fasta: str, gene_annotations_gtf: str, output_path: str) -> str: """ Generate proteins for variants by modifying sequences of affected transcripts. In case of already annotated variants it only considers variants within @@ -447,6 +492,7 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf :param vcf_file: :param input_fasta: :param gene_annotations_gtf: + :param output_path: path for writing the output FASTA :return: """ db = self.parse_gtf(gene_annotations_gtf, str(Path(gene_annotations_gtf).with_suffix('.db'))) @@ -491,6 +537,12 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf annotation_cols, vcf_file) self.get_logger().debug(msg) + # Fall back to index 0 when no structured ##INFO header was found (e.g. + # transcriptOverlaps-annotated VCFs produced by annoate_vcf, which write + # plain transcript IDs without a pipe-delimited FORMAT declaration). + if transcript_index is None: + transcript_index = 0 + else: # in case the given VCF is not annotated, annotate it by identifying the overlapping transcripts vcf_file = self.annoate_vcf(vcf_file, gene_annotations_gtf) @@ -506,7 +558,7 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf self._accepted_filters = [x.upper() for x in self._accepted_filters] - with open(self._proteindb_output, 'w', encoding='utf-8') as prots_fn: + with open(output_path, 'w', encoding='utf-8') as prots_fn: for record in vcf_reader.itertuples(index=False, name='VCFRecord'): trans = False if [x for x in str(record.REF) if x not in 'ACGT']: @@ -726,6 +778,74 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf '\n'.join([x + ":" + str(invalid_records[x]) for x in invalid_records.keys()])) self.get_logger().info(msg) + return output_path + + def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf: str, workers=None) -> str: + """Generate proteins for variants by modifying sequences of affected transcripts. + + If workers is None, falls back to self._workers (config) which defaults + to 1 (sequential, backward-compatible). Pass workers > 1 to fan out per + chromosome via multiprocessing.Pool. + :param vcf_file: + :param input_fasta: + :param gene_annotations_gtf: + :param workers: number of parallel worker processes (None => use config default) + :return: path to the output proteindb FASTA + """ + if workers is None: + workers = self._workers if self._workers else 1 + + # Fast path: sequential — single call, identical behaviour to the original implementation. + # For sequential runs we do NOT pre-annotate here; _vcf_to_proteindb_chunk handles that + # in its else-branch so that transcript_index=0 is set correctly for unannotated VCFs. + if workers <= 1: + return self._vcf_to_proteindb_chunk(vcf_file, input_fasta, gene_annotations_gtf, self._proteindb_output) + + # Parallel path: pre-annotate unannotated VCFs in the main process. This avoids each + # worker racing on the same bedtools-output bed file (which annoate_vcf writes to cwd) + # and amortises the bedtools intersect across workers. + if not self._annotation_field_name: + vcf_file = self.annoate_vcf(vcf_file, gene_annotations_gtf) + self._annotation_field_name = 'transcriptOverlaps' + + # Read VCF text once. Header lines + per-chrom buckets. + header_lines, chrom_to_lines = _split_vcf_by_chrom(vcf_file) + + # Parallel — split into per-chrom temp VCFs, fan out to a Pool. + import multiprocessing as mp + import shutil + import tempfile + + with tempfile.TemporaryDirectory(prefix='pgatk_v2p_') as tmpdir: + tasks = [] + for chrom in sorted(chrom_to_lines.keys()): + chunk_vcf = os.path.join(tmpdir, f"chunk_{_safe_chrom(chrom)}.vcf") + chunk_out = os.path.join(tmpdir, f"out_{_safe_chrom(chrom)}.fa") + with open(chunk_vcf, 'w', encoding='utf-8') as f: + f.writelines(header_lines) + f.writelines(chrom_to_lines[chrom]) + pa = dict(self.get_pipeline_parameters()) + pa[EnsemblDataService.PROTEIN_DB_OUTPUT] = chunk_out + # Force annotated mode for workers (annoate_vcf already ran in main): + pa[EnsemblDataService.ANNOTATION_FIELD_NAME] = self._annotation_field_name + tasks.append((self.get_default_parameters(), pa, chunk_vcf, + input_fasta, gene_annotations_gtf, chunk_out)) + + self.get_logger().info( + "vcf-to-proteindb: dispatching %d chromosome chunk(s) across %d worker(s)", + len(tasks), min(workers, len(tasks))) + + with mp.get_context('spawn').Pool(min(workers, len(tasks))) as pool: + pool.starmap(_vcf_to_proteindb_worker, tasks) + + # Concatenate the per-chunk FASTAs into the final output. + with open(self._proteindb_output, 'wb') as out: + for task in tasks: + chunk_out = task[5] + if os.path.exists(chunk_out): + with open(chunk_out, 'rb') as f: + shutil.copyfileobj(f, out) + return self._proteindb_output @staticmethod diff --git a/pgatk/tests/test_vcf_to_proteindb_parallel.py b/pgatk/tests/test_vcf_to_proteindb_parallel.py new file mode 100644 index 0000000..c9183ff --- /dev/null +++ b/pgatk/tests/test_vcf_to_proteindb_parallel.py @@ -0,0 +1,64 @@ +"""Equivalence test: vcf_to_proteindb with workers=2 produces the same sequence +content as workers=1 on a small fixture.""" +import os +from pathlib import Path + +from click.testing import CliRunner + +from pgatk.cli import cli + + +def _sequence_set(fasta_path) -> set: + seqs = [] + current = [] + with open(fasta_path, 'r', encoding='utf-8') as f: + for line in f: + if line.startswith('>'): + if current: + seqs.append(''.join(current)) + current = [] + else: + current.append(line.strip()) + if current: + seqs.append(''.join(current)) + return set(seqs) + + +def test_parallel_matches_sequential(tmp_path): + # pgatk package root: .../pgatk/pgatk/ + pkg_root = Path(__file__).resolve().parents[1] + testdata = pkg_root / 'testdata' + config = pkg_root / 'config' / 'ensembl_config.yaml' + out_seq = tmp_path / 'seq.fa' + out_par = tmp_path / 'par.fa' + + common_args = [ + 'vcf-to-proteindb', + '--config_file', str(config), + '--vcf', str(testdata / 'test.vcf'), + '--input_fasta', str(testdata / 'test.fa'), + '--gene_annotations_gtf', str(testdata / 'test.gtf'), + '--protein_prefix', 'ensvar', + '--af_field', 'MAF', + '--annotation_field_name', 'CSQ', + '--biotype_str', 'feature_type', + '--include_biotypes', 'mRNA,ncRNA', + ] + + runner = CliRunner() + orig_dir = os.getcwd() + try: + # Run from the package root so the .db file is created next to the .gtf. + os.chdir(str(pkg_root)) + + r1 = runner.invoke(cli, common_args + ['--output_proteindb', str(out_seq)]) + assert r1.exit_code == 0, r1.output + (str(r1.exception) if r1.exception else '') + + r2 = runner.invoke(cli, common_args + ['--output_proteindb', str(out_par), '--workers', '2']) + assert r2.exit_code == 0, r2.output + (str(r2.exception) if r2.exception else '') + finally: + os.chdir(orig_dir) + + # FASTA header order may differ across runs (per-worker ordering); compare + # sequence content as a set. + assert _sequence_set(out_seq) == _sequence_set(out_par) From 254e35c3c914c1169af33df17397819d7d8fb167 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 13 May 2026 20:49:21 +0100 Subject: [PATCH 22/23] Tier-1 perf follow-ups: SeqIO.index_db, streamed VCF split, lazy id-mapping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three focused improvements stacking on the per-chromosome multiprocessing PR: 1. Replace SeqIO.index with SeqIO.index_db. SeqIO.index rebuilds an in-memory FASTA offset index every invocation (~14 s for the 434 MB Ensembl cDNA FASTA); SeqIO.index_db persists the index in a SQLite .idx file next to the FASTA, reused across runs and shared across workers. Pre-built once in the main process before fan-out so the first chunk pays the build cost and the rest just open the same .idx. Stale detection: .idx older than FASTA triggers a rebuild. 2. _split_vcf_by_chrom now streams the input VCF and writes per-chrom chunks directly to disk in a single pass instead of buffering all data lines in memory. Constant memory regardless of input size — makes the whole-genome path actually feasible. 3. transcript_id_mapping (versionless -> versioned dict of 207k FASTA keys) is now built lazily on the first KeyError fallback. Most real workloads never need it; saves ~200ms per worker startup. No public API change. All existing tests pass; equivalence test still confirms sequential == parallel output. --- pgatk/ensembl/ensembl.py | 105 +++++++++++++++----- pgatk/testdata/proteindb_from_custom_VCF.fa | 12 +-- 2 files changed, 84 insertions(+), 33 deletions(-) diff --git a/pgatk/ensembl/ensembl.py b/pgatk/ensembl/ensembl.py index f06d69d..49fe852 100644 --- a/pgatk/ensembl/ensembl.py +++ b/pgatk/ensembl/ensembl.py @@ -52,24 +52,59 @@ def _safe_chrom(chrom: str) -> str: return re.sub(r'[^A-Za-z0-9._-]', '_', str(chrom)) -def _split_vcf_by_chrom(vcf_file: str) -> tuple[list[str], dict[str, list[str]]]: - """Read vcf_file once; return (header_lines, {chrom: [data_lines]}). +def _ensure_fasta_index(input_fasta: str) -> str: + """Return the path to a `.idx` SQLite index for input_fasta, building it + if absent or stale. Stale means: the .idx exists but the FASTA's mtime + is newer than the .idx's. The index is keyed by `EnsemblDataService.get_key`. + """ + idx_path = input_fasta + ".idx" + if os.path.exists(idx_path): + if os.path.getmtime(idx_path) >= os.path.getmtime(input_fasta): + return idx_path + try: + os.remove(idx_path) + except OSError: + pass + # Build the index. SeqIO.index_db materialises the SQLite file on disk + # as a side effect; we don't need to keep the returned handle here. + SeqIO.index_db(idx_path, [input_fasta], "fasta", key_function=EnsemblDataService.get_key) + return idx_path + - Preserves line ordering within each chromosome. Headers retain trailing newlines; - data lines retain trailing newlines. Empty/whitespace-only data lines are skipped. +def _split_vcf_by_chrom(vcf_file: str, output_dir: str) -> dict[str, str]: + """Stream `vcf_file` once, writing per-chromosome VCFs into `output_dir`. + + Each output file is `/chunk_.vcf` and contains the + full VCF header followed by only the data lines for that chromosome. + Preserves line ordering within each chromosome. + + Returns a mapping `{chrom: chunk_path}`. Constant memory — holds at most + one open file handle per chromosome seen so far. """ header: list[str] = [] - buckets: dict[str, list[str]] = {} - with open(vcf_file, 'r', encoding='utf-8') as f: - for line in f: - if line.startswith('#'): - header.append(line) - continue - if not line.strip(): - continue - chrom = line.split('\t', 1)[0] - buckets.setdefault(chrom, []).append(line) - return header, buckets + handles: dict[str, Any] = {} + chunk_paths: dict[str, str] = {} + try: + with open(vcf_file, 'r', encoding='utf-8') as f: + for line in f: + if line.startswith('#'): + header.append(line) + continue + if not line.strip(): + continue + chrom = line.split('\t', 1)[0] + handle = handles.get(chrom) + if handle is None: + chunk_path = os.path.join(output_dir, "chunk_" + _safe_chrom(chrom) + ".vcf") + handle = open(chunk_path, 'w', encoding='utf-8') + handle.writelines(header) + handles[chrom] = handle + chunk_paths[chrom] = chunk_path + handle.write(line) + finally: + for h in handles.values(): + h.close() + return chunk_paths def _vcf_to_proteindb_worker(default_params, pipeline_args, vcf_file, @@ -497,9 +532,12 @@ def _vcf_to_proteindb_chunk(self, vcf_file: str, input_fasta: str, gene_annotati """ db = self.parse_gtf(gene_annotations_gtf, str(Path(gene_annotations_gtf).with_suffix('.db'))) - transcripts_dict = SeqIO.index(input_fasta, "fasta", key_function=self.get_key) + idx_path = _ensure_fasta_index(input_fasta) + transcripts_dict = SeqIO.index_db(idx_path, [input_fasta], "fasta", + key_function=self.get_key) # handle cases where the transcript has version in the GTF but not in the VCF - transcript_id_mapping = {k.split('.')[0]: k for k in transcripts_dict.keys()} + # Built lazily on the first KeyError to avoid iterating 207k keys up-front. + transcript_id_mapping: Optional[dict[str, str]] = None feature_cache = _FeatureCache() # Value is (ref_seq, desc) for a known transcript, or None for a transcript # we've already looked up and confirmed isn't in the FASTA index (avoids re-trying @@ -659,9 +697,16 @@ def _vcf_to_proteindb_chunk(self, vcf_file: str, input_fasta: str, gene_annotati continue try: - transcript_id_v = transcript_id_mapping[transcript_id] - except KeyError: - transcript_id_v = transcript_id + transcript_id_v = transcript_id_mapping[transcript_id] # type: ignore[index] + except (KeyError, TypeError): + if transcript_id_mapping is None: + transcript_id_mapping = {k.split('.')[0]: k for k in transcripts_dict.keys()} + try: + transcript_id_v = transcript_id_mapping[transcript_id] + except KeyError: + transcript_id_v = transcript_id + else: + transcript_id_v = transcript_id cached_row = seq_cache.get(transcript_id_v, _MISSING) if cached_row is _MISSING: @@ -808,8 +853,9 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf vcf_file = self.annoate_vcf(vcf_file, gene_annotations_gtf) self._annotation_field_name = 'transcriptOverlaps' - # Read VCF text once. Header lines + per-chrom buckets. - header_lines, chrom_to_lines = _split_vcf_by_chrom(vcf_file) + # Build the FASTA index ONCE in the main process so all workers share it + # instead of each re-scanning the FASTA (~14 s × N workers wasted). + _ensure_fasta_index(input_fasta) # Parallel — split into per-chrom temp VCFs, fan out to a Pool. import multiprocessing as mp @@ -817,13 +863,18 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf import tempfile with tempfile.TemporaryDirectory(prefix='pgatk_v2p_') as tmpdir: + # Stream-split VCF by chromosome directly to per-chrom files (constant memory). + chunk_paths = _split_vcf_by_chrom(vcf_file, tmpdir) + + if len(chunk_paths) <= 1: + # Only one chromosome — run sequentially on the original file. + return self._vcf_to_proteindb_chunk(vcf_file, input_fasta, gene_annotations_gtf, + self._proteindb_output) + tasks = [] - for chrom in sorted(chrom_to_lines.keys()): - chunk_vcf = os.path.join(tmpdir, f"chunk_{_safe_chrom(chrom)}.vcf") + for chrom in sorted(chunk_paths.keys()): + chunk_vcf = chunk_paths[chrom] chunk_out = os.path.join(tmpdir, f"out_{_safe_chrom(chrom)}.fa") - with open(chunk_vcf, 'w', encoding='utf-8') as f: - f.writelines(header_lines) - f.writelines(chrom_to_lines[chrom]) pa = dict(self.get_pipeline_parameters()) pa[EnsemblDataService.PROTEIN_DB_OUTPUT] = chunk_out # Force annotated mode for workers (annoate_vcf already ran in main): diff --git a/pgatk/testdata/proteindb_from_custom_VCF.fa b/pgatk/testdata/proteindb_from_custom_VCF.fa index 22f87e0..7a47e85 100644 --- a/pgatk/testdata/proteindb_from_custom_VCF.fa +++ b/pgatk/testdata/proteindb_from_custom_VCF.fa @@ -1,12 +1,12 @@ ->varsample_rs78350717_22.15528913.C.A_ENST00000643195 -MNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKDFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII* ->varsample_rs78350717_22.15528913.C.T_ENST00000643195 -MNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKVFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII* >varsample_rs78350717_22.15528913.C.A_ENST00000252835 MCPLTLQVTGLMNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKDFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII* >varsample_rs78350717_22.15528913.C.T_ENST00000252835 MCPLTLQVTGLMNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKVFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII* ->varsample_rs147461488_22.17740102.GGCCACGCTCAACT.G_ENST00000342111 -MDCEVNNGSSLRDECITNLLVFGFLQSCSDNSFRRELDALGHELPVLAPQWEGYDELQTDGNRSSHSRLGRIEAGASDNNTASAEEETEAAGSVAFMELRQ*F*KSRRHHPEYCQAPRPGRGQHGP*HPSGPG +>varsample_rs78350717_22.15528913.C.A_ENST00000643195 +MNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKDFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII* +>varsample_rs78350717_22.15528913.C.T_ENST00000643195 +MNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKVFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII* >varsample_rs147461488_22.17740102.GGCCACGCTCAACT.G_ENST00000617586 MDCEVNNGSSLRDECITNLLVFGFLQSCSDNSFRRELDALGHELPVLAPQWEGYDELQTDGNRSSHSRLGRIEAGASDNNTASAEEETEAAGSVAFMELRQWYSGRGSTEAVRQRRRT +>varsample_rs147461488_22.17740102.GGCCACGCTCAACT.G_ENST00000342111 +MDCEVNNGSSLRDECITNLLVFGFLQSCSDNSFRRELDALGHELPVLAPQWEGYDELQTDGNRSSHSRLGRIEAGASDNNTASAEEETEAAGSVAFMELRQ*F*KSRRHHPEYCQAPRPGRGQHGP*HPSGPG From d50f8a04b26a72a0df32e7a0d110b6081cf99b99 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 13 May 2026 20:50:28 +0100 Subject: [PATCH 23/23] Revert non-deterministic test output; gitignore SeqIO.index_db files The previous commit (254e35c) inadvertently captured a re-ordering of proteindb_from_custom_VCF.fa from a single test run. The file is the OUTPUT of test_vcf_to_proteindb_notannotated (which only asserts exit_code == 0, not content); the ordering shifts non-deterministically between runs because annoate_vcf uses Python set iteration with hash randomization. Restore the prior content to keep the working tree stable across consecutive test invocations. Also gitignore *.fa.idx / *.fasta.idx: BioPython SeqIO.index_db now materialises a SQLite index next to each FASTA. These are build artifacts, rebuilt automatically when the source FASTA changes. --- .gitignore | 5 +++++ pgatk/testdata/proteindb_from_custom_VCF.fa | 12 ++++++------ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index c9aa01f..2112a66 100644 --- a/.gitignore +++ b/.gitignore @@ -138,3 +138,8 @@ pgatk/testdata/Meleagris_gallopavo* # Internal working docs (implementation plans, scratch notes) docs/plans/ + +# BioPython SeqIO.index_db SQLite indexes — built lazily on first use, +# rebuilt automatically when the source FASTA changes (mtime check). +*.fa.idx +*.fasta.idx diff --git a/pgatk/testdata/proteindb_from_custom_VCF.fa b/pgatk/testdata/proteindb_from_custom_VCF.fa index 7a47e85..22f87e0 100644 --- a/pgatk/testdata/proteindb_from_custom_VCF.fa +++ b/pgatk/testdata/proteindb_from_custom_VCF.fa @@ -1,12 +1,12 @@ ->varsample_rs78350717_22.15528913.C.A_ENST00000252835 -MCPLTLQVTGLMNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKDFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII* ->varsample_rs78350717_22.15528913.C.T_ENST00000252835 -MCPLTLQVTGLMNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKVFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII* >varsample_rs78350717_22.15528913.C.A_ENST00000643195 MNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKDFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII* >varsample_rs78350717_22.15528913.C.T_ENST00000643195 MNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKVFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII* ->varsample_rs147461488_22.17740102.GGCCACGCTCAACT.G_ENST00000617586 -MDCEVNNGSSLRDECITNLLVFGFLQSCSDNSFRRELDALGHELPVLAPQWEGYDELQTDGNRSSHSRLGRIEAGASDNNTASAEEETEAAGSVAFMELRQWYSGRGSTEAVRQRRRT +>varsample_rs78350717_22.15528913.C.A_ENST00000252835 +MCPLTLQVTGLMNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKDFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII* +>varsample_rs78350717_22.15528913.C.T_ENST00000252835 +MCPLTLQVTGLMNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKVFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII* >varsample_rs147461488_22.17740102.GGCCACGCTCAACT.G_ENST00000342111 MDCEVNNGSSLRDECITNLLVFGFLQSCSDNSFRRELDALGHELPVLAPQWEGYDELQTDGNRSSHSRLGRIEAGASDNNTASAEEETEAAGSVAFMELRQ*F*KSRRHHPEYCQAPRPGRGQHGP*HPSGPG +>varsample_rs147461488_22.17740102.GGCCACGCTCAACT.G_ENST00000617586 +MDCEVNNGSSLRDECITNLLVFGFLQSCSDNSFRRELDALGHELPVLAPQWEGYDELQTDGNRSSHSRLGRIEAGASDNNTASAEEETEAAGSVAFMELRQWYSGRGSTEAVRQRRRT