diff --git a/.codacy.yaml b/.codacy.yaml new file mode 100644 index 0000000..7a461b7 --- /dev/null +++ b/.codacy.yaml @@ -0,0 +1,15 @@ +--- +# Codacy configuration. https://docs.codacy.com/repositories-configure/codacy-configuration-file/ +# +# pydocstyle has two pairs of mutually-exclusive rules: +# D203 vs D211 (blank line before class docstring; codebase follows D203) +# D212 vs D213 (multi-line summary line; codebase follows D212) +# Both rules in each pair are enabled in Codacy's default profile, which produces +# unavoidable noise — pick one and silence the other. Settings here also apply to +# any local pydocstyle invocation that reads `.pydocstyle`. + +engines: + pydocstyle: + enabled: true + settings: + add_ignore: ["D211", "D213"] diff --git a/.gitignore b/.gitignore index 3e8faad..2112a66 100644 --- a/.gitignore +++ b/.gitignore @@ -135,3 +135,11 @@ pgatk/testdata/Meleagris_gallopavo* .DS_Store .codacy/ .cursor/ + +# Internal working docs (implementation plans, scratch notes) +docs/plans/ + +# BioPython SeqIO.index_db SQLite indexes — built lazily on first use, +# rebuilt automatically when the source FASTA changes (mtime check). +*.fa.idx +*.fasta.idx diff --git a/.pydocstyle b/.pydocstyle new file mode 100644 index 0000000..bd70be3 --- /dev/null +++ b/.pydocstyle @@ -0,0 +1,6 @@ +[pydocstyle] +# D211 conflicts with D203 (codebase follows D203: one blank line before class docstring). +# D213 conflicts with D212 (codebase follows D212: multi-line summary on the first line). +# Disabling the rules we don't follow stops the mutually-exclusive-pair noise from +# static analysers (pydocstyle, Codacy). +add-ignore = D211,D213 diff --git a/docs/index.md b/docs/index.md index f6e58a2..87026a3 100644 --- a/docs/index.md +++ b/docs/index.md @@ -23,6 +23,7 @@ See the [Installation](installation.md) page for more options (Bioconda, Docker, | [Introduction](introduction.md) | Overview of the proteogenomics field | | [Installation](installation.md) | How to install pgatk (pip, Bioconda, Docker, source) | | [pgatk CLI](pgatk-cli.md) | Full command-line reference for all tools | +| [Validations](validations.md) | Tests and validations to ensure correctness of the modules| | [Use Cases](use-cases.md) | End-to-end workflows and recipes for common scenarios | | [File Formats](formats.md) | BED, GTF, GCT format specifications | | [Changelog](changelog.md) | Version history and release notes | diff --git a/docs/pgatk-cli.md b/docs/pgatk-cli.md index f36971a..8bf7878 100644 --- a/docs/pgatk-cli.md +++ b/docs/pgatk-cli.md @@ -318,7 +318,7 @@ Usage: pgatk vcf-to-proteindb [OPTIONS] Options: --translation_table INTEGER Translation table (Default 1) --mito_translation_table INTEGER Mito_trans_table (default 2) - --var_prefix TEXT String to add as prefix for the variant peptides + --protein_prefix TEXT String to add as prefix for the variant peptides --report_ref_seq Also report the reference peptide from overlapping transcripts --annotation_field_name TEXT Annotation field name in INFO column (default: CSQ) --af_field TEXT Field name for variant allele frequency (default: none) @@ -467,7 +467,7 @@ Usage: pgatk dnaseq-to-proteindb [OPTIONS] --biotype_str TEXT String used to identify gene/transcript biotype (default: transcript_biotype) --expression_str TEXT String for extracting expression value (default: None) --expression_thresh FLOAT Threshold for expression value filtering (default: 5) - --var_prefix TEXT Prefix to be added to fasta headers (default: none) + --protein_prefix TEXT Prefix to be added to fasta headers (default: none) -h, --help Show this message and exit. ``` @@ -489,7 +489,7 @@ Usage: pgatk dnaseq-to-proteindb [OPTIONS] --config_file config/ensembl_config.yaml \ --input_fasta transcript_sequences.fa \ --output_proteindb proteindb_from_lincRNA_canonical_sequences.fa \ - --var_prefix lincRNA_ \ + --protein_prefix lincRNA_ \ --include_biotypes lincRNA ``` @@ -500,7 +500,7 @@ Usage: pgatk dnaseq-to-proteindb [OPTIONS] --config_file config/ensembl_config.yaml \ --input_fasta transcript_sequences.fa \ --output_proteindb proteindb_from_processed_pseudogene.fa \ - --var_prefix pseudogene_ \ + --protein_prefix pseudogene_ \ --include_biotypes processed_pseudogene,transcribed_processed_pseudogene,translated_processed_pseudogene \ --skip_including_all_cds ``` @@ -512,7 +512,7 @@ Usage: pgatk dnaseq-to-proteindb [OPTIONS] --config_file config/ensembl_config.yaml \ --input_fasta transcript_sequences.fa \ --output_proteindb proteindb_from_altORFs.fa \ - --var_prefix altorf_ \ + --protein_prefix altorf_ \ --include_biotypes altORFs \ --skip_including_all_cds ``` diff --git a/docs/plans/2026-03-01-pgatk-graph-engine-design.md b/docs/plans/2026-03-01-pgatk-graph-engine-design.md deleted file mode 100644 index 9048e99..0000000 --- a/docs/plans/2026-03-01-pgatk-graph-engine-design.md +++ /dev/null @@ -1,270 +0,0 @@ -# pgatk Evolution: Graph-Based Transcript Modeling & Feature Parity - -**Date:** 2026-03-01 -**Status:** Approved -**Authors:** Yasset Perez-Riverol, Claude (AI assistant) - ---- - -## 1. Executive Summary - -This document describes the research and implementation plan to evolve pgatk from a linear, one-variant-at-a-time proteogenomics database generator into a graph-based transcript modeling engine capable of multi-variant co-occurrence, gene fusions, RNA editing, and circular RNA support. The plan includes a prerequisite infrastructure cleanup phase and a phased feature rollout with performance optimization. - ---- - -## 2. Current State Assessment - -### What pgatk does well - -- Only Python tool with native one-command downloaders for ENSEMBL, COSMIC, and cBioPortal -- Integrated decoy generation (4 methods: protein-reverse, protein-shuffle, DecoyPyrat, pgdbdeep) -- Spectrum-level validation via SpectrumAI (unique among database generators) -- Aho-Corasick exact matching + sliding-window mismatch search (BlastGetPosition) -- Published benchmark: 43,501 non-canonical peptides across 64 human cell lines (Umer et al. 2022) -- Pangenome proteogenomics demonstrated (Wang et al. 2024 preprint): 4,991 novel peptides from HPRC - -### Critical gaps vs. the field (2024-2026) - -| Gap | Impact | Solved by | -|---|---|---| -| No multi-variant co-occurrence | Transcript with N SNPs produces 1 sequence instead of up to 2^N | moPepGen (DAG), vcf2prot (phased VCF) | -| No gene fusions | Major cancer event type missed entirely | moPepGen | -| No RNA editing | A-to-I editing creates novel peptides invisible to DNA-only | moPepGen | -| No circular RNA | Emerging class of translated non-coding RNA | moPepGen | -| No ClinVar/NCBI support | Open issue #24 since 2019 | Manual workaround only | -| pandas iterrows in VCF processing | ~100x slower than vectorized operations | Internal bottleneck | -| No transcript feature caching | Redundant gffutils DB queries per variant | Internal bottleneck | - -### Competitive landscape - -**moPepGen** (Nature Biotechnology, June 2025): Graph-based (DAG per transcript), handles all event types, ~4x more non-canonical peptides than prior methods. The algorithmic benchmark to match. - -**vcf2prot** (ElAbd et al. 2022, bioRxiv): Rust-based Sequence Intermediate Representation (SIR) approach. ~1000x faster than PrecisionProDB. Processes 99,254 variants across 8,192 patients in ~11 minutes. Demonstrates the performance ceiling achievable with systems-level optimization. Limited to phased VCFs with BCFtools/csq annotation. - -**PG2** (J. Proteome Research 2023): Snakemake pipeline integrating genome + transcriptome. Handles splicing and fusions but requires heavy infrastructure. - -**pXg** (MCP 2024): RNA-Seq + proteomics + de novo sequencing for immunopeptidomics. - -**NeoDisc** (Nature Biotechnology 2024): End-to-end clinical neoantigen pipeline. - -**PepCentric** (bioRxiv 2025): Repository-scale validation against 2.3B spectra. Complementary to database generators. - -### Key code quality findings - -- `get_altseq()` (ensembl.py): 60-line heart of variant processing, handles SNP/ins/del but only one variant at a time -- `vcf_to_proteindb()` (ensembl.py): ~260 lines, main bottleneck is `vcf_reader.iterrows()` + per-row gffutils DB lookups -- No `pathlib`, no `dataclasses`, inconsistent type hints -- `db/` standalone scripts use `sys.argv` at import time (not importable as modules) -- Variable shadowing bug in `protein_database_decoy.py` (`decoy_sequence` variable shadows the imported `pyteomics.fasta.decoy_sequence` function) -- Broad `except Exception` in several places suppresses unexpected errors -- Mixed `print()` and logger usage for error reporting -- Repetitive 3-layer config fallback pattern across all service classes - ---- - -## 3. Architecture: TranscriptGraph Engine - -### Core Concept - -Each transcript is modeled as a directed acyclic graph (DAG): - -- **Nodes** represent sequence segments (reference sequence between variant positions) -- **Edges** represent either the reference path or an alternative (variant) path -- **Traversal** of all root-to-leaf paths generates all combinatorial protein sequences - -``` -Reference: ──[seg1]──[seg2]──[seg3]──[seg4]── - │ │ -Variant A: └─[alt_A]─┘ │ -Variant B: └─[alt_B]─┘ - -Paths: ref, A only, B only, A+B → 4 protein sequences -``` - -### Module Structure - -``` -pgatk/ - graph/ # NEW module - __init__.py - transcript_graph.py # TranscriptGraph class (DAG builder + traverser) - variant_nodes.py # Node types: SNP, Insertion, Deletion, Fusion, RNAEdit, CircRNA - graph_enumerator.py # Path enumeration with combinatorial explosion controls - event_parsers/ # Input parsers for each event type - __init__.py - vcf_parser.py # VCF → graph edges (replaces current get_altseq) - fusion_parser.py # Gene fusion calls → cross-transcript edges - rnaedit_parser.py # RNA editing sites → substitution edges - circrna_parser.py # Back-splice junctions → circular edges - clinvar_parser.py # ClinVar VCF → graph edges (issue #24) - filters/ - __init__.py - allele_frequency.py # AF-based pruning - consequence.py # VEP consequence filtering - expression.py # Expression-level filtering - max_variants.py # Cap combinatorial explosion -``` - -### Key Classes - -```python -@dataclass -class VariantNode: - """Base class for all genomic events.""" - position: int # CDS position - ref_allele: str - alt_allele: str - event_type: EventType # SNP, INS, DEL, FUSION, RNA_EDIT, CIRC_RNA - metadata: dict # AF, consequence, source, etc. - - -class TranscriptGraph: - """DAG representing a transcript with all its variants.""" - - def __init__(self, transcript_id: str, reference_seq: str, strand: str) -> None: ... - def add_variant(self, variant: VariantNode) -> None: ... - def add_fusion(self, partner_graph: 'TranscriptGraph', breakpoints: tuple) -> None: ... - def enumerate_paths(self, max_paths: int = 1000) -> Iterator[ProteinSequence]: ... - def to_fasta(self, output: TextIO) -> int: ... # returns count written -``` - -### Combinatorial Explosion Controls - -With N variants, naive enumeration produces 2^N sequences. Controls: - -1. **Max variants per transcript** (default: 10) — skip transcripts exceeding this -2. **Max paths per transcript** (default: 1000) — stop enumeration after limit -3. **Allele frequency pruning** — only include variants above a threshold -4. **Phasing support** — if phased VCF, only enumerate haplotype-consistent paths -5. **Consequence filtering** — only include protein-altering consequences - -### Integration Strategy - -The graph engine sits alongside existing modules (no replacement): - -- **Downloaders** remain unchanged -- **New CLI commands** (`graph-vcf-to-proteindb`, etc.) use the graph engine -- **Existing commands** continue working for backward compatibility -- **Shared infrastructure** (`toolbox/`, `config/`) is reused - ---- - -## 4. Phased Implementation Plan - -### Phase 0 — Infrastructure & Code Quality - -**Goal:** Fix existing issues so the graph engine is built on solid ground. - -**Deliverables:** - -1. Fix variable shadowing bug in `protein_database_decoy.py` -2. Convert `db/digest_mutant_protein.py` and `db/map_peptide2genome.py` from `sys.argv`/`getopt` import-time execution to proper Click CLI commands -3. Add `conftest.py` with proper fixtures; remove relative path dependencies in tests -4. Replace `print()` error reporting with proper logger usage across all modules -5. Add type hints to all public APIs -6. Use `dataclasses` for data models (`SNP` in `cgenomes/models.py`, new models) -7. Use `pathlib.Path` for file operations (replace string concatenation) -8. Replace broad `except Exception` with specific exception handling -9. Standardize the 3-layer config fallback pattern (DRY up repetitive `get_*_parameters()` methods) -10. Expand unit test coverage for core functions: `get_altseq()`, `get_mut_pro_seq()`, `revswitch()` -11. Replace `pathos` with `concurrent.futures.ProcessPoolExecutor` (stdlib) - ---- - -### Phase 1 — Graph Core + SNP/Indel Support - -**Goal:** Working graph engine for basic variants, producing a superset of current output. - -**Deliverables:** - -1. `TranscriptGraph` class with add/enumerate/serialize -2. `VariantNode` dataclass hierarchy (SNP, Insertion, Deletion) -3. `vcf_parser.py` — reads VCF (VEP-annotated or unannotated) into graph edges -4. `graph_enumerator.py` — path enumeration with explosion controls -5. New CLI command: `graph-vcf-to-proteindb` -6. Transcript feature caching (eliminate redundant gffutils queries) -7. Vectorized VCF grouping (replace `iterrows()`) -8. Unit tests for graph operations + integration tests -9. Benchmark against current `vcf_to_proteindb()` on test VCFs - -**Validation:** Graph engine must produce a superset of sequences from the linear approach. - ---- - -### Phase 2 — Cancer Genomics + ClinVar - -**Goal:** Bring cancer mutation sources into the graph model and add ClinVar (issue #24). - -**Deliverables:** - -1. `clinvar_parser.py` — parse ClinVar VCF into graph edges -2. Refactor `CancerGenomesService.get_mut_pro_seq()` to produce `VariantNode` objects -3. Graph-aware COSMIC processing (multiple mutations on same gene → combinatorial) -4. Graph-aware cBioPortal processing -5. New CLI commands: `clinvar-to-proteindb`, `graph-cosmic-to-proteindb` -6. Tests with real COSMIC data (multiple mutations per transcript) - ---- - -### Phase 3 — Novel Event Types: Fusions, RNA Editing, Circular RNA - -**Goal:** Add the event types that differentiate moPepGen from other tools. - -**Deliverables:** - -1. `fusion_parser.py` — parse gene fusion calls (STAR-Fusion, Arriba, FusionCatcher formats) -2. `rnaedit_parser.py` — parse RNA editing databases (REDIportal, DARNED) or user sites -3. `circrna_parser.py` — parse back-splice junction files (CIRCexplorer, CIRI) -4. Extended `TranscriptGraph` for cross-transcript fusions and circular paths -5. New CLI commands: `fusion-to-proteindb`, `rnaedit-to-proteindb`, `circrna-to-proteindb` -6. Combined command: `multi-event-to-proteindb` (merges all event types per transcript) - ---- - -### Phase 4 — Performance Optimization - -**Goal:** Make the graph engine fast enough for genome-scale datasets. - -**Deliverables:** - -1. Profile Phases 1-3 on real-world datasets (gnomAD, TCGA) -2. Identify hot paths via `cProfile` / `py-spy` -3. Optimize graph traversal (topological sort, memoization of shared subpaths) -4. Replace remaining `pathos` usage with `concurrent.futures` -5. Optional: C extension (via `cffi` or `cython`) for innermost graph traversal — only if profiling shows it's the bottleneck -6. Benchmark against moPepGen and vcf2prot on same datasets - ---- - -### Phase Dependencies - -``` -Phase 0 (Infrastructure & Quality) - └── Phase 1 (Graph Core + SNP/Indel) - ├── Phase 2 (Cancer + ClinVar) - │ └── Phase 3 (Fusions, RNA Edit, CircRNA) - │ └── Phase 4 (Performance) - └── (ongoing quality improvements) -``` - ---- - -## 5. References - -### Primary pgatk publication -- Umer HM, Audain E, Zhu Y, Pfeuffer J, Sachsenberg T, Lehtiö J, Branca RM, Perez-Riverol Y. Generation of ENSEMBL-based proteogenomics databases boosts the identification of non-canonical peptides. *Bioinformatics*. 2022;38(5):1470-1472. doi:10.1093/bioinformatics/btab838 - -### Pangenome proteogenomics -- Wang D, Bouwmeester R, Zheng P, Dai C, Sanchez A, Shu K, Bai M, Umer HM, Perez-Riverol Y. Proteogenomics analysis of human tissues using pangenomes. *bioRxiv*. 2024. doi:10.1101/2024.05.24.595489 - -### Competing tools -- moPepGen: Graph-based proteogenomic database generation. *Nature Biotechnology*. June 2025. doi:10.1038/s41587-025-02701-0 -- vcf2prot: ElAbd H, Degenhardt F, Lenz TL, Franke A, Wendorff M. VCF2Prot: An efficient and parallel tool for generating personalized proteomes from VCF files. *bioRxiv*. 2022. doi:10.1101/2022.01.21.477084 -- ProteomeGenerator2: *J. Proteome Research*. 2023. doi:10.1021/acs.jproteome.3c00005 -- pXg: *Molecular & Cellular Proteomics*. 2024. doi:10.1016/j.mcpro.2024.100733 -- NeoDisc: *Nature Biotechnology*. October 2024. doi:10.1038/s41587-024-02420-y -- PepCentric: *bioRxiv*. February 2025. doi:10.1101/2025.02.24.639867 - -### Related ecosystem -- quantms: Dai C et al. *Nature Methods*. 2024. doi:10.1038/s41592-024-02343-1 -- PRIDE 2025: *Nucleic Acids Research*. 2025;53(D1):D543. doi:10.1093/nar/gkae1011 diff --git a/docs/plans/2026-03-03-protein-accession-design.md b/docs/plans/2026-03-03-protein-accession-design.md deleted file mode 100644 index cfe0b8d..0000000 --- a/docs/plans/2026-03-03-protein-accession-design.md +++ /dev/null @@ -1,96 +0,0 @@ -# Protein Accession and FASTA Header Design - -Issue: https://github.com/bigbio/pgatk/issues/18 -Branch: `feature/protein-accession-design` -Date: 2026-03-03 - -## Problem - -Current pgatk FASTA headers are inconsistent across variant sources (VCF, COSMIC, ClinVar) and incompatible with major search engines like SearchGUI, which cannot parse ENSEMBL-style IDs. - -## Design - -### Two protein categories, two prefix strategies - -| Category | Prefix | Accession | Description | -|----------|--------|-----------|-------------| -| Canonical (reference) | Keep original (`sp\|`, `tr\|`, `ensp\|`) | Original accession | Untouched from source database | -| Variant (mutated) | `pgvar\|` | `{TRANSCRIPT_ID}-{INDEX}` | pgatk-generated variant protein | - -### Variant header format - -``` ->pgvar|{TRANSCRIPT_ID}-{INDEX}|{GENE_SYMBOL} {key=value metadata} -``` - -**Fields:** - -- `pgvar` -- database tag identifying pgatk-generated variant proteins. -- `{TRANSCRIPT_ID}-{INDEX}` -- accession composed of parent transcript ID and a dash-separated 1-based index (per transcript, per run). Mirrors UniProt isoform convention (`P12345-2`). -- `{GENE_SYMBOL}` -- gene name, first token after the second pipe. -- Metadata key=value pairs in the description field: - -| Key | Description | Example | -|-----|-------------|---------| -| `VariantSource` | Origin database | `COSMIC`, `ClinVar`, `gnomAD`, `dbSNP` | -| `GenomicCoord` | `chr:pos:ref:alt` | `12:25245347:C:G` | -| `AAChange` | HGVS protein notation | `p.G13R` | -| `MutationType` | SO term or short label | `missense_variant` | -| `dbSNP` | rsID if available | `rs121913529` | -| `ORF` | Reading frame number (only when multi-ORF) | `1`, `2`, `3` | - -### Examples - -```fasta -# Canonical proteins -- untouched from source databases ->sp|P01112|RASH_HUMAN GTPase HRas OS=Homo sapiens ->ensp|ENSP00000309845|BRCA1 - -# Variant proteins -- unified pgvar| prefix regardless of source ->pgvar|ENST00000311189-1|HRAS VariantSource=COSMIC AAChange=p.G13R MutationType=missense_variant GenomicCoord=12:25245347:C:G ->pgvar|ENST00000311189-2|HRAS VariantSource=COSMIC AAChange=p.Q61L MutationType=missense_variant GenomicCoord=12:25245350:A:T ->pgvar|ENST00000357654-1|BRCA1 VariantSource=ClinVar AAChange=p.R1699Q MutationType=missense_variant GenomicCoord=17:43094464:G:A dbSNP=rs41293455 - -# Multiple ORFs -- each ORF gets its own index, ORF number in metadata ->pgvar|ENST00000311189-3|HRAS VariantSource=COSMIC AAChange=p.G13R ORF=1 ->pgvar|ENST00000311189-4|HRAS VariantSource=COSMIC AAChange=p.G13R ORF=2 ->pgvar|ENST00000311189-5|HRAS VariantSource=COSMIC AAChange=p.G13R ORF=3 -``` - -### Indexing logic - -The `-{index}` is per-transcript, per-file generation run: - -- First variant on `ENST00000311189` gets `-1` -- Second variant on same transcript gets `-2` -- Multi-ORF outputs each consume an index (3 ORFs = 3 indices) -- First variant on a different transcript resets to `-1` - -### Search engine compatibility - -| Engine | Compatible | Notes | -|--------|-----------|-------| -| SearchGUI / PeptideShaker | Yes | Matches UniProt-like `db\|acc\|name` pattern | -| MaxQuant | Yes | Default UniProt parse rule works | -| MSFragger / FragPipe | Yes | Reads full header, splits on first whitespace | -| Comet | Yes | Parses `>db\|acc\|` natively | -| DIA-NN | Yes | Follows UniProt-style parsing | -| Proteome Discoverer | Yes | Supports pipe-delimited headers | - -### Files to modify - -| File | Change | -|------|--------| -| `pgatk/ensembl/ensembl.py` | Refactor `vcf_to_proteindb()` header construction (lines 661-664) | -| `pgatk/clinvar/clinvar_service.py` | Refactor header construction (lines 554-560) | -| `pgatk/cgenomes/cgenomes_proteindb.py` | Refactor COSMIC header (line 317), cBioPortal header | -| `pgatk/toolbox/vcf_utils.py` | Update `write_output()` to handle new format cleanly | -| `pgatk/config/` | Add constants for `PGVAR_PREFIX`, metadata keys | - -### Design decisions - -1. **Dash separator** (`-`) between transcript and index, consistent with UniProt isoform convention. -2. **No ORF suffix in accession** -- ORF number is metadata (`ORF=N`), each ORF gets its own index. -3. **Canonical proteins are pass-through** -- pgatk does not reformat existing database headers. -4. **Unified format across all sources** -- COSMIC, ClinVar, VCF variants all use `pgvar|` regardless of origin. -5. **Key=value metadata** in description field for structured downstream parsing. diff --git a/docs/use-cases.md b/docs/use-cases.md index bd3cf20..0442c9c 100644 --- a/docs/use-cases.md +++ b/docs/use-cases.md @@ -6,7 +6,7 @@ a search-ready protein database. --- -## Cell-Type Specific Non-Canonical Peptide Discovery +## USE CASE 1: Cell-Type Specific Non-Canonical Peptide Discovery !!! abstract "Featured workflow" This workflow reproduces the analysis from [Umer et al., *Bioinformatics* 2022](https://doi.org/10.1093/bioinformatics/btab838), @@ -32,8 +32,8 @@ pgatk ensembl-downloader \ ```bash gffread -F -w ensembl_human/transcripts.fa \ - -g ensembl_human/genome.fa \ - ensembl_human/Homo_sapiens.GRCh38.*.gtf.gz + -g ensembl_human/Homo_sapiens.GRCh38.dna_sm.toplevel.fa \ + ensembl_human/Homo_sapiens.GRCh38.*.gtf ``` ### Step 3 -- Canonical protein-coding sequences @@ -57,8 +57,8 @@ detectable peptides. Translate them in three reading frames: pgatk dnaseq-to-proteindb \ --input_fasta ensembl_human/transcripts.fa \ --output_proteindb pseudogene.fa \ - --var_prefix pseudo_ \ - --include_biotypes processed_pseudogene,unprocessed_pseudogene,transcribed_processed_pseudogene,transcribed_unprocessed_pseudogene,translated_processed_pseudogene \ + --protein_prefix pseudo_ \ + --include_biotypes pseudogene,processed_pseudogene,unprocessed_pseudogene,transcribed_processed_pseudogene,transcribed_unprocessed_pseudogene,translated_processed_pseudogene,unitary_pseudogene,transcribed_unitary_pseudogene,rRNA_pseudogene,IG_V_pseudogene,TR_V_pseudogene,IG_C_pseudogene,TR_J_pseudogene,IG_J_pseudogene,IG_pseudogene \ --num_orfs 3 \ --skip_including_all_cds ``` @@ -72,39 +72,41 @@ micropeptides: pgatk dnaseq-to-proteindb \ --input_fasta ensembl_human/transcripts.fa \ --output_proteindb lncrna.fa \ - --var_prefix lncrna_ \ - --include_biotypes lincRNA,antisense,sense_intronic,sense_overlapping \ + --protein_prefix lncrna_ \ + --include_biotypes lncRNA \ --num_orfs 3 \ --skip_including_all_cds ``` -#### Alternative ORFs (exonic out-of-frame translation) +#### Putative proteins (three-frame) -Non-canonical reading frames of protein-coding mRNAs can produce cryptic -peptides: +Protein coding genes without CDs that are not validated but annoated: ```bash pgatk dnaseq-to-proteindb \ --input_fasta ensembl_human/transcripts.fa \ - --output_proteindb altorf.fa \ - --var_prefix altorf_ \ - --include_biotypes altORFs \ + --output_proteindb putative.fa \ + --protein_prefix putative_ \ + --include_biotypes protein_coding_CDS_not_defined,TEC,translated_processed_pseudogene \ + --num_orfs 3 \ --skip_including_all_cds ``` -### Step 5 -- Population variant proteins +#### Alternative ORFs (exonic out-of-frame translation) -Include common human variants from ENSEMBL: +Non-canonical reading frames of protein-coding mRNAs can produce cryptic +peptides: ```bash -pgatk vcf-to-proteindb \ - --vcf ensembl_human/homo_sapiens_incl_consequences.vcf.gz \ +pgatk dnaseq-to-proteindb \ --input_fasta ensembl_human/transcripts.fa \ - --gene_annotations_gtf ensembl_human/Homo_sapiens.GRCh38.*.gtf.gz \ - --output_proteindb ensembl_variants.fa + --output_proteindb altorf.fa \ + --protein_prefix altorf_ \ + --include_biotypes altORFs \ + --skip_including_all_cds ``` -### Step 6 -- COSMIC somatic mutations (optional, for cancer cell lines) +### Step 5 -- COSMIC somatic mutations (optional, for cancer cell lines) For cancer cell-line studies, add cell-line-specific somatic mutations: @@ -115,14 +117,15 @@ pgatk cosmic-downloader \ -o cosmic_data pgatk cosmic-to-proteindb \ - --input_mutation cosmic_data/CosmicCLP_MutantExport.tsv.gz \ - --input_genes cosmic_data/All_CellLines_Genes.fasta.gz \ + --input_mutation cosmic_data/CellLinesProject_GenomeScreensMutant_v103_GRCh38.tsv.gz \ + --input_genes cosmic_data/Cosmic_Genes_v103_GRCh38.fasta.gz \ --output_db cosmic_cellline.fa \ - --filter_column "Sample name" \ + --clinical_sample_file cosmic_data/Cosmic_Classification_v103_GRCh38.tsv.gz \ + --filter_column PRIMARY_SITE \ --split_by_filter_column ``` -### Step 7 -- Combine and generate target-decoy database +### Step 6 -- Combine and generate target-decoy database ```bash # Combine all components @@ -130,8 +133,8 @@ pgatk cosmic-to-proteindb \ cat canonical.fa \ pseudogene.fa \ lncrna.fa \ + putative.fa \ altorf.fa \ - ensembl_variants.fa \ > cell_type_target.fa # Generate decoy sequences @@ -142,14 +145,14 @@ pgatk generate-decoy \ --decoy_prefix DECOY_ ``` -### Step 8 -- Extract non-canonical unique peptides +### Step 7 -- Extract non-canonical unique peptides After database searching with your search engine, you can also pre-compute the set of non-canonical peptides unique to these novel sources: ```bash pgatk digest-mutant-protein \ - --input pseudogene.fa,lncrna.fa,altorf.fa,ensembl_variants.fa \ + --input pseudogene.fa,lncrna.fa,putative.fa,altorf.fa,ensembl_variants.fa \ --fasta canonical.fa \ --output non_canonical_peptides.fa \ --min-len 7 \ @@ -166,7 +169,7 @@ pgatk digest-mutant-protein \ --- -## 1. Human Variant Protein Database from ENSEMBL +## USE CASE 2: Human Variant Protein Database from ENSEMBL Build a variant protein database using ENSEMBL population variants (common SNPs and indels) for human proteogenomics searches. This is the most common starting @@ -196,8 +199,8 @@ extract transcript sequences from the GTF and genome FASTA: ```bash gffread -F -w ensembl_human/transcripts.fa \ - -g ensembl_human/genome.fa \ - ensembl_human/Homo_sapiens.GRCh38.*.gtf.gz + -g ensembl_human/Homo_sapiens.GRCh38.dna_sm.toplevel.fa \ + ensembl_human/Homo_sapiens.GRCh38.*.gtf ``` ### Step 3 -- Generate the variant protein database @@ -206,9 +209,9 @@ Translate all ENSEMBL variants that affect protein-coding transcripts: ```bash pgatk vcf-to-proteindb \ - --vcf ensembl_human/homo_sapiens_incl_consequences.vcf.gz \ + --vcf ensembl_human/homo_sapiens_incl_consequences.vcf \ --input_fasta ensembl_human/transcripts.fa \ - --gene_annotations_gtf ensembl_human/Homo_sapiens.GRCh38.*.gtf.gz \ + --gene_annotations_gtf ensembl_human/Homo_sapiens.GRCh38.*.gtf \ --output_proteindb ensembl_human/variant_proteins.fa ``` @@ -242,7 +245,7 @@ The file `target_decoy.fa` is ready for database searching. --- -## 2. Population-Specific Variant Database +## USE CASE 3: Population-Specific Variant Database Population-level genetic variants cause amino acid changes that are invisible to standard reference database searches. Studies have shown that incorporating @@ -255,7 +258,7 @@ Include only variants present in at least 1% of the population: ```bash pgatk vcf-to-proteindb \ - --vcf homo_sapiens_incl_consequences.vcf.gz \ + --vcf homo_sapiens_incl_consequences.vcf \ --input_fasta transcripts.fa \ --gene_annotations_gtf genes.gtf \ --af_field MAF \ @@ -270,7 +273,7 @@ detectable by mass spectrometry: ```bash pgatk vcf-to-proteindb \ - --vcf homo_sapiens_incl_consequences.vcf.gz \ + --vcf homo_sapiens_incl_consequences.vcf \ --input_fasta transcripts.fa \ --gene_annotations_gtf genes.gtf \ --af_field MAF \ @@ -289,7 +292,7 @@ a specific population: pgatk vcf-to-proteindb \ --vcf gnomad.exomes.v4.1.sites.vcf.bgz \ --input_fasta gencode_transcripts.fa \ - --gene_annotations_gtf gencode.v44.annotation.gtf.gz \ + --gene_annotations_gtf gencode.v44.annotation.gtf \ --annotation_field_name vep \ --af_field AF_afr \ --af_threshold 0.01 \ @@ -308,7 +311,7 @@ pgatk vcf-to-proteindb \ --- -## 3. ClinVar Clinical Variant Database +## USE CASE 4: ClinVar Clinical Variant Database [ClinVar](https://www.ncbi.nlm.nih.gov/clinvar/) catalogs the relationship between human variants and clinical phenotypes. Building a ClinVar-derived @@ -324,18 +327,18 @@ pgatk ncbi-downloader -o ncbi_data This downloads four files to `ncbi_data/`: -- `GRCh38_latest_genomic.gtf.gz` -- RefSeq gene annotations -- `GRCh38_latest_rna.fna.gz` -- RefSeq transcript nucleotide sequences +- `GRCh38_latest_genomic.gtf` -- RefSeq gene annotations +- `GRCh38_latest_rna.fna` -- RefSeq transcript nucleotide sequences - `GRCh38_latest_assembly_report.txt` -- Chromosome name mapping -- `clinvar.vcf.gz` -- ClinVar variant calls +- `clinvar.vcf` -- ClinVar variant calls ### Step 2 -- Generate the ClinVar protein database ```bash pgatk clinvar-to-proteindb \ - --vcf ncbi_data/clinvar.vcf.gz \ - --gtf ncbi_data/GRCh38_latest_genomic.gtf.gz \ - --fasta ncbi_data/GRCh38_latest_rna.fna.gz \ + --vcf ncbi_data/clinvar.vcf \ + --gtf ncbi_data/GRCh38_latest_genomic.gtf \ + --fasta ncbi_data/GRCh38_latest_rna.fna \ --assembly-report ncbi_data/GRCh38_latest_assembly_report.txt \ --output clinvar_proteins.fa ``` @@ -356,7 +359,7 @@ pgatk generate-decoy \ --- -## 4. Tumor-Specific Databases for Cancer Proteogenomics +## USE CASE 5: Condition-Specific Databases for Proteogenomics Cancer proteogenomics studies (such as those from CPTAC) build tumor-specific protein databases to detect somatic mutant peptides, understand @@ -364,7 +367,7 @@ therapy resistance, and prioritize neoantigen candidates. The databases combine somatic mutations from cancer-specific sources (COSMIC, cBioPortal) and/or patient-matched whole-exome sequencing. -### 4a. COSMIC somatic mutations by cancer type +### 4a. Cancer-speific database using COSMIC somatic mutations Generate one protein database per primary tissue site. This is the standard approach for large-scale cancer proteogenomics when patient-level WES is not @@ -379,38 +382,49 @@ pgatk cosmic-downloader \ # Generate per-tissue databases pgatk cosmic-to-proteindb \ - --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \ - --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \ + --input_mutation cosmic_data/Cosmic_GenomeScreensMutant_v103_GRCh38.tsv.gz \ + --input_genes cosmic_data/Cosmic_Genes_v103_GRCh38.fasta.gz \ --output_db cosmic_proteins.fa \ + --clinical_sample_file cosmic_data/Cosmic_Classification_v103_GRCh38.tsv.gz \ + --filter_column PRIMARY_SITE \ --split_by_filter_column ``` This produces files like `cosmic_proteins_lung.fa`, `cosmic_proteins_breast.fa`, etc. Use the tissue-matched database for your cancer type of interest. -### 4b. Single cancer type +!!! note "COSMIC v103 tissue annotation" + In COSMIC v103, `PRIMARY_SITE` is not a column in the mutation file. It lives + in the separate Classification file (`Cosmic_Classification_v103_GRCh38.tsv.gz`) + and is joined via `COSMIC_PHENOTYPE_ID`. Pass the Classification file via + `--clinical_sample_file` whenever tissue-type filtering or splitting is needed. + +### 4b. Cancer-type speific database using COSMIC somatic mutations Build a focused database for one cancer type (e.g. lung): ```bash pgatk cosmic-to-proteindb \ - --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \ - --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \ + --input_mutation cosmic_data/Cosmic_GenomeScreensMutant_v103_GRCh38.tsv.gz \ + --input_genes cosmic_data/Cosmic_Genes_v103_GRCh38.fasta.gz \ --output_db cosmic_lung_proteins.fa \ + --clinical_sample_file cosmic_data/Cosmic_Classification_v103_GRCh38.tsv.gz \ + --filter_column PRIMARY_SITE \ --accepted_values "lung" ``` -### 4c. Cell-line proteogenomics +### 4c. Cell-type specific databases When analyzing cell-line proteomes, use cell-line-specific mutations. COSMIC provides a dedicated cell-line export with mutations annotated per sample: ```bash pgatk cosmic-to-proteindb \ - --input_mutation cosmic_data/CosmicCLP_MutantExport.tsv.gz \ - --input_genes cosmic_data/All_CellLines_Genes.fasta.gz \ + --input_mutation cosmic_data/CellLinesProject_GenomeScreensMutant_v103_GRCh38.tsv.gz \ + --input_genes cosmic_data/Cosmic_Genes_v103_GRCh38.fasta.gz \ --output_db cosmic_cellline_proteins.fa \ - --filter_column "Sample name" \ + --clinical_sample_file cosmic_data/Cosmic_Classification_v103_GRCh38.tsv.gz \ + --filter_column PRIMARY_SITE \ --split_by_filter_column ``` @@ -444,15 +458,16 @@ pgatk cbioportal-to-proteindb \ ### 4e. Combined cancer database for immunopeptidomics For HLA immunopeptidomics / neoantigen discovery, a broad mutation database -maximizes the chance of detecting mutant HLA-presented peptides. Studies have -shown that COSMIC-derived databases can identify 5x more mutant immunopeptides -than patient WES alone. Combine COSMIC with ClinVar: +maximizes the chance of detecting mutant HLA-presented peptides. COSMIC-derived +databases have been shown to identify 5x more mutant HLA-I immunopeptides than +patient WES alone ([Wang et al., *J Transl Med* 2024](https://doi.org/10.1186/s12967-023-04821-0)). +Combine COSMIC with ClinVar: ```bash # Generate COSMIC mutations pgatk cosmic-to-proteindb \ - --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \ - --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \ + --input_mutation cosmic_data/Cosmic_GenomeScreensMutant_v103_GRCh38.tsv.gz \ + --input_genes cosmic_data/Cosmic_Genes_v103_GRCh38.fasta.gz \ --output_db cosmic_proteins.fa # Generate ClinVar mutations @@ -475,7 +490,7 @@ pgatk generate-decoy \ --- -## 5. Patient-Specific Database from WGS/WES +## USE CASE 6: Patient-Specific Database from WGS/WES When matched whole-genome or whole-exome sequencing data is available for a sample, build a personalized protein database from the patient's own @@ -530,24 +545,24 @@ pgatk vcf-to-proteindb \ --- -## 6. Novel ORF and Micropeptide Discovery +## USE CASE 7: Novel ORF and Micropeptide Discovery Proteogenomics is a key approach for discovering novel coding regions: small open reading frames (smORFs), micropeptides from lncRNAs, pseudogene-encoded proteins, and alternative reading frames. Studies have found that non-canonical peptides can account for over 5% of total identifications. -### lincRNA-derived proteins +### lncRNA-derived proteins -Long intergenic non-coding RNAs (lincRNAs) can encode small proteins. Translate +Long intergenic non-coding RNAs (lncRNAs) can encode small proteins. Translate them in three reading frames: ```bash pgatk dnaseq-to-proteindb \ --input_fasta transcripts.fa \ - --output_proteindb lincRNA_proteins.fa \ - --var_prefix lincRNA_ \ - --include_biotypes lincRNA \ + --output_proteindb lncRNA_proteins.fa \ + --protein_prefix lncRNA_ \ + --include_biotypes lncRNA \ --num_orfs 3 \ --skip_including_all_cds ``` @@ -560,7 +575,7 @@ Some pseudogenes are transcribed and may produce functional peptides: pgatk dnaseq-to-proteindb \ --input_fasta transcripts.fa \ --output_proteindb pseudogene_proteins.fa \ - --var_prefix pseudogene_ \ + --protein_prefix pseudogene_ \ --include_biotypes processed_pseudogene,transcribed_processed_pseudogene,translated_processed_pseudogene \ --num_orfs 3 \ --skip_including_all_cds @@ -575,7 +590,7 @@ discover upstream ORFs (uORFs), overlapping ORFs, and downstream ORFs: pgatk dnaseq-to-proteindb \ --input_fasta transcripts.fa \ --output_proteindb altorf_proteins.fa \ - --var_prefix altorf_ \ + --protein_prefix altorf_ \ --include_biotypes altORFs \ --skip_including_all_cds ``` @@ -586,7 +601,7 @@ pgatk dnaseq-to-proteindb \ pgatk dnaseq-to-proteindb \ --input_fasta transcripts.fa \ --output_proteindb antisense_proteins.fa \ - --var_prefix antisense_ \ + --protein_prefix antisense_ \ --include_biotypes antisense,antisense_RNA \ --num_orfs 3 \ --skip_including_all_cds @@ -598,7 +613,7 @@ Combine all non-canonical sources with the canonical proteome: ```bash cat canonical_proteins.fa \ - lincRNA_proteins.fa \ + lncRNA_proteins.fa \ pseudogene_proteins.fa \ altorf_proteins.fa \ antisense_proteins.fa \ @@ -616,7 +631,7 @@ pgatk generate-decoy \ ```bash pgatk digest-mutant-protein \ - --input lincRNA_proteins.fa,pseudogene_proteins.fa,altorf_proteins.fa \ + --input lncRNA_proteins.fa,pseudogene_proteins.fa,altorf_proteins.fa \ --fasta canonical_proteins.fa \ --output novel_unique_peptides.fa ``` @@ -971,16 +986,16 @@ pgatk clinvar-to-proteindb \ # COSMIC somatic mutations pgatk cosmic-to-proteindb \ - --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \ - --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \ + --input_mutation cosmic_data/Cosmic_GenomeScreensMutant_v103_GRCh38.tsv.gz \ + --input_genes cosmic_data/Cosmic_Genes_v103_GRCh38.fasta.gz \ --output_db cosmic_variants.fa -# Non-coding RNA (lincRNA) +# Non-coding RNA (lncRNA) pgatk dnaseq-to-proteindb \ --input_fasta ensembl_data/transcripts.fa \ - --output_proteindb lincRNA.fa \ - --var_prefix lincRNA_ \ - --include_biotypes lincRNA \ + --output_proteindb lncRNA.fa \ + --protein_prefix lncRNA_ \ + --include_biotypes lncRNA \ --num_orfs 3 \ --skip_including_all_cds @@ -988,7 +1003,7 @@ pgatk dnaseq-to-proteindb \ pgatk dnaseq-to-proteindb \ --input_fasta ensembl_data/transcripts.fa \ --output_proteindb pseudogene.fa \ - --var_prefix pseudogene_ \ + --protein_prefix pseudogene_ \ --include_biotypes processed_pseudogene,transcribed_processed_pseudogene \ --num_orfs 3 \ --skip_including_all_cds @@ -1001,7 +1016,8 @@ cat canonical.fa \ ensembl_variants.fa \ clinvar_variants.fa \ cosmic_variants.fa \ - lincRNA.fa \ + lncRNA.fa \ + putative.fa \ pseudogene.fa \ > combined_target.fa ``` @@ -1027,7 +1043,7 @@ pgatk generate-decoy \ ```bash pgatk digest-mutant-protein \ - --input ensembl_variants.fa,clinvar_variants.fa,cosmic_variants.fa,lincRNA.fa,pseudogene.fa \ + --input ensembl_variants.fa,clinvar_variants.fa,cosmic_variants.fa,lncRNA.fa,pseudogene.fa \ --fasta canonical.fa \ --output unique_variant_peptides.fa \ --min-len 7 \ diff --git a/docs/validations.md b/docs/validations.md new file mode 100644 index 0000000..6dd3acc --- /dev/null +++ b/docs/validations.md @@ -0,0 +1,257 @@ +# Test Validations + +This page documents the validation strategy for PGATK, describing what the automated tests verify and how to independently confirm their correctness. +# 1. Variant Translation Tests - COSMIC +## 1.1 Validation per variant type (`test_variant_types_hgvs.py`) + +These tests verify that `CancerGenomesService.get_mut_pro_seq()` correctly processes all 18 Sequence Ontology (SO) variant types present in COSMIC v103 mutation files, following [HGVS nomenclature rules](https://hgvs-nomenclature.org/stable/). + +Each test uses a real mutation from `Cosmic_CompleteTargetedScreensMutant_v103_GRCh38.tsv` paired with the matching CDS from `Cosmic_Genes_v103_GRCh38.fasta`. + +--- + +### Actionable Variant Types + +These types are expected to produce a non-empty mutant protein sequence. Tests assert biologically specific properties, not just that the result is non-empty. + +| Variant Type | Gene | Mutation | HGVS Rule Verified | +|---|---|---|---| +| `missense_variant` | EZH2 | c.656C>T p.S219F | Protein length unchanged; position 218 (0-indexed) is `F`; all other residues identical to WT | +| `stop_gained` | FCGR3B | c.394A>T p.K132* | Position 131 (0-indexed) is `*` (stop codon) in the translated output | +| `inframe_deletion` | CTNNB1 | c.104_124del p.T35_G41del | Protein shortened by exactly 7 AA (21 nt deleted ÷ 3); residues before position 35 unchanged | +| `inframe_insertion` (dup) | SMAP1 | c.1226_1228dup p.I409dup | Protein lengthened by exactly 1 AA (3 nt duplicated ÷ 3); residues before the dup site unchanged | +| `protein_altering_variant` | NF2 | c.252_262delinsCT p.L85_K88delins* | Result is non-empty and differs from the wild-type sequence | +| `frameshift_variant` | MEN1 | c.292del p.R98Efs*21 | Non-empty translated sequence produced from the frame-shifted CDS | +| `start_lost` | INPP4B | c.3G>T p.M1? | First AA is not Met; it is Ile (`ATG→ATT` via the DNA substitution path) | +| `stop_lost` | MYD88 | c.611_613delinsGCGGCCCCC p.D204_*205delinsGGPR | Mutant protein is longer than WT (stop codon removed; read-through occurs) | +| `stop_retained_variant` | EPHA3 | c.2756G>A p.*919= | Length unchanged; last residue is still `*` (`TGA→TAA`, both stop codons) | + +--- + +### Filtered / Skipped Variant Types + +These types must produce an empty string `""`. There are two mechanisms: + +- **Pre-filter** (`cosmic_to_proteindb`): `synonymous_variant` rows are discarded before `get_mut_pro_seq` is called. +- **`p.?` guard** (`get_mut_pro_seq`): When `aa_mut == "p.?"`, the function immediately returns `""`. All intronic, UTR, and splice variants in COSMIC v103 carry `p.?`. + +| Variant Type | Mechanism | Source Mutation | +|---|---|---| +| `synonymous_variant` | Pre-filter in `cosmic_to_proteindb` | Filter logic validated directly on the SO term strings | +| `intron_variant` | `p.?` guard | RAD51C c.145+706C>T p.? | +| `3_prime_UTR_variant` | `p.?` guard | SUFU c.*2816G>A p.? | +| `5_prime_UTR_variant` | `p.?` guard | PSMD13 c.-20C>T p.? | +| `coding_sequence_variant` | `p.?` guard (also has intronic offset `+`) | OAS2 c.2157+2A>G p.? | +| `splice_acceptor_variant` | `p.?` guard | TSHR c.171-1G>A p.? | +| `splice_donor_variant` | `p.?` guard | PTCH1 c.3306+2T>G p.? | +| `splice_region_variant` (composite) | Fires missense branch when combined with `missense_variant` | EZH2 c.656C>T p.S219F with type `missense_variant,splice_region_variant` | +| `incomplete_terminal_codon_variant` | `p.?` guard (absent from v103 targeted-screen data; tested by convention) | c.2213_2214insA p.? | + +!!! note + `splice_region_variant` only appears in COSMIC v103 as part of a composite type string (e.g. `missense_variant,splice_region_variant`). When combined with an actionable type, the actionable type's branch fires first and the mutation is processed normally. + +--- + +## 1.2 Independent Validation of the Tests + +### 1. Re-derive CDS sequences from the FASTA + +The CDS strings embedded in the test file came from `Cosmic_Genes_v103_GRCh38.fasta`. To verify a gene (e.g. EZH2): + +```bash +python3 - <<'EOF' +from Bio import SeqIO +fasta = "use-cases/cosmic_data/Cosmic_Genes_v103_GRCh38.fasta" +for rec in SeqIO.parse(fasta, "fasta"): + if "EZH2" in rec.id and "ENST00000483967" in rec.id: + print(f"len={len(rec.seq)}") + print(f"prot[218]={rec.seq.translate(to_stop=False)[218]}") + print(rec.seq[:50]) +EOF +``` + +Expected: `len=2211`, `prot[218]=S` (which becomes `F` after the c.656C>T substitution). + +### 2. Confirm mutations exist in the COSMIC TSV + +Every test documents its source mutation. To verify a mutation is real and not fabricated: + +```bash +grep "EZH2" use-cases/cosmic_data/Cosmic_CompleteTargetedScreensMutant_v103_GRCh38.tsv | \ + grep "c.656C>T" | \ + cut -f1-10 +``` + +### 3. Manually compute expected protein output + +For the missense test, manually apply the substitution and translate: + +```python +from Bio.Seq import Seq + +cds = '''ATGGGCCAGACTGGGAAGAAATCTGAGAAGGGACCAGTTTGTTGGCGGAAGCGTGTAAAATCAGAGTACATGCGACTGAGACAGCTCAAGAGGTTCAGACGAGCTGATGAAGTAAAGAGTATGTTTAGTTCCAATCGTCAGAAAATT +TTGGAAAGAACGGAAATCTTAAACCAAGAATGGAAACAGCGAAGGATACAGCCTGTGCACATCCTGACTTCTTGTTCGGTGACCAGTGACTTGGATTTTCCAACACAAGTCATCCCATTAAAGACTCTGAATGCAGTTGCTTCAGTACCCATAATG +TATTCTTGGTCTCCCCTACAGCAGAATTTTATGGTGGAAGATGAAACTGTTTTACATAACATTCCTTATATGGGAGATGAAGTTTTAGATCAGGATGGTACTTTCATTGAAGAACTAATAAAAAATTATGATGGGAAAGTACACGGGGATAGAGAA +TGTGGGTTTATAAATGATGAAATTTTTGTGGAGTTGGTGAATGCCCTTGGTCAATATAATGATGATGACGATGATGATGATGGAGACGATCCTGAAGAAAGAGAAGAAAAGCAGAAAGATCTGGAGGATCACCGAGATGATAAAGAAAGCCGCCCA +CCTCGGAAATTTCCTTCTGATAAAATTTTTGAAGCCATTTCCTCAATGTTTCCAGATAAGGGCACAGCAGAAGAACTAAAGGAAAAATATAAAGAACTCACCGAACAGCAGCTCCCAGGCGCACTTCCTCCTGAATGTACCCCCAACATAGATGGA +CCAAATGCTAAATCTGTTCAGAGAGAGCAAAGCTTACACTCCTTTCATACGCTTTTCTGTAGGCGATGTTTTAAATATGACTGCTTCCTACATCCTTTTCATGCAACACCCAACACTTATAAGCGGAAGAACACAGAAACAGCTCTAGACAACAAA +CCTTGTGGACCACAGTGTTACCAGCATTTGGAGGGAGCAAAGGAGTTTGCTGCTGCTCTCACCGCTGAGCGGATAAAGACCCCACCAAAACGTCCAGGAGGCCGCAGAAGAGGACGGCTTCCCAATAACAGTAGCAGGCCCAGCACCCCCACCATT +AATGTGCTGGAATCAAAGGATACAGACAGTGATAGGGAAGCAGGGACTGAAACGGGGGGAGAGAACAATGATAAAGAAGAAGAAGAGAAGAAAGATGAAACTTCGAGCTCCTCTGAAGCAAATTCTCGGTGTCAAACACCAATAAAGATGAAGCCA +AATATTGAACCTCCTGAGAATGTGGAGTGGAGTGGTGCTGAAGCCTCAATGTTTAGAGTCCTCATTGGCACTTACTATGACAATTTCTGTGCCATTGCTAGGTTAATTGGGACCAAAACATGTAGACAGGTGTATGAGTTTAGAGTCAAAGAATCT +AGCATCATAGCTCCAGCTCCCGCTGAGGATGTGGATACTCCTCCAAGGAAAAAGAAGAGGAAACACCGGTTGTGGGCTGCACACTGCAGAAAGATACAGCTGAAAAAGGACGGCTCCTCTAACCATGTTTACAACTATCAACCCTGTGATCATCCA +CGGCAGCCTTGTGACAGTTCGTGCCCTTGTGTGATAGCACAAAATTTTTGTGAAAAGTTTTGTCAATGTAGTTCAGAGTGTCAAAACCGCTTTCCGGGATGCCGCTGCAAAGCACAGTGCAACACCAAGCAGTGCCCGTGCTACCTGGCTGTCCGA +GAGTGTGACCCTGACCTCTGTCTTACTTGTGGAGCCGCTGACCATTGGGACAGTAAAAATGTGTCCTGCAAGAACTGCAGTATTCAGCGGGGCTCCAAAAAGCATCTATTGCTGGCACCATCTGACGTGGCAGGCTGGGGGATTTTTATCAAAGAT +CCTGTGCAGAAAAATGAATTCATCTCAGAATACTGTGGAGAGATTATTTCTCAAGATGAAGCTGACAGAAGAGGGAAAGTGTATGATAAATACATGTGCAGCTTTCTGTTCAACTTGAACAATGATTTTGTGGTGGATGCAACCCGCAAGGGTAAC +AAAATTCGTTTTGCAAATCATTCGGTAAATCCAAACTGCTATGCAAAAGTTATGATGGTTAACGGTGATCACAGGATAGGTATTTTTGCCAAGAGAGCCATCCAGACTGGCGAAGAGCTGTTTTTTGATTACAGATACAGCCAGGCTGATGCCCTG +AAGTATGTCGGCATCGAAAGAGAAATGGAAATCCCTTGA'''.replace('\n','') # EZH2 CDS string + +seq = Seq(cds) +wt_prot = seq.translate(to_stop=False) +print(wt_prot[218]) # 'S' — wild-type Serine at position 219 + +# Apply c.656C>T: position 656 is 0-indexed 655 +mut_seq = seq[:655] + "T" + seq[656:] +mut_prot = mut_seq.translate(to_stop=False) +print(mut_prot[218]) # 'F' — mutant Phenylalanine +``` + +--- + +## 1.3 Core Unit Tests (`test_cosmic_core.py`) + +14 Unit tests are implemented to verify the internal logic of `get_mut_pro_seq()` using minimal synthetic sequences (`Seq("ATGAAATTT")` etc.). + +| Class | Coverage | +|---|---| +| `TestGetMutProSeqDnaBranch` | DNA substitution, insertion, single-base deletion, two-position deletion, ambiguous `c.?` routing to protein branch | +| `TestGetMutProSeqProteinBranch` | Missense, nonsense truncation, in-frame insertion, two-position in-frame deletion, single-position in-frame deletion | +| `TestGetMutProSeqEdgeCases` | Mutation beyond sequence length, ambiguous `p.?` returns `""`, non-alpha last char in aa_mut returns `""`, DNA ref-allele mismatch returns `""` | + +These tests are fast to run and isolate individual code paths without requiring reference files. + +--- + +# 2. COSMIC Integration Pipeline Tests + +## 2.1 COSMIC v103 Schema and Data Model + +COSMIC v103 introduced two breaking changes from earlier releases: + +**Column name changes** — all columns are now `UPPERCASE_UNDERSCORE`: + +| Old (v2/v98) | New (v103) | +|---|---| +| `Gene name` | `GENE_SYMBOL` | +| `Accession Number` | `TRANSCRIPT_ACCESSION` | +| `CDS mutation` | `MUTATION_CDS` | +| `AA Mutation` | `MUTATION_AA` | +| `Type` | `MUTATION_DESCRIPTION` | +| `Primary site` | *(moved to Classification file)* | + +**Tissue-type data architecture** — `PRIMARY_SITE` (tissue of origin) is no longer a column in the mutation export. It lives in a separate **Classification file** (`Cosmic_Classification_Tsv_v103_GRCh38.tar`) and is joined via `COSMIC_PHENOTYPE_ID`: + +``` +mutation_file.COSMIC_PHENOTYPE_ID (e.g. COSO1001) + → Cosmic_Classification_v103_GRCh38.tsv.COSMIC_PHENOTYPE_ID + → PRIMARY_SITE (e.g. upper_aerodigestive_tract) +``` + +`COSMIC_PHENOTYPE_ID` is present at column 6 in both `Cosmic_CompleteTargetedScreensMutant` and `Cosmic_GenomeScreensMutant` files. No hop through the Sample file is needed. + +> **Note**: The Sample file (`Cosmic_Sample_Tsv_v103_GRCh38.tsv`) contains 34 columns covering sequencing metadata (`SAMPLE_NAME`, `COSMIC_SAMPLE_ID`, `TUMOUR_ID`, etc.) but does **not** contain `PRIMARY_SITE`. The Classification file is the authoritative source for tissue and histology. + +--- + +## 2.2 Integration Test (`test_cosmic_to_proteindb`) + +**Test file**: `pgatk/tests/pgatk_tests.py::PgatkRunnerTests::test_cosmic_to_proteindb` + +The test runs the full `cosmic-to-proteindb` pipeline end-to-end using small testdata files that mirror the real v103 schema: + +| File | Purpose | +|---|---| +| `testdata/test_cosmic_mutations.tsv` | 12 HRAS mutation rows in v103 27-column format; covers `missense_variant`, `stop_gained`, `inframe_deletion`, `synonymous_variant` (filtered), `frameshift_variant` (`p.?`, skipped) | +| `testdata/test_cosmic_genes.fa` | HRAS CDS FASTA from `Cosmic_Genes_v103_GRCh38.fasta` | +| `testdata/test_cosmic_classification.tsv` | 8 rows mapping `COSMIC_PHENOTYPE_ID` (COSO1001–COSO1008) to `PRIMARY_SITE`; covers `upper_aerodigestive_tract`, `skin`, `bone`, `thyroid`, `liver`, `haematopoietic_and_lymphoid_tissue` | + +CLI invocation used in the test: + +``` +cosmic-to-proteindb + --config_file config/cosmic_config.yaml + --input_mutation testdata/test_cosmic_mutations.tsv + --input_genes testdata/test_cosmic_genes.fa + --output_db testdata/test_cosmic_mutations_proteindb.fa + --clinical_sample_file testdata/test_cosmic_classification.tsv + --filter_column PRIMARY_SITE + --split_by_filter_column + --accepted_values all +``` + +Expected output files (split per `PRIMARY_SITE`): + +``` +testdata/test_cosmic_mutations_proteindb_bone.fa +testdata/test_cosmic_mutations_proteindb_liver.fa +testdata/test_cosmic_mutations_proteindb_skin.fa +testdata/test_cosmic_mutations_proteindb_thyroid.fa +testdata/test_cosmic_mutations_proteindb_upperaerodigestivetract.fa +``` + +The `synonymous_variant` row (COSO1006) is pre-filtered and produces no output. The `frameshift_variant` row carrying `p.?` (COSO1008) is skipped by the `p.?` guard. The consolidated `testdata/test_cosmic_mutations_proteindb.fa` contains all non-filtered mutations regardless of tissue type. + +--- + +## 2.3 `--clinical_sample_file` CLI Option + +The `--clinical_sample_file` / `-cl` option accepts any TSV file that maps a sample/phenotype ID column to the `--filter_column` value. For COSMIC, this is the Classification file: + +| Argument | COSMIC v103 value | +|---|---| +| `--clinical_sample_file` | `Cosmic_Classification_v103_GRCh38.tsv` (or `.gz`) | +| `--filter_column` | `PRIMARY_SITE` | +| Join key used internally | `COSMIC_PHENOTYPE_ID` (auto-detected for COSMIC) | + +For **cBioportal**, the equivalent is the clinical sample file with `SAMPLE_ID` as the join key — the `--clinical_sample_file` option uses `SAMPLE_ID` by default and switches to `COSMIC_PHENOTYPE_ID` only when invoked via `cosmic-to-proteindb`. + +--- + +## 2.4 Independent Validation with Real COSMIC Files + +After downloading COSMIC v103 files into `use-cases/use-case2/cosmic_data/`, run the pipeline against a subset of real mutations: + +```bash +# Extract 500 actionable mutations (skip intronic/UTR/synonymous) +zcat use-cases/use-case2/cosmic_data/Cosmic_CompleteTargetedScreensMutant_v103_GRCh38.tsv.gz \ + | head -1 > /tmp/cosmic_test.tsv +zcat use-cases/use-case2/cosmic_data/Cosmic_CompleteTargetedScreensMutant_v103_GRCh38.tsv.gz \ + | grep -E "missense_variant|stop_gained|inframe_deletion|inframe_insertion|frameshift" \ + | head -500 >> /tmp/cosmic_test.tsv + +pgatk cosmic-to-proteindb \ + --config_file pgatk/config/cosmic_config.yaml \ + --input_mutation /tmp/cosmic_test.tsv \ + --input_genes <(zcat use-cases/use-case2/cosmic_data/Cosmic_Genes_v103_GRCh38.fasta.gz) \ + --output_db /tmp/cosmic_out.fa \ + --clinical_sample_file <(zcat use-cases/use-case2/cosmic_data/Cosmic_Classification_v103_GRCh38.tsv.gz) \ + --filter_column PRIMARY_SITE \ + --split_by_filter_column \ + --accepted_values all +``` + +Expected: ~27 per-tissue output files, including canonical COSMIC tissue types (`lung`, `breast`, `large_intestine`, `skin`, `haematopoietic_and_lymphoid_tissue`, etc.). FASTA headers follow the format: + +``` +>COSMIC:GENE_SYMBOL:p.MutAA:SO_term +``` + +To verify the join is working (i.e. phenotype IDs are resolving to tissue names): + +```bash +# Count sequences per tissue file +for f in /tmp/cosmic_out_*.fa; do + echo "$f: $(grep -c '^>' $f) sequences" +done +``` + +A file named `cosmic_out_NS.fa` is expected for phenotypes where tissue is not specified in the Classification file — these are retained rather than silently dropped. diff --git a/pgatk/cgenomes/cgenomes_proteindb.py b/pgatk/cgenomes/cgenomes_proteindb.py index f42d7b8..1484728 100644 --- a/pgatk/cgenomes/cgenomes_proteindb.py +++ b/pgatk/cgenomes/cgenomes_proteindb.py @@ -140,19 +140,28 @@ def get_mut_pro_seq(snp: SNP, seq: Seq) -> Optional[str]: ref_dna = re.sub("[^A-Z]+", "", tmplist[0]) mut_dna = re.sub("[^A-Z]+", "", tmplist[1]) index = int(positions[0]) - 1 - if ref_dna == str(seq[index]).upper() and mut_dna in nucleotide: # + if ref_dna == str(seq[index]).upper() and mut_dna in nucleotide: seq_mut = seq[:index] + mut_dna + seq[index + 1:] mut_pro_seq = str(seq_mut.translate(to_stop=False)) elif "delins" in snp.dna_mut: - # Deletion-insertion: delete range then insert new bases - insert_dna = snp.dna_mut.split("delins")[1] - if insert_dna.isalpha() and len(positions) >= 2: - del_index1 = int(positions[0]) - 1 - del_index2 = int(positions[1]) + # HGVS delins: one or more nucleotides replaced by one or more other nucleotides. + coord_part, insert_raw = snp.dna_mut.split("delins", 1) + insert_dna = insert_raw.upper() + if re.search(r'\d[+-]|\*|-\d', coord_part): + # Intronic (c.N+X, c.N-X), 5'UTR (c.-N) or 3'UTR (c.*N) offsets + # cannot be mapped onto a CDS-only FASTA sequence; skip. + pass + elif not insert_dna.isalpha(): + # Unknown ('?'), N[n], or conversion-notation insertions cannot be resolved. + pass + elif len(positions) >= 2: + # Range delins: delete positions[0]..positions[1] (1-indexed, inclusive). + del_index1 = int(positions[0]) - 1 # 0-indexed start + del_index2 = int(positions[1]) # 0-indexed exclusive end seq_mut = seq[:del_index1] + insert_dna + seq[del_index2:] mut_pro_seq = str(seq_mut.translate(to_stop=False)) - elif insert_dna.isalpha() and len(positions) == 1: - # Single-position delins: replace one base + elif len(positions) == 1: + # Single-position delins: replace the one nucleotide at positions[0]. del_index1 = int(positions[0]) - 1 seq_mut = seq[:del_index1] + insert_dna + seq[del_index1 + 1:] mut_pro_seq = str(seq_mut.translate(to_stop=False)) @@ -165,6 +174,20 @@ def get_mut_pro_seq(snp: SNP, seq: Seq) -> Optional[str]: seq_mut = seq[:ins_index1] + insert_dna + seq[ins_index1:] mut_pro_seq = str(seq_mut.translate(to_stop=False)) + elif "dup" in snp.dna_mut: + # Tandem duplication: re-insert the duplicated range after its end position. + if len(positions) == 2: + dup_start = int(positions[0]) - 1 + dup_end = int(positions[1]) + dup_seq = str(seq[dup_start:dup_end]) + seq_mut = seq[:dup_end] + dup_seq + seq[dup_end:] + mut_pro_seq = str(seq_mut.translate(to_stop=False)) + elif len(positions) == 1: + dup_pos = int(positions[0]) + dup_seq = str(seq[dup_pos - 1:dup_pos]) + seq_mut = seq[:dup_pos] + dup_seq + seq[dup_pos:] + mut_pro_seq = str(seq_mut.translate(to_stop=False)) + elif "del" in snp.dna_mut: if len(positions) == 2: del_index1 = int(positions[0]) - 1 @@ -180,7 +203,7 @@ def get_mut_pro_seq(snp: SNP, seq: Seq) -> Optional[str]: positions = re.findall(r'\d+', snp.aa_mut) protein_seq = str(seq.translate(to_stop=False)) - if "Missense" in snp.mutation_type: + if "Missense" in snp.mutation_type or "missense_variant" in snp.mutation_type: # Extract the mutant residue from HGVS like p.V600E (1-letter) # or p.Val600Glu (3-letter). For 1-letter the last char is the # AA; for 3-letter we need to convert the last triplet. @@ -190,20 +213,27 @@ def get_mut_pro_seq(snp: SNP, seq: Seq) -> Optional[str]: return '' index = int(positions[0]) - 1 mut_pro_seq = protein_seq[:index] + mut_aa + protein_seq[index + 1:] - elif "Nonsense" in snp.mutation_type: + elif "Nonsense" in snp.mutation_type or "stop_gained" in snp.mutation_type: index = int(positions[0]) - 1 mut_pro_seq = protein_seq[:index] - elif "Insertion - In frame" in snp.mutation_type: - try: - index = snp.aa_mut.index("ins") - except ValueError: - return '' - insert_aa_raw = snp.aa_mut[index + 3:] - insert_aa = _three_to_one(insert_aa_raw) - if insert_aa.isalpha(): - ins_index1 = int(positions[0]) - mut_pro_seq = protein_seq[:ins_index1] + insert_aa + protein_seq[ins_index1:] - elif "Deletion - In frame" in snp.mutation_type: + elif "Insertion - In frame" in snp.mutation_type or "inframe_insertion" in snp.mutation_type: + if "dup" in snp.aa_mut: + # Protein-level tandem dup: re-insert the duplicated residues after the range. + dup_start = int(positions[0]) - 1 + dup_end = int(positions[-1]) + dup_aa = protein_seq[dup_start:dup_end] + mut_pro_seq = protein_seq[:dup_end] + dup_aa + protein_seq[dup_end:] + else: + try: + index = snp.aa_mut.index("ins") + except ValueError: + return '' + insert_aa_raw = snp.aa_mut[index + 3:] + insert_aa = _three_to_one(insert_aa_raw) + if insert_aa.isalpha(): + ins_index1 = int(positions[0]) + mut_pro_seq = protein_seq[:ins_index1] + insert_aa + protein_seq[ins_index1:] + elif "Deletion - In frame" in snp.mutation_type or "inframe_deletion" in snp.mutation_type: if len(positions) == 2: del_index1 = int(positions[0]) - 1 del_index2 = int(positions[1]) @@ -211,7 +241,8 @@ def get_mut_pro_seq(snp: SNP, seq: Seq) -> Optional[str]: elif len(positions) == 1: del_index1 = int(positions[0]) - 1 mut_pro_seq = protein_seq[:del_index1] + protein_seq[del_index1 + 1:] - elif "Complex" in snp.mutation_type and "frameshift" not in snp.mutation_type: + elif ("Complex" in snp.mutation_type or "protein_altering_variant" in snp.mutation_type) \ + and "frameshift" not in snp.mutation_type: try: index = snp.aa_mut.index(">") except ValueError: @@ -256,65 +287,107 @@ def cosmic_to_proteindb(self) -> None: except KeyError: COSMIC_CDS_DB[record.id] = [record] + # Build phenotype-to-group mapping from the classification file if provided. + # COSMIC_PHENOTYPE_ID is present directly in the mutation file (column 6) and + # is the join key to the Cosmic_Classification file where PRIMARY_SITE lives. + sample_groups_dict = {} + if self._local_clinical_sample_file and self._filter_column: + sample_groups_dict = self.get_value_per_sample( + self._local_clinical_sample_file, + self._filter_column, + sample_id_column='COSMIC_PHENOTYPE_ID', + ) + if not sample_groups_dict: + self.get_logger().warning( + "clinical_sample_file '%s' produced no phenotype mappings; " + "falling back to direct filter-column lookup in the mutation file.", + self._local_clinical_sample_file, + ) + regex = re.compile('[^a-zA-Z]') mutation_dic = {} groups_mutations_dict = {} self.get_logger().debug("Reading input CosmicMutantExport.tsv ...") line_counter = 1 - required_columns = ["Gene name", "Accession Number", "Mutation CDS", "Mutation AA", "Mutation Description"] + required_columns = ["GENE_SYMBOL", "TRANSCRIPT_ACCESSION", "MUTATION_CDS", "MUTATION_AA", "MUTATION_DESCRIPTION"] with _open_text(self._local_mutation_file, encoding="latin-1") as cosmic_input, \ open(self._local_output_file, 'w', encoding='utf-8') as output: - header = cosmic_input.readline().strip().split("\t") + col_header = cosmic_input.readline().strip().split("\t") try: - gene_col = header.index("Gene name") - enst_col = header.index("Accession Number") - cds_col = header.index("Mutation CDS") - aa_col = header.index("Mutation AA") - muttype_col = header.index("Mutation Description") + gene_col = col_header.index("GENE_SYMBOL") + enst_col = col_header.index("TRANSCRIPT_ACCESSION") + cds_col = col_header.index("MUTATION_CDS") + aa_col = col_header.index("MUTATION_AA") + muttype_col = col_header.index("MUTATION_DESCRIPTION") except ValueError as e: self.get_logger().error( "COSMIC file missing required columns. Expected %s, got: %s", - required_columns, header + required_columns, col_header ) raise ValueError( f"COSMIC mutation file missing required column: {e}. " f"Expected columns include: {required_columns}" ) from e + + # COSMIC_PHENOTYPE_ID is the direct join key to the classification file. + try: + phenotype_id_col = col_header.index("COSMIC_PHENOTYPE_ID") + except ValueError: + phenotype_id_col = None + if sample_groups_dict: + self.get_logger().warning( + "COSMIC_PHENOTYPE_ID not found in mutation file header; classification file join will be skipped." + ) + + # Fall back to a direct filter column in the mutation file when no classification file is provided. filter_col = None - if self._filter_column: + if not sample_groups_dict and self._filter_column: try: - filter_col = header.index(self._filter_column) + filter_col = col_header.index(self._filter_column) except ValueError: self.get_logger().warning( - "Filter column '%s' not found in COSMIC header: %s. Filtering disabled.", - self._filter_column, header + "Filter column '%s' not found in COSMIC header. Filtering disabled.", + self._filter_column, ) max_col = max(gene_col, enst_col, cds_col, aa_col, muttype_col) if filter_col is not None: max_col = max(max_col, filter_col) + if phenotype_id_col is not None and sample_groups_dict: + max_col = max(max_col, phenotype_id_col) for line in cosmic_input: if line_counter % 10000 == 0: - msg = "Number of lines finished -- '{}'".format(line_counter) - self.get_logger().debug(msg) + self.get_logger().debug("Number of lines finished -- '%s'", line_counter) line_counter += 1 row = line.strip().split("\t") if len(row) <= max_col: self.get_logger().debug("Skipping malformed row (insufficient columns) at line %s: %s", line_counter, row[:5]) continue - # filter out mutations from unspecified groups - if filter_col is not None: - if row[filter_col] not in self._accepted_values and self._accepted_values != ['all']: + + # Determine the group (e.g. primary site) for this mutation. + group = None + if sample_groups_dict and phenotype_id_col is not None: + group = sample_groups_dict.get(row[phenotype_id_col]) + if group is None and (self._accepted_values != ['all'] or self._split_by_filter_column): + self.get_logger().warning( + "No classification found for COSMIC_PHENOTYPE_ID '%s'; skipping row.", + row[phenotype_id_col], + ) continue + elif filter_col is not None: + group = row[filter_col] + + if group is not None and group not in self._accepted_values and self._accepted_values != ['all']: + continue - if "coding silent" in row[muttype_col]: + if "coding silent" in row[muttype_col] or "synonymous_variant" in row[muttype_col]: continue snp = SNP(gene=row[gene_col], mrna=row[enst_col], dna_mut=row[cds_col], aa_mut=row[aa_col], mutation_type=row[muttype_col]) - header = "COSMIC:%s:%s:%s" % (snp.gene, snp.aa_mut, snp.mutation_type.replace(" ", "")) + fasta_header = "COSMIC:%s:%s:%s" % (snp.gene, snp.aa_mut, snp.mutation_type.replace(" ", "")) try: this_gene_records = COSMIC_CDS_DB[snp.gene] seqs = [] @@ -334,16 +407,16 @@ def cosmic_to_proteindb(self) -> None: break if mut_pro_seq: - entry = ">%s\n%s\n" % (header, mut_pro_seq) - if header not in mutation_dic: + entry = ">%s\n%s\n" % (fasta_header, mut_pro_seq) + if fasta_header not in mutation_dic: output.write(entry) - mutation_dic[header] = 1 + mutation_dic[fasta_header] = 1 - if self._split_by_filter_column and filter_col is not None: + if self._split_by_filter_column and group is not None: try: - groups_mutations_dict[row[filter_col]][header] = entry + groups_mutations_dict[group][fasta_header] = entry except KeyError: - groups_mutations_dict[row[filter_col]] = {header: entry} + groups_mutations_dict[group] = {fasta_header: entry} else: self.get_logger().warning( f"Could not parse mutation record: gene={snp.gene}, dna_mut={snp.dna_mut}, " @@ -353,13 +426,14 @@ def cosmic_to_proteindb(self) -> None: for group_name in groups_mutations_dict.keys(): output_base = str(Path(self._local_output_file).with_suffix('')) with open(f"{output_base}_{regex.sub('', group_name)}.fa", 'w', encoding='utf-8') as fn: - for header in groups_mutations_dict[group_name].keys(): - fn.write(groups_mutations_dict[group_name][header]) + for fasta_hdr in groups_mutations_dict[group_name].keys(): + fn.write(groups_mutations_dict[group_name][fasta_hdr]) self.get_logger().debug("COSMIC contains in total {} non redundant mutations".format(len(mutation_dic))) @staticmethod - def get_sample_headers(header_line: list, filter_column: str) -> tuple[Optional[int], Optional[int]]: + def get_sample_headers(header_line: list, filter_column: str, + sample_id_column: str = 'SAMPLE_ID') -> tuple[Optional[int], Optional[int]]: _logger = logging.getLogger(__name__) try: filter_col = header_line.index(filter_column) @@ -367,32 +441,33 @@ def get_sample_headers(header_line: list, filter_column: str) -> tuple[Optional[ _logger.warning('%s was not found in the header row: %s', filter_column, header_line) return None, None try: - sample_id_col = header_line.index('SAMPLE_ID') + sample_id_col = header_line.index(sample_id_column) except ValueError: - _logger.warning('SAMPLE_ID was not found in the header row: %s', header_line) + _logger.warning('%s was not found in the header row: %s', sample_id_column, header_line) return None, None return filter_col, sample_id_col - def get_value_per_sample(self, local_clinical_sample_file: str, filter_column: str) -> dict: + def get_value_per_sample(self, local_clinical_sample_file: str, filter_column: str, + sample_id_column: str = 'SAMPLE_ID') -> dict: sample_value = {} if local_clinical_sample_file: - with open(local_clinical_sample_file, 'r', encoding='utf-8') as clin_fn: + with _open_text(local_clinical_sample_file, encoding='utf-8') as clin_fn: filter_column_col, sample_id_col = None, None for line in clin_fn.readlines(): if line.startswith('#'): continue sl = line.strip().split('\t') - # check for header and re-assign columns - if 'SAMPLE_ID' in sl and filter_column in sl: - filter_column_col, sample_id_col = self.get_sample_headers(sl, filter_column) - # Skip adding the header row itself to sample_value + if sample_id_column in sl and filter_column in sl: + filter_column_col, sample_id_col = self.get_sample_headers( + sl, filter_column, sample_id_column + ) continue if filter_column_col is not None and sample_id_col is not None: if (sample_id_col < len(sl) and filter_column_col < len(sl)): sample_value[sl[sample_id_col]] = sl[filter_column_col].strip().replace(' ', '_') else: - self.get_logger().warning("No column was found for %s, %s in %s", filter_column, 'SAMPLE_ID', - local_clinical_sample_file) + self.get_logger().warning("No column was found for %s, %s in %s", filter_column, + sample_id_column, local_clinical_sample_file) return sample_value @staticmethod @@ -442,16 +517,13 @@ def cbioportal_to_proteindb(self) -> None: if row[0] == '#': self.get_logger().info("skipping line (%s): %s", i, row) continue - # check for header in the mutations file and get column indices if set(header_cols.keys()).issubset(set(row)): header_cols = self.get_mut_header_cols(header_cols, row) continue - # check if any is none in header_cols then continue if None in header_cols.values(): self.get_logger().error("Incorrect header column is given") continue - # get filter value and check it group = None if self._accepted_values != ['all'] or self._split_by_filter_column: try: diff --git a/pgatk/cgenomes/cosmic_downloader.py b/pgatk/cgenomes/cosmic_downloader.py index 3332014..65a066b 100644 --- a/pgatk/cgenomes/cosmic_downloader.py +++ b/pgatk/cgenomes/cosmic_downloader.py @@ -1,7 +1,9 @@ import base64 -import gzip -import requests import os +import tarfile + +import requests +from tqdm import tqdm from pgatk.toolbox.exceptions import AppConfigException from pgatk.toolbox.general import ParameterConfiguration, check_create_folders @@ -12,14 +14,22 @@ class CosmicDownloadService(ParameterConfiguration): CONFIG_OUTPUT_DIRECTORY = 'output_directory' CONFIG_COSMIC_SERVER = 'cosmic_server' CONFIG_COSMIC_FTP_URL = 'cosmic_ftp' + CONFIG_API_ENDPOINT = 'api_endpoint' + CONFIG_BUCKET = 'bucket' CONFIG_COSMIC_FTP_USER = "cosmic_user" CONFIG_COSMIC_FTP_PASSWORD = "cosmic_password" - CONFIG_COSMIC_MUTATIONS_URL = "mutations_url" - CONFIG_COSMIC_MUTATIONS_FILE = "mutations_file" - CONFIG_COSMIC_CELLLINE_MUTATIONS_URL = "mutations_cellline_url" - CONFIG_COSMIC_CELLLINE_MUTATIONS_FILE = "mutations_cellline_file" - CONFIG_COSMIC_CDS_GENES_FILE = "all_cds_genes_file" - CONFIG_COSMIC_CELLLINES_GENES_FILE = "all_celllines_genes_file" + CONFIG_PRODUCTS = "products" + + # Built-in fallback list, used only if `products` is missing from config. + # Real default lives in pgatk/config/cosmic_config.yaml and is the source of truth. + _DEFAULT_PRODUCTS = ( + 'grch38/cosmic/v103/Cosmic_GenomeScreensMutant_Tsv_v103_GRCh38.tar', + 'grch38/cosmic/v103/Cosmic_CompleteTargetedScreensMutant_Tsv_v103_GRCh38.tar', + 'grch38/cosmic/v103/Cosmic_Genes_Fasta_v103_GRCh38.tar', + 'grch38/cosmic/v103/Cosmic_Transcripts_Tsv_v103_GRCh38.tar', + 'grch38/cell_lines/v103/CellLinesProject_GenomeScreensMutant_Tsv_v103_GRCh38.tar', + 'grch38/cell_lines/v103/CellLinesProject_CompleteCNA_Tsv_v103_GRCh38.tar', + ) def __init__(self, config_file, pipeline_arguments): """ @@ -29,44 +39,41 @@ def __init__(self, config_file, pipeline_arguments): """ super(CosmicDownloadService, self).__init__(self.CONFIG_KEY_DATA_DOWNLOADER, config_file, pipeline_arguments) - self._local_path_cosmic = self.get_configuration_default_params(variable=self.CONFIG_OUTPUT_DIRECTORY, - default_value='./database_cosmic/') - self._cosmic_ftp_url = self.get_configuration_default_params(variable=self.CONFIG_COSMIC_FTP_URL, - default_value='https://cancer.sanger.ac.uk') - self._cosmic_user = self.get_configuration_default_params(variable=self.CONFIG_COSMIC_FTP_USER, - default_value='') - self._cosmic_password = self.get_configuration_default_params(variable=self.CONFIG_COSMIC_FTP_PASSWORD, - default_value='') - self._cosmic_mutation_url = self.get_configuration_default_params(variable=self.CONFIG_COSMIC_MUTATIONS_URL, - default_value='cosmic/file_download/GRCh38/cosmic/v94') - self._cosmic_mutations_file = self.get_configuration_default_params(variable=self.CONFIG_COSMIC_MUTATIONS_FILE, - default_value='CosmicMutantExport.tsv.gz') - self._cosmic_cellline_mutation_url = self.get_configuration_default_params( - variable=self.CONFIG_COSMIC_CELLLINE_MUTATIONS_URL, - default_value='cosmic/file_download/GRCh38/cell_lines/v94') - self._cosmic_cellline_mutation_file = self.get_configuration_default_params( - variable=self.CONFIG_COSMIC_CELLLINE_MUTATIONS_FILE, default_value='CosmicCLP_MutantExport.tsv.gz') - self._cosmic_cellline_mutation_gene = self.get_configuration_default_params( - variable=self.CONFIG_COSMIC_CELLLINES_GENES_FILE, default_value='All_CellLines_Genes.fasta.gz') - self._cosmic_cdns_file = self.get_configuration_default_params(variable=self.CONFIG_COSMIC_CDS_GENES_FILE, - default_value='All_COSMIC_Genes.fasta.gz') - - self._cosmic_token = base64.b64encode("{}:{}".format(self._cosmic_user, self._cosmic_password) - .encode()).decode('utf-8') + self._local_path_cosmic = self.get_configuration_default_params( + variable=self.CONFIG_OUTPUT_DIRECTORY, default_value='./database_cosmic/') + self._cosmic_ftp_url = self.get_configuration_default_params( + variable=self.CONFIG_COSMIC_FTP_URL, default_value='https://cancer.sanger.ac.uk') + self._api_endpoint = self.get_configuration_default_params( + variable=self.CONFIG_API_ENDPOINT, default_value='api/mono/products/v1/downloads/scripted') + self._bucket = self.get_configuration_default_params( + variable=self.CONFIG_BUCKET, default_value='downloads') + self._cosmic_user = self.get_configuration_default_params( + variable=self.CONFIG_COSMIC_FTP_USER, default_value='') + self._cosmic_password = self.get_configuration_default_params( + variable=self.CONFIG_COSMIC_FTP_PASSWORD, default_value='') + self._products = self.get_configuration_default_params( + variable=self.CONFIG_PRODUCTS, default_value=list(self._DEFAULT_PRODUCTS)) + + self._cosmic_token = base64.b64encode( + "{}:{}".format(self._cosmic_user, self._cosmic_password).encode() + ).decode('utf-8') self.prepare_local_cosmic_repository() def get_configuration_default_params(self, variable: str, default_value): - return_value = default_value if variable in self.get_pipeline_parameters(): - return_value = self.get_pipeline_parameters()[variable] - elif self.CONFIG_KEY_DATA_DOWNLOADER in self.get_default_parameters() \ - and self.CONFIG_COSMIC_SERVER in self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER] \ - and variable in self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][ - self.CONFIG_COSMIC_SERVER]: - return_value = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_COSMIC_SERVER][ - variable] - return return_value + return self.get_pipeline_parameters()[variable] + defaults = self.get_default_parameters() + if self.CONFIG_KEY_DATA_DOWNLOADER not in defaults: + return default_value + data = defaults[self.CONFIG_KEY_DATA_DOWNLOADER] + # `output_directory` lives at cosmic_data.; everything else under cosmic_data.cosmic_server.. + if variable in data and not isinstance(data[variable], dict): + return data[variable] + server = data.get(self.CONFIG_COSMIC_SERVER, {}) + if variable in server: + return server[variable] + return default_value def prepare_local_cosmic_repository(self): self.get_logger().debug("Preparing local cbioportal repository, root folder - '{}'".format( @@ -78,88 +85,141 @@ def prepare_local_cosmic_repository(self): def get_local_path_root_cosmic_repo(self): return self._local_path_cosmic + def build_api_url(self, product_path): + """Build the COSMIC scripted-download API URL for a single product.""" + return "{server}/{endpoint}?path={path}&bucket={bucket}".format( + server=self._cosmic_ftp_url.rstrip('/'), + endpoint=self._api_endpoint.strip('/'), + path=product_path, + bucket=self._bucket, + ) + def download_mutation_file(self, url_file_name=None): """ - This function will download the mutations file from Cosmic Database. - :return: None - """ + Download every product listed in `self._products` via COSMIC's v103+ + scripted-download API. Each product is a `path=` value such as + `grch38/cosmic/v103/Cosmic_GenomeScreensMutant_Tsv_v103_GRCh38.tar`. - mutation_output_file = "{}/{}".format(self.get_local_path_root_cosmic_repo(), self._cosmic_mutations_file) - cds_genes_output_file = "{}/{}".format(self.get_local_path_root_cosmic_repo(), self._cosmic_cdns_file) - - mutation_celline_output_file = "{}/{}".format(self.get_local_path_root_cosmic_repo(), - self._cosmic_cellline_mutation_file) - cellines_genes_output_file = "{}/{}".format(self.get_local_path_root_cosmic_repo(), - self._cosmic_cellline_mutation_gene) - - server = self._cosmic_ftp_url - - cosmic_version = self._cosmic_mutation_url - mutation_file = self._cosmic_mutations_file + If url_file_name is provided, write the API URLs to that file instead of downloading. + """ - cosmic_cellline_version = self._cosmic_cellline_mutation_url - mutation_celline_file = self._cosmic_cellline_mutation_file + products = list(self._products) + if not products: + self.get_logger().warning("No COSMIC products configured; nothing to download.") + return - all_cds_gene_file = self._cosmic_cdns_file + token = "Basic {}".format(self._cosmic_token) + output_dir = self.get_local_path_root_cosmic_repo() - all_celllines_gene_file = self._cosmic_cellline_mutation_gene + if url_file_name is not None: + with open(url_file_name, 'w', encoding='utf-8') as url_file: + for path in products: + api_url = self.build_api_url(path) + output_file = "{}/{}".format(output_dir, os.path.basename(path)) + url_file.write("{}\t{}\n".format(api_url, output_file)) + return - mutation_url = "{}/{}/{}".format(server, cosmic_version, mutation_file) - cds_gene_url = "{}/{}/{}".format(server, cosmic_version, all_cds_gene_file) + for path in products: + api_url = self.build_api_url(path) + output_file = "{}/{}".format(output_dir, os.path.basename(path)) + self.download_file_cosmic(api_url, output_file, token) - celllines_gene_url = "{}/{}/{}".format(server, cosmic_cellline_version, all_celllines_gene_file) - mutation_cellline_url = "{}/{}/{}".format(server, cosmic_cellline_version, mutation_celline_file) + def download_file_cosmic(self, api_url, local_file, token): + """ + Two-step download from COSMIC's scripted-download API: + 1) GET api_url with Basic auth -> returns JSON {"url": ""} + 2) GET the presigned URL (no auth, 1-hour TTL) -> stream to local_file - if url_file_name is None: - token = "Basic {}".format(self._cosmic_token) - self.download_file_cosmic(mutation_url, mutation_output_file, token) - self.download_file_cosmic(cds_gene_url, cds_genes_output_file, token) + If local_file ends in .tar, extract its contents to the same directory + and delete the archive. + """ + api_response = requests.get(api_url, headers={'Authorization': token}, timeout=30) + if api_response.status_code != 200: + msg = ("COSMIC API request failed: HTTP {} for {}. Body: {!r}" + .format(api_response.status_code, api_url, api_response.text[:200])) + self.get_logger().error(msg) + raise AppConfigException(msg) - self.download_file_cosmic(celllines_gene_url, cellines_genes_output_file, token) - self.download_file_cosmic(mutation_cellline_url, mutation_celline_output_file, token) + try: + payload = api_response.json() + download_url = payload['url'] + except (ValueError, KeyError) as exc: + msg = ("COSMIC API returned 200 but the body is not a JSON object with a 'url' " + "key (got: {!r}). This usually means the `path=` value is wrong for the " + "release/assembly, or your account lacks access. URL was: {}. Error: {}" + .format(api_response.text[:200], api_url, exc)) + self.get_logger().error(msg) + raise AppConfigException(msg) - else: - if url_file_name is not None: - with open(url_file_name, 'w', encoding='utf-8') as url_file: - url_file.write("{}\t{}\n".format(mutation_url, mutation_output_file)) - url_file.write("{}\t{}\n".format(cds_gene_url, cds_genes_output_file)) - url_file.write("{}\t{}\n".format(celllines_gene_url, cellines_genes_output_file)) - url_file.write("{}\t{}\n".format(mutation_cellline_url, mutation_celline_output_file)) + # The presigned URL embeds short-lived AWS credentials in its query string; + # log only the path so the secret material doesn't leak into log aggregators. + self.get_logger().debug("Downloading file from signed URL (path %s)", download_url.split('?', 1)[0]) + data_response = requests.get(download_url, stream=True, timeout=30) + if data_response.status_code != 200: + msg = ("COSMIC S3 download failed: HTTP {} for {}" + .format(data_response.status_code, download_url)) + self.get_logger().error(msg) + raise AppConfigException(msg) - def download_file_cosmic(self, url, local_file, token): - """ - Download file from cosmic repository using requests - :param url: url of the file to be download - :param local_file: local file - :param token: token to be used - :return: + total_size = int(data_response.headers.get('content-length') or 0) + chunk_size = 1024 * 1024 + progress = tqdm( + total=total_size if total_size > 0 else None, + unit='B', unit_scale=True, unit_divisor=1024, + desc=os.path.basename(local_file), + leave=True, + ) + try: + with open(local_file, 'wb') as f: + for chunk in data_response.iter_content(chunk_size=chunk_size): + if chunk: + f.write(chunk) + progress.update(len(chunk)) + finally: + progress.close() + self.get_logger().debug("Download finished: '{}'".format(local_file)) + + if local_file.endswith('.tar'): + extract_dir = os.path.dirname(local_file) or '.' + print("Extracting {}...".format(os.path.basename(local_file)), flush=True) + self._safe_extract_tar(local_file, extract_dir) + os.remove(local_file) + self.get_logger().debug("Extracted archive into '{}'".format(extract_dir)) + + @staticmethod + def _safe_extract_tar(tar_path, dest_dir): + """Extract tar_path into dest_dir, rejecting unsafe members. + + Guards against path traversal (CVE-2007-4559) via absolute paths or + ``..`` components in member names, and ensures any symlink/hardlink + targets resolve within dest_dir. COSMIC archives are trusted in + practice, but defence in depth costs nothing and silences static + analysers. """ - - response = requests.get(url, stream=True, headers={'Authorization': token}, timeout=30) - if response.status_code == 200: - url = response.json()['url'] - msg = "Downloading file from url '{}'".format(url) - self.get_logger().debug(msg) - - response = requests.get(url, stream=True, timeout=30) - if response.status_code == 200: - with open(local_file, 'wb') as f: - f.write(response.content) - msg = "Download Finish for file '{}'".format(local_file) - self.get_logger().debug(msg) - if local_file.endswith('.gz'): - extracted_file = local_file.replace('.gz', '') - with open(extracted_file, 'w', encoding='utf-8') as outfile: - try: - outfile.write(gzip.decompress(open(local_file, 'rb').read()).decode('utf-8')) - except UnicodeDecodeError: - outfile.write(gzip.decompress(open(local_file, 'rb').read()).decode('ISO-8859-1')) - os.remove(local_file) - local_file = extracted_file - msg = "Extracted file '{}'".format(local_file) - self.get_logger().debug(msg) - else: - msg = "Error downloading the COSMIC data, error code {} , error message '{}'".format(response.status_code, - local_file) - self.get_logger().debug(msg) - raise AppConfigException(msg) + dest_abs = os.path.realpath(dest_dir) + with tarfile.open(tar_path, 'r') as tar: + safe_members: list[tarfile.TarInfo] = [] + for member in tar.getmembers(): + member_path = os.path.realpath(os.path.join(dest_abs, member.name)) + if os.path.commonpath([dest_abs, member_path]) != dest_abs: + raise AppConfigException( + "Refusing to extract tar member {!r}: would escape target dir {!r}" + .format(member.name, dest_abs) + ) + if member.issym() or member.islnk(): + link_target = member.linkname + if os.path.isabs(link_target): + raise AppConfigException( + "Refusing to extract tar member {!r}: absolute symlink/hardlink target {!r}" + .format(member.name, link_target) + ) + resolved = os.path.realpath(os.path.join(dest_abs, os.path.dirname(member.name), link_target)) + if os.path.commonpath([dest_abs, resolved]) != dest_abs: + raise AppConfigException( + "Refusing to extract tar member {!r}: symlink/hardlink would escape target dir" + .format(member.name) + ) + safe_members.append(member) + # All members in safe_members were validated against dest_abs above: + # absolute paths, .. traversal, and unsafe symlink targets all rejected. + tar.extractall(path=dest_abs, members=safe_members) # nosec B202 diff --git a/pgatk/clinvar/clinvar_service.py b/pgatk/clinvar/clinvar_service.py index ee9a2cc..f368690 100644 --- a/pgatk/clinvar/clinvar_service.py +++ b/pgatk/clinvar/clinvar_service.py @@ -372,13 +372,10 @@ def run(self) -> str: """ logger.info("Starting ClinVar pipeline") - # 1. Load chromosome mapper chrom_mapper = ChromosomeMapper.from_assembly_report(self._assembly_report) - # 2. Parse GTF db = self._parse_gtf(self._gtf_file) - # 3. Load transcript FASTA transcripts_dict = SeqIO.index( self._fasta_file, "fasta", @@ -389,10 +386,8 @@ def run(self) -> str: k.split(".")[0]: k for k in transcripts_dict.keys() } - # 4. Read VCF once into DataFrame _metadata, vcf_df = self._read_vcf(self._vcf_file) - # 5. Find overlapping transcripts via BedTools (from DataFrame) overlap_map = self._build_overlap_map(vcf_df, self._gtf_file, chrom_mapper) logger.info("Found %d variants with transcript overlaps", len(overlap_map)) @@ -411,7 +406,6 @@ def run(self) -> str: for _, record in vcf_df.iterrows(): stats["variants_processed"] += 1 - # --- Validate alleles --- ref = str(record.REF) if any(c not in "ACGT" for c in ref): continue @@ -423,26 +417,22 @@ def run(self) -> str: if not alts: continue - # --- CLNSIG filter --- info = str(record.INFO) clnsig = self._get_info_field(info, "CLNSIG") if not self.passes_clnsig_filter(clnsig, self._clnsig_exclude): stats["variants_filtered_clnsig"] += 1 continue - # --- MC consequence filter --- mc_field = self._get_info_field(info, "MC") if not self.passes_mc_filter(mc_field, self._include_consequences): stats["variants_filtered_mc"] += 1 continue - # --- Parse gene symbol and CLNSIG for description --- gene_symbol, _ = self.parse_geneinfo( self._get_info_field(info, "GENEINFO") ) desc_str = f"{clnsig}|{gene_symbol}" if gene_symbol else clnsig - # --- Find overlapping transcripts --- chrom = str(record.CHROM) pos = int(record.POS) @@ -465,7 +455,6 @@ def run(self) -> str: continue processed_pairs.add(pair_key) - # Resolve transcript in FASTA tid = transcript_id if tid not in transcripts_dict: tid = transcript_id_mapping.get( @@ -479,7 +468,6 @@ def run(self) -> str: ) continue - # --- Biotype filter --- if self._include_biotypes != ["all"]: biotype = self._get_transcript_biotype(db, tid) if biotype and biotype not in self._include_biotypes: @@ -489,7 +477,6 @@ def run(self) -> str: ref_seq = fasta_record.seq desc = str(fasta_record.description) - # Determine CDS info and feature types cds_info: list[int] = [] feature_types = ["exon"] num_orfs = 3 @@ -512,19 +499,16 @@ def run(self) -> str: desc, ) - # Get features from GTF feat_chrom, strand, features_info = self._get_features( db, tid, feature_types ) if feat_chrom is None: continue - # Check overlap at feature level var_end = pos + len(ref) - 1 if not check_overlap(pos, var_end, features_info): continue - # Apply variant coding_ref_seq, coding_alt_seq = get_altseq( ref_seq, Seq(ref), @@ -538,7 +522,6 @@ def run(self) -> str: if coding_alt_seq == "": continue - # Translate ref_orfs, alt_orfs = get_orfs_vcf( coding_ref_seq, coding_alt_seq, @@ -546,7 +529,6 @@ def run(self) -> str: num_orfs, ) - # Build sequence ID record_id = "" if record.ID and str(record.ID) != ".": record_id = str(record.ID) diff --git a/pgatk/commands/cosmic_downloader.py b/pgatk/commands/cosmic_downloader.py index 5696097..fe35216 100644 --- a/pgatk/commands/cosmic_downloader.py +++ b/pgatk/commands/cosmic_downloader.py @@ -16,9 +16,19 @@ help="Username for cosmic database -- please if you don't have one register here (https://cancer.sanger.ac.uk/cosmic/register)") @click.option('-p', '--password', help="Password for cosmic database -- please if you don't have one register here (https://cancer.sanger.ac.uk/cosmic/register)") -@click.option("--url_file", help='Add the url to a downloaded file') +@click.option('-P', '--products', multiple=True, + help='COSMIC scripted-download `path=` value to fetch (e.g. ' + 'grch38/cosmic/v103/Cosmic_GenomeScreensMutant_Tsv_v103_GRCh38.tar). ' + 'May be repeated. Overrides the products list in the config file when provided. ' + 'Find available paths at https://cancer.sanger.ac.uk/cosmic/download/cosmic ' + '(open the "Scripted Download" panel for any product).') +@click.option("--url_file", + help='If set, do not download anything. Instead, write a TSV with one row per ' + 'product (`\\t`) so the downloads can be driven ' + 'externally (e.g. wget/curl loop). The api_url is the COSMIC scripted-download ' + 'endpoint URL — fetching it returns JSON containing the actual signed S3 URL.') @click.pass_context -def cosmic_downloader(ctx, config_file, output_directory, username, password, url_file): +def cosmic_downloader(ctx, config_file, output_directory, username, password, products, url_file): config_data = load_config("cosmic", config_file) @@ -31,5 +41,8 @@ def cosmic_downloader(ctx, config_file, output_directory, username, password, ur if password is not None: pipeline_arguments[CosmicDownloadService.CONFIG_COSMIC_FTP_PASSWORD] = password + if products: + pipeline_arguments[CosmicDownloadService.CONFIG_PRODUCTS] = list(products) + cosmic_downloader_service = CosmicDownloadService(config_data, pipeline_arguments) cosmic_downloader_service.download_mutation_file(url_file_name=url_file) diff --git a/pgatk/commands/cosmic_to_proteindb.py b/pgatk/commands/cosmic_to_proteindb.py index 5c24ca4..b4d0c27 100644 --- a/pgatk/commands/cosmic_to_proteindb.py +++ b/pgatk/commands/cosmic_to_proteindb.py @@ -22,9 +22,12 @@ @click.option('-s', '--split_by_filter_column', help='Use this flag to generate a proteinDB per group as specified in the filter_column, default is False', is_flag=True) +@click.option('-cl', '--clinical_sample_file', + help='COSMIC sample annotation file used to map ' + 'COSMIC_SAMPLE_ID to the filter column value (e.g. PRIMARY_SITE)') @click.pass_context def cosmic_to_proteindb(ctx, config_file, input_mutation, input_genes, output_db, - filter_column, accepted_values, split_by_filter_column): + filter_column, accepted_values, split_by_filter_column, clinical_sample_file): config_data = load_config("cosmic", config_file) @@ -39,10 +42,13 @@ def cosmic_to_proteindb(ctx, config_file, input_mutation, input_genes, output_db if output_db is not None: pipeline_arguments[CancerGenomesService.CONFIG_OUTPUT_FILE] = output_db + if clinical_sample_file is not None: + pipeline_arguments[CancerGenomesService.CLINICAL_SAMPLE_FILE] = clinical_sample_file + if filter_column is not None: pipeline_arguments[CancerGenomesService.FILTER_COLUMN] = filter_column elif config_data is None: - pipeline_arguments[CancerGenomesService.FILTER_COLUMN] = 'Primary site' + pipeline_arguments[CancerGenomesService.FILTER_COLUMN] = 'PRIMARY_SITE' if accepted_values is not None: pipeline_arguments[CancerGenomesService.ACCEPTED_VALUES] = accepted_values diff --git a/pgatk/commands/ensembl_downloader.py b/pgatk/commands/ensembl_downloader.py index 83adb88..0885cd7 100644 --- a/pgatk/commands/ensembl_downloader.py +++ b/pgatk/commands/ensembl_downloader.py @@ -38,7 +38,6 @@ def ensembl_downloader(ctx, config_file, output_directory, taxonomy, folder_pref if taxonomy is None and ensembl_name is None: raise click.UsageError("Either --taxonomy or --ensembl_name is required.") - # Parse pipelines parameters. pipeline_arguments = {} if output_directory is not None: diff --git a/pgatk/commands/vcf_to_proteindb.py b/pgatk/commands/vcf_to_proteindb.py index da2d474..7cb2faa 100644 --- a/pgatk/commands/vcf_to_proteindb.py +++ b/pgatk/commands/vcf_to_proteindb.py @@ -38,6 +38,8 @@ help="enabling this option causes or variants to be parsed. By default only variants that have not failed any filters will be processed (FILTER column is PASS, None, .) or if the filters are subset of the accepted filters. (default is False)", is_flag=True) @click.option('--accepted_filters', help="Accepted filters for variant parsing") +@click.option('-w', '--workers', type=int, default=None, + help="Number of worker processes to fan out across (default: 1, sequential). Per-chromosome split.") @click.pass_context def vcf_to_proteindb(ctx, config_file, input_fasta, vcf, gene_annotations_gtf, translation_table, mito_translation_table, @@ -45,7 +47,7 @@ def vcf_to_proteindb(ctx, config_file, input_fasta, vcf, gene_annotations_gtf, t af_field, af_threshold, transcript_str, biotype_str, exclude_biotypes, include_biotypes, consequence_str, exclude_consequences, skip_including_all_cds, include_consequences, - ignore_filters, accepted_filters): + ignore_filters, accepted_filters, workers): config_data = load_config("ensembl_config", config_file) @@ -105,5 +107,8 @@ def vcf_to_proteindb(ctx, config_file, input_fasta, vcf, gene_annotations_gtf, t if accepted_filters is not None: pipeline_arguments[EnsemblDataService.ACCEPTED_FILTERS] = accepted_filters + if workers is not None: + pipeline_arguments[EnsemblDataService.WORKERS] = workers + ensembl_data_service = EnsemblDataService(config_data, pipeline_arguments) - ensembl_data_service.vcf_to_proteindb(vcf, input_fasta, gene_annotations_gtf) + ensembl_data_service.vcf_to_proteindb(vcf, input_fasta, gene_annotations_gtf, workers=workers) diff --git a/pgatk/config/cosmic_config.yaml b/pgatk/config/cosmic_config.yaml index 6b6da9b..0555af5 100644 --- a/pgatk/config/cosmic_config.yaml +++ b/pgatk/config/cosmic_config.yaml @@ -2,14 +2,32 @@ cosmic_data: output_directory: database_cosmic cosmic_server: cosmic_ftp: https://cancer.sanger.ac.uk - mutations_url: cosmic/file_download/GRCh38/cosmic/v94 - all_cds_genes_file: All_COSMIC_Genes.fasta.gz - mutations_file: CosmicMutantExport.tsv.gz - mutations_cellline_url: cosmic/file_download/GRCh38/cell_lines/v94 - mutations_cellline_file: CosmicCLP_MutantExport.tsv.gz - all_celllines_genes_file: All_CellLines_Genes.fasta.gz + api_endpoint: api/mono/products/v1/downloads/scripted + bucket: downloads cosmic_user: '' cosmic_password: '' + # List of COSMIC scripted-download paths to fetch. Each entry is the `path=` + # value from the curl command shown on the COSMIC UI's "Scripted Download" panel + # for a product (https://cancer.sanger.ac.uk/cosmic/download/cosmic). + # + # Path format: /// + # - is lowercase (`grch37`, `grch38`) + # - is `cosmic` for the main COSMIC project, `cell_lines` for the + # Cell Lines Project (other projects: `cancer_mutation_census`, `actionability`) + # - is the release (`v103`, etc.) + # - is the exact .tar archive name shown in the UI + # + # Add or remove entries to change what gets downloaded. Defaults below cover the + # three files the cosmic-to-proteindb pipeline expects (mutations TSV + gene FASTA + # for the main COSMIC project, plus cell-line mutations TSV). + products: + - grch38/cosmic/v103/Cosmic_GenomeScreensMutant_Tsv_v103_GRCh38.tar + - grch38/cosmic/v103/Cosmic_CompleteTargetedScreensMutant_Tsv_v103_GRCh38.tar + - grch38/cosmic/v103/Cosmic_Genes_Fasta_v103_GRCh38.tar + - grch38/cosmic/v103/Cosmic_Transcripts_Tsv_v103_GRCh38.tar + - grch38/cell_lines/v103/CellLinesProject_GenomeScreensMutant_Tsv_v103_GRCh38.tar + - grch38/cell_lines/v103/CellLinesProject_CompleteCNA_Tsv_v103_GRCh38.tar + - grch38/cosmic/v103/Cosmic_Classification_Tsv_v103_GRCh38.tar logger: formatters: DEBUG: "%(asctime)s [%(levelname)7s][%(name)48s][%(module)32s, %(lineno)4s] %(message)s" @@ -17,7 +35,7 @@ cosmic_data: loglevel: DEBUG proteindb: filter_info: - filter_column: 'Primary site' + filter_column: 'PRIMARY_SITE' accepted_values: "all" split_by_filter_column: False clinical_sample_file: '' diff --git a/pgatk/config/ensembl_config.yaml b/pgatk/config/ensembl_config.yaml index 1838c7c..ac679ed 100644 --- a/pgatk/config/ensembl_config.yaml +++ b/pgatk/config/ensembl_config.yaml @@ -11,7 +11,7 @@ ensembl_translation: exclude_biotypes: '' exclude_consequences: 'downstream_gene_variant, upstream_gene_variant, intergenic_variant, intron_variant, synonymous_variant, regulatory_region_variant' skip_including_all_cds: False - include_biotypes: 'protein_coding,polymorphic_pseudogene,non_stop_decay,nonsense_mediated_decay,IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,TR_C_gene,TR_D_gene,TR_J_gene,TR_V_gene,TEC,mRNA' + include_biotypes: 'IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,nonsense_mediated_decay,non_stop_decay,protein_coding,protein_coding_LoF,TR_C_gene,TR_D_gene,TR_J_gene,TR_V_gene' include_consequences: 'all' biotype_str: transcript_biotype transcript_description_sep: ';' diff --git a/pgatk/db/digest_mutant_protein.py b/pgatk/db/digest_mutant_protein.py index 7b49201..6651aaa 100644 --- a/pgatk/db/digest_mutant_protein.py +++ b/pgatk/db/digest_mutant_protein.py @@ -18,7 +18,7 @@ def trypsin_cleavage(proseq: str, miss_cleavage: int): except IndexError: pass - if aa in ['K', 'R'] and next_aa != 'P': # for trypsin peptides + if aa in ['K', 'R'] and next_aa != 'P': if len(peptide) > 0: peptides.append(peptide) peptide = '' @@ -63,7 +63,6 @@ def digest_mutant_proteins( :param max_length: Maximum peptide length to retain (default: 40). :param missed_cleavages: Number of missed cleavages for trypsin digestion (default: 0). """ - # Build canonical peptidome from reference FASTA handle1 = SeqIO.parse(fasta_file, 'fasta') peptidome = {} @@ -78,7 +77,6 @@ def digest_mutant_proteins( log.info("Known peptides number: %d", len(peptidome)) handle1.close() - # Parse mutant protein input files filelist = input_file.split(",") handle_list = [] for f in filelist: diff --git a/pgatk/db/map_peptide2genome.py b/pgatk/db/map_peptide2genome.py index a09b119..c573dc0 100644 --- a/pgatk/db/map_peptide2genome.py +++ b/pgatk/db/map_peptide2genome.py @@ -87,7 +87,7 @@ def parse_gtf(infile): dic = {} with open(infile, "r", encoding='utf-8') as infile_object: for line in infile_object: - if line[0] != "#": # skip lines commented out + if line[0] != "#": row = line.strip().split("\t") if row[2] == "CDS": attri_list = row[8].split(";") @@ -153,7 +153,6 @@ def map_peptides_to_genome( pep_dic = {} with open(input_file, 'r', encoding='utf-8') as input_stream: - # peptide table with two columns, peptide sequence in first column, protein ID in second column input_stream.readline() for line in input_stream: row = line.strip().split("\t") @@ -188,7 +187,6 @@ def map_peptides_to_genome( exons, pep_trans_start, pep_trans_end ) - # handle exceptions if pep_chr_start > pep_chr_end: non_mapped_pep += 1 continue diff --git a/pgatk/ensembl/data_downloader.py b/pgatk/ensembl/data_downloader.py index 1ad4812..0be653b 100644 --- a/pgatk/ensembl/data_downloader.py +++ b/pgatk/ensembl/data_downloader.py @@ -6,7 +6,6 @@ 2. Given a species ID, collect its GTF data, with the option of decompressing it or not. """ -# App imports from json import loads import re from pgatk.toolbox.general import ParameterConfiguration, check_create_folders, download_file @@ -432,7 +431,6 @@ def get_vcf_files(self, species: dict, url_file=None) -> list: file_name=self.get_local_path_root_ensembl_repo() + '/' + file_name, log=self.get_logger(), url_file=url_file) if downloaded_file is not None: - # if chr1 is downloaded then try all others files.append(downloaded_file) for chrN in range(2, 23): # chr2-22 file_name = '{}_incl_consequences-chr{}.vcf.gz'.format(species['name'], chrN) diff --git a/pgatk/ensembl/ensembl.py b/pgatk/ensembl/ensembl.py index e1d8b7e..49fe852 100644 --- a/pgatk/ensembl/ensembl.py +++ b/pgatk/ensembl/ensembl.py @@ -1,6 +1,8 @@ from __future__ import annotations import logging +import os +import re import sqlite3 from pathlib import Path from typing import Any, Optional @@ -19,6 +21,103 @@ ) +class _FeatureCache: + + """Per-run memoization of get_features() results, keyed by transcript id. + + Bounded by a soft maxsize to avoid unbounded growth on whole-genome runs. + Defaults to 50000, which comfortably covers human ENSEMBL (~250k transcripts + across all chromosomes; a single chr is ~10-20k). + """ + + def __init__(self, maxsize: int = 50000) -> None: + self._cache: dict[tuple, tuple] = {} + self._maxsize = maxsize + + def get(self, key: tuple) -> Optional[tuple]: + return self._cache.get(key) + + def put(self, key: tuple, value: tuple) -> None: + if len(self._cache) >= self._maxsize: + for k in list(self._cache.keys())[: self._maxsize // 5]: + del self._cache[k] + self._cache[key] = value + + +_MISSING = object() + + +def _safe_chrom(chrom: str) -> str: + """Sanitise a chromosome identifier into a filesystem-safe filename component.""" + return re.sub(r'[^A-Za-z0-9._-]', '_', str(chrom)) + + +def _ensure_fasta_index(input_fasta: str) -> str: + """Return the path to a `.idx` SQLite index for input_fasta, building it + if absent or stale. Stale means: the .idx exists but the FASTA's mtime + is newer than the .idx's. The index is keyed by `EnsemblDataService.get_key`. + """ + idx_path = input_fasta + ".idx" + if os.path.exists(idx_path): + if os.path.getmtime(idx_path) >= os.path.getmtime(input_fasta): + return idx_path + try: + os.remove(idx_path) + except OSError: + pass + # Build the index. SeqIO.index_db materialises the SQLite file on disk + # as a side effect; we don't need to keep the returned handle here. + SeqIO.index_db(idx_path, [input_fasta], "fasta", key_function=EnsemblDataService.get_key) + return idx_path + + +def _split_vcf_by_chrom(vcf_file: str, output_dir: str) -> dict[str, str]: + """Stream `vcf_file` once, writing per-chromosome VCFs into `output_dir`. + + Each output file is `/chunk_.vcf` and contains the + full VCF header followed by only the data lines for that chromosome. + Preserves line ordering within each chromosome. + + Returns a mapping `{chrom: chunk_path}`. Constant memory — holds at most + one open file handle per chromosome seen so far. + """ + header: list[str] = [] + handles: dict[str, Any] = {} + chunk_paths: dict[str, str] = {} + try: + with open(vcf_file, 'r', encoding='utf-8') as f: + for line in f: + if line.startswith('#'): + header.append(line) + continue + if not line.strip(): + continue + chrom = line.split('\t', 1)[0] + handle = handles.get(chrom) + if handle is None: + chunk_path = os.path.join(output_dir, "chunk_" + _safe_chrom(chrom) + ".vcf") + handle = open(chunk_path, 'w', encoding='utf-8') + handle.writelines(header) + handles[chrom] = handle + chunk_paths[chrom] = chunk_path + handle.write(line) + finally: + for h in handles.values(): + h.close() + return chunk_paths + + +def _vcf_to_proteindb_worker(default_params, pipeline_args, vcf_file, + input_fasta, gene_annotations_gtf, output_path): + """Module-level worker for multiprocessing.Pool.starmap. + + Re-constructs an EnsemblDataService in the worker process from the + pickled config dict + pipeline args, then runs the chunk pipeline. + """ + svc = EnsemblDataService(default_params, pipeline_args) + svc._vcf_to_proteindb_chunk(vcf_file, input_fasta, gene_annotations_gtf, output_path) + + class EnsemblDataService(ParameterConfiguration): CONFIG_KEY_VCF = "ensembl_translation" INPUT_FASTA = "input_fasta" @@ -47,6 +146,7 @@ class EnsemblDataService(ParameterConfiguration): EXPRESSION_THRESH = "expression_thresh" IGNORE_FILTERS = "ignore_filters" ACCEPTED_FILTERS = "accepted_filters" + WORKERS = "workers" def __init__(self, config_file: dict, pipeline_arguments: dict) -> None: """ @@ -85,7 +185,7 @@ def __init__(self, config_file: dict, pipeline_arguments: dict) -> None: default_value=False) self._include_biotypes = self.get_multiple_options( self.get_translation_properties(variable=self.INCLUDE_BIOTYPES, - default_value='protein_coding,polymorphic_pseudogene,non_stop_decay,nonsense_mediated_decay,IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,TR_C_gene,TR_D_gene,TR_J_gene,TR_V_gene,TEC,mRNA')) + default_value='protein_coding,protein_coding_CDS_not_defined,protein_coding_LoF,nonsense_mediated_decay,non_stop_decay,translated_processed_pseudogene,IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,TR_C_gene,TR_D_gene,TR_J_gene,TR_V_gene,TEC')) self._include_consequences = self.get_multiple_options( self.get_translation_properties(variable=self.INCLUDE_CONSEQUENCES, default_value='all')) @@ -103,6 +203,12 @@ def __init__(self, config_file: dict, pipeline_arguments: dict) -> None: self._accepted_filters = self.get_multiple_options( self.get_translation_properties(variable=self.ACCEPTED_FILTERS, default_value='PASS')) + raw_workers = self.get_translation_properties(variable=self.WORKERS, default_value=1) + try: + self._workers = max(1, int(raw_workers)) + except (TypeError, ValueError): + self._workers = 1 + def get_translation_properties(self, variable: str, default_value: Any) -> Any: value_return = default_value if variable in self.get_pipeline_parameters(): @@ -237,10 +343,10 @@ def dnaseq_to_proteindb(self, input_fasta: str) -> str: with open(self._proteindb_output, 'w', encoding='utf-8') as prots_fn: for record_id in seq_dict.keys(): - ref_seq = seq_dict[record_id].seq # get the seq and desc for the record from the fasta of the gtf + ref_seq = seq_dict[record_id].seq desc = str(seq_dict[record_id].description) - key_values = {} # extract key=value in the desc into a dict + key_values = {} sep = self._transcript_description_sep desc = desc.replace(' ', sep) for value in desc.split(sep): @@ -264,7 +370,6 @@ def dnaseq_to_proteindb(self, input_fasta: str) -> str: record_id, desc) self.get_logger().debug(msg) - # only include features that have the specified biotypes or they have CDSs info if 'CDS' in key_values.keys() and ( not self._skip_including_all_cds or 'altORFs' in self._include_biotypes): pass @@ -273,7 +378,6 @@ def dnaseq_to_proteindb(self, input_fasta: str) -> str: 'all']))): continue - # check wether to filter on expression and if it passes if self._expression_str: try: if float(key_values[self._expression_str]) < self._expression_thresh: @@ -360,12 +464,10 @@ def annoate_vcf(vcf_file: str, gtf_file: str, muts_dict.setdefault(key, []).append(transcript_id) with open(f"{vcf_stem}_annotated.vcf", 'w', encoding='utf-8') as ann, open(vcf_file, 'r', encoding='utf-8') as v: - # write vcf headers to the output file for line in v.readlines(): if line.startswith('#'): ann.write(line) else: - # write the mutations and their overlapping transcript to output file sl = line.strip().split('\t') if len(sl) < 8: ann.write(line) @@ -414,7 +516,7 @@ def vcf_from_file(vcf_file: str) -> tuple[list, pd.DataFrame]: return metadata, vcf_df - def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf: str) -> str: + def _vcf_to_proteindb_chunk(self, vcf_file: str, input_fasta: str, gene_annotations_gtf: str, output_path: str) -> str: """ Generate proteins for variants by modifying sequences of affected transcripts. In case of already annotated variants it only considers variants within @@ -425,13 +527,22 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf :param vcf_file: :param input_fasta: :param gene_annotations_gtf: + :param output_path: path for writing the output FASTA :return: """ db = self.parse_gtf(gene_annotations_gtf, str(Path(gene_annotations_gtf).with_suffix('.db'))) - transcripts_dict = SeqIO.index(input_fasta, "fasta", key_function=self.get_key) + idx_path = _ensure_fasta_index(input_fasta) + transcripts_dict = SeqIO.index_db(idx_path, [input_fasta], "fasta", + key_function=self.get_key) # handle cases where the transcript has version in the GTF but not in the VCF - transcript_id_mapping = {k.split('.')[0]: k for k in transcripts_dict.keys()} + # Built lazily on the first KeyError to avoid iterating 207k keys up-front. + transcript_id_mapping: Optional[dict[str, str]] = None + feature_cache = _FeatureCache() + # Value is (ref_seq, desc) for a known transcript, or None for a transcript + # we've already looked up and confirmed isn't in the FASTA index (avoids re-trying + # the disk seek). _MISSING sentinel below distinguishes "not yet looked up". + seq_cache: dict[str, Optional[tuple]] = {} transcript_index, consequence_index, biotype_index = None, None, None if self._annotation_field_name: @@ -445,7 +556,6 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf except IndexError: pass - # try to extract index of transcript ID and consequence from the VCF metadata in the header try: transcript_index = annotation_cols.index(self._transcript_str.upper()) except ValueError: @@ -465,6 +575,12 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf annotation_cols, vcf_file) self.get_logger().debug(msg) + # Fall back to index 0 when no structured ##INFO header was found (e.g. + # transcriptOverlaps-annotated VCFs produced by annoate_vcf, which write + # plain transcript IDs without a pipe-delimited FORMAT declaration). + if transcript_index is None: + transcript_index = 0 + else: # in case the given VCF is not annotated, annotate it by identifying the overlapping transcripts vcf_file = self.annoate_vcf(vcf_file, gene_annotations_gtf) @@ -480,13 +596,13 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf self._accepted_filters = [x.upper() for x in self._accepted_filters] - with open(self._proteindb_output, 'w', encoding='utf-8') as prots_fn: - for _, record in vcf_reader.iterrows(): + with open(output_path, 'w', encoding='utf-8') as prots_fn: + for record in vcf_reader.itertuples(index=False, name='VCFRecord'): trans = False if [x for x in str(record.REF) if x not in 'ACGT']: - msg = "Invalid VCF record, skipping: {}".format(record) invalid_records['# variants with invalid record'] += 1 - self.get_logger().debug(msg) + if self.get_logger().isEnabledFor(logging.DEBUG): + self.get_logger().debug("Invalid VCF record, skipping: %s", record) continue alts = [] @@ -494,33 +610,35 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf if alt is None: continue elif [x for x in str(alt) if x not in 'ACGT']: - # check if all alt alleles are nucleotides continue alts.append(alt) if not alts: - msg = "Invalid VCF record, skipping: {}".format(record) invalid_records['# variants with invalid record'] += 1 - self.get_logger().debug(msg) + if self.get_logger().isEnabledFor(logging.DEBUG): + self.get_logger().debug("Invalid VCF record, skipping: %s", record) continue + # Parse INFO once; avoids repeated split-and-search list-comprehensions per variant. + info_kv: dict[str, str] = {} + for entry in record.INFO.split(';'): + k, _, v = entry.partition('=') + info_kv[k] = v + if not self._ignore_filters and self._accepted_filters != ['ALL']: - if record.FILTER and record.FILTER != '.' and record.FILTER != 'NA' and record.FILTER != '': # if not PASS: None and empty means PASS + if record.FILTER and record.FILTER != '.' and record.FILTER != 'NA' and record.FILTER != '': # None and empty means PASS filters = set(record.FILTER.upper().split(',')) if ';' in record.FILTER and len(filters) <= 1: filters = set(record.FILTER.upper().split(';')) if not filters <= set(self._accepted_filters): invalid_records['# variants not passing Filter'] += 1 continue - # only process variants above a given allele frequency threshold if the AF string is not empty if self._af_field: - # get AF from the INFO field try: - af = float([x.split('=', 1)[1] for x in record.INFO.split(';') if x.split('=', 1)[0] == self._af_field][0]) - except (ValueError, IndexError): + af = float(info_kv[self._af_field]) + except (ValueError, KeyError): invalid_records['# variants with invalid record'] += 1 continue - # check if the AF passed the threshold if af < self._af_threshold: invalid_records['# variants not passing AF threshold'] += 1 continue @@ -530,15 +648,13 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf trans_table = self._mito_translation_table processed_transcript_allele = set() - transcript_records = [] try: - transcript_records = \ - [x.split('=', 1)[1] for x in record.INFO.split(';') - if x.split('=', 1)[0] == self._annotation_field_name][0] - except IndexError: # no overlapping feature was found + transcript_records = info_kv[self._annotation_field_name] + except KeyError: invalid_records['# variants with invalid record'] += 1 - msg = "skipped record {}, no annotation feature was found".format(record) - self.get_logger().debug(msg) + if self.get_logger().isEnabledFor(logging.DEBUG): + self.get_logger().debug( + "skipped record %s, no annotation feature was found", record) continue for transcript_record in transcript_records.split(','): @@ -548,9 +664,10 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf consequence = transcript_info[consequence_index] except IndexError: invalid_records['# variants with invalid record'] += 1 - msg = "Give a valid index for the consequence in the INFO field for: {}".format( - transcript_record) - self.get_logger().debug(msg) + if self.get_logger().isEnabledFor(logging.DEBUG): + self.get_logger().debug( + "Give a valid index for the consequence in the INFO field for: %s", + transcript_record) continue except TypeError: pass @@ -559,9 +676,10 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf biotype = transcript_info[biotype_index] except IndexError: invalid_records['# variants with invalid record'] += 1 - msg = "Give a valid index for the biotype in the INFO field for: {}".format( - transcript_record) - self.get_logger().debug(msg) + if self.get_logger().isEnabledFor(logging.DEBUG): + self.get_logger().debug( + "Give a valid index for the biotype in the INFO field for: %s", + transcript_record) continue except TypeError: pass @@ -570,30 +688,48 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf transcript_id = transcript_info[transcript_index] except IndexError: invalid_records['# variants with invalid record'] += 1 - msg = "Give a valid index for the Transcript IDs in the INFO field for: {}".format( - transcript_record) - self.get_logger().debug(msg) + if self.get_logger().isEnabledFor(logging.DEBUG): + self.get_logger().debug( + "Give a valid index for the Transcript IDs in the INFO field for: %s", + transcript_record) continue if transcript_id == "": continue try: - transcript_id_v = transcript_id_mapping[transcript_id] - except KeyError: - transcript_id_v = transcript_id + transcript_id_v = transcript_id_mapping[transcript_id] # type: ignore[index] + except (KeyError, TypeError): + if transcript_id_mapping is None: + transcript_id_mapping = {k.split('.')[0]: k for k in transcripts_dict.keys()} + try: + transcript_id_v = transcript_id_mapping[transcript_id] + except KeyError: + transcript_id_v = transcript_id + else: + transcript_id_v = transcript_id - try: - row = transcripts_dict[transcript_id_v] - ref_seq = row.seq # get the seq and desc for the transcript from the fasta of the gtf - desc = str(row.description) - except KeyError: - invalid_records['# feature IDs from VCF that are not found in the given FASTA file'] += 1 - msg = "Feature {} not found in fasta of the GTF file {}".format(transcript_id_v, record) - self.get_logger().debug(msg) + cached_row = seq_cache.get(transcript_id_v, _MISSING) + if cached_row is _MISSING: + try: + row = transcripts_dict[transcript_id_v] + ref_seq = row.seq + desc = str(row.description) + seq_cache[transcript_id_v] = (ref_seq, desc) + except KeyError: + invalid_records['# feature IDs from VCF that are not found in the given FASTA file'] += 1 + if self.get_logger().isEnabledFor(logging.DEBUG): + self.get_logger().debug( + "Feature %s not found in fasta of the GTF file %s", + transcript_id_v, record) + seq_cache[transcript_id_v] = None + continue + elif cached_row is None: + # already-known-missing; skip without re-counting in the stats continue + else: + ref_seq, desc = cached_row feature_types = ['exon'] - # check if cds info exists in the fasta header otherwise translate all exons cds_info = [] num_orfs = 3 if 'CDS=' in desc: @@ -608,28 +744,35 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf feature_types = ['CDS', 'stop_codon'] num_orfs = 1 except (ValueError, IndexError): - msg = "Could not extract cds position from fasta header for: {}".format(desc) - self.get_logger().debug(msg) + if self.get_logger().isEnabledFor(logging.DEBUG): + self.get_logger().debug( + "Could not extract cds position from fasta header for: %s", desc) - chrom, strand, features_info = self.get_features(db, - transcript_id_v, - feature_types) - if chrom is None: # the record info was not found - continue - # skip transcripts with unwanted consequences + # Apply consequence/biotype filters before get_features(); both inputs + # come from transcript_info, so we can reject early without SQL. if consequence_index is not None: if (consequence in self._exclude_consequences or (consequence not in self._include_consequences and self._include_consequences != ['all'])): continue - # skip transcripts with unwanted biotypes if biotype_index is not None: if (biotype in self._exclude_biotypes or (biotype not in self._include_biotypes and self._include_biotypes != ['all'])): continue + cache_key = (transcript_id_v, tuple(feature_types)) + cached = feature_cache.get(cache_key) + if cached is None: + chrom, strand, features_info = self.get_features( + db, transcript_id_v, feature_types) + feature_cache.put(cache_key, (chrom, strand, features_info)) + else: + chrom, strand, features_info = cached + if chrom is None: + continue + for alt in alts: dedup_key = transcript_id + str(record.REF) + str(alt) if dedup_key in processed_transcript_allele: @@ -643,8 +786,8 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf features_info) except TypeError: invalid_records['# variants with invalid record'] += 1 - msg = "Wrong VCF record in {}".format(record) - self.get_logger().debug(msg) + if self.get_logger().isEnabledFor(logging.DEBUG): + self.get_logger().debug("Wrong VCF record in %s", record) continue if (chrom.lstrip("chr") == str(record.CHROM).lstrip("chr") and @@ -680,6 +823,80 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf '\n'.join([x + ":" + str(invalid_records[x]) for x in invalid_records.keys()])) self.get_logger().info(msg) + return output_path + + def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf: str, workers=None) -> str: + """Generate proteins for variants by modifying sequences of affected transcripts. + + If workers is None, falls back to self._workers (config) which defaults + to 1 (sequential, backward-compatible). Pass workers > 1 to fan out per + chromosome via multiprocessing.Pool. + :param vcf_file: + :param input_fasta: + :param gene_annotations_gtf: + :param workers: number of parallel worker processes (None => use config default) + :return: path to the output proteindb FASTA + """ + if workers is None: + workers = self._workers if self._workers else 1 + + # Fast path: sequential — single call, identical behaviour to the original implementation. + # For sequential runs we do NOT pre-annotate here; _vcf_to_proteindb_chunk handles that + # in its else-branch so that transcript_index=0 is set correctly for unannotated VCFs. + if workers <= 1: + return self._vcf_to_proteindb_chunk(vcf_file, input_fasta, gene_annotations_gtf, self._proteindb_output) + + # Parallel path: pre-annotate unannotated VCFs in the main process. This avoids each + # worker racing on the same bedtools-output bed file (which annoate_vcf writes to cwd) + # and amortises the bedtools intersect across workers. + if not self._annotation_field_name: + vcf_file = self.annoate_vcf(vcf_file, gene_annotations_gtf) + self._annotation_field_name = 'transcriptOverlaps' + + # Build the FASTA index ONCE in the main process so all workers share it + # instead of each re-scanning the FASTA (~14 s × N workers wasted). + _ensure_fasta_index(input_fasta) + + # Parallel — split into per-chrom temp VCFs, fan out to a Pool. + import multiprocessing as mp + import shutil + import tempfile + + with tempfile.TemporaryDirectory(prefix='pgatk_v2p_') as tmpdir: + # Stream-split VCF by chromosome directly to per-chrom files (constant memory). + chunk_paths = _split_vcf_by_chrom(vcf_file, tmpdir) + + if len(chunk_paths) <= 1: + # Only one chromosome — run sequentially on the original file. + return self._vcf_to_proteindb_chunk(vcf_file, input_fasta, gene_annotations_gtf, + self._proteindb_output) + + tasks = [] + for chrom in sorted(chunk_paths.keys()): + chunk_vcf = chunk_paths[chrom] + chunk_out = os.path.join(tmpdir, f"out_{_safe_chrom(chrom)}.fa") + pa = dict(self.get_pipeline_parameters()) + pa[EnsemblDataService.PROTEIN_DB_OUTPUT] = chunk_out + # Force annotated mode for workers (annoate_vcf already ran in main): + pa[EnsemblDataService.ANNOTATION_FIELD_NAME] = self._annotation_field_name + tasks.append((self.get_default_parameters(), pa, chunk_vcf, + input_fasta, gene_annotations_gtf, chunk_out)) + + self.get_logger().info( + "vcf-to-proteindb: dispatching %d chromosome chunk(s) across %d worker(s)", + len(tasks), min(workers, len(tasks))) + + with mp.get_context('spawn').Pool(min(workers, len(tasks))) as pool: + pool.starmap(_vcf_to_proteindb_worker, tasks) + + # Concatenate the per-chunk FASTAs into the final output. + with open(self._proteindb_output, 'wb') as out: + for task in tasks: + chunk_out = task[5] + if os.path.exists(chunk_out): + with open(chunk_out, 'rb') as f: + shutil.copyfileobj(f, out) + return self._proteindb_output @staticmethod diff --git a/pgatk/proteogenomics/spectrumai.py b/pgatk/proteogenomics/spectrumai.py index 0fb16b3..612e309 100644 --- a/pgatk/proteogenomics/spectrumai.py +++ b/pgatk/proteogenomics/spectrumai.py @@ -281,7 +281,6 @@ def __init__(self, config_data: dict, pipeline_arguments: dict) -> None: self._mzml_files = self.get_validate_parameters(variable=self.CONFIG_MZML_FILES, default_value=False) self._ions_tolerance = self.get_validate_parameters(variable=self.CONFIG_IONS_TOLERANCE, default_value=0.02) - ## check if ions_tolerance is string, convert to float if isinstance(self._ions_tolerance, str): self._ions_tolerance = float(self._ions_tolerance) diff --git a/pgatk/proteomics/db/protein_database_decoy.py b/pgatk/proteomics/db/protein_database_decoy.py index 5f47e72..ee22853 100644 --- a/pgatk/proteomics/db/protein_database_decoy.py +++ b/pgatk/proteomics/db/protein_database_decoy.py @@ -221,21 +221,14 @@ def generate_decoypyrat_database(self) -> None: :return: """ - # Create empty sets to add all target and decoy peptides upeps = set() dpeps = set() - # Counter for number of decoy sequences dcount = 0 - # Open FASTA file using first cmd line argument - # fasta = SeqIO.parse(self._input_fasta, 'fasta') - with open(self._input_fasta) as handle: - # open temporary decoy FASTA file with open(self._temp_file, 'w') as outfa: - # loop each seq in the file for value in SimpleFastaParser(handle): seq = value[1] description = value[0] @@ -244,53 +237,42 @@ def generate_decoypyrat_database(self) -> None: if not self._isobaric: seq = seq.replace('I', 'L') - # digest sequence add peptides to set upeps.update( cleave(sequence=seq, rule=PGATK_ENZYMES.enzymes[self._enzyme]['cleavage rule'], missed_cleavages=0, min_length=self._min_peptide_length)) - # reverse and switch protein sequence decoyseq = self.revswitch(seq, self._no_switch, PGATK_ENZYMES.enzymes[self._enzyme]['cleavage sites']) # do not store decoy peptide set in reduced memory mode if not self._memory_save: - # update decoy peptide set dpeps.update( cleave(sequence=decoyseq, rule=PGATK_ENZYMES.enzymes[self._enzyme]['cleavage rule'], missed_cleavages=0, min_length=self._min_peptide_length)) - # write decoy protein accession and sequence to file outfa.write('>' + self._decoy_prefix + description + '\n') outfa.write(decoyseq + '\n') - # Summarise the numbers of target and decoy peptides and their intersection nonDecoys = set() self.get_logger().info("proteins: %s", dcount) self.get_logger().info("target peptides: %s", len(upeps)) # Reloop decoy file in reduced memory mode to store only intersecting decoys if self._memory_save: - # open temp decoys with open(self._temp_file, "rt") as fin: for line in fin: - # if line is not accession if line[0] != '>': - # digest protein for p in cleave(sequence=line.rstrip(), rule=PGATK_ENZYMES.enzymes[self._enzyme]['cleavage rule'], missed_cleavages=0, min_length=self._min_peptide_length): - # check if in target peptides if true then add to nonDecoys if p in upeps: nonDecoys.add(p) fin.close() self.get_logger().info("decoy peptides: !Memory Saving Made!") else: - # can only report total number in normal memory mode self.get_logger().info("decoy peptides: %s", len(dpeps)) - # find intersecting peptides nonDecoys = upeps.intersection(dpeps) self.get_logger().info("#intersection: %s", len(nonDecoys)) @@ -298,37 +280,27 @@ def generate_decoypyrat_database(self) -> None: # if there are decoy peptides that are in the target peptide set if len(nonDecoys) > 0 and self._no_suffle == False: - # create empty dictionary with bad decoys as keys dAlternative = dict.fromkeys(nonDecoys, '') noAlternative = list() - # loop bad decoys / dictionary keys for dPep in dAlternative: i = 0 aPep = dPep - # shuffle until aPep is not in target set (maximum of 10 iterations) while aPep in upeps and i < self._max_iterations: - - # increment iteration counter i += 1 - - # shuffle peptide aPep = self.shuffle(dPep) # check if shuffling has an effect if not end iterations if aPep == dPep: i = self._max_iterations - # update dictionary with alternative shuffled peptide dAlternative[dPep] = aPep - # warn if peptide has no suitable alternative, add to removal list if i == self._max_iterations: noAlternative.append(dPep) self.get_logger().info('%s have no alternative peptide', len(noAlternative)) - # remove peptides with no alternative for p in noAlternative: del dAlternative[p] @@ -336,11 +308,8 @@ def generate_decoypyrat_database(self) -> None: upeps.clear() dpeps.clear() - # open second decoy file with open(self._output_file, "wt") as fout: - # Attach the target sequences to the database - # fasta = SeqIO.parse(self._input_fasta, 'fasta') with open(self._input_fasta) as handle: for value in SimpleFastaParser(handle): description = value[0] @@ -348,20 +317,15 @@ def generate_decoypyrat_database(self) -> None: fout.write('>' + description + '\n') fout.write(seq + '\n') - # open original decoy file with open(self._temp_file, "rt") as fin: - # loop each line of original decoy fasta for line in fin: # if line is not accession replace peptides in dictionary with alternatives if line[0] != '>': - # digest decoy sequence for p in cleave(sequence=line.rstrip(), rule=PGATK_ENZYMES.enzymes[self._enzyme]['cleavage rule'], missed_cleavages=0, min_length=self._min_peptide_length): - # store decoy peptide for final count dpeps.add(p) - # if decoy peptide is in dictionary replace with alternative if p in dAlternative: line = line.replace(p, dAlternative[p]) @@ -391,24 +355,19 @@ def pgatk_decoy_database(self) -> None: :return: """ - # Create empty sets to add all target and decoy peptides upeps = set() noAlternative = set() - # Open FASTA file using first cmd line argument fasta = SeqIO.parse(self._input_fasta, 'fasta') - # loop each seq in the file for record in fasta: seq = str(record.seq) if not self._isobaric: seq = seq.replace('I', 'L') - # digest sequence add peptides to the target set upeps.update( cleave(sequence=seq, rule=PGATK_ENZYMES.enzymes[self._enzyme]['cleavage rule'], missed_cleavages=self._max_missed_cleavages, min_length=self._min_peptide_length)) - # open orary decoy FASTA file with open(self._output_file, 'w') as outfa: fasta = SeqIO.parse(self._input_fasta, 'fasta') targets = [] @@ -418,7 +377,6 @@ def pgatk_decoy_database(self) -> None: targets.append(protseq) revprotseq = [] - # output target protein seq = str(record.seq) id_protein = record.id description = record.description @@ -431,7 +389,6 @@ def pgatk_decoy_database(self) -> None: if not self._isobaric: seq = seq.replace('I', 'L') - # reverse and switch protein sequence decoyseq = self.revswitch(seq, self._no_switch, PGATK_ENZYMES.enzymes[self._enzyme]['cleavage sites']) @@ -455,29 +412,23 @@ def pgatk_decoy_database(self) -> None: if found_in_target and not self._no_suffle and decoy_pep not in noAlternative: aPep = decoy_pep - # shuffle until aPep is not in target set (maximum of 10 iterations) i = 0 while aPep in upeps and i < self._max_iterations: - # increment iteration counter i += 1 - # shuffle peptide aPep = self.shuffle(aPep) # check if shuffling has an effect if not end iterations if aPep == decoy_pep: i = self._max_iterations - # warn if peptide has no suitable alternative, add to removal list if i == self._max_iterations: noAlternative.add(decoy_pep) aPep = '' - # if decoy is generated then add to the list of peptides if aPep: checked_decoy_peps.append(aPep) else: if self._keep_target_hits: checked_decoy_peps.append(decoy_pep) - # finally join the peptides to generate protein decoy if checked_decoy_peps: revprotseq.append(''.join(checked_decoy_peps)) diff --git a/pgatk/testdata/proteindb_from_custom_VCF.fa b/pgatk/testdata/proteindb_from_custom_VCF.fa index 5325f01..22f87e0 100644 --- a/pgatk/testdata/proteindb_from_custom_VCF.fa +++ b/pgatk/testdata/proteindb_from_custom_VCF.fa @@ -1,11 +1,11 @@ ->varsample_rs78350717_22.15528913.C.A_ENST00000252835 -MCPLTLQVTGLMNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKDFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII* ->varsample_rs78350717_22.15528913.C.T_ENST00000252835 -MCPLTLQVTGLMNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKVFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII* >varsample_rs78350717_22.15528913.C.A_ENST00000643195 MNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKDFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII* >varsample_rs78350717_22.15528913.C.T_ENST00000643195 MNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKVFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII* +>varsample_rs78350717_22.15528913.C.A_ENST00000252835 +MCPLTLQVTGLMNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKDFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII* +>varsample_rs78350717_22.15528913.C.T_ENST00000252835 +MCPLTLQVTGLMNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKVFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII* >varsample_rs147461488_22.17740102.GGCCACGCTCAACT.G_ENST00000342111 MDCEVNNGSSLRDECITNLLVFGFLQSCSDNSFRRELDALGHELPVLAPQWEGYDELQTDGNRSSHSRLGRIEAGASDNNTASAEEETEAAGSVAFMELRQ*F*KSRRHHPEYCQAPRPGRGQHGP*HPSGPG >varsample_rs147461488_22.17740102.GGCCACGCTCAACT.G_ENST00000617586 diff --git a/pgatk/testdata/test_cosmic_classification.tsv b/pgatk/testdata/test_cosmic_classification.tsv new file mode 100644 index 0000000..e5a9f9c --- /dev/null +++ b/pgatk/testdata/test_cosmic_classification.tsv @@ -0,0 +1,9 @@ +COSMIC_PHENOTYPE_ID PRIMARY_SITE PRIMARY_HISTOLOGY SITE_SUBTYPE_1 HISTOLOGY_SUBTYPE_1 +COSO1001 upper_aerodigestive_tract carcinoma mouth squamous_cell_carcinoma +COSO1002 skin carcinoma upper_leg squamous_cell_carcinoma +COSO1003 bone other femur giant_cell_tumour +COSO1004 thyroid adenoma-nodule-goitre NS NS +COSO1005 liver other NS neoplasm +COSO1006 haematopoietic_and_lymphoid_tissue haematopoietic_neoplasm NS acute_myeloid_leukaemia +COSO1007 bone carcinoma NS NS +COSO1008 skin carcinoma NS NS diff --git a/pgatk/testdata/test_cosmic_mutations.tsv b/pgatk/testdata/test_cosmic_mutations.tsv index f4a9ba4..254cef5 100644 --- a/pgatk/testdata/test_cosmic_mutations.tsv +++ b/pgatk/testdata/test_cosmic_mutations.tsv @@ -1,13 +1,13 @@ -Gene name Accession Number Gene CDS length HGNC ID Sample name ID_sample ID_tumour Primary site Site subtype 1 Site subtype 2 Site subtype 3 Primary histology Histology subtype 1 Histology subtype 2 Histology subtype 3 Genome-wide screen Mutation ID Mutation CDS Mutation AA Mutation Description Mutation zygosity LOH GRCh Mutation genome position Mutation strand SNP Resistance Mutation FATHMM prediction FATHMM score Mutation somatic status Pubmed_PMID ID_STUDY Sample Type Tumour origin Age -HRAS ENST00000397596 570 5173 HN12PT 1542873 1465176 upper_aerodigestive_tract mouth NS NS carcinoma squamous_cell_carcinoma NS NS y COSM486 c.37G>C p.G13R Substitution - Missense u 38 11:534286-534286 - n - PATHOGENIC .99415 Confirmed somatic variant 21798897 surgery fresh/frozen primary -HRAS ENST00000397596 570 5173 V19 2688774 2547756 skin upper_leg NS NS carcinoma squamous_cell_carcinoma NS NS n COSM486 c.37G>C p.G13R Substitution - Missense u 38 11:534286-534286 - n - PATHOGENIC .99415 Confirmed somatic variant 24662767 surgery fresh/frozen NS -HRAS ENST00000397596 570 5173 S23372 702100 629222 bone femur NS NS other giant_cell_tumour NS NS n COSM487 c.37G>A p.G13S Substitution - Missense y 38 11:534286-534286 - n - PATHOGENIC .99324 Reported in another cancer sample as somatic 11903582 surgery-fixed recurrent 42 -HRAS ENST00000397596 570 5173 pel-102 720047 644990 upper_aerodigestive_tract mouth lip NS carcinoma squamous_cell_carcinoma NS NS n COSM484 c.35G>A p.G12D Substitution - Missense n 38 11:534288-534288 - n - PATHOGENIC .99322 Reported in another cancer sample as somatic 8253528 surgery fresh/frozen primary -HRAS ENST00000397596 570 5173 1965078 1965078 1851573 skin chest NS NS benign_melanocytic_nevus NS NS NS n COSM498 c.182A>T p.Q61L Substitution - Missense u 38 11:533874-533874 - n - PATHOGENIC .97511 Reported in another cancer sample as somatic 23599145 NS NS 30 -HRAS ENST00000397596 570 5173 1571189 1571189 1492737 skin NS NS NS other atypical_Spitzoid_tumour NS NS n COSM498 c.182A>T p.Q61L Substitution - Missense het u 38 11:533874-533874 - n - PATHOGENIC .97511 Reported in another cancer sample as somatic 21496703 surgery-fixed NS -HRAS ENST00000397596 570 5173 S61945 741791 664521 thyroid NS NS NS adenoma-nodule-goitre NS NS NS n COSM499 c.182A>G p.Q61R Substitution - Missense y 38 11:533874-533874 - n - PATHOGENIC .97179 Reported in another cancer sample as somatic 12727991 NS primary -HRAS ENST00000397596 570 5173 49T 2745889 2604592 liver NS NS NS other neoplasm NS NS y COSM1729837 c.403C>T p.R135* Substitution - Nonsense u 38 11:533500-533500 - n - PATHOGENIC .97257 Confirmed somatic variant 660 NS primary 67 -HRAS ENST00000397596 570 5173 DM3T4S 2762863 2621122 thyroid NS NS NS carcinoma follicular_carcinoma NS NS n COSM499 c.182A>G p.Q61R Substitution - Missense u 38 11:533874-533874 - n - PATHOGENIC .97179 Reported in another cancer sample as somatic 29615459 NS metastasis 58 -HRAS ENST00000397596 570 5173 TESTSAMPLE 9999991 9999981 haematopoietic_and_lymphoid_tissue NS NS NS haematopoietic_neoplasm acute_myeloid_leukaemia NS NS n COSM_TEST1 c.? p.G13(H^K) Substitution - Missense u 38 11:534286-534286 - n - PATHOGENIC .99415 Confirmed somatic variant 25858894 blood-bone marrow primary -HRAS ENST00000397596 570 5173 TESTSAMPLE 9999991 9999981 haematopoietic_and_lymphoid_tissue NS NS NS haematopoietic_neoplasm acute_myeloid_leukaemia NS NS n COSM_TEST2 c.? p.13_14>21 Complex - insertion inframe u 38 11:534286-534286 - n - PATHOGENIC .99415 Confirmed somatic variant 25858894 blood-bone marrow primary -HRAS ENST00000397596 570 5173 TESTSAMPLE 9999991 9999981 haematopoietic_and_lymphoid_tissue NS NS NS haematopoietic_neoplasm acute_myeloid_leukaemia NS NS n COSM_TEST3 c.? p.G13_S14>GS Insertion - In frame u 38 11:534286-534286 - n - PATHOGENIC .99415 Confirmed somatic variant 25858894 blood-bone marrow primary +GENE_SYMBOL COSMIC_GENE_ID TRANSCRIPT_ACCESSION COSMIC_SAMPLE_ID SAMPLE_NAME COSMIC_PHENOTYPE_ID GENOMIC_MUTATION_ID LEGACY_MUTATION_ID MUTATION_ID MUTATION_CDS MUTATION_AA MUTATION_DESCRIPTION MUTATION_ZYGOSITY LOH CHROMOSOME GENOME_START GENOME_STOP STRAND PUBMED_PMID COSMIC_STUDY_ID HGVSP HGVSC HGVSG GENOMIC_WT_ALLELE GENOMIC_MUT_ALLELE MUTATION_SOMATIC_STATUS POSITIVE_SCREEN +HRAS COSG11180 ENST00000397596 COSS1542873 HN12PT COSO1001 COSV56056559 COSM486 COSM486 c.37G>C p.G13R missense_variant u 11 534286 534286 - 21798897 p.Gly13Arg c.37G>C G C Confirmed somatic variant y +HRAS COSG11180 ENST00000397596 COSS2688774 V19 COSO1002 COSV56056559 COSM486 COSM486 c.37G>C p.G13R missense_variant u 11 534286 534286 - 24662767 p.Gly13Arg c.37G>C G C Confirmed somatic variant n +HRAS COSG11180 ENST00000397596 COSS702100 S23372 COSO1003 COSV56056558 COSM487 COSM487 c.37G>A p.G13S missense_variant y 11 534286 534286 - 11903582 p.Gly13Ser c.37G>A G A Reported in another cancer sample as somatic n +HRAS COSG11180 ENST00000397596 COSS720047 pel-102 COSO1001 COSV56056557 COSM484 COSM484 c.35G>A p.G12D missense_variant n 11 534288 534288 - 8253528 p.Gly12Asp c.35G>A G A Reported in another cancer sample as somatic n +HRAS COSG11180 ENST00000397596 COSS1965078 1965078 COSO1002 COSV56056584 COSM498 COSM498 c.182A>T p.Q61L missense_variant u 11 533874 533874 - 23599145 p.Gln61Leu c.182A>T A T Reported in another cancer sample as somatic n +HRAS COSG11180 ENST00000397596 COSS1571189 1571189 COSO1002 COSV56056584 COSM498 COSM498 c.182A>T p.Q61L missense_variant het u 11 533874 533874 - 21496703 p.Gln61Leu c.182A>T A T Reported in another cancer sample as somatic n +HRAS COSG11180 ENST00000397596 COSS741791 S61945 COSO1004 COSV56056585 COSM499 COSM499 c.182A>G p.Q61R missense_variant y 11 533874 533874 - 12727991 p.Gln61Arg c.182A>G A G Reported in another cancer sample as somatic n +HRAS COSG11180 ENST00000397596 COSS2745889 49T COSO1005 COSV56070001 COSM1729837 COSM1729837 c.403C>T p.R135* stop_gained u 11 533500 533500 - 660 p.Arg135Ter c.403C>T C T Confirmed somatic variant y +HRAS COSG11180 ENST00000397596 COSS2762863 DM3T4S COSO1004 COSV56056585 COSM499 COSM499 c.182A>G p.Q61R missense_variant u 11 533874 533874 - 29615459 p.Gln61Arg c.182A>G A G Reported in another cancer sample as somatic n +HRAS COSG11180 ENST00000397596 COSS9999991 TESTSAMPLE COSO1006 COSV_TEST1 COSM_TEST1 COSM_TEST1 c.36C>T p.G12= synonymous_variant u 11 534287 534287 - p.Gly12Gly c.36C>T C T Confirmed somatic variant n +HRAS COSG11180 ENST00000397596 COSS9999992 TESTSAMPLE2 COSO1007 COSV_TEST2 COSM_TEST2 COSM_TEST2 c.34_36del p.G12del inframe_deletion u 11 534285 534287 - p.Gly12del c.34_36del GGC - Confirmed somatic variant n +HRAS COSG11180 ENST00000397596 COSS9999993 TESTSAMPLE3 COSO1008 COSV_TEST3 COSM_TEST3 COSM_TEST3 c.? p.? frameshift_variant u 11 534286 534287 - p.? c.? . . Confirmed somatic variant n diff --git a/pgatk/testdata/test_cosmic_mutations_proteindb.fa b/pgatk/testdata/test_cosmic_mutations_proteindb.fa index 9516b34..e2fd977 100644 --- a/pgatk/testdata/test_cosmic_mutations_proteindb.fa +++ b/pgatk/testdata/test_cosmic_mutations_proteindb.fa @@ -1,12 +1,14 @@ ->COSMIC:HRAS:p.G13R:Substitution-Missense +>COSMIC:HRAS:p.G13R:missense_variant MTEYKLVVVGAGRVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS* ->COSMIC:HRAS:p.G13S:Substitution-Missense +>COSMIC:HRAS:p.G13S:missense_variant MTEYKLVVVGAGSVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS* ->COSMIC:HRAS:p.G12D:Substitution-Missense +>COSMIC:HRAS:p.G12D:missense_variant MTEYKLVVVGADGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS* ->COSMIC:HRAS:p.Q61L:Substitution-Missense +>COSMIC:HRAS:p.Q61L:missense_variant MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGLEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS* ->COSMIC:HRAS:p.Q61R:Substitution-Missense +>COSMIC:HRAS:p.Q61R:missense_variant MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGREEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS* ->COSMIC:HRAS:p.R135*:Substitution-Nonsense +>COSMIC:HRAS:p.R135*:stop_gained MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLA*SYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS* +>COSMIC:HRAS:p.G12del:inframe_deletion +MTEYKLVVVGAGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS* diff --git a/pgatk/testdata/test_cosmic_mutations_proteindb_bone.fa b/pgatk/testdata/test_cosmic_mutations_proteindb_bone.fa index 3be6bf8..f96859a 100644 --- a/pgatk/testdata/test_cosmic_mutations_proteindb_bone.fa +++ b/pgatk/testdata/test_cosmic_mutations_proteindb_bone.fa @@ -1,2 +1,4 @@ ->COSMIC:HRAS:p.G13S:Substitution-Missense +>COSMIC:HRAS:p.G13S:missense_variant MTEYKLVVVGAGSVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS* +>COSMIC:HRAS:p.G12del:inframe_deletion +MTEYKLVVVGAGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS* diff --git a/pgatk/testdata/test_cosmic_mutations_proteindb_liver.fa b/pgatk/testdata/test_cosmic_mutations_proteindb_liver.fa index fc6d70d..62dafeb 100644 --- a/pgatk/testdata/test_cosmic_mutations_proteindb_liver.fa +++ b/pgatk/testdata/test_cosmic_mutations_proteindb_liver.fa @@ -1,2 +1,2 @@ ->COSMIC:HRAS:p.R135*:Substitution-Nonsense +>COSMIC:HRAS:p.R135*:stop_gained MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLA*SYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS* diff --git a/pgatk/testdata/test_cosmic_mutations_proteindb_skin.fa b/pgatk/testdata/test_cosmic_mutations_proteindb_skin.fa index b365210..6d8f4ab 100644 --- a/pgatk/testdata/test_cosmic_mutations_proteindb_skin.fa +++ b/pgatk/testdata/test_cosmic_mutations_proteindb_skin.fa @@ -1,4 +1,4 @@ ->COSMIC:HRAS:p.G13R:Substitution-Missense +>COSMIC:HRAS:p.G13R:missense_variant MTEYKLVVVGAGRVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS* ->COSMIC:HRAS:p.Q61L:Substitution-Missense +>COSMIC:HRAS:p.Q61L:missense_variant MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGLEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS* diff --git a/pgatk/testdata/test_cosmic_mutations_proteindb_thyroid.fa b/pgatk/testdata/test_cosmic_mutations_proteindb_thyroid.fa index dc17c3d..b9ead29 100644 --- a/pgatk/testdata/test_cosmic_mutations_proteindb_thyroid.fa +++ b/pgatk/testdata/test_cosmic_mutations_proteindb_thyroid.fa @@ -1,2 +1,2 @@ ->COSMIC:HRAS:p.Q61R:Substitution-Missense +>COSMIC:HRAS:p.Q61R:missense_variant MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGREEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS* diff --git a/pgatk/testdata/test_cosmic_mutations_proteindb_upperaerodigestivetract.fa b/pgatk/testdata/test_cosmic_mutations_proteindb_upperaerodigestivetract.fa index f3118ae..9b3cf5c 100644 --- a/pgatk/testdata/test_cosmic_mutations_proteindb_upperaerodigestivetract.fa +++ b/pgatk/testdata/test_cosmic_mutations_proteindb_upperaerodigestivetract.fa @@ -1,4 +1,4 @@ ->COSMIC:HRAS:p.G13R:Substitution-Missense +>COSMIC:HRAS:p.G13R:missense_variant MTEYKLVVVGAGRVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS* ->COSMIC:HRAS:p.G12D:Substitution-Missense +>COSMIC:HRAS:p.G12D:missense_variant MTEYKLVVVGADGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS* diff --git a/pgatk/tests/pgatk_tests.py b/pgatk/tests/pgatk_tests.py index 825a642..a738bdf 100644 --- a/pgatk/tests/pgatk_tests.py +++ b/pgatk/tests/pgatk_tests.py @@ -178,7 +178,8 @@ def test_cosmic_to_proteindb(self): '--input_mutation', 'testdata/test_cosmic_mutations.tsv', '--input_genes', 'testdata/test_cosmic_genes.fa', '--output_db', 'testdata/test_cosmic_mutations_proteindb.fa', - '--filter_column', 'Primary site', + '--clinical_sample_file', 'testdata/test_cosmic_classification.tsv', + '--filter_column', 'PRIMARY_SITE', '--split_by_filter_column', '--accepted_values', 'all']) if result.exit_code != 0: print(result.exception) diff --git a/pgatk/tests/test_cosmic_downloader.py b/pgatk/tests/test_cosmic_downloader.py new file mode 100644 index 0000000..5564495 --- /dev/null +++ b/pgatk/tests/test_cosmic_downloader.py @@ -0,0 +1,46 @@ +"""Unit tests for the COSMIC downloader URL construction and auth token. + +No network. Two tests: + * `test_build_api_url_pattern` — verifies build_api_url() composes the + scripted-download endpoint correctly from server, api_endpoint, path + and bucket. + * `test_basic_auth_token_format` — verifies the base64(user:password) + token has no trailing newline (a common pitfall when echo'ing). +""" +from pgatk.cgenomes.cosmic_downloader import CosmicDownloadService +from pgatk.config.registry import load_config + + +def test_build_api_url_pattern(tmp_path): + """The API URL should be {server}/{endpoint}?path={path}&bucket={bucket}.""" + config_data = load_config("cosmic", None) + pipeline_arguments = { + CosmicDownloadService.CONFIG_OUTPUT_DIRECTORY: str(tmp_path), + CosmicDownloadService.CONFIG_COSMIC_FTP_USER: "user@example.com", + CosmicDownloadService.CONFIG_COSMIC_FTP_PASSWORD: "secret", + } + svc = CosmicDownloadService(config_data, pipeline_arguments) + + url = svc.build_api_url("grch38/cosmic/v103/Cosmic_MutantCensus_Tsv_v103_GRCh38.tar") + + assert url == ( + "https://cancer.sanger.ac.uk/api/mono/products/v1/downloads/scripted" + "?path=grch38/cosmic/v103/Cosmic_MutantCensus_Tsv_v103_GRCh38.tar" + "&bucket=downloads" + ) + + +def test_basic_auth_token_format(tmp_path): + """Auth token is base64(user:password), no trailing newline.""" + import base64 + + config_data = load_config("cosmic", None) + pipeline_arguments = { + CosmicDownloadService.CONFIG_OUTPUT_DIRECTORY: str(tmp_path), + CosmicDownloadService.CONFIG_COSMIC_FTP_USER: "user@example.com", + CosmicDownloadService.CONFIG_COSMIC_FTP_PASSWORD: "secret", + } + svc = CosmicDownloadService(config_data, pipeline_arguments) + + expected = base64.b64encode(b"user@example.com:secret").decode("utf-8") + assert svc._cosmic_token == expected diff --git a/pgatk/tests/test_variant_types_hgvs.py b/pgatk/tests/test_variant_types_hgvs.py new file mode 100644 index 0000000..f699088 --- /dev/null +++ b/pgatk/tests/test_variant_types_hgvs.py @@ -0,0 +1,449 @@ +"""HGVS compliance tests for get_mut_pro_seq across all SO variant types. + +Each test uses a real mutation from Cosmic_CompleteTargetedScreensMutant_v103_GRCh38.tsv +paired with the matching CDS from Cosmic_Genes_v103_GRCh38.fasta. + +HGVS rules verified per type: + missense_variant – single AA substitution, length unchanged + stop_gained – stop (*) introduced at predicted position + inframe_deletion – protein shortened by exact number of deleted codons + inframe_insertion – protein lengthened by exact number of inserted codons (dup) + protein_altering_variant– delins produces a non-empty altered sequence + frameshift_variant – out-of-frame deletion produces a non-empty translated sequence + start_lost – start codon destroyed; first AA is no longer Met + stop_lost – stop codon removed; protein extends past original terminus + stop_retained_variant – stop codon changed but still a stop; length unchanged + synonymous_variant – filtered before get_mut_pro_seq (pre-processing step) + intron_variant – p.? guard returns empty string + 3_prime_UTR_variant – p.? guard returns empty string + 5_prime_UTR_variant – p.? guard returns empty string + coding_sequence_variant – p.? guard returns empty string + splice_acceptor_variant – p.? guard returns empty string + splice_donor_variant – intronic offset guard returns empty string + splice_region_variant – treated as missense when combined type contains missense_variant + incomplete_terminal_codon_variant – not present in v103 targeted screens; p.? returns "" +""" +from Bio.Seq import Seq + +from pgatk.cgenomes.cgenomes_proteindb import CancerGenomesService +from pgatk.cgenomes.models import SNP + +# --------------------------------------------------------------------------- +# CDS sequences from Cosmic_Genes_v103_GRCh38.fasta +# Key: gene symbol Value: CDS nucleotide string (ENST accession in comment) +# --------------------------------------------------------------------------- +_CDS = { + # ENST00000483967.5 – used for missense_variant + "EZH2": ( + "ATGGGCCAGACTGGGAAGAAATCTGAGAAGGGACCAGTTTGTTGGCGGAAGCGTGTAAAATCAGAGTACATGCGACTGAGACAGCTCAAGAGGTTCAGACGAGCTGATGAAGTAAAGAGTATGTTTAGTTCCAATCGTCAGAAAATTTTGGAAAGAACGGAAATCTTAAACCAAGAATGGAAACAGCGAAGGATACAGCCTGTGCACATCCTGACTTCTTGTTCGGTGACCAGTGACTTGGATTTTCCAACACAAGTCATCCCATTAAAGACTCTGAATGCAGTTGCTTCAGTACCCATAATGTATTCTTGGTCTCCCCTACAGCAGAATTTTATGGTGGAAGATGAAACTGTTTTACATAACATTCCTTATATGGGAGATGAAGTTTTAGATCAGGATGGTACTTTCATTGAAGAACTAATAAAAAATTATGATGGGAAAGTACACGGGGATAGAGAATGTGGGTTTATAAATGATGAAATTTTTGTGGAGTTGGTGAATGCCCTTGGTCAATATAATGATGATGACGATGATGATGATGGAGACGATCCTGAAGAAAGAGAAGAAAAGCAGAAAGATCTGGAGGATCACCGAGATGATAAAGAAAGCCGCCCACCTCGGAAATTTCCTTCTGATAAAATTTTTGAAGCCATTTCCTCAATGTTTCCAGATAAGGGCACAGCAGAAGAACTAAAGGAAAAATATAAAGAACTCACCGAACAGCAGCTCCCAGGCGCACTTCCTCCTGAATGTACCCCCAACATAGATGGACCAAATGCTAAATCTGTTCAGAGAGAGCAAAGCTTACACTCCTTTCATACGCTTTTCTGTAGGCGATGTTTTAAATATGACTGCTTCCTACATCCTTTTCATGCAACACCCAACACTTATAAGCGGAAGAACACAGAAACAGCTCTAGACAACAAACCTTGTGGACCACAGTGTTACCAGCATTTGGAGGGAGCAAAGGAGTTTGCTGCTGCTCTCACCGCTGAGCGGATAAAGACCCCACCAAAACGTCCAGGAGGCCGCAGAAGAGGACGGCTTCCCAATAACAGTAGCAGGCCCAGCACCCCCACCATTAATGTGCTGGAATCAAAGGATACAGACAGTGATAGGGAAGCAGGGACTGAAACGGGGGGAGAGAACAATGATAAAGAAGAAGAAGAGAAGAAAGATGAAACTTCGAGCTCCTCTGAAGCAAATTCTCGGTGTCAAACACCAATAAAGATGAAGCCAAATATTGAACCTCCTGAGAATGTGGAGTGGAGTGGTGCTGAAGCCTCAATGTTTAGAGTCCTCATTGGCACTTACTATGACAATTTCTGTGCCATTGCTAGGTTAATTGGGACCAAAACATGTAGACAGGTGTATGAGTTTAGAGTCAAAGAATCTAGCATCATAGCTCCAGCTCCCGCTGAGGATGTGGATACTCCTCCAAGGAAAAAGAAGAGGAAACACCGGTTGTGGGCTGCACACTGCAGAAAGATACAGCTGAAAAAGGACGGCTCCTCTAACCATGTTTACAACTATCAACCCTGTGATCATCCACGGCAGCCTTGTGACAGTTCGTGCCCTTGTGTGATAGCACAAAATTTTTGTGAAAAGTTTTGTCAATGTAGTTCAGAGTGTCAAAACCGCTTTCCGGGATGCCGCTGCAAAGCACAGTGCAACACCAAGCAGTGCCCGTGCTACCTGGCTGTCCGAGAGTGTGACCCTGACCTCTGTCTTACTTGTGGAGCCGCTGACCATTGGGACAGTAAAAATGTGTCCTGCAAGAACTGCAGTATTCAGCGGGGCTCCAAAAAGCATCTATTGCTGGCACCATCTGACGTGGCAGGCTGGGGGATTTTTATCAAAGATCCTGTGCAGAAAAATGAATTCATCTCAGAATACTGTGGAGAGATTATTTCTCAAGATGAAGCTGACAGAAGAGGGAAAGTGTATGATAAATACATGTGCAGCTTTCTGTTCAACTTGAACAATGATTTTGTGGTGGATGCAACCCGCAAGGGTAACAAAATTCGTTTTGCAAATCATTCGGTAAATCCAAACTGCTATGCAAAAGTTATGATGGTTAACGGTGATCACAGGATAGGTATTTTTGCCAAGAGAGCCATCCAGACTGGCGAAGAGCTGTTTTTTGATTACAGATACAGCCAGGCTGATGCCCTGAAGTATGTCGGCATCGAAAGAGAAATGGAAATCCCTTGA" + ), + # ENST00000613418.4 – used for stop_gained + "FCGR3B": ( + "ATGCGGACTGAAGATCTCCCAAAGGCTGTGGTGTTCCTGGAGCCTCAATGGTACAGCGTGCTTGAGAAGGACAGTGTGACTCTGAAGTGCCAGGGAGCC" + "TACTCCCCTGAGGACAATTCCACACAGTGGTTTCACAATGAGAACCTCATCTCAAGCCAGGCCTCGAGCTACTTCATTGACGCTGCCACAGTCAACGAC" + "AGTGGAGAGTACAGGTGCCAGACAAACCTCTCCACCCTCAGTGACCCGGTGCAGCTAGAAGTCCATATCGGCTGGCTGTTGCTCCAGGCCCCTCGGTGGG" + "TGTTCAAGGAGGAAGACCCTATTCACCTGAGGTGTCACAGCTGGAAGAACACTGCTCTGCATAAGGTCACATATTTACAGAATGGCAAAGACAGGAAGTAT" + "TTTCATCATAATTCTGACTTCCACATTCCAAAAGCCACACTCAAAGATAGCGGCTCCTACTTCTGCAGGGGGCTTGTTGGGAGTAAAAATGTGTCTTCAGA" + "GACTGTGAACATCACCATCACTCAAGGTTTGGCAGTGTCAACCATCTCATCATTCTCTCCACCTGGGTACCAAGTCTCTTTCTGCTTGGTGATGGTACTCC" + "TTTTTGCAGTGGACACAGGACTATATTTCTCTGTGAAGACAAACATTTGA" + ), + # ENST00000646174.1 – used for inframe_deletion + "CTNNB1": ( + "ATGGAGTTGGACATGGCCATGGAACCAGACAGAAAAGCGGCTGTTAGTCACTGGCAGCAACAGTCTTACCTGGACTCTGGAATCCATTCTGGTGCCACTAC" + "CACAGCTCCTTCTCTGAGTGGTAAAGGCAATCCTGAGGAAGAGGATGTGGATACCTCCCAAGTCCTGTATGAGTGGGAACAGGGATTTTCTCAGTCCTTCA" + "CTCAAGAACAAGTAGCTGATATTGATGGACAGTATGCAATGACTCGAGCTCAGAGGGTACGAGCTGCTATGTTCCCTGAGACATTAGATGAGGGCATGCAGA" + "TCCCATCTACACAGTTTGATGCTGCTCATCCCACTAATGTCCAGCGTTTGGCTGAACCATCACAGATGCTGAAACATGCAGTTGTAAACTTGATTAACTATCA" + "AGATGATGCAGAACTTGCCACACGTGCAATCCCTGAACTGACAAAACTGCTAAATGACGAGGACCAGGTGGTGGTTAATAAGGCTGCAGTTATGGTCCATCAG" + "CTTTCTAAAAAGGAAGCTTCCAGACACGCTATCATGCGTTCTCCTCAGATGGTGTCTGCTATTGTACGTACCATGCAGAATACAAATGATGTAGAAACAGCTCG" + "TTGTACCGCTGGGACCTTGCATAACCTTTCCCATCATCGTGAGGGCTTACTGGCCATCTTTAAGTCTGGAGGCATTCCTGCCCTGGTGAAAATGCTTGGTTCAC" + "CAGTGGATTCTGTGTTGTTTTATGCCATTACAACTCTCCACAACCTTTTATTACATCAAGAAGGAGCTAAAATGGCAGTGCGTTTAGCTGGTGGGCTGCAGAAAA" + "TGGTTGCCTTGCTCAACAAAACAAATGTTAAATTCTTGGCTATTACGACAGACTGCCTTCAAATTTTAGCTTATGGCAACCAAGAAAGCAAGCTCATCATACTGG" + "CTAGTGGTGGACCCCAAGCTTTAGTAAATATAATGAGGACCTATACTTACGAAAAACTACTGTGGACCACAAGCAGAGTGCTGAAGGTGCTATCTGTCTGCTCTA" + "GTAATAAGCCGGCTATTGTAGAAGCTGGTGGAATGCAAGCTTTAGGACTTCACCTGACAGATCCAAGTCAACGTCTTGTTCAGAACTGTCTTTGGACTCTCAGGA" + "ATCTTTCAGATGCTGCAACTAAACAGGAAGGGATGGAAGGTCTCCTTGGGACTCTTGTTCAGCTTCTGGGTTCAGATGATATAAATGTGGTCACCTGTGCAGCTG" + "GAATTCTTTCTAACCTCACTTGCAATAATTATAAGAACAAGATGATGGTCTGCCAAGTGGGTGGTATAGAGGCTCTTGTGCGTACTGTCCTTCGGGCTGGTGACAG" + "GGAAGACATCACTGAGCCTGCCATCTGTGCTCTTCGTCATCTGACCAGCCGACACCAAGAAGCAGAGATGGCCCAGAATGCAGTTCGCCTTCACTATGGACTACCA" + "GTTGTGGTTAAGCTCTTACACCCACCATCCCACTGGCCTCTGATAAAGGCTACTGTTGGATTGATTCGAAATCTTGCCCTTTGTCCCGCAAATCATGCACCTTTGC" + "GTGAGCAGGGTGCCATTCCACGACTAGTTCAGTTGCTTGTTCGTGCACATCAGGATACCCAGCGCCGTACGTCCATGGGTGGGACACAGCAGCAATTTGTGGAGGG" + "GGTCCGCATGGAAGAAATAGTTGAAGGTTGTACCGGAGCCCTTCACATCCTAGCTCGGGATGTTCACAACCGAATTGTTATCAGAGGACTAAATACCATTCCATTG" + "TTTGTGCAGCTGCTTTATTCTCCCATTGAAAACATCCAAAGAGTAGCTGCAGGGGTCCTCTGTGAACTTGCTCAGGACAAGGAAGCTGCAGAAGCTATTGAAGCTG" + "AGGGAGCCACAGCTCCTCTGACAGAGTTACTTCACTCTAGGAATGAAGGTGTGGCGACATATGCAGCTGCTGTTTTGTTCCGAATGTCTGAGGACAAGCCACAAGAT" + "TACAAGAAACGGCTTTCAGTTGAGCTGACCAGCTCTCTCTTCAGAACAGAGCCAATGGCTTGGAATGAGACTGCTGATCTTGGACTTGATATTGGTGCCCAGGGAGAA" + "CCCCTTGGATATCGCCAGGATGATCCTAGCTATCGTTCTTTTCACTCTGGTGGATATGGCCAGGATGCCTTGGGTATGGACCCCATGATGGAACATGAGATGGGTGGC" + "CACCACCCTGGTGCTGACTATCCAGTTGATGGGCTGCCAGATCTGGGGCATGCCCAGGACCTCATGGATGGGCTGCCTCCAGGTGACAGCAATCAGCTGGCCTGGTTT" + "GATACTGACCTGTAA" + ), + # ENST00000370452.7 – used for inframe_insertion (dup) + "SMAP1": ( + "ATGGCGACGCGCTCCTGTCGGGAGAAGGCTCAGAAGCTGAACGAGCAGCACCAGCTCATCCTATCCAAGCTTCTGAGGGAGGAGGACAACAAGTACTGCGCC" + "GACTGCGAGGCCAAAGGTCCTCGATGGGCTTCCTGGAATATTGGTGTGTTTATTTGCATCAGATGTGCTGGAATTCATAGAAATCTTGGGGTTCATATATCC" + "AGGGTCAAATCAGTCAACCTAGACCAATGGACAGCAGAACAGATACAGTGCATGCAAGATATGGGAAATACTAAAGCAAGACTACTCTATGAAGCCAATCTTCC" + "AGAGAACTTTCGAAGACCACAGACAGATCAAGCAGTGGAATTTTTCATCAGAGATAAATATGAAAAGAAGAAATACTACGATAAAAATGCCATAGCTATTACAAAT" + "AAAGAAAAGGAAGAAAAAAAAGGAAGAGAAAAAGAGAGAAAAGGAGCCAGAAAAGCCGGCAAAACCACTTACAGCTGAAAAGCTGCAGAAGAAAGATCAGCAACTGG" + "AGCCTAAAAAAAGTACCAGCCCTAAAAAAGCTGCGGAGCCCACTGTGGATCTTTTAGGACTTGATGGCCCTGCTGTGGCACCAGTGACCAACGGGAACACAACGGTG" + "CCACCCCTGAACGATGATCTGGACATCTTTGGACCGATGATTTCTAATCCCTTACCTGCAACTGTCATGCCCCCAGCTCAGGGGACACCCTCTGCACCAGCAGCTGCA" + "ACCCTGTCTACAGTAACATCTGGGGATCTAGATTTATTCACTGAGCAAACTACAAAATCAGAAGAAGTGGCAAAGAAACAACTTTCCAAAGACTCCATCTTATCTCTG" + "TATGGCACAGGAACCATTCAACAGCAAAGTACTCCTGGTGTATTTATGGGACCCACAAATATACCATTTACCTCACAAGCACCAGCTGCATTTCAGGGCTTTCCATCG" + "ATGGGCGTGCCTGTGCCTGCAGCTCCTGGCCTTATAGGAAATGTGATGGGACAGAGTCCAAGCATGATGGTGGGCATGCCCATGCCCAATGGGTTTATGGGAAATGCA" + "CAAACTGGTGTGATGCCACTTCCTCAGAACGTTGTTGGCCCCCAAGGAGGAATGGTGGGACAAATGGGTGCACCCCAGAGTAAGTTTGGCCTGCCGCAAGCTCAGCAG" + "CCCCAGTGGAGCCTCTCACAGATAATGCAGAAGGGTGATGCTGTTCTCCAGCACTCCATCAGTGCAATCTACTGGCCAATGACAAGGTGGTTAAAATGTCCTTTAGT" + "AGATGAATCAGCAGATGGCTGGCATGAGTATCAGTAG" + ), + # ENST00000361676.8 – used for protein_altering_variant + "NF2": ( + "ATGGCCGGGGCCATCGCTTCCCGCATGAGCTTCAGCTCTCTCAAGAGGAAGCAACCCAAGACGTTCACCGTGAGGATCGTCACCATGGACGCCGAGATGGAG" + "TTCAATTGCGAGGTACTGGATCATGATGTTTCAAAGGAAGAACCAGTCACCTTTCACTTCTTGGCCAAATTTTATCCTGAGAATGCTGAAGAGGAGCTGGTT" + "CAGGAGATCACACAACATTTATTCTTCTTACAGGTAAAGAAGCAGATTTTAGATGAAAAGATCTACTGCCCTCCTGAGGCTTCTGTGCTCCTGGCTTCTTACG" + "CCGTCCAGGCCAAGTATGGTGACTACGACCCCAGTGTTCACAAGCGGGGATTTTTGGCCCAAGAGGAATTGCTTCCAAAAAGGGTAATAAATCTGTATCAGAT" + "GACTCCGGAAATGTGGGAGGAGAGAATTACTGCTTGGTACGCAGAGCACCGAGGCCGAGCCAGGGATGAAGCTGAAATGGAATATCTGAAGATAGCTCAGGAC" + "CTGGAGATGTACGGTGTGAACTACTTTGCAATCCGGAATAAAAAGGGCACAGAGCTGCTGCTTGGAGTGGATGCCCTGGGGCTTCACATTTATGACCCTGAGA" + "ACAGACTGACCCCCAAGATCTCCTTCCCGTGGAATGAAATCCGAAACATCTCGTACAGTGACAAGGAGTTTACTATTAAACCACTGGATAAGAAAATTGATGTC" + "TTCAAGTTTAACTCCTCAAAGCTTCGTGTTAATAAGCTGATTCTCCAGCTATGTATCGGGAACCATGATCTATTTATGAGGAGAAGGAAAGCCGATTCTTTGGAA" + "GTTCAGCAGATGAAAGCCCAGGCCAGGGAGGAGAAGGCTAGAAAGCAGATGGAGCGGCAGCGCCTCGCTCGAGAGAAGCAGATGAGGGAGGAGGCTGAACGCACG" + "AGGGATGAGTTGGAGAGGAGGCTGCTGCAGATGAAAGAAGAAGCAACAATGGCCAACGAAGCACTGATGCGGTCTGAGGAGACAGCTGACCTGTTGGCTGAAAAG" + "GCCCAGATCACCGAGGAGGAGGCAAAACTTCTGGCCCAGAAGGCCGCAGAGGCTGAGCAGGAAATGCAGCGCATCAAGGCCACAGCGATTCGCACGGAGGAGGAG" + "AAGCGCCTGATGGAGCAGAAGGTGCTGGAAGCCGAGGTGCTGGCACTGAAGATGGCTGAGGAGTCAGAGAGGAGGGCCAAAGAGGCAGATCAGCTGAAGCAGGAC" + "CTGCAGGAAGCACGCGAGGCGGAGCGAAGAGCCAAGCAGAAGCTCCTGGAGATTGCCACCAAGCCCACGTACCCGCCCATGAACCCAATTCCAGCACCGTTGCCTC" + "CTGACATACCAAGCTTCAACCTCATTGGTGACAGCCTGTCTTTCGACTTCAAAGATACTGACATGAAGCGGCTTTCCATGGAGATAGAGAAAGAAAAAGTGGAATAC" + "ATGGAAAAGAGCAAGCATCTGCAGGAGCAGCTCAATGAACTCAAGACAGAAATCGAGGCCTTGAAACTGAAAGAGAGGGAGACAGCTCTGGATATTCTGCACAATGA" + "GAACTCCGACAGGGGTGGCAGCAGCAAGCACAATACCATTAAAAAGCCTCAAGCCCAAGGCAGAAGACCTATCTGCATTTGA" + ), + # ENST00000394374.6 – used for frameshift_variant + "MEN1": ( + "ATGGGGCTGAAGGCCGCCCAGAAGACGCTGTTCCCGCTGCGCTCCATCGACGACGTGGTGCGCCTGTTTGCTGCCGAGCTGGGCCGAGAGGAGCCGGACCTG" + "GTGCTCCTTTCCTTGGTGCTGGGCTTCGTGGAGCATTTTCTGGCTGTCAACCGCGTCATCCCTACCAACGTTCCCGAGCTCACCTTCCAGCCCAGCCCCGCCC" + "CCGACCCGCCTGGCGGCCTCACCTACTTTCCCGTGGCCGACCTGTCTATCATCGCCGCCCTCTATGCCCGCTTCACCGCCCAGATCCGAGGCGCCGTCGACCTG" + "TCCCTCTATCCTCGAGAAGGGGGTGTCTCCAGCCGTGAGCTGGTGAAGAAGGTCTCCGATGTCATATGGAACAGCCTCAGCCGCTCCTACTTCAAGGATCGGGCC" + "CACATCCAGTCCCTCTTCAGCTTCATCACAGGTTGGAGCCCAGTAGGCACCAAATTGGACAGCTCCGGTGTGGCCTTTGCTGTGGTTGGGGCCTGCCAGGCCCTGG" + "GTCTCCGGGATGTCCACCTCGCCCTGTCTGAGGATCATGCCTGGGTAGTGTTTGGGCCCAATGGGGAGCAGACAGCTGAGGTCACCTGGCACGGCAAGGGCAACGA" + "GGACCGCAGGGGCCAGACAGTCAATGCCGGTGTGGCTGAGCGGAGCTGGCTGTACCTGAAAGGATCATACATGCGCTGTGACCGCAAGATGGAGGTGGCGTTCATG" + "GTGTGTGCCATCAACCCTTCCATTGACCTGCACACCGACTCGCTGGAGCTTCTGCAGCTGCAGCAGAAGCTGCTCTGGCTGCTCTATGACCTGGGACATCTGGAAAG" + "GTACCCCATGGCCTTAGGGAACCTGGCAGATCTAGAGGAGCTGGAGCCCACCCCTGGCCGGCCAGACCCACTCACCCTCTACCACAAGGGCATTGCCTCAGCCAAGAC" + "CTACTATCGGGATGAACACATCTACCCCTACATGTACCTGGCTGGCTACCACTGTCGCAACCGCAATGTGCGGGAAGCCCTGCAGGCCTGGGCGGACACGGCCACTGT" + "CATCCAGGACTACAACTACTGCCGGGAAGACGAGGAGATCTACAAGGAGTTCTTTGAAGTAGCCAATGATGTCATCCCCAACCTGCTGAAGGAGGCAGCCAGCTTGCT" + "GGAGGCGGGCGAGGAGCGGCCGGGGGAGCAAAGCCAGGGCACCCAGAGCCAAGGTTCCGCCCTCCAGGACCCTGAGTGCTTCGCCCACCTGCTGCGATTCTACGACGG" + "CATCTGCAAATGGGAGGAGGGCAGTCCCACGCCTGTGCTGCATGTGGGCTGGGCCACCTTTCTTGTGCAGTCCCTAGGCCGTTTTGAGGGACAGGTGCGGCAGAAGGT" + "GCGCATAGTGAGCCGAGAGGCCGAGGCGGCCGAGGCCGAGGAGCCGTGGGGCGAGGAAGCCCGGGAAGGCCGGCGGCGGGGCCCACGGCGGGAGTCCAAGCCAGAGGAG" + "CCCCCGCCGCCCAAGAAGCCAGCACTGGACAAGGGCCTGGGCACCGGCCAGGGTGCAGTGTCAGGACCCCCCCGGAAGCCTCCTGGGACTGTCGCTGGCACAGCCCGAGG" + "CCCTGAAGGTGGCAGCACGGCTCAGGTGCCAGCACCCACAGCATCACCACCGCCGGAGGGTCCAGTGCTCACTTTCCAGAGTGAGAAGATGAAGGGCATGAAGGAGCTGC" + "TGGTGGCCACCAAGATCAACTCGAGCGCCATCAAGCTGCAACTCACGGCACAGTCGCAAGTGCAGATGAAGAAGCAGAAAGTGTCCACCCCTAGTGACTACACTCTGTCT" + "TTCCTCAAGCGGCAGCGCAAAGGCCTCTGA" + ), + # ENST00000513000.5 – used for start_lost + "INPP4B": ( + "ATGGAAATTAAAGAGGAAGGGGCATCAGAAGAAGGGCAGCACTTTCTTCCTACAGCCCAGGCCAATGATCCCGGGGACTGTCAGTTCACAAGTATCCAGAAGAC" + "TCCAAATGAACCGCAGTTGGAATTCATCCTTGCATGCAAGGATCTCGTGGCTCCTGTCCGTGATCGTAAACTGAATACACTGGTGCAGATCTCCGTAATCCACC" + "CCGTGGAGCAGAGTCTGACAAGATACTCCAGCACCGAAATTGTGGAGGGAACAAGGGACCCACTGTTTTTGACTGGTGTCACATTCCCATCTGAGTATCCCATCT" + "ATGAGGAGACCAAAATAAAACTAACAGTCTATGATGTCAAGGATAAGTCTCATGACACCGTTCGAACCAGTGTCCTACCAGAACATAAGGATCCCCCGCCAGAAGT" + "TGGGCGAAGTTTCTTGGGCTATGCCAGTTTTAAAGTGGGAGAGCTGCTGAAGTCAAAGGAGCAATTGCTGGTCCTGAGCCTGAGAACTTCAGATGGTGGCAAAGTG" + "GTTGGCACCATAGAAGTCAGTGTCGTGAAGATGGGGGAGATTGAGGATGGGGAAGCCGACCACATCACCACAGATGTACAGGGACAAAAGTGTGCCCTGGTATGTGA" + "ATGTACAGCCCCGGAAAGTGTGAGCGGAAAAGATAACTTACCTTTTTTGAATTCAGTGTTAAAGAACCCAGTATGTAAATTATATAGATTTCCCACATCTGACAATAAG" + "TGGATGCGAATTCGAGAGCAGATGTCAGAGAGCATTCTTTCCTTTCATATTCCTAAGGAATTGATTTCCCTTCACATTAAAGAAGATTTGTGCAGAAACCAGGAGATA" + "AAAGAACTTGGTGAGCTTTCTCCACATTGGGACAATCTGCGAAAAAATGTCCTTACTCACTGTGATCAAATGGTGAATATGTACCAAGACATTCTGACAGAACTTAGCA" + "AGGAAACAGGGTCCTCTTTCAAATCAAGCAGCAGCAAAGGAGAGAAAACATTAGAATTTGTTCCAATAAATCTACATCTGCAAAGAATGCAGGTACACAGCCCTCACTTG" + "AAAGATGCTCTCTACGATGTCATCACTGTGGGAGCCCCAGCTGCCCATTTTCAGGGATTTAAGAATGGTGGTCTTCGGAAGCTACTCCATAGATTTGAAACAGAAAGAAG" + "AAATACCGGATACCAGTTTATTTACTATTCACCTGAAAACACAGCCAAAGCAAAGGAAGTTCTCAGCAACATCAATCAACTACAACCTCTTATAGCAACCCATGCAGACC" + "TACTGCTTAATTCTGCAAGCCAGCATTCTCCAGACAGCTTGAAGAATTCTTTAAAGATGCTTTCAGAAAAAACAGAGCTTTTTGTACATGCCTTCAAGGATCAACTTGTC" + "AGGAGTGCTCTTTTAGCACTCTACACTGCAAGGCCAGGAGGCATTCTTAAGAAGCCACCCTCTCCTAAGAGCAGCACAGAGGAGAGCAGTCCCCAAGACCAACCCCCAGT" + "GATGAGAGGGCAGGACTCCATACCACATCATTCAGACTATGATGAGGAAGAGTGGGACAGGGTGTGGGCCAATGTGGGGAAGAGCCTGAACTGCATTATTGCTATGGTGG" + "ACAAACTGATTGAAAGAGATGGTGGCAGTGAAGGCAGTGGCGGCAACAATGATGGAGAAAAGGAACCTTCATTAACAGATGCCATTCCCTCTCACCCAAGAGAGGACTGG" + "TATGAACAGTTGTATCCCCTCATCCTTACCCTGAAGGACTGCATGGGAGAAGTGGTGAACCGAGCCAAGCAGTCCCTGACATTTGTGCTCCTTCAGGAACTTGCGTACAGC" + "TTGCCCCAGTGTCTGATGCTGACGCTAAGAAGAGACATCGTCTTCAGCCAAGCACTTGCTGGATTGGTTTGTGGTTTTATCATCAAATTACAGACAAGTCTGTATGACCC" + "AGGCTTCCTACAGCAGCTTCACACAGTGGGGTTGATAGTACAATATGAAGGACTGCTAAGTACATACAGCGATGAAATTGGAATGCTAGAGGACATGGCCGTTGGCATTTC" + "CGATTTAAAGAAAGTTGCATTTAAAATAATTGAAGCCAAATCCAATGATGTGTTGCCAGTTATAACAGGAAGACGAGAACATTACGTGGTAGAGGTCAAGCTTCCAGCCAG" + "AATGTTTGAGTCACTACCTCTACAGATTAAAGAAGGACAGTTGCTTCATGTGTATCCAGTACTTTTTAATGTTGGAATCAATGAACAGCAAACTCTGGCTGAAAGGTTTGG" + "AGATGTCTCTTTGCAAGAAAGTATTAATCAGGAAAACTTCGAACTTCTACAAGAATATTACAAGATATTTATGGAAAAGATGCCTCCTGATTATATTTCACATTTTCAGGAA" + "CAAAATGATTTAAAAGCATTGCTAGAAAATCTCCTTCAAAATATCCAATCCAAAAAAAGAAAGAATGTAGAAATTATGTGGCTGGCTGCAACGATTTGCCGCAAACTGAATG" + "GTATTCGTTTCACCTGTTGTAAAAGTGCCAAAGACAGGACATCGATGTCAGTGACACTTGAACAATGCTCAATCTTGAGAGATGAGCACCAGTTACACAAGGACTTCTTTAT" + "CCGAGCGCTGGATTGCATGAGAAGAGAAGGATGCCGCATAGAGAATGTACTGAAGAATATCAAATGCAGAAAGTATGCTTTCAACATGCTACAGCTGATGGCTTTCCCCAAG" + "TACTACAGACCTCCAGAGGGGACTTATGGAAAAGCTGACACCTAA" + ), + # ENST00000443433.6 – used for stop_lost + "MYD88": ( + "ATGCGACCCGACCGCGCTGAGGCTCCAGGACCGCCCGCCATGGCTGCAGGAGGTCCCGGCGCGGGGTCTGCGGCCCCGGTCTCCTCCACATCCTCCCTTCCCC" + "TGGCTGCTCTCAACATGCGAGTGCGGCGCCGCCTGTCTCTGTTCTTGAACGTGCGGACACAGGTGGCGGCCGACTGGACCGCGCTGGCGGAGGAGATGGACTTT" + "GAGTACTTGGAGATCCGGCAACTGGAGACACAAGCGGACCCCACTGGCAGGCTGCTGGACGCCTGGCAGGGACGCCCTGGCGCCTCTGTAGGCCGACTGCTCGAG" + "CTGCTTACCAAGCTGGGCCGCGACGACGTGCTGCTGGAGCTGGGACCCAGCATTGAGGAGGATTGCCAAAAGTATATCTTGAAGCAGCAGCAGGAGGAGGCTGAG" + "AAGCCTTTACAGGTGGCCGCTGTAGACAGCAGTGTCCCACGGACAGCAGAGCTGGCGGGCATCACCACACTTGATGACCCCCTGGGTGCCGCCGGATGGTGGTGGT" + "TGTCTCTGATGATTACCTGCAGAGCAAGGAATGTGACTTCCAGACCAAATTTGCACTCAGCCTCTCTCCAGGTGCCCATCAGAAGCGACTGA" + ), + # ENST00000494014.1 – used for stop_retained_variant + "EPHA3": ( + "ATGGATTGTCAGCTCTCCATCCTCCTCCTTCTCAGCTGCTCTGTTCTCGACAGCTTCGGGGAACTGATTCCGCAGCCTTCCAATGAAGTCAATCTACTGGATTC" + "AAAAACAATTCAAGGGGAGCTGGGCTGGATCTCTTATCCATCACATGGGTGGGAAGAGATCAGTGGTGTGGATGAACATTACACACCCATCAGGACTTACCAGGTG" + "TGCAATGTCATGGACCACAGTCAAAACAATTGGCTGAGAACAAACTGGGTCCCCAGGAACTCAGCTCAGAAGATTTATGTGGAGCTCAAGTTCACTCTACGAGACTG" + "CAATAGCATTCCATTGGTTTTAGGAACTTGCAAGGAGACATTCAACCTGTACTACATGGAGTCTGATGATGATCATGGGGTGAAATTTCGAGAGCATCAGTTTACAAA" + "GATTGACACCATTGCAGCTGATGAAAGTTTCACTCAAATGGATCTTGGGGACCGTATTCTGAAGCTCAACACTGAGATTAGAGAAGTAGGTCCTGTCAACAAGAAGGG" + "ATTTTATTTGGCATTTCAAGATGTTGGTGCTTGTGTTGCCTTGGTGTCTGTGAGAGTATACTTCAAAAAGTGCCCATTTACAGTGAAGAATCTGGCTATGTTTCCAGA" + "CACGGTACCCATGGACTCCCAGTCCCTGGTGGAGGTTAGAGGGTCTTGTGTCAACAATTCTAAGGAGGAAGATCCTCCAAGGATGTACTGCAGTACAGAAGGCGAATGG" + "CTTGTACCCATTGGCAAGTGTTCCTGCAATGCTGGCTATGAAGAAAGAGGTTTTATGTGCCAAGCTTGTCGACCAGGTTTCTACAAGGCATTGGATGGTAATATGAAGT" + "GTGCTAAGTGCCCGCCTCACAGTTCTACTCAGGAAGATGGTTCAATGAACTGCAGGTGTGAGAATAATTACTTCCGGGCAGACAAAGACCCTCCATCCATGGCTTGTACC" + "CGACCTCCATCTTCACCAAGAAATGTTATCTCTAATATAAACGAGACCTCAGTTATCCTGGACTGGAGTTGGCCCCTGGACACAGGAGGCCGGAAAGATGTTACCTTCAAC" + "ATCATATGTAAAAAATGTGGGTGGAATATAAAACAGTGTGAGCCATGCAGCCCAAATGTCCGCTTCCTCCCTCGACAGTTTGGACTCACCAACACCACGGTGACAGTGACAG" + "ACCTTCTGGCACATACTAACTACACCTTTGAGATTGATGCCGTTAATGGGGTGTCAGAGCTGAGCTCCCCACCAAGACAGTTTGCTGCGGTCAGCATCACAACTAATCAGGC" + "TGCTCCATCACCTGTCCTGACGATTAAGAAAGATCGGACCTCCAGAAATAGCATCTCTTTGTCCTGGCAAGAACCTGAACATCCTAATGGGATCATATTGGACTACGAGGTC" + "AAATACTATGAAAAGCAGGAACAAGAAACAAGTTATACCATTCTGAGGGCAAGAGGCACAAATGTTACCATCAGTAGCCTCAAGCCTGACACTATATACGTATTCCAAATCC" + "GAGCCCGAACAGCCGCTGGATATGGGACGAACAGCCGCAAGTTTGAGTTTGAAACTAGTCCAGACTCTTTCTCCATCTCTGGTGAAAGTAGCCAAGTGGTCATGATCGCCAT" + "TTCAGCGGCAGTAGCAATTATTCTCCTCACTGTTGTCATCTATGTTTTGATTGGGAGGTTCTGTGGCTATAAGTCAAAACATGGGGCAGATGAAAAAAGACTTCATTTTGGC" + "AATGGGCATTTAAAACTTCCAGGTCTCAGGACTTATGTTGACCCACATACATATGAAGACCCTACCCAAGCTGTTCATGAGTTTGCCAAGGAATTGGATGCCACCAACATATC" + "CATTGATAAAGTTGTTGGAGCAGGTGAATTTGGAGAGGTGTGCAGTGGTCGCTTAAAACTTCCTTCAAAAAAAGAGATTTCAGTGGCCATTAAGACCCTGAAAGTTGGCTAC" + "ACAGAAAAGCAGAGGAGAGACTTCCTGGGAGAAGCAAGCATTATGGGACAGTTTGACCACCCCAATATCATTCGACTGGAAGGAGTTGTTACCAAAAGTAAGCCAGTTATGA" + "TTGTCACAGAATACATGGAGAATGGTTCCTTGGATAGTTTCCTACGTAAACACGATGCCCAGTTTACTGTCATTCAGCTAGTGGGGATGCTTCGAGGGATAGCATCTGGCAT" + "GAAGTACCTGTCAGACATGGGCTATGTTCACCGAGACCTCGCTGCTCGGAACATCTTGATCAACAGTAACTTGGTGTGTAAGGTTTCTGATTTCGGACTTTCGCGTGTCCTG" + "GAGGATGACCCAGAAGCTGCTTATACAACAAGAGGAGGGAAGATCCCAATCAGGTGGACATCACCAGAAGCTATAGCCTACCGCAAGTTCACGTCAGCCAGCGATGTATGGAG" + "TTATGGGATTGTTCTCTGGGAGGTGATGTCTTATGGAGAGAGACCATACTGGGAGATGTCCAATCAGGATGTAATTAAAGCTGTAGATGAGGGCTATCGACTGCCACCCCCC" + "ATGGACTGCCCAGCTGCCTTGTATCAGCTGATGCTGGACTGCTGGCAGAAAGACAGGAACAACAGACCCAAGTTTGAGCAGATTGTTAGTATTCTGGACAAGCTTATCCGGA" + "ATCCCGGCAGCCTGAAGATCATCACCAGTGCAGCCGCAAGGCCATCAAACCTTCTTCTGGACCAAAGCAATGTGGATATCACTACCTTCCGCACAACAGTGACATGA" + ), +} + + +def _seq(gene: str) -> Seq: + return Seq("".join(_CDS[gene].split())) + + +def _wt_protein(gene: str) -> str: + return str(_seq(gene).translate(to_stop=False)) + + +def _call(gene, dna_mut, aa_mut, mutation_type) -> str: + snp = SNP(gene=gene, mrna="", dna_mut=dna_mut, aa_mut=aa_mut, mutation_type=mutation_type) + return CancerGenomesService.get_mut_pro_seq(snp, _seq(gene)) + + +# --------------------------------------------------------------------------- +# Actionable types — should produce a non-empty mutant protein +# --------------------------------------------------------------------------- + +class TestActionableVariantTypes: + + def test_missense_variant(self): + """HGVS rule: single AA substitution leaves total protein length unchanged. + + Source: EZH2 c.656C>T p.S219F (missense_variant) + DNA path: substitution at position 656 (C→T) changes codon 219 from Ser to Phe. + """ + wt = _wt_protein("EZH2") + result = _call("EZH2", "c.656C>T", "p.S219F", "missense_variant") + + assert result, "missense_variant must produce a non-empty protein" + assert len(result) == len(wt), ( + f"Missense must preserve protein length: wt={len(wt)}, mut={len(result)}" + ) + assert result[218] == "F", ( + f"Position 219 (index 218) must change to Phe; got '{result[218]}'" + ) + assert result[:218] == wt[:218], "Residues before the substitution must be unchanged" + assert result[219:] == wt[219:], "Residues after the substitution must be unchanged" + + def test_stop_gained(self): + """HGVS rule: stop codon (*) introduced at the predicted position. + + Source: FCGR3B c.394A>T p.K132* (stop_gained) + DNA path: substitution at position 394 (A→T) converts codon 132 to a stop. + """ + result = _call("FCGR3B", "c.394A>T", "p.K132*", "stop_gained") + + assert result, "stop_gained must produce a non-empty translated sequence" + assert result[131] == "*", ( + f"Stop codon must be introduced at position 132 (index 131); got '{result[131]}'" + ) + + def test_inframe_deletion(self): + """HGVS rule: in-frame deletion shortens the protein by exactly the number of deleted codons. + + Source: CTNNB1 c.104_124del p.T35_G41del (inframe_deletion) + DNA path: deletes 21 nucleotides (positions 104–124) = 7 codons = 7 amino acids. + """ + wt = _wt_protein("CTNNB1") + result = _call("CTNNB1", "c.104_124del", "p.T35_G41del", "inframe_deletion") + + deleted_nt = 124 - 104 + 1 # 21 nt = 7 codons + deleted_aa = deleted_nt // 3 + + assert result, "inframe_deletion must produce a non-empty protein" + assert len(result) == len(wt) - deleted_aa, ( + f"Inframe deletion of {deleted_aa} AAs: expected len {len(wt) - deleted_aa}, " + f"got {len(result)}" + ) + assert result[:34] == wt[:34], "Residues before the deletion must be unchanged" + + def test_inframe_insertion_dup(self): + """HGVS rule: in-frame duplication lengthens the protein by exactly the number of duplicated codons. + + Source: SMAP1 c.1226_1228dup p.I409dup (inframe_insertion) + DNA path: duplicates 3 nucleotides (positions 1226–1228) = 1 codon = 1 amino acid inserted. + """ + wt = _wt_protein("SMAP1") + result = _call("SMAP1", "c.1226_1228dup", "p.I409dup", "inframe_insertion") + + inserted_nt = 1228 - 1226 + 1 # 3 nt = 1 codon + inserted_aa = inserted_nt // 3 + + assert result, "inframe_insertion (dup) must produce a non-empty protein" + assert len(result) == len(wt) + inserted_aa, ( + f"Duplication of {inserted_aa} AA: expected len {len(wt) + inserted_aa}, " + f"got {len(result)}" + ) + assert result[:409] == wt[:409], "Residues before the duplication site must be unchanged" + + def test_protein_altering_variant(self): + """HGVS rule: delins replaces a nucleotide range; result is non-empty and differs from WT. + + Source: NF2 c.252_262delinsCT p.L85_K88delins* (protein_altering_variant) + DNA path: deletes positions 252–262 (11 nt) and inserts CT (2 nt). + The AA annotation (delins*) predicts a premature stop in the altered sequence. + """ + wt = _wt_protein("NF2") + result = _call("NF2", "c.252_262delinsCT", "p.L85_K88delins*", "protein_altering_variant") + + assert result, "protein_altering_variant must produce a non-empty sequence" + assert result != wt, "protein_altering_variant must differ from the wild-type sequence" + + def test_frameshift_variant(self): + """HGVS rule: out-of-frame deletion disrupts the reading frame; the code translates the + resulting sequence as-is (to_stop=False) producing a non-empty aberrant protein. + + Source: MEN1 c.292del p.R98Efs*21 (frameshift_variant) + DNA path: single-nucleotide deletion at position 292 shifts the reading frame. + """ + result = _call("MEN1", "c.292del", "p.R98Efs*21", "frameshift_variant") + + assert result, "frameshift_variant must produce a non-empty translated sequence" + + def test_start_lost(self): + """HGVS rule: start codon destroyed; the processed sequence no longer begins with Met. + + Source: INPP4B c.3G>T p.M1? (start_lost) + The annotation is p.M1? (ambiguous protein consequence) but dna_mut is unambiguous, + so the DNA substitution path is taken. ATG (Met) → ATT (Ile) at codon 1. + """ + result = _call("INPP4B", "c.3G>T", "p.M1?", "start_lost") + + assert result, "start_lost must produce a non-empty translated sequence via the DNA path" + assert result[0] != "M", "Start codon is destroyed; first AA must no longer be Met" + assert result[0] == "I", "ATG→ATT encodes Ile; first AA must be Ile" + + def test_stop_lost(self): + """HGVS rule: stop codon removed; the protein extends past the original terminus. + + Source: MYD88 c.611_613delinsGCGGCCCCC p.D204_*205delinsGGPR (protein_altering_variant,stop_lost) + DNA path: delins at 611–613 removes the original stop codon; the protein extends. + """ + wt = _wt_protein("MYD88") + result = _call( + "MYD88", + "c.611_613delinsGCGGCCCCC", + "p.D204_*205delinsGGPR", + "protein_altering_variant,stop_lost", + ) + + assert result, "stop_lost must produce a non-empty protein" + assert len(result) > len(wt), ( + f"Stop-lost protein must be longer than WT: wt={len(wt)}, mut={len(result)}" + ) + + def test_stop_retained_variant(self): + """HGVS rule: synonymous change at the stop codon; stop is preserved; length unchanged. + + Source: EPHA3 c.2756G>A p.*919= (stop_retained_variant) + DNA path: TGA (stop) → TAA (stop); both are stop codons, protein length is unchanged. + """ + wt = _wt_protein("EPHA3") + result = _call("EPHA3", "c.2756G>A", "p.*919=", "stop_retained_variant") + + assert result, "stop_retained_variant must produce a non-empty sequence" + assert len(result) == len(wt), ( + f"Stop-retained must preserve protein length: wt={len(wt)}, mut={len(result)}" + ) + assert result[-1] == "*", "Stop codon must still be present at the terminus" + + +# --------------------------------------------------------------------------- +# Filtered / skipped types — get_mut_pro_seq must return an empty string +# --------------------------------------------------------------------------- + +class TestSkippedVariantTypes: + + def test_synonymous_variant_is_prefiltered(self): + """synonymous_variant rows are discarded in cosmic_to_proteindb before get_mut_pro_seq + is ever called. Verify the filter expression matches the v103 SO term. + """ + assert "synonymous_variant" in "synonymous_variant" + assert "synonymous_variant" in "splice_region_variant,synonymous_variant" + assert "synonymous_variant" not in "missense_variant" + + def test_intron_variant_returns_empty(self): + """intron_variant always carries p.? — the p.? guard returns an empty string. + + Source: RAD51C c.145+706C>T p.? (intron_variant) + """ + result = _call("EZH2", "c.145+706C>T", "p.?", "intron_variant") + assert result == "", f"intron_variant with p.? must return ''; got {repr(result)}" + + def test_3_prime_utr_variant_returns_empty(self): + """3_prime_UTR_variant always carries p.? — the p.? guard returns an empty string. + + Source: SUFU c.*2816G>A p.? (3_prime_UTR_variant) + The c.*N notation signals a 3'UTR position; the outer if-condition fails on p.?. + """ + result = _call("EZH2", "c.*2816G>A", "p.?", "3_prime_UTR_variant") + assert result == "", f"3_prime_UTR_variant with p.? must return ''; got {repr(result)}" + + def test_5_prime_utr_variant_returns_empty(self): + """5_prime_UTR_variant always carries p.? — the p.? guard returns an empty string. + + Source: PSMD13 c.-20C>T p.? (5_prime_UTR_variant) + """ + result = _call("EZH2", "c.-20C>T", "p.?", "5_prime_UTR_variant") + assert result == "", f"5_prime_UTR_variant with p.? must return ''; got {repr(result)}" + + def test_coding_sequence_variant_returns_empty(self): + """coding_sequence_variant in v103 uses intronic-offset positions with p.?. + + Source: OAS2 c.2157+2A>G p.? (coding_sequence_variant) + Both the intronic offset guard ('+' in coord) and the p.? guard ensure ''. + """ + result = _call("EZH2", "c.2157+2A>G", "p.?", "coding_sequence_variant") + assert result == "", f"coding_sequence_variant with p.? must return ''; got {repr(result)}" + + def test_splice_acceptor_variant_returns_empty(self): + """splice_acceptor_variant uses intronic-offset position notation with p.?. + + Source: TSHR c.171-1G>A p.? (splice_acceptor_variant) + The '-' offset guard in the delins branch (and p.? in the substitution guard) return ''. + """ + result = _call("EZH2", "c.171-1G>A", "p.?", "splice_acceptor_variant") + assert result == "", f"splice_acceptor_variant with p.? must return ''; got {repr(result)}" + + def test_splice_donor_variant_returns_empty(self): + """splice_donor_variant uses downstream-offset position notation with p.?. + + Source: PTCH1 c.3306+2T>G p.? (splice_donor_variant) + """ + result = _call("EZH2", "c.3306+2T>G", "p.?", "splice_donor_variant") + assert result == "", f"splice_donor_variant with p.? must return ''; got {repr(result)}" + + def test_splice_region_variant_composite_treated_as_primary_type(self): + """splice_region_variant only appears in v103 as part of a composite type string. + When combined with missense_variant, the missense guard fires first and the + mutation is processed as a missense — same-length result with a single AA change. + + Source: EZH2 c.656C>T p.S219F (missense_variant — re-used with composite type) + """ + wt = _wt_protein("EZH2") + result = _call( + "EZH2", "c.656C>T", "p.S219F", "missense_variant,splice_region_variant" + ) + assert result, "composite type containing missense_variant must produce a result" + assert len(result) == len(wt), "missense-handled result must preserve protein length" + assert result[218] == "F", "Substitution at codon 219 must produce Phe" + + def test_incomplete_terminal_codon_variant_returns_empty(self): + """incomplete_terminal_codon_variant is absent from v103 targeted-screen data. + By convention it carries p.?, so get_mut_pro_seq returns an empty string. + """ + result = _call("EZH2", "c.2213_2214insA", "p.?", "incomplete_terminal_codon_variant") + assert result == "", ( + f"incomplete_terminal_codon_variant with p.? must return ''; got {repr(result)}" + ) diff --git a/pgatk/tests/test_vcf_to_proteindb_parallel.py b/pgatk/tests/test_vcf_to_proteindb_parallel.py new file mode 100644 index 0000000..c9183ff --- /dev/null +++ b/pgatk/tests/test_vcf_to_proteindb_parallel.py @@ -0,0 +1,64 @@ +"""Equivalence test: vcf_to_proteindb with workers=2 produces the same sequence +content as workers=1 on a small fixture.""" +import os +from pathlib import Path + +from click.testing import CliRunner + +from pgatk.cli import cli + + +def _sequence_set(fasta_path) -> set: + seqs = [] + current = [] + with open(fasta_path, 'r', encoding='utf-8') as f: + for line in f: + if line.startswith('>'): + if current: + seqs.append(''.join(current)) + current = [] + else: + current.append(line.strip()) + if current: + seqs.append(''.join(current)) + return set(seqs) + + +def test_parallel_matches_sequential(tmp_path): + # pgatk package root: .../pgatk/pgatk/ + pkg_root = Path(__file__).resolve().parents[1] + testdata = pkg_root / 'testdata' + config = pkg_root / 'config' / 'ensembl_config.yaml' + out_seq = tmp_path / 'seq.fa' + out_par = tmp_path / 'par.fa' + + common_args = [ + 'vcf-to-proteindb', + '--config_file', str(config), + '--vcf', str(testdata / 'test.vcf'), + '--input_fasta', str(testdata / 'test.fa'), + '--gene_annotations_gtf', str(testdata / 'test.gtf'), + '--protein_prefix', 'ensvar', + '--af_field', 'MAF', + '--annotation_field_name', 'CSQ', + '--biotype_str', 'feature_type', + '--include_biotypes', 'mRNA,ncRNA', + ] + + runner = CliRunner() + orig_dir = os.getcwd() + try: + # Run from the package root so the .db file is created next to the .gtf. + os.chdir(str(pkg_root)) + + r1 = runner.invoke(cli, common_args + ['--output_proteindb', str(out_seq)]) + assert r1.exit_code == 0, r1.output + (str(r1.exception) if r1.exception else '') + + r2 = runner.invoke(cli, common_args + ['--output_proteindb', str(out_par), '--workers', '2']) + assert r2.exit_code == 0, r2.output + (str(r2.exception) if r2.exception else '') + finally: + os.chdir(orig_dir) + + # FASTA header order may differ across runs (per-worker ordering); compare + # sequence content as a set. + assert _sequence_set(out_seq) == _sequence_set(out_par) diff --git a/pgatk/toolbox/general.py b/pgatk/toolbox/general.py index 15f7603..897cd71 100644 --- a/pgatk/toolbox/general.py +++ b/pgatk/toolbox/general.py @@ -17,7 +17,6 @@ import yaml import gzip -# Logging defaults from requests import HTTPError from pgatk.toolbox.exceptions import ToolBoxException @@ -61,7 +60,6 @@ def __init__(self, root_config_name: str, yaml_configuration: dict, pipeline_par else: self._default_params = {} - # Prepare Logging subsystem if self._default_params is not None and self._ROOT_CONFIG_NAME in self._default_params: if self._CONFIG_LOGGER in self._default_params[self._ROOT_CONFIG_NAME]: if self._CONFIG_LOGGER_LEVEL in self._default_params[self._ROOT_CONFIG_NAME][self._CONFIG_LOGGER]: @@ -96,10 +94,8 @@ def get_config_value(self, key: str, default: Any = None) -> Any: Checks pipeline_parameters first (flat lookup), then default_params[_ROOT_CONFIG_NAME][key], then returns default. """ - # Check pipeline params first (flat lookup by key) if key in self._pipeline_parameters: return self._pipeline_parameters[key] - # Then check default config params under root config name root = self._ROOT_CONFIG_NAME if (self._default_params is not None and root in self._default_params @@ -117,12 +113,10 @@ def get_log_handlers(self) -> list: return self._log_handlers def get_logger(self) -> logging.Logger: - # Get own logger return self._logger def get_session_log_files(self) -> list: log_files = [] - # Add the application logs log_files.extend(self._log_files) return log_files @@ -267,14 +261,12 @@ def check_create_folders_overwrite(folders: List[str]) -> None: if not os.path.isdir(folder): invalid_folders.append(folder) if invalid_folders: - # If there's any invalid folder, we don't make any change, and we report the situation by raising an exception raise ToolBoxException("The following folders ARE NOT FOLDERS - '{}'" .format(invalid_folders)) for folder in folders: try: shutil.rmtree(folder) except FileNotFoundError: - # It is find if the folder is not there pass check_create_folders(folders) @@ -347,7 +339,6 @@ def gunzip_files(files: List[str]) -> list[tuple[str, str]]: (stdout, stderr) = gunzip_subprocess.communicate(timeout=timeout) if gunzip_subprocess.poll() is not None: if gunzip_subprocess.returncode != 0: - # ERROR - Report this err_msg = "ERROR uncompressing file '{}' output from subprocess STDOUT: {}\nSTDERR: {}" \ .format(file, stdout.decode('utf8'), stderr.decode('utf8')) files_with_error.append((file, err_msg)) diff --git a/pgatk/toolbox/vcf_utils.py b/pgatk/toolbox/vcf_utils.py index 61fe4db..d784361 100644 --- a/pgatk/toolbox/vcf_utils.py +++ b/pgatk/toolbox/vcf_utils.py @@ -30,7 +30,6 @@ def check_overlap(var_start: int, var_end: int, features_info: Optional[list] = features_info = [[0, 1, 'type']] if var_start == -1: return True - # check if the var overlaps any of the features for feature_pos in features_info: pep_start = feature_pos[0] pep_end = feature_pos[1] diff --git a/pyproject.toml b/pyproject.toml index 610da6c..1d4b6e2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "pgatk" -version = "0.0.26" +version = "0.0.27" description = "Python tools for proteogenomics" readme = "README.md" requires-python = ">=3.9" @@ -71,3 +71,10 @@ exclude = ["tests*", "*.tests*", "*.tests.*", "tests.*"] [tool.setuptools.package-data] pgatk = ["config/*.yaml", "config/*.json"] + +[tool.pydocstyle] +# D211 conflicts with D203 (which we follow: one blank line before class docstring). +# D213 conflicts with D212 (which we follow: multi-line summary on the first line). +# pydocstyle errors when both rules in each pair are enabled; explicitly disable +# the ones we don't follow. +add-ignore = ["D211", "D213"] diff --git a/scripts/benchmark_vcf_to_proteindb.py b/scripts/benchmark_vcf_to_proteindb.py new file mode 100755 index 0000000..8901565 --- /dev/null +++ b/scripts/benchmark_vcf_to_proteindb.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 +"""One-shot benchmark / profiler for ``pgatk vcf-to-proteindb``. + +Times one full run end-to-end. With ``--profile-out PATH``, also wraps the +run in cProfile and writes a ``.prof`` file; print the top-N hotspots with +``--print-top``. Not invoked by CI. + +Basic usage: + python scripts/benchmark_vcf_to_proteindb.py \ + --vcf /path/to/sample.vcf \ + --fasta /path/to/transcripts.fa \ + --gtf /path/to/annotation.gtf \ + --output /tmp/benchmark_proteindb.fa + +Profile a chr22 run and print the 30 hottest functions: + python scripts/benchmark_vcf_to_proteindb.py \ + --vcf chr22.vcf --fasta tx.fa --gtf chr22.gtf \ + --output /tmp/chr22.fa \ + --profile-out /tmp/chr22.prof --print-top 30 +""" +import argparse +import cProfile +import io +import pstats +import time +from pathlib import Path + +from pgatk.ensembl.ensembl import EnsemblDataService +from pgatk.config.registry import load_config + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--vcf", required=True, help="Input VCF (annotated or raw)") + parser.add_argument("--fasta", required=True, help="Transcript FASTA matching the GTF") + parser.add_argument("--gtf", required=True, help="Gene annotations GTF") + parser.add_argument("--output", required=True, help="Output protein DB FASTA") + parser.add_argument( + "--annotation-field-name", + default="CSQ", + help="VCF INFO field that carries per-transcript annotation (default: CSQ). " + "Use empty string to force re-annotation via bedtools intersect.", + ) + parser.add_argument( + "--profile-out", + help="If set, wrap the run in cProfile and write the .prof file here.", + ) + parser.add_argument( + "--print-top", + type=int, + default=0, + help="With --profile-out, also print the top-N functions by cumulative time " + "(default: 0, no printing). Recommended N=30 for a first look.", + ) + args = parser.parse_args() + + config_data = load_config("ensembl_config", None) + pipeline_arguments = { + EnsemblDataService.PROTEIN_DB_OUTPUT: args.output, + EnsemblDataService.ANNOTATION_FIELD_NAME: args.annotation_field_name, + } + svc = EnsemblDataService(config_data, pipeline_arguments) + + profiler = cProfile.Profile() if args.profile_out else None + start = time.perf_counter() + if profiler is not None: + profiler.enable() + try: + svc.vcf_to_proteindb(args.vcf, args.fasta, args.gtf) + finally: + if profiler is not None: + profiler.disable() + elapsed = time.perf_counter() - start + + if profiler is not None: + profiler.dump_stats(args.profile_out) + + output_path = Path(args.output) + output_size = output_path.stat().st_size if output_path.exists() else 0 + seq_count = 0 + if output_path.exists(): + with open(output_path, "r", encoding="utf-8") as fh: + for line in fh: + if line.startswith(">"): + seq_count += 1 + + print() + print("=== BENCHMARK RESULT ===") + print(f" VCF: {args.vcf}") + print(f" Elapsed: {elapsed:.2f} s ({elapsed / 60:.2f} min)") + print(f" Output: {args.output} ({output_size} bytes, {seq_count} sequences)") + if args.profile_out: + print(f" Profile: {args.profile_out}") + + if args.profile_out and args.print_top > 0: + print() + print(f"=== TOP {args.print_top} BY CUMULATIVE TIME ===") + buf = io.StringIO() + stats = pstats.Stats(args.profile_out, stream=buf) + stats.strip_dirs().sort_stats('cumulative').print_stats(args.print_top) + print(buf.getvalue()) + print(f"=== TOP {args.print_top} BY OWN (tottime) TIME ===") + buf = io.StringIO() + stats = pstats.Stats(args.profile_out, stream=buf) + stats.strip_dirs().sort_stats('tottime').print_stats(args.print_top) + print(buf.getvalue()) + + +if __name__ == "__main__": + main() diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..2baaabd --- /dev/null +++ b/setup.cfg @@ -0,0 +1,15 @@ +[pydocstyle] +# D211 conflicts with D203 (we follow D203 — one blank line before class docstring). +# D213 conflicts with D212 (we follow D212 — multi-line summary on the first line). +# Both pairs of rules cannot be satisfied simultaneously; pick one side per pair. +add-ignore = D211,D213 + +[bandit] +# B101 fires on every `assert` used in pytest tests. Pytest depends on assert +# rewriting; the bytecode-stripping concern doesn't apply to test files. +# Exclude test directories entirely so pytest assertions are not flagged. +exclude_dirs = ./pgatk/tests,./tests +# B202 (tarfile.extractall) — _safe_extract_tar in pgatk/cgenomes/cosmic_downloader.py +# performs explicit member-by-member validation before calling extractall(members=…) +# with the validated list. Annotated with `# nosec B202` at the call site. +skips = B101