From 047f026e184467cb1df602c5ec9cfb0813667603 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Tue, 3 Mar 2026 21:34:05 +0000 Subject: [PATCH] minor changes to the algorithms and bugs --- ...6-03-01-pgatk-rename-refactoring-design.md | 127 -- ...026-03-01-pgatk-rename-refactoring-plan.md | 817 -------- .../plans/2026-03-01-phase0-infrastructure.md | 1199 ------------ .../2026-03-02-clinvar-ncbi-implementation.md | 1652 ----------------- .../2026-03-02-clinvar-ncbi-support-design.md | 208 --- .../2026-03-03-clinvar-pipeline-hardening.md | 618 ------ docs/use-cases.md | 11 +- pgatk/clinvar/clinvar_service.py | 7 +- .../test_clinvar/test_clinvar_service.py | 7 + pgatk/tests/test_vcf_utils.py | 48 + pgatk/toolbox/vcf_utils.py | 22 + 11 files changed, 87 insertions(+), 4629 deletions(-) delete mode 100644 docs/plans/2026-03-01-pgatk-rename-refactoring-design.md delete mode 100644 docs/plans/2026-03-01-pgatk-rename-refactoring-plan.md delete mode 100644 docs/plans/2026-03-01-phase0-infrastructure.md delete mode 100644 docs/plans/2026-03-02-clinvar-ncbi-implementation.md delete mode 100644 docs/plans/2026-03-02-clinvar-ncbi-support-design.md delete mode 100644 docs/plans/2026-03-03-clinvar-pipeline-hardening.md diff --git a/docs/plans/2026-03-01-pgatk-rename-refactoring-design.md b/docs/plans/2026-03-01-pgatk-rename-refactoring-design.md deleted file mode 100644 index 399edd30..00000000 --- a/docs/plans/2026-03-01-pgatk-rename-refactoring-design.md +++ /dev/null @@ -1,127 +0,0 @@ -# Design: py-pgatk → pgatk Rename & Refactoring - -**Date:** 2026-03-01 -**Status:** Approved - -## Context - -The project is currently named `py-pgatk` with a Python package `pypgatk`. We are doing a full cleanup: rename, config system overhaul, bug fixes, and dead code removal. - -## Decisions - -| Decision | Choice | Rationale | -|---|---|---| -| Scope | Full cleanup | Rename + config + bugs + dead code in one effort | -| Delivery | 4 separate PRs | Reviewable, independent after PR 1 | -| Backward compat | Clean break | No shim package, rename everything | -| Config approach | Config Registry | Central mapping, auto-load bundled defaults | -| CLI entry point | `pgatk` | Matches project name | - -## PR 1: Rename (branch: `refactor/rename`) - -**Rename `pypgatk/` → `pgatk/`** across the entire codebase. - -### Changes - -- Rename directory: `pypgatk/` → `pgatk/` -- Rename CLI module: `pypgatkc.py` → `cli.py` -- Update `pyproject.toml`: - - `name = "pgatk"` - - Entry point: `pgatk = "pgatk.cli:main"` - - Package find: `include = ["pgatk*"]` - - Package data: `pgatk = ["config/*.yaml", "config/*.json"]` - - URLs: `bigbio/pgatk` -- Update all internal imports: `from pypgatk.xxx` → `from pgatk.xxx` -- Update docs references (mkdocs.yml, README.md, doc pages) -- Update GitHub Actions workflows -- Update test imports - -### Files affected - -- Every `.py` file with `pypgatk` imports (~44 files) -- `pyproject.toml` -- `mkdocs.yml` -- `README.md` -- `.github/workflows/*.yml` -- `docs/*.md` - -## PR 2: Config Registry (branch: `refactor/config-registry`) - -**Auto-load bundled config defaults so `-c` flag becomes optional.** - -### New module: `pgatk/config/registry.py` - -```python -import os -from pgatk.toolbox.general import read_yaml_from_file - -CONFIG_DIR = os.path.dirname(__file__) - -COMMAND_CONFIGS = { - "ensembl_downloader": "ensembl_downloader_config.yaml", - "ensembl_config": "ensembl_config.yaml", - "cosmic": "cosmic_config.yaml", - "cbioportal": "cbioportal_config.yaml", - "protein_decoy": "protein_decoy.yaml", - "openms_analysis": "openms_analysis.yaml", -} - -def load_config(config_key, user_config_file=None): - """Load config: user file if provided, otherwise bundled default.""" - if user_config_file is not None: - return read_yaml_from_file(user_config_file) - default_file = COMMAND_CONFIGS.get(config_key) - if default_file is None: - return {} - return read_yaml_from_file(os.path.join(CONFIG_DIR, default_file)) -``` - -### Command changes - -Replace the 3-line boilerplate in every command: - -```python -# Before -config_data = None -if config_file is not None: - config_data = read_yaml_from_file(config_file) - -# After -from pgatk.config.registry import load_config -config_data = load_config("ensembl_downloader", config_file) -``` - -### Files affected - -- New: `pgatk/config/registry.py` -- All 14 command files in `pgatk/commands/` - -## PR 3: Bug Fixes (branch: `fix/bugs`) - -| Bug | File | Fix | -|---|---|---| -| `_filter_write_idxml_with_df` writes unfiltered data | `proteomics/openms.py` | Write only filtered entries | -| ISO-8859 encoding typo | `cgenomes/cosmic_downloader.py` | `'ISO-8859'` → `'ISO-8859-1'` | -| Deprecated `Bio.pairwise2` | `proteomics/openms.py` | Replace with `Bio.Align.PairwiseAligner` | -| Enzyme name typos | `proteogenomics/models.py` | `'ArgC/P'` → `'Arg-C/P'` etc. | -| `print_help()` for required args | `commands/*.py` | Use Click's `required=True` | -| `download_file` log overwrite | `toolbox/general.py:193` | `if log is not None:` → `if log is None:` | - -## PR 4: Cleanup (branch: `refactor/cleanup`) - -| Item | File | Action | -|---|---|---| -| Dead `Species`/`SpeciesService` classes | `ensembl/ensembl.py` | Remove if unused | -| Commented-out code after return | `toolbox/general.py:241-253` | Delete | -| Hardcoded version `'0.0.26'` | `cli.py` | Use `importlib.metadata.version("pgatk")` | -| Log files created in CWD | `toolbox/general.py` | Log to stderr only by default | - -## Dependency Order - -``` -PR 1 (Rename) ──► PR 2 (Config Registry) - ──► PR 3 (Bug Fixes) - ──► PR 4 (Cleanup) -``` - -PR 1 must merge first. PRs 2, 3, 4 are independent of each other. diff --git a/docs/plans/2026-03-01-pgatk-rename-refactoring-plan.md b/docs/plans/2026-03-01-pgatk-rename-refactoring-plan.md deleted file mode 100644 index 11252f28..00000000 --- a/docs/plans/2026-03-01-pgatk-rename-refactoring-plan.md +++ /dev/null @@ -1,817 +0,0 @@ -# pgatk Rename & Refactoring Implementation Plan - -> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. - -**Goal:** Rename `py-pgatk`/`pypgatk` to `pgatk` everywhere, add config auto-loading, fix bugs, and remove dead code. - -**Architecture:** 4 independent PRs — PR 1 (rename) merges first, PRs 2-4 are independent of each other. Each PR gets its own branch off `dev`. - -**Tech Stack:** Python 3.9+, Click CLI, PyYAML, BioPython, pyOpenMS, MkDocs Material - ---- - -## PR 1: Rename `pypgatk` → `pgatk` (branch: `refactor/rename`) - -### Task 1.1: Rename the package directory and CLI module - -**Files:** -- Rename: `pypgatk/` → `pgatk/` (entire directory tree) -- Rename: `pgatk/pypgatkc.py` → `pgatk/cli.py` - -**Step 1: Rename the directory** - -```bash -cd /Users/yperez/work/proteogenomics/pgatk -git mv pypgatk pgatk -``` - -**Step 2: Rename the CLI module** - -```bash -git mv pgatk/pypgatkc.py pgatk/cli.py -``` - -**Step 3: Verify the rename** - -```bash -ls pgatk/ -# Should show: __init__.py cli.py cgenomes/ commands/ config/ db/ ensembl/ logs/ proteogenomics/ proteomics/ test_all.bed test_annotated.vcf testdata/ tests/ toolbox/ -``` - ---- - -### Task 1.2: Update `__init__.py` - -**Files:** -- Modify: `pgatk/__init__.py` - -**Step 1: Update the package name** - -Change `name = "pypgatk"` to `name = "pgatk"`. - ---- - -### Task 1.3: Update `cli.py` (formerly `pypgatkc.py`) - -**Files:** -- Modify: `pgatk/cli.py` - -**Step 1: Update all imports from `pypgatk` to `pgatk`** - -Replace all 14 import lines: -```python -from pgatk.commands import ensembl_downloader as ensembl_downloader_cmd -from pgatk.commands import ensembl_database as ensembl_database_cmd -from pgatk.commands import cosmic_downloader as cosmic_downloader_cmd -from pgatk.commands import cbioportal_downloader as cbioportal_downloader_cmd -from pgatk.commands import cosmic_to_proteindb as cosmic_to_proteindb_cmd -from pgatk.commands import cbioportal_to_proteindb as cbioportal_to_proteindb_cmd -from pgatk.commands import threeframe_translation as threeframe_translation_cmd -from pgatk.commands import vcf_to_proteindb as vcf_to_proteindb_cmd -from pgatk.commands import dnaseq_to_proteindb as dnase_to_proteindb_cmd -from pgatk.commands import proteindb_decoy as proteindb_decoy_cmd -from pgatk.commands import peptide_class_fdr as peptide_class_fdr_cmd -from pgatk.commands import validate_peptides as validate_peptides_cmd -from pgatk.commands import mztab_class_fdr as mztab_class_fdr_cmd -from pgatk.commands import blast_get_position as blast_get_position_cmd -``` - -**Step 2: Update docstring** - -Change `"py-pgatk"` references to `"pgatk"` in the module docstring and CLI help text. - ---- - -### Task 1.4: Update all Python imports across all modules - -**Files (all files under `pgatk/` that import from `pypgatk`):** - -Use a global find-and-replace: `from pypgatk.` → `from pgatk.` - -These files need updating: -- `pgatk/commands/blast_get_position.py` -- `pgatk/commands/cbioportal_downloader.py` -- `pgatk/commands/cbioportal_to_proteindb.py` -- `pgatk/commands/cosmic_downloader.py` -- `pgatk/commands/cosmic_to_proteindb.py` -- `pgatk/commands/dnaseq_to_proteindb.py` -- `pgatk/commands/ensembl_database.py` -- `pgatk/commands/ensembl_downloader.py` -- `pgatk/commands/mztab_class_fdr.py` -- `pgatk/commands/peptide_class_fdr.py` -- `pgatk/commands/proteindb_decoy.py` -- `pgatk/commands/threeframe_translation.py` -- `pgatk/commands/validate_peptides.py` -- `pgatk/commands/vcf_to_proteindb.py` -- `pgatk/cgenomes/cbioportal_downloader.py` -- `pgatk/cgenomes/cgenomes_proteindb.py` -- `pgatk/cgenomes/cosmic_downloader.py` -- `pgatk/ensembl/data_downloader.py` -- `pgatk/ensembl/ensembl.py` -- `pgatk/ensembl/exceptions.py` -- `pgatk/proteogenomics/blast_get_position.py` -- `pgatk/proteogenomics/mztab_class_fdr.py` -- `pgatk/proteogenomics/spectrumai.py` -- `pgatk/proteomics/openms.py` -- `pgatk/proteomics/db/protein_database_decoy.py` -- `pgatk/toolbox/general.py` - -**Step 1: Run a sed-like replacement across all `.py` files** - -In every `.py` file under `pgatk/`, replace `from pypgatk.` with `from pgatk.` and `import pypgatk` with `import pgatk`. - -**Step 2: Verify no remaining references** - -```bash -grep -r "pypgatk" pgatk/ --include="*.py" -# Should return nothing (except possibly string literals in help text) -``` - ---- - -### Task 1.5: Update test files - -**Files:** -- Modify: `pgatk/tests/pypgatk_tests.py` -- Modify: `pgatk/tests/classes_tests.py` - -**Step 1: Update test imports** - -In `pypgatk_tests.py`: -```python -# Before -from pypgatk.pypgatkc import cli -# After -from pgatk.cli import cli -``` - -In `classes_tests.py`: -```python -# Before -from pypgatk.proteomics.models import PYGPATK_ENZYMES -# After -from pgatk.proteomics.models import PYGPATK_ENZYMES -``` - -**Step 2: Rename the constant `PYGPATK_ENZYMES` → `PGATK_ENZYMES`** - -In `pgatk/proteomics/models.py`, rename the constant: -```python -# Before -PYGPATK_ENZYMES = ... -# After -PGATK_ENZYMES = ... -``` - -Update all references (in `models.py`, `classes_tests.py`, `protein_database_decoy.py`, and any command files). - ---- - -### Task 1.6: Update `pyproject.toml` - -**Files:** -- Modify: `pyproject.toml` - -**Step 1: Update all references** - -```toml -[project] -name = "pgatk" - -[project.scripts] -pgatk = "pgatk.cli:main" - -[project.urls] -Homepage = "http://github.com/bigbio/pgatk" -Repository = "http://github.com/bigbio/pgatk" -Issues = "http://github.com/bigbio/pgatk/issues" - -[tool.setuptools.packages.find] -include = ["pgatk*"] - -[tool.setuptools.package-data] -pgatk = ["config/*.yaml", "config/*.json"] -``` - ---- - -### Task 1.7: Update non-Python files - -**Files:** -- Modify: `MANIFEST.in` -- Modify: `Dockerfile` -- Modify: `.gitignore` -- Modify: `conda-enviroment.yaml` - -**Step 1: Update MANIFEST.in** - -``` -recursive-include pgatk *.json -recursive-include pgatk *.yaml -``` - -**Step 2: Update Dockerfile** - -Replace `pypgatk` references with `pgatk`: -- `LABEL software="pgatk"`, `container="pgatk"` -- `ENV PATH=$PATH:/tool/source/pgatk/` -- `RUN chmod +x /tool/source/pgatk/cli.py` -- Update git clone URL to `bigbio/pgatk` - -**Step 3: Update .gitignore** - -Replace `pypgatk/testdata/` paths with `pgatk/testdata/`. - -**Step 4: Update conda-enviroment.yaml** - -Change `name: py-pgatk` to `name: pgatk`. - ---- - -### Task 1.8: Update GitHub Actions workflows - -**Files:** -- Modify: `.github/workflows/pythonapp.yml` -- Modify: `.github/workflows/pythonpackage.yml` - -**Step 1: Update test directory references** - -In both files, change: -```yaml -# Before -cd pypgatk -# After -cd pgatk -``` - ---- - -### Task 1.9: Update documentation files - -**Files:** -- Modify: `mkdocs.yml` -- Modify: `README.md` -- Modify: `docs/index.md` -- Modify: `docs/installation.md` -- Modify: `docs/pypgatk.md` (rename to `docs/pgatk-cli.md`) -- Modify: `docs/changelog.md` - -**Step 1: Update mkdocs.yml** - -```yaml -repo_name: bigbio/pgatk -repo_url: https://github.com/bigbio/pgatk -# Update nav -nav: - - Home: index.md - - Introduction: introduction.md - - Installation: installation.md - - pgatk CLI: pgatk-cli.md - - File Formats: formats.md - - Changelog: changelog.md - - Support: support.md -# Update social links -- icon: fontawesome/brands/github - link: https://github.com/bigbio/pgatk -- icon: fontawesome/brands/python - link: https://pypi.org/project/pgatk/ -``` - -**Step 2: Rename docs/pypgatk.md → docs/pgatk-cli.md** - -```bash -git mv docs/pypgatk.md docs/pgatk-cli.md -``` - -**Step 3: Update all doc content** - -In all doc files, replace: -- `pypgatk` (the CLI command) → `pgatk` -- `py-pgatk` (repo name) → `pgatk` -- `pip install pypgatk` → `pip install pgatk` -- Badge URLs: `bigbio/py-pgatk` → `bigbio/pgatk` -- PyPI badges: `pypgatk` → `pgatk` - ---- - -### Task 1.10: Run tests and verify - -**Step 1: Install in editable mode** - -```bash -cd /Users/yperez/work/proteogenomics/pgatk -pip install -e . -``` - -**Step 2: Run tests** - -```bash -cd pgatk -python -m unittest discover -s tests -p "*_tests.py" -v -``` - -Expected: All tests pass. - -**Step 3: Verify CLI entry point** - -```bash -pgatk --version -# Expected: pgatk, version 0.0.26 -``` - -**Step 4: Commit** - -```bash -git add -A -git commit -m "refactor: rename pypgatk to pgatk - -- Rename package directory pypgatk/ → pgatk/ -- Rename CLI module pypgatkc.py → cli.py -- Update all Python imports from pypgatk to pgatk -- Rename constant PYGPATK_ENZYMES → PGATK_ENZYMES -- Update pyproject.toml (name, entry point, URLs, package config) -- Update Dockerfile, MANIFEST.in, .gitignore, conda env -- Update GitHub Actions workflows -- Update MkDocs documentation -- CLI command: pypgatk → pgatk -- PyPI package: pypgatk → pgatk" -``` - ---- - -## PR 2: Config Registry (branch: `refactor/config-registry`) - -### Task 2.1: Create `pgatk/config/registry.py` - -**Files:** -- Create: `pgatk/config/registry.py` - -**Step 1: Write the registry module** - -```python -""" -Central config registry: maps config keys to bundled default YAML files. -Enables auto-loading of defaults when user doesn't pass -c. -""" -import os - -from pgatk.toolbox.general import read_yaml_from_file - -CONFIG_DIR = os.path.dirname(__file__) - -COMMAND_CONFIGS = { - "ensembl_downloader": "ensembl_downloader_config.yaml", - "ensembl_config": "ensembl_config.yaml", - "cosmic": "cosmic_config.yaml", - "cbioportal": "cbioportal_config.yaml", - "protein_decoy": "protein_decoy.yaml", - "openms_analysis": "openms_analysis.yaml", -} - - -def load_config(config_key, user_config_file=None): - """ - Load configuration for a command. - - If user_config_file is provided, loads that file. - Otherwise, loads the bundled default config for the given key. - - :param config_key: key in COMMAND_CONFIGS - :param user_config_file: optional path to user-provided config file - :return: parsed YAML config dict, or empty dict if no config found - """ - if user_config_file is not None: - return read_yaml_from_file(user_config_file) - - default_file = COMMAND_CONFIGS.get(config_key) - if default_file is None: - return {} - return read_yaml_from_file(os.path.join(CONFIG_DIR, default_file)) -``` - ---- - -### Task 2.2: Update command files to use `load_config` - -**Files (all 14 command files in `pgatk/commands/`):** - -For each command file, replace: - -```python -# Before -from pgatk.toolbox.general import read_yaml_from_file - -# ... in the command function: -config_data = None -if config_file is not None: - config_data = read_yaml_from_file(config_file) -``` - -With: - -```python -# After -from pgatk.config.registry import load_config - -# ... in the command function: -config_data = load_config("", config_file) -``` - -**Config key mapping per command file:** - -| Command file | Config key | -|---|---| -| `ensembl_downloader.py` | `"ensembl_downloader"` | -| `ensembl_database.py` | `"ensembl_config"` | -| `vcf_to_proteindb.py` | `"ensembl_config"` | -| `dnaseq_to_proteindb.py` | `"ensembl_config"` | -| `threeframe_translation.py` | `"ensembl_config"` | -| `cosmic_downloader.py` | `"cosmic"` | -| `cosmic_to_proteindb.py` | `"cosmic"` | -| `cbioportal_downloader.py` | `"cbioportal"` | -| `cbioportal_to_proteindb.py` | `"cbioportal"` | -| `proteindb_decoy.py` | `"protein_decoy"` | -| `peptide_class_fdr.py` | `"openms_analysis"` | -| `validate_peptides.py` | `"ensembl_config"` | -| `mztab_class_fdr.py` | `"ensembl_config"` | -| `blast_get_position.py` | (no config needed — uses None) | - ---- - -### Task 2.3: Update tests that pass `--config_file` - -**Files:** -- Modify: `pgatk/tests/pypgatk_tests.py` - -Some tests currently pass `--config_file` explicitly. These should still work (user override). -Add a new test that runs a command without `--config_file` to verify auto-loading: - -```python -def test_generate_decoy_database_autoconfig(self): - """Test that generate-decoy works without explicit --config_file (auto-loads bundled default).""" - runner = CliRunner() - result = runner.invoke(cli, - ['generate-decoy', '-in', 'testdata/test_db.fa', - '-out', 'testdata/output_decoy.fa', - '--method', 'protein-reverse']) - self.assertEqual(result.exit_code, 0) -``` - -Note: `test_generate_decoy_database_noconfig` already tests this case — verify it still passes. - -**Step 1: Run tests** - -```bash -cd pgatk -python -m unittest discover -s tests -p "*_tests.py" -v -``` - -**Step 2: Commit** - -```bash -git add pgatk/config/registry.py pgatk/commands/ -git commit -m "refactor: add config registry for auto-loading bundled defaults - -- Add pgatk/config/registry.py with central COMMAND_CONFIGS mapping -- Update all 14 command files to use load_config() -- -c flag is now optional: bundled defaults are loaded automatically -- User can still override with -c path/to/custom_config.yaml" -``` - ---- - -## PR 3: Bug Fixes (branch: `fix/bugs`) - -### Task 3.1: Fix `_filter_write_idxml_with_df` writing unfiltered data - -**Files:** -- Modify: `pgatk/proteomics/openms.py:437` - -**Step 1: Fix the store call** - -```python -# Before (line 437) -idxml_parser().store(output_file, prot_ids, pep_ids) -# After -idxml_parser().store(output_file, prot_ids, new_pep_ids) -``` - ---- - -### Task 3.2: Fix ISO-8859 encoding typo - -**Files:** -- Modify: `pgatk/cgenomes/cosmic_downloader.py:156` - -**Step 1: Fix the encoding name** - -The en-dash (U+2013) in `'ISO-8859–1'` must be replaced with a regular hyphen: - -```python -# Before (line 156) -outfile.write(gzip.decompress(open(local_file, 'rb').read()).decode('ISO-8859–1')) -# After -outfile.write(gzip.decompress(open(local_file, 'rb').read()).decode('ISO-8859-1')) -``` - ---- - -### Task 3.3: Fix `download_file` log inversion bug - -**Files:** -- Modify: `pgatk/toolbox/general.py:192-193` - -**Step 1: Fix the condition** - -```python -# Before (line 192-193) -if log is not None: - log = logging -# After -if log is None: - log = logging -``` - ---- - -### Task 3.4: Fix enzyme name typos in models.py - -**Files:** -- Modify: `pgatk/proteomics/models.py:21-22` - -**Step 1: Fix the typos** - -```python -# Before (lines 21-22 — "cleavege" is misspelled) -'Trypsin/P': {'cleavage rule': '(?<=[KRX])', 'PSIID': 'MS:1001313', 'cleavege sites': 'KR'}, -'V8-DE': {'cleavage rule': '(?<=[DBEZX])(?!P)', 'PSIID': 'MS:1001314', 'cleavege sites': 'DBEZX'}, -# After -'Trypsin/P': {'cleavage rule': '(?<=[KRX])', 'PSIID': 'MS:1001313', 'cleavage sites': 'KR'}, -'V8-DE': {'cleavage rule': '(?<=[DBEZX])(?!P)', 'PSIID': 'MS:1001314', 'cleavage sites': 'DBEZX'}, -``` - -**Important:** Check if `'cleavege sites'` is used as a dictionary key anywhere — if so, update those references too. Search for `cleavege` across the codebase. - ---- - -### Task 3.5: Replace `print_help()` with Click's `required=True` - -**Files (11 command files):** - -For each command file listed below, add `required=True` to the relevant `@click.option()` decorators and remove the `print_help()` check. - -| File | Options to make required | -|---|---| -| `ensembl_database.py` | `--input_fasta` | -| `vcf_to_proteindb.py` | `--input_fasta`, `--vcf`, `--gene_annotations_gtf` | -| `cbioportal_to_proteindb.py` | `--input_mutation`, `--input_cds`, `--output_db` | -| `cosmic_to_proteindb.py` | `--input_mutation`, `--input_genes`, `--output_db` | -| `threeframe_translation.py` | `--input_fasta` | -| `proteindb_decoy.py` | `--input_database` | -| `peptide_class_fdr.py` | `--input_file`, `--output_file` | -| `dnaseq_to_proteindb.py` | `--input_fasta` | -| `blast_get_position.py` | `--input_psm_to_blast`, `--input_reference_database`, `--output_psm` | -| `mztab_class_fdr.py` | `--input_mztab`, `--outfile_name` | - -**Special case: `ensembl_downloader.py`** — This command requires `--taxonomy` OR `--ensembl_name` (either one). Can't use `required=True` directly. Use a Click callback or keep a validation check (but raise `click.UsageError` instead of `print_help()`): - -```python -if taxonomy is None and ensembl_name is None: - raise click.UsageError("Either --taxonomy or --ensembl_name is required.") -``` - -**Special case: `validate_peptides.py`** — Review the specific validation logic before converting. - -**Step 1: For each file, add `required=True` and remove `print_help()` import and call** - -Example pattern: -```python -# Before -from pgatk.commands.utils import print_help - -@click.option('-in', '--input_fasta', help='...') -def command(ctx, config_file, input_fasta, ...): - if input_fasta is None: - print_help() - -# After (remove print_help import, add required=True) -@click.option('-in', '--input_fasta', help='...', required=True) -def command(ctx, config_file, input_fasta, ...): - # print_help() call removed -``` - -**Step 2: Remove `print_help` from `pgatk/commands/utils.py`** if no longer used anywhere. - -**Step 3: Run tests** - -```bash -cd pgatk -python -m unittest discover -s tests -p "*_tests.py" -v -``` - ---- - -### Task 3.6: Replace deprecated `Bio.pairwise2` - -**Files:** -- Modify: `pgatk/proteogenomics/blast_get_position.py` - -**Step 1: Replace import** - -```python -# Before -from Bio import pairwise2, SeqIO -# After -from Bio import SeqIO -from Bio.Align import PairwiseAligner -``` - -**Step 2: Replace usage in `peptide_blast_protein` function** - -```python -# Before -score = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide, - match=1, mismatch=0, open=-2, extend=-2, score_only=True) -if score == length-1: - alignment = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide, - match=1, mismatch=0, open=-2, extend=-2)[0] - -# After -aligner = PairwiseAligner() -aligner.mode = 'local' -aligner.match_score = 1 -aligner.mismatch_score = 0 -aligner.open_gap_score = -2 -aligner.extend_gap_score = -2 -alignments = aligner.align(fasta, peptide) -score = alignments.score if alignments else 0 -if score == length - 1: - alignment = alignments[0] - # Update attribute access: alignment.aligned gives coordinate pairs - # Need to adapt get_details() calls to work with new alignment format -``` - -**Note:** The `PairwiseAligner` API differs from `pairwise2`. The alignment object attributes change: -- `alignment.seqA` / `alignment.seqB` → access via `alignment.target` / `alignment.query` -- `alignment.start` / `alignment.end` → access via `alignment.aligned` coordinate arrays - -Carefully test the `get_details()` function calls to ensure they still work with the new alignment format. - -**Step 3: Commit** - -```bash -git add -A -git commit -m "fix: resolve multiple bugs - -- Fix _filter_write_idxml_with_df writing unfiltered data (openms.py) -- Fix ISO-8859 encoding en-dash typo (cosmic_downloader.py) -- Fix download_file log inversion bug (general.py) -- Fix enzyme name typos 'cleavege' → 'cleavage' (models.py) -- Replace print_help() with Click required=True (11 command files) -- Replace deprecated Bio.pairwise2 with PairwiseAligner (blast_get_position.py)" -``` - ---- - -## PR 4: Cleanup (branch: `refactor/cleanup`) - -### Task 4.1: Remove dead `Species` and `SpeciesService` classes - -**Files:** -- Modify: `pgatk/ensembl/models.py` - -**Step 1: Verify they are unused** - -```bash -grep -r "Species\|SpeciesService" pgatk/ --include="*.py" -l -# Should only show pgatk/ensembl/models.py -``` - -**Step 2: Remove both classes** (lines 9-180 approximately). Keep any other code in the file (e.g., imports used elsewhere). - ---- - -### Task 4.2: Remove dead code after return statement - -**Files:** -- Modify: `pgatk/toolbox/general.py` - -**Step 1: Delete commented-out code** at lines 241-253 (after `return downloaded_file` in `download_file()`). - ---- - -### Task 4.3: Dynamic version from package metadata - -**Files:** -- Modify: `pgatk/cli.py` - -**Step 1: Use importlib.metadata** - -```python -# Before -@click.version_option(version='0.0.26') -def cli(): - -# After -from importlib.metadata import version, PackageNotFoundError - -try: - __version__ = version("pgatk") -except PackageNotFoundError: - __version__ = "dev" - -@click.version_option(version=__version__) -def cli(): -``` - ---- - -### Task 4.4: Fix CWD log file creation - -**Files:** -- Modify: `pgatk/toolbox/general.py` - -**Step 1: Replace FileHandler with StreamHandler** - -In `ParameterConfiguration.__init__`, replace the file-based logging with stderr-based logging: - -```python -# Before (lines 72-89) -self._log_handlers = [] -log_handlers_prefix = self._ROOT_CONFIG_NAME + '-' -log_handlers_extension = '.log' - -self._logger = logging.getLogger(__name__) -self._logger.setLevel(getattr(logging, self._log_level)) -self._log_files = [] -for llevel, lformat in self._logger_formatters.items(): - logfile = os.path.join(log_handlers_prefix + llevel.lower() + log_handlers_extension) - lformatter = logging.Formatter(lformat) - lhandler = logging.FileHandler(logfile, mode='w') - lhandler.setLevel(getattr(logging, llevel)) - lhandler.setFormatter(lformatter) - self._log_handlers.append(lhandler) - self._logger.addHandler(lhandler) - self._log_files.append(logfile) - -# After -self._log_handlers = [] -self._logger = logging.getLogger(__name__) -self._logger.setLevel(getattr(logging, self._log_level)) -self._log_files = [] - -# Log to stderr instead of creating files in CWD -for llevel, lformat in self._logger_formatters.items(): - lformatter = logging.Formatter(lformat) - lhandler = logging.StreamHandler() - lhandler.setLevel(getattr(logging, llevel)) - lhandler.setFormatter(lformatter) - self._log_handlers.append(lhandler) - self._logger.addHandler(lhandler) -``` - -**Step 2: Clean up `get_session_log_files()`** - -This method returns `self._log_files` which will now be empty. Keep the method (in case downstream code calls it) but it will return `[]`. - ---- - -### Task 4.5: Run full test suite and commit - -**Step 1: Run tests** - -```bash -cd pgatk -python -m unittest discover -s tests -p "*_tests.py" -v -``` - -**Step 2: Commit** - -```bash -git add -A -git commit -m "refactor: remove dead code and improve logging - -- Remove unused Species and SpeciesService classes (ensembl/models.py) -- Remove commented-out dead code after return (toolbox/general.py) -- Use dynamic version from package metadata instead of hardcoded '0.0.26' -- Log to stderr instead of creating log files in CWD" -``` - ---- - -## Summary: Branch and PR Strategy - -| PR | Branch | Base | Description | -|---|---|---|---| -| 1 | `refactor/rename` | `dev` | Full rename: pypgatk → pgatk | -| 2 | `refactor/config-registry` | `refactor/rename` | Auto-load bundled config defaults | -| 3 | `fix/bugs` | `refactor/rename` | Fix 6 bugs | -| 4 | `refactor/cleanup` | `refactor/rename` | Remove dead code, fix logging | - -PR 1 merges first → then PRs 2, 3, 4 can merge in any order. - -## Post-Merge (Manual) - -1. Rename GitHub repo: Settings → Repository name → `pgatk` -2. Publish new `pgatk` package to PyPI -3. Update Bioconda recipe diff --git a/docs/plans/2026-03-01-phase0-infrastructure.md b/docs/plans/2026-03-01-phase0-infrastructure.md deleted file mode 100644 index d70a21d7..00000000 --- a/docs/plans/2026-03-01-phase0-infrastructure.md +++ /dev/null @@ -1,1199 +0,0 @@ -# Phase 0: Infrastructure & Code Quality — Implementation Plan - -> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. - -**Goal:** Fix existing bugs, modernize code patterns, and expand test coverage so the graph engine (Phase 1) is built on solid ground. - -**Architecture:** No architectural changes — this is a cleanup phase touching existing files. All changes are backward-compatible. The test suite must pass before and after each task. - -**Tech Stack:** Python 3.9+, pytest, dataclasses, pathlib, concurrent.futures, logging - ---- - -## Task 1: Add conftest.py with proper fixtures and absolute paths - -**Files:** -- Create: `pgatk/tests/conftest.py` -- Modify: `pgatk/tests/pgatk_tests.py` -- Modify: `pgatk/tests/classes_tests.py` - -**Step 1: Create conftest.py with path fixtures** - -```python -# pgatk/tests/conftest.py -import os -from pathlib import Path - -import pytest - -# Absolute path to testdata/ directory, works regardless of CWD -TESTDATA_DIR = Path(__file__).resolve().parent.parent / "testdata" -CONFIG_DIR = Path(__file__).resolve().parent.parent / "config" - - -@pytest.fixture -def testdata_dir(): - """Absolute path to the testdata directory.""" - return TESTDATA_DIR - - -@pytest.fixture -def config_dir(): - """Absolute path to the config directory.""" - return CONFIG_DIR - - -@pytest.fixture(autouse=True) -def change_to_package_dir(tmp_path, monkeypatch): - """Ensure tests run from a consistent directory (the pgatk package root).""" - monkeypatch.chdir(Path(__file__).resolve().parent.parent) -``` - -**Step 2: Run existing tests to verify they still pass** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/ -v --timeout=120 -x` -Expected: All existing tests pass (the `autouse` fixture changes CWD to the package root, preserving relative path behavior). - -**Step 3: Commit** - -```bash -git add pgatk/tests/conftest.py -git commit -m "test: add conftest.py with testdata/config path fixtures" -``` - ---- - -## Task 2: Fix variable shadowing bug in protein_database_decoy.py - -**Files:** -- Modify: `pgatk/proteomics/db/protein_database_decoy.py:170,180,203` -- Create: `pgatk/tests/test_decoy_shadowing.py` - -**Step 1: Write the failing test** - -```python -# pgatk/tests/test_decoy_shadowing.py -"""Verify that the decoy_sequence import is not shadowed by local variables.""" -from pyteomics.fasta import decoy_sequence as pyteomics_decoy_sequence -from pgatk.proteomics.db import protein_database_decoy - - -def test_decoy_sequence_import_not_shadowed(): - """The module-level import of decoy_sequence must remain callable after - print_target_decoy_composition defines a local variable of the same name.""" - # Verify the import at module level is still the pyteomics function - module_ref = protein_database_decoy.decoy_sequence - assert callable(module_ref), ( - "decoy_sequence at module level should be the pyteomics function, " - f"got {type(module_ref)}" - ) - assert module_ref is pyteomics_decoy_sequence -``` - -**Step 2: Run test to verify it passes (this is a module-level check, so it should pass)** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_decoy_shadowing.py -v` -Expected: PASS (the shadow is local to the method, not module-level — but we fix it for safety) - -**Step 3: Rename the shadowing variable** - -In `pgatk/proteomics/db/protein_database_decoy.py`, rename the local variable `decoy_sequence` to `decoy_seq_accumulated` at lines 170, 180, and 203: - -- Line 170: `decoy_sequence = ''` → `decoy_seq_accumulated = ''` -- Line 180: `decoy_sequence = decoy_sequence + str(record.seq)` → `decoy_seq_accumulated = decoy_seq_accumulated + str(record.seq)` -- Line 203: `decoy_aa_composition = self.count_aa_in_dictionary(decoy_aa_composition, decoy_sequence)` → `decoy_aa_composition = self.count_aa_in_dictionary(decoy_aa_composition, decoy_seq_accumulated)` - -**Step 4: Run all tests to verify nothing broke** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/ -v --timeout=120 -x` -Expected: All PASS - -**Step 5: Commit** - -```bash -git add pgatk/proteomics/db/protein_database_decoy.py pgatk/tests/test_decoy_shadowing.py -git commit -m "fix: rename shadowing variable decoy_sequence in print_target_decoy_composition" -``` - ---- - -## Task 3: Convert SNP class to dataclass - -**Files:** -- Modify: `pgatk/cgenomes/models.py` -- Modify: `pgatk/cgenomes/cgenomes_proteindb.py` (update `snp.type` → `snp.mutation_type`) -- Create: `pgatk/tests/test_models.py` - -**Step 1: Write the failing test** - -```python -# pgatk/tests/test_models.py -import dataclasses -from pgatk.cgenomes.models import SNP - - -def test_snp_is_dataclass(): - assert dataclasses.is_dataclass(SNP) - - -def test_snp_defaults(): - snp = SNP() - assert snp.gene is None - assert snp.mrna is None - assert snp.dna_mut is None - assert snp.aa_mut is None - assert snp.mutation_type is None - - -def test_snp_with_values(): - snp = SNP(gene="BRCA1", mrna="NM_007294", dna_mut="c.5266dupC", - aa_mut="p.Q1756fs", mutation_type="Insertion") - assert snp.gene == "BRCA1" - assert snp.mutation_type == "Insertion" - - -def test_snp_no_type_builtin_shadow(): - """The old SNP class stored mutation_type as self.type, shadowing the builtin.""" - snp = SNP(mutation_type="Substitution") - assert not hasattr(snp, 'type') or snp.mutation_type == "Substitution" -``` - -**Step 2: Run test to verify it fails** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_models.py -v` -Expected: FAIL — `test_snp_is_dataclass` fails because SNP is not a dataclass yet - -**Step 3: Convert SNP to dataclass** - -Replace the entire content of `pgatk/cgenomes/models.py`: - -```python -from dataclasses import dataclass -from typing import Optional - - -@dataclass -class SNP: - """Represents a single nucleotide polymorphism or mutation.""" - gene: Optional[str] = None - mrna: Optional[str] = None - dna_mut: Optional[str] = None - aa_mut: Optional[str] = None - mutation_type: Optional[str] = None -``` - -**Step 4: Update references from `snp.type` to `snp.mutation_type`** - -In `pgatk/cgenomes/cgenomes_proteindb.py`, find all uses of `.type` on SNP objects: -- Search for `snp.type` or `.type =` in the context of SNP objects -- Line where SNP is constructed: `snp = SNP()` followed by `snp.type = ...` → change to `snp.mutation_type = ...` -- Any reads of `snp.type` → `snp.mutation_type` - -Specifically, look for patterns like: -```python -snp.type = mutation_type # → snp.mutation_type = mutation_type -``` -And reads like: -```python -snp.type # → snp.mutation_type -``` - -**Step 5: Run tests to verify they pass** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_models.py pgatk/tests/ -v --timeout=120 -x` -Expected: All PASS - -**Step 6: Commit** - -```bash -git add pgatk/cgenomes/models.py pgatk/cgenomes/cgenomes_proteindb.py pgatk/tests/test_models.py -git commit -m "refactor: convert SNP to dataclass, rename .type to .mutation_type" -``` - ---- - -## Task 4: Replace pathos with concurrent.futures in blast_get_position.py - -**Files:** -- Modify: `pgatk/proteogenomics/blast_get_position.py:1-10,85,151-155` -- Modify: `pyproject.toml` (remove pathos from dependencies after both files are migrated) - -**Step 1: Write the test** - -```python -# pgatk/tests/test_blast_imports.py -"""Verify blast_get_position uses stdlib concurrent.futures, not pathos.""" -import importlib - - -def test_no_pathos_import_in_blast(): - """blast_get_position should not import pathos.""" - import pgatk.proteogenomics.blast_get_position as mod - source_file = mod.__file__ - with open(source_file, 'r') as f: - source = f.read() - assert 'pathos' not in source, "blast_get_position.py still imports pathos" - assert 'concurrent.futures' in source or 'ProcessPoolExecutor' in source -``` - -**Step 2: Run test to verify it fails** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_blast_imports.py -v` -Expected: FAIL — pathos is still imported - -**Step 3: Replace pathos with concurrent.futures** - -In `pgatk/proteogenomics/blast_get_position.py`: - -Replace imports (lines 1-9): -```python -import pandas as pd -from Bio import SeqIO -import datetime -from concurrent.futures import ProcessPoolExecutor -from tqdm import tqdm -import ahocorasick - -from pgatk.toolbox.general import ParameterConfiguration -``` - -Remove `Manager` import and usage. In `__init__`, replace line 85: -```python -# OLD: self.blast_dict = Manager().dict() -self.blast_dict = {} -``` - -Replace the pool usage (lines 151-155) in `blast()`: -```python -# OLD: -# pool = Pool(int(self._number_of_processes)) -# list(tqdm(pool.imap(self._result, seq_list), total=len(seq_list), desc="Blast", unit="peptide")) -# pool.close() -# pool.join() - -# NEW: -with ProcessPoolExecutor(max_workers=int(self._number_of_processes)) as executor: - list(tqdm(executor.map(self._result, seq_list, chunksize=100), - total=len(seq_list), desc="Blast", unit="peptide")) -``` - -Note: Since `self._result` writes to `self.blast_dict` which is a shared Manager dict, and we're removing Manager, we need to refactor `_result()` to return results instead of writing to shared state. Then collect results in the main process: - -```python -# Refactor _result to return data instead of writing to shared dict -def _result(self, sequence): - """Process a single peptide and return (peptide, details) or None.""" - result = peptide_blast_protein(self.fasta_dict, sequence) - if result: - return (sequence, result) - return None - -# In blast(): -with ProcessPoolExecutor(max_workers=int(self._number_of_processes)) as executor: - results = list(tqdm(executor.map(self._result, seq_list, chunksize=100), - total=len(seq_list), desc="Blast", unit="peptide")) -for r in results: - if r is not None: - self.blast_dict[r[0]] = r[1] -``` - -**Step 4: Run the existing blast test** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/pgatk_tests.py::PgatkRunnerTests::test_blast -v --timeout=120` -Expected: PASS - -**Step 5: Run import test** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_blast_imports.py -v` -Expected: PASS - -**Step 6: Commit** - -```bash -git add pgatk/proteogenomics/blast_get_position.py pgatk/tests/test_blast_imports.py -git commit -m "refactor: replace pathos with concurrent.futures in blast_get_position" -``` - ---- - -## Task 5: Replace pathos with concurrent.futures in spectrumai.py - -**Files:** -- Modify: `pgatk/proteogenomics/spectrumai.py:1-10,47,290,314-318` - -**Step 1: Write the test** - -```python -# pgatk/tests/test_spectrumai_imports.py -"""Verify spectrumai uses stdlib concurrent.futures, not pathos.""" - - -def test_no_pathos_import_in_spectrumai(): - import pgatk.proteogenomics.spectrumai as mod - with open(mod.__file__, 'r') as f: - source = f.read() - assert 'pathos' not in source, "spectrumai.py still imports pathos" -``` - -**Step 2: Run test to verify it fails** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_spectrumai_imports.py -v` -Expected: FAIL - -**Step 3: Replace pathos with concurrent.futures** - -In `pgatk/proteogenomics/spectrumai.py`: - -Replace imports (lines 1-10): -```python -import datetime -import os.path -import re -import pandas as pd -from concurrent.futures import ProcessPoolExecutor -from pyopenms import (TheoreticalSpectrumGenerator, MSSpectrum, - AASequence, Param, MzMLFile, MSExperiment, SpectrumLookup) -from tqdm import tqdm -from pgatk.toolbox.general import ParameterConfiguration -``` - -Remove Manager usage. In `__init__`, replace line 47: -```python -# OLD: self.df_list = Manager().list() -# Remove this line entirely — we'll collect results from executor.map() -``` - -Refactor `_multiprocess_inspect_spectrum` (around line 290) to return the result instead of appending to shared list: -```python -def _multiprocess_inspect_spectrum(self, df): - """Process a single mzML group and return the annotated DataFrame.""" - return self._inspect_spectrum(df, self._mzml_path, self._mzml_files) -``` - -Replace pool usage (lines 314-318) in `validate()`: -```python -# OLD: -# pool = Pool(int(self._number_of_processes)) -# list(tqdm(pool.imap(self._multiprocess_inspect_spectrum, list_of_dfs), ...)) -# pool.close() -# pool.join() - -# NEW: -with ProcessPoolExecutor(max_workers=int(self._number_of_processes)) as executor: - results = list(tqdm( - executor.map(self._multiprocess_inspect_spectrum, list_of_dfs), - total=len(list_of_dfs), desc="Validate By Each mzML", unit="mzML" - )) - -# Replace the line that concatenates self.df_list: -# OLD: df_output = pd.concat(self.df_list, axis=0, ignore_index=True) -# NEW: -df_output = pd.concat(results, axis=0, ignore_index=True) -``` - -**Step 4: Run test to verify it passes** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_spectrumai_imports.py -v` -Expected: PASS - -**Step 5: Now remove pathos from pyproject.toml** - -In `pyproject.toml`, remove the `pathos` dependency line: -``` -# Remove: "pathos>=0.2.8,<1.0", -``` -Also remove from `requirements.txt` if present. - -**Step 6: Run full test suite** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/ -v --timeout=120 -x` -Expected: All PASS - -**Step 7: Commit** - -```bash -git add pgatk/proteogenomics/spectrumai.py pgatk/tests/test_spectrumai_imports.py pyproject.toml requirements.txt -git commit -m "refactor: replace pathos with concurrent.futures in spectrumai, remove pathos dependency" -``` - ---- - -## Task 6: Replace print() with logger across all modules - -This is a large task covering many files. Work through them one file at a time. - -**Files:** -- Modify: `pgatk/proteomics/db/protein_database_decoy.py` (14 print sites) -- Modify: `pgatk/ensembl/ensembl.py` (9 print sites) -- Modify: `pgatk/cgenomes/cgenomes_proteindb.py` (18 print sites) -- Modify: `pgatk/proteogenomics/blast_get_position.py` (8 print sites) -- Modify: `pgatk/proteogenomics/spectrumai.py` (6 print sites) - -**Step 1: Write a test that checks for print() usage in source files** - -```python -# pgatk/tests/test_no_print_statements.py -"""Verify that production code uses logger instead of print().""" -import ast -from pathlib import Path - -PGATK_ROOT = Path(__file__).resolve().parent.parent - -# Files that should not contain print() calls (excluding tests and __init__) -CHECKED_FILES = [ - "proteomics/db/protein_database_decoy.py", - "ensembl/ensembl.py", - "cgenomes/cgenomes_proteindb.py", - "proteogenomics/blast_get_position.py", - "proteogenomics/spectrumai.py", -] - - -def _count_print_calls(filepath: Path) -> list[int]: - """Return line numbers of print() function calls in a Python file.""" - source = filepath.read_text() - tree = ast.parse(source) - lines = [] - for node in ast.walk(tree): - if isinstance(node, ast.Call): - func = node.func - if isinstance(func, ast.Name) and func.id == "print": - lines.append(node.lineno) - return lines - - -def test_no_print_in_production_code(): - violations = {} - for rel_path in CHECKED_FILES: - filepath = PGATK_ROOT / rel_path - if filepath.exists(): - lines = _count_print_calls(filepath) - if lines: - violations[rel_path] = lines - assert not violations, ( - f"print() calls found in production code (use logger instead):\n" - + "\n".join(f" {f}: lines {lines}" for f, lines in violations.items()) - ) -``` - -**Step 2: Run test to verify it fails** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_no_print_statements.py -v` -Expected: FAIL with list of all print() locations - -**Step 3: Replace print() calls file by file** - -For each file, the pattern is: -1. Ensure the file has a logger (most already do via `ParameterConfiguration.get_logger()`) -2. Replace `print(...)` with `self.get_logger().info(...)` for informational output -3. Replace `print("ERROR: ...")` with `self.get_logger().error(...)` -4. Replace `print("Warning: ...")` with `self.get_logger().warning(...)` -5. For static methods or module-level functions without `self`, use `logging.getLogger(__name__)` - -**Key replacements by file:** - -**protein_database_decoy.py** — all `print()` inside instance methods → `self.get_logger().info()`: -- Lines 158-160, 193, 195, 197, 199-201: diagnostic stats → `self.get_logger().info(...)` -- Lines 261, 262, 279, 282, 286, 320, 367: decoypyrat progress → `self.get_logger().info(...)` -- Lines 481, 483-484: final stats → `self.get_logger().info(...)` - -**ensembl.py** — mixed instance and static methods: -- Line 131: `print("skip entries...")` → `self.get_logger().warning("skip entries...")` -- Line 248: `print("Database already exists...")` → `logging.getLogger(__name__).debug("Database already exists: %s %s", e, gtf_db_file)` -- Line 272-274: `print("""Feature {} ...""")` → `logging.getLogger(__name__).warning(...)` -- Line 400: `print("Could not extra cds...")` → `self.get_logger().warning(...)` -- Line 755: redundant `print(msg)` after `logger` call → remove the print -- Lines 823-829: `print(" total number...")` → `self.get_logger().info(...)` -- Line 857-858: `print("ERROR: This script...")` → `logging.getLogger(__name__).error(...)` - -**cgenomes_proteindb.py** — add `import logging` at top, use `self.get_logger()` in instance methods and `logging.getLogger(__name__)` in static methods. Replace all 18 print calls. - -**blast_get_position.py** — replace 8 print calls with `self.get_logger()`. - -**spectrumai.py** — replace 6 print calls with `self.get_logger()`. - -**Step 4: Run the no-print test** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_no_print_statements.py -v` -Expected: PASS - -**Step 5: Run full test suite** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/ -v --timeout=120 -x` -Expected: All PASS - -**Step 6: Commit** - -```bash -git add pgatk/proteomics/db/protein_database_decoy.py pgatk/ensembl/ensembl.py pgatk/cgenomes/cgenomes_proteindb.py pgatk/proteogenomics/blast_get_position.py pgatk/proteogenomics/spectrumai.py pgatk/tests/test_no_print_statements.py -git commit -m "refactor: replace print() with logger across all production modules" -``` - ---- - -## Task 7: Replace broad except Exception with specific exceptions - -**Files:** -- Modify: `pgatk/toolbox/general.py:163,229,342` -- Modify: `pgatk/ensembl/ensembl.py:247` -- Modify: `pgatk/proteogenomics/spectrumai.py:163,180` - -**Step 1: Fix each broad except block** - -**`toolbox/general.py` line 163:** -```python -# OLD: -except Exception as e: - raise ToolBoxException(str(e)) -# NEW: -except (OSError, PermissionError) as e: - raise ToolBoxException(str(e)) from e -``` - -**`toolbox/general.py` line 229 (download_file retry):** -```python -# OLD: -except Exception as error: - remaining_download_tries -= 1 - log.error("Error code: " + str(error)) -# NEW: -except (URLError, ContentTooShortError, OSError, HTTPError) as error: - remaining_download_tries -= 1 - log.error("Download error: %s", error) -``` - -**`toolbox/general.py` line 342 (gunzip_files):** -```python -# OLD: -except Exception as e: - err_msg = "UNKNOWN ERROR uncompressing file..." -# NEW: -except (OSError, subprocess.CalledProcessError) as e: - err_msg = "Error uncompressing file..." -``` - -**`ensembl/ensembl.py` line 247 (parse_gtf):** -```python -# OLD: -except Exception as e: - print("Database already exists: " + str(e), gtf_db_file) -# NEW: -except (gffutils.FeatureNotFoundError, ValueError, sqlite3.OperationalError) as e: - logging.getLogger(__name__).debug("GTF database already exists: %s %s", e, gtf_db_file) -``` -Add `import sqlite3` to the imports. - -**`spectrumai.py` line 163 (mzML loading):** -```python -# OLD: -except Exception as e: - print(mzml_file + " has ERROR!") -# NEW: -except (OSError, RuntimeError, ValueError) as e: - self.get_logger().error("Error loading mzML file %s: %s", mzml_file, e) -``` - -**`spectrumai.py` line 180 (scan lookup):** -```python -# OLD: -except Exception as e: - print("ERROR: ...") -# NEW: -except (RuntimeError, IndexError, KeyError) as e: - self.get_logger().error("Scan lookup error in %s scan %s: %s", mzml_file, scan_num, e) -``` - -**Step 2: Run full test suite** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/ -v --timeout=120 -x` -Expected: All PASS - -**Step 3: Commit** - -```bash -git add pgatk/toolbox/general.py pgatk/ensembl/ensembl.py pgatk/proteogenomics/spectrumai.py -git commit -m "refactor: replace broad except Exception with specific exception types" -``` - ---- - -## Task 8: Standardize config fallback pattern with helper method - -**Files:** -- Modify: `pgatk/toolbox/general.py` (add `_get_config_value` helper to `ParameterConfiguration`) -- Modify: `pgatk/ensembl/ensembl.py` (simplify `__init__` config loading) -- Modify: `pgatk/cgenomes/cgenomes_proteindb.py` (simplify `get_mutations_default_options`) -- Modify: `pgatk/proteogenomics/blast_get_position.py` (simplify `get_blast_parameters`) -- Modify: `pgatk/proteogenomics/spectrumai.py` (simplify parameter loading) - -**Step 1: Write the test** - -```python -# pgatk/tests/test_parameter_configuration.py -"""Test the ParameterConfiguration config fallback helper.""" -from pgatk.toolbox.general import ParameterConfiguration - - -def test_get_config_value_from_pipeline_params(): - """Pipeline params should take precedence.""" - config = ParameterConfiguration.__new__(ParameterConfiguration) - config._pipeline_params = {"root": {"key1": "from_pipeline"}} - config._default_params = {"root": {"key1": "from_default"}} - config._ROOT_CONFIG_NAME = "root" - result = config.get_config_value("key1", "fallback") - assert result == "from_pipeline" - - -def test_get_config_value_from_defaults(): - """Default params used when pipeline params missing.""" - config = ParameterConfiguration.__new__(ParameterConfiguration) - config._pipeline_params = {"root": {}} - config._default_params = {"root": {"key1": "from_default"}} - config._ROOT_CONFIG_NAME = "root" - result = config.get_config_value("key1", "fallback") - assert result == "from_default" - - -def test_get_config_value_fallback(): - """Fallback returned when neither source has the key.""" - config = ParameterConfiguration.__new__(ParameterConfiguration) - config._pipeline_params = {"root": {}} - config._default_params = {"root": {}} - config._ROOT_CONFIG_NAME = "root" - result = config.get_config_value("missing_key", "fallback_value") - assert result == "fallback_value" -``` - -**Step 2: Run test to verify it fails** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_parameter_configuration.py -v` -Expected: FAIL — `get_config_value` does not exist yet - -**Step 3: Add the helper method to ParameterConfiguration** - -In `pgatk/toolbox/general.py`, add to the `ParameterConfiguration` class: - -```python -def get_config_value(self, key: str, default=None): - """Get a configuration value with standard 3-layer fallback. - - Checks pipeline_params first, then default_params, then returns default. - """ - root = self._ROOT_CONFIG_NAME - # Check pipeline params first - if (self._pipeline_params is not None - and root in self._pipeline_params - and key in self._pipeline_params[root]): - return self._pipeline_params[root][key] - # Then check default config params - if (self._default_params is not None - and root in self._default_params - and key in self._default_params[root]): - return self._default_params[root][key] - return default -``` - -**Step 4: Run test to verify it passes** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_parameter_configuration.py -v` -Expected: PASS - -**Step 5: Refactor subclasses to use the new helper** - -Replace the repetitive fallback patterns in each subclass with `self.get_config_value(key, default)`. For example, in `blast_get_position.py`, the `get_blast_parameters()` method: - -```python -# OLD pattern (repeated everywhere): -def get_blast_parameters(self, variable: str, default_value): - value = default_value - if self._pipeline_params is not None and self._ROOT_CONFIG_NAME in self._pipeline_params: - if variable in self._pipeline_params[self._ROOT_CONFIG_NAME]: - value = self._pipeline_params[self._ROOT_CONFIG_NAME][variable] - elif self._default_params is not None and self._ROOT_CONFIG_NAME in self._default_params: - if variable in self._default_params[self._ROOT_CONFIG_NAME]: - value = self._default_params[self._ROOT_CONFIG_NAME][variable] - return value - -# NEW: -def get_blast_parameters(self, variable: str, default_value): - return self.get_config_value(variable, default_value) -``` - -Apply the same simplification to equivalent methods in `cgenomes_proteindb.py`, `spectrumai.py`, and `ensembl.py`. - -**Step 6: Run full test suite** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/ -v --timeout=120 -x` -Expected: All PASS - -**Step 7: Commit** - -```bash -git add pgatk/toolbox/general.py pgatk/ensembl/ensembl.py pgatk/cgenomes/cgenomes_proteindb.py pgatk/proteogenomics/blast_get_position.py pgatk/proteogenomics/spectrumai.py pgatk/tests/test_parameter_configuration.py -git commit -m "refactor: add get_config_value helper, DRY up config fallback pattern" -``` - ---- - -## Task 9: Convert db/digest_mutant_protein.py to proper CLI command - -**Files:** -- Modify: `pgatk/db/digest_mutant_protein.py` (wrap in Click command) -- Modify: `pgatk/cli.py` (register new command) -- Create: `pgatk/commands/digest_mutant_protein.py` (Click wrapper) - -**Step 1: Refactor the script into a function** - -Wrap all logic in `pgatk/db/digest_mutant_protein.py` inside a function with proper parameters: - -```python -# pgatk/db/digest_mutant_protein.py -import re -import logging -from collections import OrderedDict -from typing import Optional - -from Bio import SeqIO - -logger = logging.getLogger(__name__) - - -def trypsin_cleavage(proseq: str, miss_cleavage: int) -> list: - """Perform in-silico tryptic digestion.""" - # (existing trypsin_cleavage function body, unchanged) - ... - - -def digest_mutant_proteins( - input_file: str, - fasta_file: str, - output_file: str, - decoy_prefix: str = "DECOY_", - min_length: int = 7, - max_length: int = 40, - missed_cleavages: int = 2, -) -> None: - """Digest mutant proteins and filter against canonical peptidome. - - Reads mutant protein sequences, performs tryptic digestion, removes peptides - found in the canonical proteome, and writes unique variant peptides. - """ - # (existing script logic from lines 83-139, using function parameters - # instead of getopt-parsed variables) - ... -``` - -Remove all `sys.argv`, `getopt`, and `sys.exit()` code. Remove the import-time execution. - -**Step 2: Create Click command wrapper** - -```python -# pgatk/commands/digest_mutant_protein.py -import click - -from pgatk.db.digest_mutant_protein import digest_mutant_proteins - - -@click.command("digest-mutant-protein", short_help="Digest mutant proteins and filter against canonical proteome") -@click.option("-i", "--input", "input_file", required=True, help="Input mutant protein FASTA") -@click.option("-f", "--fasta", "fasta_file", required=True, help="Reference canonical protein FASTA") -@click.option("-o", "--output", "output_file", required=True, help="Output file for unique variant peptides") -@click.option("--prefix", default="DECOY_", help="Decoy prefix to skip (default: DECOY_)") -@click.option("--min-len", default=7, type=int, help="Minimum peptide length (default: 7)") -@click.option("--max-len", default=40, type=int, help="Maximum peptide length (default: 40)") -@click.option("--missed-cleavages", default=2, type=int, help="Missed cleavages (default: 2)") -def digest_mutant_protein(input_file, fasta_file, output_file, prefix, min_len, max_len, missed_cleavages): - digest_mutant_proteins(input_file, fasta_file, output_file, prefix, min_len, max_len, missed_cleavages) -``` - -**Step 3: Register in cli.py** - -Add to `pgatk/cli.py`: -```python -from pgatk.commands.digest_mutant_protein import digest_mutant_protein -cli.add_command(digest_mutant_protein) -``` - -**Step 4: Run CLI help to verify registration** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pgatk.cli digest-mutant-protein --help` -Expected: Help text with all options displayed - -**Step 5: Run full test suite** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/ -v --timeout=120 -x` -Expected: All PASS - -**Step 6: Commit** - -```bash -git add pgatk/db/digest_mutant_protein.py pgatk/commands/digest_mutant_protein.py pgatk/cli.py -git commit -m "refactor: convert digest_mutant_protein from standalone script to Click CLI command" -``` - ---- - -## Task 10: Convert db/map_peptide2genome.py to proper CLI command - -**Files:** -- Modify: `pgatk/db/map_peptide2genome.py` (wrap in functions) -- Create: `pgatk/commands/map_peptide2genome.py` (Click wrapper) -- Modify: `pgatk/cli.py` (register new command) - -**Step 1: Refactor the script into functions** - -Same pattern as Task 9. Wrap all logic in a function: - -```python -# pgatk/db/map_peptide2genome.py -import re -import logging -from typing import Optional - -from Bio import SeqIO - -logger = logging.getLogger(__name__) - - -class EXON: - """Represents an exon with genomic coordinates.""" - # (existing EXON class, unchanged except add type hints) - ... - - -def cal_trans_pos(exon_list: list) -> list: - # (existing function, unchanged) - ... - - -def get_pep_cor(exon_object_list: list, n1: int, n2: int) -> list: - # (existing function, unchanged) - ... - - -def parse_gtf(infile: str) -> dict: - # (existing function, unchanged) - ... - - -def map_peptides_to_genome( - input_file: str, - gtf_file: str, - fasta_file: str, - idmap_file: str, - output_file: str, - pep_col: int = 0, - prot_col: int = 1, -) -> None: - """Map identified peptides to genomic coordinates. - - Reads peptide identifications, maps them to transcript coordinates using - a GTF file and protein-transcript ID mapping, outputs GFF3 format. - """ - # (existing script logic from lines 146-261, using function parameters) - ... -``` - -Remove all `sys.argv`, `getopt`, `sys.exit()` code. - -**Step 2: Create Click command wrapper** - -```python -# pgatk/commands/map_peptide2genome.py -import click - -from pgatk.db.map_peptide2genome import map_peptides_to_genome - - -@click.command("map-peptide2genome", short_help="Map peptides to genomic coordinates (GFF3 output)") -@click.option("-i", "--input", "input_file", required=True, help="Input peptide identification TSV") -@click.option("-g", "--gtf", "gtf_file", required=True, help="GTF gene annotation file") -@click.option("-f", "--fasta", "fasta_file", required=True, help="Protein FASTA file") -@click.option("-m", "--idmap", "idmap_file", required=True, help="Protein-to-transcript ID mapping file") -@click.option("-o", "--output", "output_file", required=True, help="Output GFF3 file") -@click.option("--pep-col", default=0, type=int, help="Peptide column index (default: 0)") -@click.option("--prot-col", default=1, type=int, help="Protein column index (default: 1)") -def map_peptide2genome(input_file, gtf_file, fasta_file, idmap_file, output_file, pep_col, prot_col): - map_peptides_to_genome(input_file, gtf_file, fasta_file, idmap_file, output_file, pep_col, prot_col) -``` - -**Step 3: Register in cli.py** - -Add to `pgatk/cli.py`: -```python -from pgatk.commands.map_peptide2genome import map_peptide2genome -cli.add_command(map_peptide2genome) -``` - -**Step 4: Run CLI help to verify registration** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pgatk.cli map-peptide2genome --help` -Expected: Help text with all options displayed - -**Step 5: Run full test suite** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/ -v --timeout=120 -x` -Expected: All PASS - -**Step 6: Commit** - -```bash -git add pgatk/db/map_peptide2genome.py pgatk/commands/map_peptide2genome.py pgatk/cli.py -git commit -m "refactor: convert map_peptide2genome from standalone script to Click CLI command" -``` - ---- - -## Task 11: Use pathlib.Path for file operations - -**Files:** -- Modify: `pgatk/ensembl/ensembl.py` (5 sites) -- Modify: `pgatk/cgenomes/cgenomes_proteindb.py` (2 sites) -- Modify: `pgatk/proteomics/db/protein_database_decoy.py` (1 site) - -**Step 1: Add pathlib import and replace string path operations** - -**`ensembl/ensembl.py`** — add `from pathlib import Path` to imports: - -- Line 417: `os.path.abspath(vcf_file.split('/')[-1].replace('.vcf', ''))` → `str(Path(vcf_file).resolve().stem)` -- Line 419: `annotated_vcf + '_all.bed'` → `str(Path(annotated_vcf).with_name(Path(annotated_vcf).name + '_all.bed'))` - Or simpler: keep `annotated_vcf` as a Path stem and use f-strings: `f"{annotated_vcf}_all.bed"` -- Line 508: `gene_annotations_gtf.replace('.gtf', '.db')` → `str(Path(gene_annotations_gtf).with_suffix('.db'))` - -**`cgenomes/cgenomes_proteindb.py`** — add `from pathlib import Path`: - -- Line 260: `self._local_output_file.replace('.fa', '') + '_' + ...` → `str(Path(self._local_output_file).with_suffix('')) + '_' + ...` (or use Path stem) -- Line 461: same pattern - -**`protein_database_decoy.py`** — add `from pathlib import Path`: - -- Line 479: `self._output_file.replace('.fa', '') + '_noAlternative.fa'` → `str(Path(self._output_file).with_suffix('').with_name(Path(self._output_file).stem + '_noAlternative.fa'))` - -**Step 2: Run full test suite** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/ -v --timeout=120 -x` -Expected: All PASS - -**Step 3: Commit** - -```bash -git add pgatk/ensembl/ensembl.py pgatk/cgenomes/cgenomes_proteindb.py pgatk/proteomics/db/protein_database_decoy.py -git commit -m "refactor: use pathlib.Path for file path operations" -``` - ---- - -## Task 12: Add type hints to public APIs - -**Files:** -- Modify: `pgatk/toolbox/general.py` -- Modify: `pgatk/ensembl/ensembl.py` -- Modify: `pgatk/cgenomes/cgenomes_proteindb.py` -- Modify: `pgatk/proteogenomics/blast_get_position.py` -- Modify: `pgatk/proteogenomics/spectrumai.py` -- Modify: `pgatk/proteomics/db/protein_database_decoy.py` - -**Step 1: Add type hints to each file's public methods** - -Work through each file adding type annotations to all public methods (non-underscore-prefixed). Use `from __future__ import annotations` at the top of each modified file for forward references. - -Key patterns: -```python -from __future__ import annotations -from typing import Optional -from pathlib import Path -import logging -import pandas as pd -import gffutils -``` - -Examples of what to add: - -```python -# ensembl.py -def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf: str) -> str: ... -def dnaseq_to_proteindb(self, input_fasta: str) -> str: ... -def check_proteindb(self, input_fasta: str, add_stop_codon: bool = True, num_aa: int = 6) -> None: ... - -@staticmethod -def get_altseq(ref_seq: str, ref_allele: str, var_allele: str, var_pos: int, - strand: str, features_info: list, cds_info: Optional[list] = None) -> tuple: ... - -@staticmethod -def parse_gtf(gene_annotations_gtf: str, gtf_db_file: str) -> gffutils.FeatureDB: ... -``` - -```python -# blast_get_position.py -def blast(self, input_psm_to_blast: str, output_psm: str) -> None: ... - -@staticmethod -def get_details(fasta: str, peptide: str) -> list[str]: ... - -@staticmethod -def peptide_blast_protein(fasta: str, peptide: str) -> Optional[list]: ... -``` - -**Step 2: Run mypy (optional, non-blocking) and tests** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/ -v --timeout=120 -x` -Expected: All PASS - -**Step 3: Commit** - -```bash -git add pgatk/toolbox/general.py pgatk/ensembl/ensembl.py pgatk/cgenomes/cgenomes_proteindb.py pgatk/proteogenomics/blast_get_position.py pgatk/proteogenomics/spectrumai.py pgatk/proteomics/db/protein_database_decoy.py -git commit -m "refactor: add type hints to all public API methods" -``` - ---- - -## Task 13: Expand unit test coverage for core functions - -**Files:** -- Create: `pgatk/tests/test_ensembl_core.py` -- Create: `pgatk/tests/test_cosmic_core.py` -- Create: `pgatk/tests/test_decoy_core.py` - -**Step 1: Write unit tests for get_altseq()** - -```python -# pgatk/tests/test_ensembl_core.py -"""Unit tests for EnsemblDataService core functions.""" -from pgatk.ensembl.ensembl import EnsemblDataService - - -class TestGetAltseq: - """Test the static method that computes alternative sequences from variants.""" - - def test_snp_plus_strand(self): - """Simple SNP on plus strand should substitute one base.""" - ref_seq = "ATGCGATCGA" - # SNP at position 3 (0-indexed in CDS): G -> T - alt_seq = EnsemblDataService.get_altseq( - ref_seq=ref_seq, ref_allele="G", var_allele="T", - var_pos=4, strand="+", - features_info=[(0, len(ref_seq))], # single exon covering full CDS - ) - assert alt_seq is not None - # The exact assertion depends on what get_altseq returns — - # inspect the return value and assert accordingly - - def test_insertion_plus_strand(self): - """Insertion on plus strand should add bases.""" - ref_seq = "ATGCGATCGA" - alt_seq = EnsemblDataService.get_altseq( - ref_seq=ref_seq, ref_allele="G", var_allele="GAA", - var_pos=4, strand="+", - features_info=[(0, len(ref_seq))], - ) - assert alt_seq is not None - - def test_deletion_plus_strand(self): - """Deletion on plus strand should remove bases.""" - ref_seq = "ATGCGATCGA" - alt_seq = EnsemblDataService.get_altseq( - ref_seq=ref_seq, ref_allele="CGA", var_allele="C", - var_pos=4, strand="+", - features_info=[(0, len(ref_seq))], - ) - assert alt_seq is not None -``` - -**Step 2: Write unit tests for get_mut_pro_seq()** - -```python -# pgatk/tests/test_cosmic_core.py -"""Unit tests for CancerGenomesService core mutation functions.""" -from pgatk.cgenomes.cgenomes_proteindb import CancerGenomesService -from pgatk.cgenomes.models import SNP - - -class TestGetMutProSeq: - """Test the static method that applies COSMIC mutations to protein sequences.""" - - def test_substitution_missense(self): - """A single AA substitution should change one residue.""" - snp = SNP(gene="TEST", aa_mut="p.A10V", mutation_type="Substitution - Missense") - ref_seq = "MAAAAAAAAAAAAAAAAA" # A at position 10 - result = CancerGenomesService.get_mut_pro_seq(snp, ref_seq) - # A at position 10 (1-indexed) should become V - assert result is not None - assert "V" in result - - def test_nonsense(self): - """Nonsense mutation should truncate at the stop.""" - snp = SNP(gene="TEST", aa_mut="p.A10*", mutation_type="Substitution - Nonsense") - ref_seq = "MAAAAAAAAAAAAAAAAA" - result = CancerGenomesService.get_mut_pro_seq(snp, ref_seq) - assert result is not None -``` - -**Step 3: Write unit tests for decoy revswitch()** - -```python -# pgatk/tests/test_decoy_core.py -"""Unit tests for ProteinDBDecoyService core functions.""" -from pgatk.proteomics.db.protein_database_decoy import ProteinDBDecoyService - - -class TestRevswitch: - """Test the reverse-with-KR-switch function.""" - - def test_revswitch_preserves_length(self): - """Reversed sequence should have the same length.""" - seq = "ABCDEFGHIJK" - result = ProteinDBDecoyService.revswitch(seq) - assert len(result) == len(seq) - - def test_revswitch_kr_handling(self): - """K and R at cleavage sites should be preserved at the correct positions.""" - seq = "PEPTIDEK" - result = ProteinDBDecoyService.revswitch(seq) - assert isinstance(result, str) - assert len(result) == len(seq) -``` - -**Step 4: Run all new tests** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_ensembl_core.py pgatk/tests/test_cosmic_core.py pgatk/tests/test_decoy_core.py -v` -Expected: All PASS (adjust assertions based on actual function return values) - -**Step 5: Run full test suite** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/ -v --timeout=120 -x` -Expected: All PASS - -**Step 6: Commit** - -```bash -git add pgatk/tests/test_ensembl_core.py pgatk/tests/test_cosmic_core.py pgatk/tests/test_decoy_core.py -git commit -m "test: add unit tests for get_altseq, get_mut_pro_seq, and revswitch" -``` - ---- - -## Task Summary - -| Task | Description | Files | Depends On | -|------|-------------|-------|------------| -| 1 | Add conftest.py with fixtures | tests/conftest.py | — | -| 2 | Fix variable shadowing bug | protein_database_decoy.py | — | -| 3 | Convert SNP to dataclass | cgenomes/models.py, cgenomes_proteindb.py | — | -| 4 | Replace pathos in blast_get_position | blast_get_position.py | — | -| 5 | Replace pathos in spectrumai | spectrumai.py | — | -| 6 | Replace print() with logger | 5 files | — | -| 7 | Replace broad except blocks | 3 files | Task 6 | -| 8 | Standardize config fallback | toolbox/general.py + 4 files | — | -| 9 | Convert digest_mutant_protein to CLI | db/digest_mutant_protein.py, cli.py | — | -| 10 | Convert map_peptide2genome to CLI | db/map_peptide2genome.py, cli.py | — | -| 11 | Use pathlib.Path | 3 files | — | -| 12 | Add type hints to public APIs | 6 files | Tasks 1-11 | -| 13 | Expand unit test coverage | 3 test files | Tasks 1-11 | - -**Parallelizable groups:** -- Tasks 1-5 can run in parallel (no shared file dependencies) -- Task 6 can start after Tasks 2, 4, 5 (they touch overlapping files) -- Tasks 7 depends on Task 6 (print→logger before except→specific) -- Tasks 9-10 are independent of each other -- Tasks 12-13 should run last (they depend on all prior cleanups) diff --git a/docs/plans/2026-03-02-clinvar-ncbi-implementation.md b/docs/plans/2026-03-02-clinvar-ncbi-implementation.md deleted file mode 100644 index efe7a4aa..00000000 --- a/docs/plans/2026-03-02-clinvar-ncbi-implementation.md +++ /dev/null @@ -1,1652 +0,0 @@ -# ClinVar/NCBI RefSeq Protein Database Generation — Implementation Plan - -> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. - -**Goal:** Add CLI commands to generate variant protein databases from ClinVar VCF files using NCBI RefSeq GTF annotations, without requiring VEP. - -**Architecture:** New `pgatk/clinvar/` module with chromosome mapper, ClinVar VCF→protein pipeline, NCBI data downloader. Shared VCF→protein functions (`get_altseq`, `get_orfs_vcf`, etc.) are extracted from `pgatk/ensembl/ensembl.py` into `pgatk/toolbox/vcf_utils.py` and imported by both pipelines. - -**Tech Stack:** Python 3.9+, gffutils (GTF indexing), pybedtools (interval overlap), BioPython (Seq, SeqIO), pandas, Click (CLI), PyYAML (config), tqdm (progress), urllib (downloads). - ---- - -## Task 1: Extract Shared VCF→Protein Functions to `pgatk/toolbox/vcf_utils.py` - -These static methods in `pgatk/ensembl/ensembl.py` are generic (no Ensembl-specific logic) and will be reused by the ClinVar pipeline. Extract them to a shared module. - -**Files:** -- Create: `pgatk/toolbox/vcf_utils.py` -- Modify: `pgatk/ensembl/ensembl.py` -- Test: `pgatk/tests/test_vcf_utils.py` - -**Step 1: Write the failing test** - -Create `pgatk/tests/test_vcf_utils.py`: - -```python -"""Tests for shared VCF utility functions extracted from EnsemblDataService.""" - -from Bio.Seq import Seq - -from pgatk.toolbox.vcf_utils import check_overlap, get_altseq, get_orfs_vcf, write_output - - -class TestCheckOverlap: - """Tests for check_overlap().""" - - def test_variant_fully_within_feature(self): - features = [[100, 200, 'CDS']] - assert check_overlap(150, 160, features) is True - - def test_variant_outside_feature(self): - features = [[100, 200, 'CDS']] - assert check_overlap(50, 60, features) is False - - def test_variant_start_neg1_always_overlaps(self): - assert check_overlap(-1, 10, [[100, 200, 'CDS']]) is True - - def test_variant_partial_overlap_start(self): - features = [[100, 200, 'CDS']] - assert check_overlap(90, 110, features) is True - - -class TestGetAltseq: - """Tests for get_altseq().""" - - def test_snp_plus_strand(self): - ref_seq = Seq("ATGCCCGGG") - features = [[10, 18, 'exon']] - ref, alt = get_altseq(ref_seq, Seq("C"), Seq("T"), 13, '+', features) - assert len(alt) > 0 - assert ref != alt - - def test_returns_tuple(self): - ref_seq = Seq("ATGCCC") - features = [[10, 15, 'exon']] - result = get_altseq(ref_seq, Seq("C"), Seq("T"), 12, '+', features) - assert isinstance(result, tuple) - assert len(result) == 2 - - -class TestGetOrfsVcf: - """Tests for get_orfs_vcf().""" - - def test_single_orf(self): - ref = Seq("ATGCCCGGG") - alt = Seq("ATGTCCGGG") - ref_orfs, alt_orfs = get_orfs_vcf(ref, alt, 1, num_orfs=1) - assert len(ref_orfs) == 1 - assert len(alt_orfs) == 1 - - def test_three_orfs(self): - ref = Seq("ATGCCCGGG") - alt = Seq("ATGTCCGGG") - ref_orfs, alt_orfs = get_orfs_vcf(ref, alt, 1, num_orfs=3) - assert len(ref_orfs) == 3 - assert len(alt_orfs) == 3 - - -class TestWriteOutput: - """Tests for write_output().""" - - def test_writes_fasta(self, tmp_path): - out_file = tmp_path / "test.fa" - with open(out_file, 'w') as f: - write_output("seq1", "description", ["MPKF"], f) - content = out_file.read_text() - assert ">seq1 description" in content - assert "MPKF" in content -``` - -**Step 2: Run test to verify it fails** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_vcf_utils.py -v` -Expected: FAIL with `ModuleNotFoundError: No module named 'pgatk.toolbox.vcf_utils'` - -**Step 3: Create `pgatk/toolbox/vcf_utils.py`** - -Extract these 4 static methods from `pgatk/ensembl/ensembl.py` (lines 142-166, 168-227, 280-296, 826-849) as standalone functions: - -```python -"""Shared VCF-to-protein utility functions. - -Extracted from pgatk.ensembl.ensembl.EnsemblDataService so that both the -Ensembl and ClinVar pipelines can reuse the same core algorithms. -""" -from __future__ import annotations - -import logging -from typing import Any, Optional - -from Bio.Seq import Seq - -logger = logging.getLogger(__name__) - - -def check_overlap(var_start: int, var_end: int, features_info: Optional[list] = None) -> bool: - """Return True when the variant overlaps any of the features. - - :param var_start: Variant start position (genomic). -1 means always overlap. - :param var_end: Variant end position (genomic). - :param features_info: List of [start, end, type] for each feature. - """ - if features_info is None: - features_info = [[0, 1, 'type']] - if var_start == -1: - return True - for feature_pos in features_info: - pep_start = feature_pos[0] - pep_end = feature_pos[1] - if var_start <= pep_start <= var_end: - return True - elif var_start <= pep_end <= var_end: - return True - elif pep_start <= var_start and pep_end >= var_end: - return True - return False - - -def get_altseq( - ref_seq: Seq, - ref_allele: Seq, - var_allele: Seq, - var_pos: int, - strand: str, - features_info: list, - cds_info: Optional[list] = None, -) -> tuple: - """Modify a reference sequence based on a variant allele. - - Handles strand orientation, CDS trimming, and exon-based coordinate - calculation. Works identically with Ensembl and RefSeq annotations. - - :param ref_seq: Full transcript sequence (all exons concatenated). - :param ref_allele: Reference allele (Seq object). - :param var_allele: Alternate allele (Seq object). - :param var_pos: Genomic position of the variant. - :param strand: '+' or '-'. - :param features_info: List of [start, end, type] per exon/CDS. - :param cds_info: Optional [cds_start, cds_end] for CDS trimming. - :return: (ref_coding_seq, alt_coding_seq) tuple. - """ - if cds_info is None: - cds_info = [] - alt_seq = "" - if len(cds_info) == 2: - start_coding_index = cds_info[0] - 1 - stop_coding_index = cds_info[1] - else: - start_coding_index = 0 - total_len = 0 - for x in features_info: - total_len += x[1] - x[0] + 1 - stop_coding_index = total_len - - if strand == '-': - ref_seq = ref_seq[::-1] - ref_allele = ref_allele.complement() - var_allele = var_allele.complement() - - if strand == '-' and len(cds_info) == 2: - n = len(ref_seq) - ref_seq = ref_seq[n - stop_coding_index:n - start_coding_index] - else: - ref_seq = ref_seq[start_coding_index:stop_coding_index] - - nc_index = 0 - if len(ref_allele) == len(var_allele) or ref_allele[0] == var_allele[0]: - for feature in features_info: - if var_pos in range(feature[0], feature[1] + 1): - var_index_in_cds = nc_index + (var_pos - feature[0]) - c = len(ref_allele) - alt_seq = ref_seq[0:var_index_in_cds] + var_allele + ref_seq[var_index_in_cds + c::] - if strand == '-': - return ref_seq[::-1], alt_seq[::-1] - else: - return ref_seq, alt_seq - nc_index += (feature[1] - feature[0] + 1) - - return ref_seq, alt_seq - - -def get_orfs_vcf( - ref_seq: Seq, - alt_seq: Seq, - translation_table: int, - num_orfs: int = 1, -) -> tuple[list, list]: - """Translate reference and alternate sequences into ORFs. - - :param ref_seq: Reference coding sequence. - :param alt_seq: Alternate coding sequence with variant applied. - :param translation_table: NCBI translation table number. - :param num_orfs: Number of reading frames to generate (1 or 3). - :return: (ref_orfs, alt_orfs) lists of translated protein sequences. - """ - ref_orfs = [] - alt_orfs = [] - for n in range(0, num_orfs): - ref_orfs.append(ref_seq[n::].translate(translation_table)) - alt_orfs.append(alt_seq[n::].translate(translation_table)) - return ref_orfs, alt_orfs - - -def write_output( - seq_id: str, - desc: str, - seqs: list, - prots_fn: Any, - seqs_filter: Optional[list] = None, -) -> None: - """Write ORFs to a FASTA output file. - - :param seq_id: Sequence accession/ID for the FASTA header. - :param desc: Description string for the FASTA header. - :param seqs: List of protein sequences (ORFs) to write. - :param prots_fn: Open file handle for writing. - :param seqs_filter: If provided, skip ORFs that match reference (used to - write only variant-specific proteins). - """ - if seqs_filter is None: - seqs_filter = [] - write_i = False - if len(seqs) > 1: - write_i = True - - formatted_desc = " " + desc if desc else "" - for i, orf in enumerate(seqs): - if orf in seqs_filter: - continue - if write_i: - prots_fn.write('>{}{}\n{}\n'.format(seq_id + "_" + str(i + 1), formatted_desc, orf)) - else: - prots_fn.write('>{}{}\n{}\n'.format(seq_id, formatted_desc, orf)) -``` - -**Step 4: Run test to verify it passes** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_vcf_utils.py -v` -Expected: All PASS - -**Step 5: Update `pgatk/ensembl/ensembl.py` to import from shared module** - -Replace the 4 static method bodies in `EnsemblDataService` with thin wrappers that delegate to the shared functions. This preserves backward compatibility for any code calling `EnsemblDataService.check_overlap(...)` etc. - -In `pgatk/ensembl/ensembl.py`: -- Add import at the top: `from pgatk.toolbox.vcf_utils import check_overlap as _check_overlap, get_altseq as _get_altseq, get_orfs_vcf as _get_orfs_vcf, write_output as _write_output` -- Replace method bodies of `check_overlap` (lines 142-165), `get_altseq` (lines 168-227), `get_orfs_vcf` (lines 280-296), `write_output` (lines 826-849) with delegation calls: - -```python -@staticmethod -def check_overlap(var_start, var_end, features_info=None): - return _check_overlap(var_start, var_end, features_info) - -@staticmethod -def get_altseq(ref_seq, ref_allele, var_allele, var_pos, strand, features_info, cds_info=None): - return _get_altseq(ref_seq, ref_allele, var_allele, var_pos, strand, features_info, cds_info) - -@staticmethod -def get_orfs_vcf(ref_seq, alt_seq, translation_table, num_orfs=1): - return _get_orfs_vcf(ref_seq, alt_seq, translation_table, num_orfs) - -@staticmethod -def write_output(seq_id, desc, seqs, prots_fn, seqs_filter=None): - _write_output(seq_id, desc, seqs, prots_fn, seqs_filter) -``` - -**Step 6: Run ALL existing tests to verify no regressions** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/ -v` -Expected: All 66+ tests PASS (existing tests still call `EnsemblDataService.get_altseq(...)` etc., which now delegate to the shared module) - -**Step 7: Commit** - -```bash -git add pgatk/toolbox/vcf_utils.py pgatk/tests/test_vcf_utils.py pgatk/ensembl/ensembl.py -git commit -m "refactor: extract shared VCF utility functions to toolbox/vcf_utils.py - -Extract check_overlap, get_altseq, get_orfs_vcf, write_output from -EnsemblDataService to pgatk.toolbox.vcf_utils for reuse by ClinVar pipeline. -EnsemblDataService methods now delegate to shared functions." -``` - ---- - -## Task 2: Chromosome Mapper - -Build the chromosome name mapper that converts between NC_ accessions (RefSeq GTF), numeric names (ClinVar VCF), and UCSC chr-prefixed names. - -**Files:** -- Create: `pgatk/clinvar/__init__.py` -- Create: `pgatk/clinvar/chromosome_mapper.py` -- Create: `pgatk/testdata/clinvar/mini_assembly_report.txt` -- Test: `pgatk/tests/test_clinvar/__init__.py` -- Test: `pgatk/tests/test_clinvar/test_chromosome_mapper.py` - -**Step 1: Create test data** - -Create `pgatk/testdata/clinvar/mini_assembly_report.txt` — a subset of the real NCBI assembly report format. The file has comment lines starting with `#`, then a header line, then TSV data. The columns we care about are: col 0 = Sequence-Name (e.g. "1"), col 6 = RefSeq-Accn (e.g. "NC_000001.11"), col 9 = UCSC-style-name (e.g. "chr1"): - -``` -# Assembly name: GRCh38.p14 -# Description: Genome Reference Consortium Human Build 38 patch release 14 -# Organism name: Homo sapiens -# Sequence-Name Sequence-Role Assigned-Molecule Assigned-Molecule-loc/type GenBank-Accn Relationship RefSeq-Accn Assembly-Unit Sequence-Length UCSC-style-name -1 assembled-molecule 1 Chromosome CM000663.2 = NC_000001.11 Primary Assembly 248956422 chr1 -2 assembled-molecule 2 Chromosome CM000664.2 = NC_000002.12 Primary Assembly 242193529 chr2 -X assembled-molecule X Chromosome CM000685.2 = NC_000023.11 Primary Assembly 156040895 chrX -Y assembled-molecule Y Chromosome CM000686.2 = NC_000024.10 Primary Assembly 57227415 chrY -MT assembled-molecule MT Mitochondria J01415.2 = NC_012920.1 non-nuclear 16569 chrM -``` - -**Step 2: Write the failing test** - -Create `pgatk/tests/test_clinvar/__init__.py` (empty file). - -Create `pgatk/tests/test_clinvar/test_chromosome_mapper.py`: - -```python -"""Tests for pgatk.clinvar.chromosome_mapper.""" - -from pathlib import Path - -import pytest - -from pgatk.clinvar.chromosome_mapper import ChromosomeMapper - - -TESTDATA_DIR = Path(__file__).resolve().parent.parent.parent / "testdata" / "clinvar" -ASSEMBLY_REPORT = TESTDATA_DIR / "mini_assembly_report.txt" - - -class TestChromosomeMapper: - """Tests for ChromosomeMapper.""" - - @pytest.fixture - def mapper(self): - return ChromosomeMapper.from_assembly_report(str(ASSEMBLY_REPORT)) - - def test_numeric_to_refseq(self, mapper): - assert mapper.map_chrom("1", "refseq") == "NC_000001.11" - - def test_refseq_to_numeric(self, mapper): - assert mapper.map_chrom("NC_000001.11", "numeric") == "1" - - def test_numeric_to_ucsc(self, mapper): - assert mapper.map_chrom("1", "ucsc") == "chr1" - - def test_ucsc_to_numeric(self, mapper): - assert mapper.map_chrom("chr1", "numeric") == "1" - - def test_refseq_to_ucsc(self, mapper): - assert mapper.map_chrom("NC_000001.11", "ucsc") == "chr1" - - def test_x_chromosome(self, mapper): - assert mapper.map_chrom("X", "refseq") == "NC_000023.11" - assert mapper.map_chrom("NC_000023.11", "numeric") == "X" - - def test_mt_chromosome(self, mapper): - assert mapper.map_chrom("MT", "refseq") == "NC_012920.1" - assert mapper.map_chrom("NC_012920.1", "ucsc") == "chrM" - - def test_unknown_chromosome_returns_input(self, mapper): - assert mapper.map_chrom("UNKNOWN_CONTIG", "refseq") == "UNKNOWN_CONTIG" - - def test_invalid_convention_raises(self, mapper): - with pytest.raises(ValueError, match="convention"): - mapper.map_chrom("1", "invalid_convention") - - def test_chr_prefixed_input(self, mapper): - """Input with chr prefix should be recognized as UCSC convention.""" - assert mapper.map_chrom("chrX", "numeric") == "X" - assert mapper.map_chrom("chrX", "refseq") == "NC_000023.11" -``` - -**Step 3: Run test to verify it fails** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_clinvar/test_chromosome_mapper.py -v` -Expected: FAIL with `ModuleNotFoundError: No module named 'pgatk.clinvar'` - -**Step 4: Implement chromosome mapper** - -Create `pgatk/clinvar/__init__.py` (empty file). - -Create `pgatk/clinvar/chromosome_mapper.py`: - -```python -"""Bidirectional chromosome name mapping: RefSeq (NC_) <-> numeric <-> UCSC (chr). - -Parses an NCBI assembly report to build the mapping tables. -""" -from __future__ import annotations - -import logging -from pathlib import Path - -logger = logging.getLogger(__name__) - -# Valid target convention names -_VALID_CONVENTIONS = {"numeric", "refseq", "ucsc"} - - -class ChromosomeMapper: - """Maps chromosome names between naming conventions. - - Supports three conventions: - - 'numeric': e.g. '1', '2', 'X', 'Y', 'MT' - - 'refseq': e.g. 'NC_000001.11' - - 'ucsc': e.g. 'chr1', 'chrX', 'chrM' - """ - - def __init__( - self, - numeric_to_refseq: dict[str, str], - numeric_to_ucsc: dict[str, str], - ) -> None: - self._num_to_ref = numeric_to_refseq - self._num_to_ucsc = numeric_to_ucsc - - # Build reverse maps - self._ref_to_num = {v: k for k, v in numeric_to_refseq.items()} - self._ucsc_to_num = {v: k for k, v in numeric_to_ucsc.items()} - - @classmethod - def from_assembly_report(cls, report_path: str) -> ChromosomeMapper: - """Parse an NCBI assembly report file and build chromosome maps. - - The assembly report is a TSV with comment lines starting with '#'. - Relevant columns (0-indexed): - - 0: Sequence-Name (e.g. '1', 'X', 'MT') - - 6: RefSeq-Accn (e.g. 'NC_000001.11') - - 9: UCSC-style-name (e.g. 'chr1') - """ - numeric_to_refseq: dict[str, str] = {} - numeric_to_ucsc: dict[str, str] = {} - - with open(report_path, "r", encoding="utf-8") as fh: - for line in fh: - line = line.strip() - if line.startswith("#") or not line: - continue - cols = line.split("\t") - if len(cols) < 10: - continue - seq_name = cols[0] # numeric name - refseq_accn = cols[6] # NC_ accession - ucsc_name = cols[9] # chr-prefixed name - - if refseq_accn and refseq_accn != "na": - numeric_to_refseq[seq_name] = refseq_accn - if ucsc_name and ucsc_name != "na": - numeric_to_ucsc[seq_name] = ucsc_name - - logger.info( - "Loaded chromosome mapping: %d entries from %s", - len(numeric_to_refseq), - Path(report_path).name, - ) - return cls(numeric_to_refseq, numeric_to_ucsc) - - def _to_numeric(self, name: str) -> str | None: - """Convert any convention to numeric. Returns None if unknown.""" - # Already numeric? - if name in self._num_to_ref or name in self._num_to_ucsc: - return name - # RefSeq? - if name in self._ref_to_num: - return self._ref_to_num[name] - # UCSC? - if name in self._ucsc_to_num: - return self._ucsc_to_num[name] - return None - - def map_chrom(self, name: str, target_convention: str) -> str: - """Map a chromosome name to the target convention. - - :param name: Input chromosome name in any convention. - :param target_convention: One of 'numeric', 'refseq', 'ucsc'. - :return: Mapped name, or the original name if mapping is unknown. - :raises ValueError: If target_convention is not valid. - """ - if target_convention not in _VALID_CONVENTIONS: - raise ValueError( - f"Invalid convention '{target_convention}'. " - f"Must be one of: {', '.join(sorted(_VALID_CONVENTIONS))}" - ) - - numeric = self._to_numeric(name) - if numeric is None: - logger.warning("Unknown chromosome name: %s — returning as-is", name) - return name - - if target_convention == "numeric": - return numeric - elif target_convention == "refseq": - return self._num_to_ref.get(numeric, name) - elif target_convention == "ucsc": - return self._num_to_ucsc.get(numeric, name) - return name # unreachable but satisfies linters -``` - -**Step 5: Run test to verify it passes** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_clinvar/test_chromosome_mapper.py -v` -Expected: All PASS - -**Step 6: Commit** - -```bash -git add pgatk/clinvar/__init__.py pgatk/clinvar/chromosome_mapper.py \ - pgatk/testdata/clinvar/mini_assembly_report.txt \ - pgatk/tests/test_clinvar/__init__.py \ - pgatk/tests/test_clinvar/test_chromosome_mapper.py -git commit -m "feat: add ChromosomeMapper for NC_/numeric/UCSC chromosome name conversion - -Parses NCBI assembly report to build bidirectional chromosome name maps. -Supports RefSeq (NC_000001.11), numeric (1), and UCSC (chr1) conventions." -``` - ---- - -## Task 3: ClinVar Config and Config Registry Integration - -Add ClinVar-specific configuration defaults and register them in the config registry. - -**Files:** -- Create: `pgatk/config/clinvar_config.yaml` -- Modify: `pgatk/config/registry.py` - -**Step 1: Create `pgatk/config/clinvar_config.yaml`** - -```yaml -clinvar_translation: - translation_table: 1 - mito_translation_table: 2 - protein_prefix: "clinvar_" - proteindb_output_file: "clinvar-peptide-database.fa" - report_ref_seq: false - num_orfs: 1 - - # ClinVar-specific filtering - clinical_significance_exclude: - - "Benign" - - "Likely_benign" - - "Benign/Likely_benign" - - # NCBI RefSeq biotypes to include - include_biotypes: "protein_coding" - - # Molecular consequences to include (from ClinVar MC field) - include_consequences: - - "missense_variant" - - "nonsense" - - "frameshift_variant" - - "inframe_insertion" - - "inframe_deletion" - - "stop_gained" - - "stop_lost" - - "start_lost" - - "splice_donor_variant" - - "splice_acceptor_variant" - - # NCBI FTP URLs - ncbi_refseq_ftp: "https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/" - clinvar_ftp: "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/" - - logger: - formatters: - DEBUG: "%(asctime)s [%(levelname)7s][%(name)48s][%(module)32s, %(lineno)4s] %(message)s" - INFO: "%(asctime)s [%(levelname)7s][%(name)48s] %(message)s" - loglevel: DEBUG -``` - -**Step 2: Register in `pgatk/config/registry.py`** - -Add `"clinvar": "clinvar_config.yaml"` to the `COMMAND_CONFIGS` dict at line 16 of `pgatk/config/registry.py`: - -```python -COMMAND_CONFIGS = { - "ensembl_downloader": "ensembl_downloader_config.yaml", - "ensembl_config": "ensembl_config.yaml", - "cosmic": "cosmic_config.yaml", - "cbioportal": "cbioportal_config.yaml", - "protein_decoy": "protein_decoy.yaml", - "clinvar": "clinvar_config.yaml", -} -``` - -**Step 3: Verify config loads** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -c "from pgatk.config.registry import load_config; c = load_config('clinvar'); print(c['clinvar_translation']['clinical_significance_exclude'])"` -Expected: `['Benign', 'Likely_benign', 'Benign/Likely_benign']` - -**Step 4: Commit** - -```bash -git add pgatk/config/clinvar_config.yaml pgatk/config/registry.py -git commit -m "feat: add ClinVar config with CLNSIG filtering and NCBI FTP URLs" -``` - ---- - -## Task 4: ClinVar Service — Core Pipeline - -The main ClinVar VCF-to-protein pipeline. This is the largest task. - -**Files:** -- Create: `pgatk/clinvar/clinvar_service.py` -- Create: `pgatk/testdata/clinvar/mini_clinvar.vcf` -- Create: `pgatk/testdata/clinvar/mini_refseq.gtf` -- Create: `pgatk/testdata/clinvar/mini_refseq_protein.faa` -- Test: `pgatk/tests/test_clinvar/test_clinvar_service.py` - -**Step 1: Create test data files** - -These are minimal hand-crafted files that exercise the key code paths. - -Create `pgatk/testdata/clinvar/mini_clinvar.vcf`: -``` -##fileformat=VCFv4.1 -##INFO= -##INFO= -##INFO= -##INFO= -#CHROM POS ID REF ALT QUAL FILTER INFO -1 100015 rs001 C T . . CLNSIG=Pathogenic;GENEINFO=TESTGENE1:1234;MC=SO:0001583|missense_variant -1 100025 rs002 G A . . CLNSIG=Benign;GENEINFO=TESTGENE1:1234;MC=SO:0001583|missense_variant -2 200010 rs003 A G . . CLNSIG=Likely_pathogenic;GENEINFO=TESTGENE2:5678;MC=SO:0001587|stop_gained -``` - -Create `pgatk/testdata/clinvar/mini_refseq.gtf` — RefSeq-style GTF with NC_ chromosome accessions. Note: features must overlap the VCF variant positions after chromosome mapping. - -``` -NC_000001.11 BestRefSeq transcript 100001 100090 . + . gene_id "TESTGENE1"; transcript_id "NM_000001.1"; gene_biotype "protein_coding"; -NC_000001.11 BestRefSeq exon 100001 100030 . + . gene_id "TESTGENE1"; transcript_id "NM_000001.1"; -NC_000001.11 BestRefSeq CDS 100001 100030 . + 0 gene_id "TESTGENE1"; transcript_id "NM_000001.1"; -NC_000001.11 BestRefSeq exon 100051 100090 . + . gene_id "TESTGENE1"; transcript_id "NM_000001.1"; -NC_000001.11 BestRefSeq CDS 100051 100090 . + 2 gene_id "TESTGENE1"; transcript_id "NM_000001.1"; -NC_000002.12 BestRefSeq transcript 200001 200060 . + . gene_id "TESTGENE2"; transcript_id "NM_000002.1"; gene_biotype "protein_coding"; -NC_000002.12 BestRefSeq exon 200001 200060 . + . gene_id "TESTGENE2"; transcript_id "NM_000002.1"; -NC_000002.12 BestRefSeq CDS 200001 200060 . + 0 gene_id "TESTGENE2"; transcript_id "NM_000002.1"; -``` - -Create `pgatk/testdata/clinvar/mini_refseq_protein.faa` — transcript sequences for the transcripts referenced in the GTF. Sequences must be nucleotide, matching the CDS lengths: - -``` ->NM_000001.1 CDS=1-70 -ATGCCCGGGAAATTTCCCGGGAAATTTCCCATGCCCGGGAAATTTCCCGGGAAATTTCCCGGGAAATTTCC ->NM_000002.1 CDS=1-60 -ATGCCCGGGAAATTTCCCGGGAAATTTCCCGGGAAATTTCCCGGGAAATTTCCCGGGAAA -``` - -**Step 2: Write the failing test** - -Create `pgatk/tests/test_clinvar/test_clinvar_service.py`: - -```python -"""Tests for ClinVar VCF-to-protein pipeline.""" - -from pathlib import Path - -import pytest - -from pgatk.clinvar.clinvar_service import ClinVarService - - -TESTDATA_DIR = Path(__file__).resolve().parent.parent.parent / "testdata" / "clinvar" - - -class TestClinSigFiltering: - """Tests for CLNSIG filtering logic.""" - - def test_pathogenic_passes(self): - exclude = ["Benign", "Likely_benign", "Benign/Likely_benign"] - assert ClinVarService.passes_clnsig_filter("Pathogenic", exclude) is True - - def test_benign_excluded(self): - exclude = ["Benign", "Likely_benign", "Benign/Likely_benign"] - assert ClinVarService.passes_clnsig_filter("Benign", exclude) is False - - def test_likely_benign_excluded(self): - exclude = ["Benign", "Likely_benign", "Benign/Likely_benign"] - assert ClinVarService.passes_clnsig_filter("Likely_benign", exclude) is False - - def test_uncertain_passes(self): - exclude = ["Benign", "Likely_benign", "Benign/Likely_benign"] - assert ClinVarService.passes_clnsig_filter("Uncertain_significance", exclude) is True - - def test_empty_clnsig_passes(self): - exclude = ["Benign", "Likely_benign"] - assert ClinVarService.passes_clnsig_filter("", exclude) is True - - -class TestMolecularConsequenceParser: - """Tests for MC field parsing.""" - - def test_parse_missense(self): - mc_field = "SO:0001583|missense_variant" - assert ClinVarService.parse_mc_consequence(mc_field) == "missense_variant" - - def test_parse_multiple_consequences(self): - mc_field = "SO:0001583|missense_variant,SO:0001587|stop_gained" - result = ClinVarService.parse_mc_consequences(mc_field) - assert "missense_variant" in result - assert "stop_gained" in result - - def test_parse_empty_mc(self): - assert ClinVarService.parse_mc_consequence("") == "" - - -class TestGeneInfoParser: - """Tests for GENEINFO field parsing.""" - - def test_parse_single_gene(self): - gene_symbol, gene_id = ClinVarService.parse_geneinfo("BRCA1:672") - assert gene_symbol == "BRCA1" - assert gene_id == "672" - - def test_parse_multi_gene(self): - gene_symbol, gene_id = ClinVarService.parse_geneinfo("BRCA1:672|TP53:7157") - assert gene_symbol == "BRCA1" # first gene - - -class TestClinVarPipeline: - """Integration test for ClinVar-to-proteindb pipeline.""" - - def test_pipeline_produces_output(self, tmp_path): - output_file = tmp_path / "output.fa" - service = ClinVarService( - vcf_file=str(TESTDATA_DIR / "mini_clinvar.vcf"), - gtf_file=str(TESTDATA_DIR / "mini_refseq.gtf"), - fasta_file=str(TESTDATA_DIR / "mini_refseq_protein.faa"), - assembly_report=str(TESTDATA_DIR / "mini_assembly_report.txt"), - output_file=str(output_file), - ) - service.run() - assert output_file.exists() - content = output_file.read_text() - # Should contain at least one protein entry - assert ">" in content - - def test_benign_variants_excluded(self, tmp_path): - output_file = tmp_path / "output.fa" - service = ClinVarService( - vcf_file=str(TESTDATA_DIR / "mini_clinvar.vcf"), - gtf_file=str(TESTDATA_DIR / "mini_refseq.gtf"), - fasta_file=str(TESTDATA_DIR / "mini_refseq_protein.faa"), - assembly_report=str(TESTDATA_DIR / "mini_assembly_report.txt"), - output_file=str(output_file), - ) - service.run() - content = output_file.read_text() - # rs002 is Benign and should NOT appear - assert "rs002" not in content -``` - -**Step 3: Run test to verify it fails** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_clinvar/test_clinvar_service.py -v` -Expected: FAIL with `ModuleNotFoundError: No module named 'pgatk.clinvar.clinvar_service'` - -**Step 4: Implement `pgatk/clinvar/clinvar_service.py`** - -```python -"""ClinVar VCF-to-protein database pipeline. - -Processes ClinVar VCF records against NCBI RefSeq GTF annotations to produce -a FASTA file of variant protein sequences. Does NOT require VEP — uses -BedTools interval overlap to find affected transcripts internally. -""" -from __future__ import annotations - -import logging -import sqlite3 -from pathlib import Path -from typing import Optional - -import gffutils -import pandas as pd -from Bio import SeqIO -from Bio.Seq import Seq -from pybedtools import BedTool - -from pgatk.clinvar.chromosome_mapper import ChromosomeMapper -from pgatk.toolbox.vcf_utils import check_overlap, get_altseq, get_orfs_vcf, write_output - -logger = logging.getLogger(__name__) - -# Default CLNSIG values to exclude -_DEFAULT_CLNSIG_EXCLUDE = ["Benign", "Likely_benign", "Benign/Likely_benign"] - -# Default molecular consequences to include -_DEFAULT_CONSEQUENCES = [ - "missense_variant", "nonsense", "frameshift_variant", - "inframe_insertion", "inframe_deletion", "stop_gained", - "stop_lost", "start_lost", "splice_donor_variant", "splice_acceptor_variant", -] - - -class ClinVarService: - """Generate variant protein databases from ClinVar VCF + RefSeq GTF.""" - - def __init__( - self, - vcf_file: str, - gtf_file: str, - fasta_file: str, - assembly_report: str, - output_file: str, - clnsig_exclude: Optional[list[str]] = None, - consequences: Optional[list[str]] = None, - translation_table: int = 1, - mito_translation_table: int = 2, - protein_prefix: str = "clinvar_", - num_orfs: int = 1, - report_ref_seq: bool = False, - ) -> None: - self._vcf_file = vcf_file - self._gtf_file = gtf_file - self._fasta_file = fasta_file - self._assembly_report = assembly_report - self._output_file = output_file - self._clnsig_exclude = clnsig_exclude or _DEFAULT_CLNSIG_EXCLUDE - self._consequences = consequences or _DEFAULT_CONSEQUENCES - self._translation_table = translation_table - self._mito_translation_table = mito_translation_table - self._protein_prefix = protein_prefix - self._num_orfs = num_orfs - self._report_ref_seq = report_ref_seq - - # ------------------------------------------------------------------ - # Static helper methods for parsing ClinVar INFO fields - # ------------------------------------------------------------------ - - @staticmethod - def passes_clnsig_filter(clnsig: str, exclude_list: list[str]) -> bool: - """Return True if the clinical significance is NOT in the exclude list.""" - if not clnsig: - return True - return clnsig not in exclude_list - - @staticmethod - def parse_mc_consequence(mc_field: str) -> str: - """Parse first molecular consequence from the MC INFO field. - - MC format: 'SO:0001583|missense_variant' - """ - if not mc_field: - return "" - first_mc = mc_field.split(",")[0] - parts = first_mc.split("|") - return parts[1] if len(parts) > 1 else "" - - @staticmethod - def parse_mc_consequences(mc_field: str) -> list[str]: - """Parse all molecular consequences from the MC field.""" - if not mc_field: - return [] - consequences = [] - for entry in mc_field.split(","): - parts = entry.strip().split("|") - if len(parts) > 1: - consequences.append(parts[1]) - return consequences - - @staticmethod - def parse_geneinfo(geneinfo: str) -> tuple[str, str]: - """Parse GENEINFO field. Format: 'SYMBOL:GENEID[|SYMBOL2:GENEID2]'. - - Returns (gene_symbol, gene_id) for the first gene. - """ - if not geneinfo: - return "", "" - first_gene = geneinfo.split("|")[0] - parts = first_gene.split(":") - return (parts[0], parts[1]) if len(parts) == 2 else (geneinfo, "") - - @staticmethod - def _get_info_field(info_str: str, key: str) -> str: - """Extract a value from the VCF INFO column by key.""" - for field in info_str.split(";"): - if field.startswith(key + "="): - return field.split("=", 1)[1] - return "" - - # ------------------------------------------------------------------ - # GTF parsing (reuses gffutils, same pattern as EnsemblDataService) - # ------------------------------------------------------------------ - - @staticmethod - def _parse_gtf(gtf_file: str) -> gffutils.FeatureDB: - """Create or load a gffutils database from a GTF file.""" - db_file = str(Path(gtf_file).with_suffix(".db")) - try: - gffutils.create_db( - gtf_file, db_file, - merge_strategy="create_unique", - keep_order=True, - disable_infer_transcripts=True, - disable_infer_genes=True, - verbose=False, - force=False, - ) - except (ValueError, sqlite3.OperationalError): - logger.info("GTF database already exists: %s", db_file) - - return gffutils.FeatureDB(db_file) - - @staticmethod - def _get_features( - db: gffutils.FeatureDB, - feature_id: str, - feature_types: Optional[list[str]] = None, - ) -> tuple: - """Retrieve chromosome, strand, and coding features for a transcript.""" - if feature_types is None: - feature_types = ["exon"] - try: - feature = db[feature_id] - except gffutils.exceptions.FeatureNotFoundError: - try: - feature = db[feature_id.split(".")[0]] - except gffutils.exceptions.FeatureNotFoundError: - logger.warning("Feature %s not found in GTF database", feature_id) - return None, None, None - - coding_features = [] - for f in db.children(feature, featuretype=feature_types, order_by="end"): - coding_features.append([f.start, f.end, f.featuretype]) - return feature.chrom, feature.strand, coding_features - - # ------------------------------------------------------------------ - # VCF reading (same pandas approach as EnsemblDataService) - # ------------------------------------------------------------------ - - @staticmethod - def _read_vcf(vcf_file: str) -> tuple[list[str], pd.DataFrame]: - """Read a VCF file into metadata headers + DataFrame.""" - metadata = [] - records = [] - with open(vcf_file, "r", encoding="utf-8") as fh: - for line in fh: - line = line.strip() - if line.startswith("#"): - metadata.append(line) - continue - cols = line.split("\t") - if len(cols) < 8: - continue - records.append({ - "CHROM": cols[0], - "POS": cols[1], - "ID": cols[2], - "REF": cols[3], - "ALT": cols[4], - "QUAL": cols[5], - "FILTER": cols[6], - "INFO": cols[7], - }) - df = pd.DataFrame(records) if records else pd.DataFrame( - columns=["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] - ) - return metadata, df - - # ------------------------------------------------------------------ - # Transcript overlap via BedTools - # ------------------------------------------------------------------ - - @staticmethod - def _annotate_vcf_with_transcripts( - vcf_file: str, - gtf_file: str, - chrom_mapper: ChromosomeMapper, - ) -> dict[str, list[str]]: - """Find overlapping transcripts for each VCF record using BedTools. - - Returns a dict mapping 'CHROM:POS:REF:ALT' -> [transcript_id, ...]. - Chromosome names are normalized to RefSeq (NC_) for GTF matching. - """ - overlaps: dict[str, list[str]] = {} - - # Read VCF records and convert to BED format in RefSeq chromosome names - vcf_bed_entries = [] - _, vcf_df = ClinVarService._read_vcf(vcf_file) - - for _, row in vcf_df.iterrows(): - chrom_refseq = chrom_mapper.map_chrom(str(row.CHROM), "refseq") - start = int(row.POS) - 1 # BED is 0-based - end = start + len(str(row.REF)) - key = f"{row.CHROM}:{row.POS}:{row.REF}:{row.ALT}" - vcf_bed_entries.append(f"{chrom_refseq}\t{start}\t{end}\t{key}") - - if not vcf_bed_entries: - return overlaps - - vcf_bed = BedTool("\n".join(vcf_bed_entries), from_string=True) - - # Intersect with GTF — extract CDS features only - try: - gtf_bed = BedTool(gtf_file) - intersection = gtf_bed.intersect(vcf_bed, wo=True) - except Exception as e: - logger.error("BedTools intersection failed: %s", e) - return overlaps - - for feature in intersection: - fields = str(feature).strip().split("\t") - # GTF feature type is at index 2 - if fields[2] != "CDS": - continue - # Parse transcript_id from GTF attributes (index 8) - attrs = fields[8] - transcript_id = "" - for attr in attrs.split(";"): - attr = attr.strip() - if attr.startswith("transcript_id"): - transcript_id = attr.split('"')[1] if '"' in attr else attr.split(" ")[1] - break - if not transcript_id: - continue - - # The key is in the last few columns (from the VCF BED name field) - # BedTool intersect with wo appends the second file fields - variant_key = fields[-2] # the name column from the VCF BED - overlaps.setdefault(variant_key, []) - if transcript_id not in overlaps[variant_key]: - overlaps[variant_key].append(transcript_id) - - return overlaps - - # ------------------------------------------------------------------ - # Main pipeline - # ------------------------------------------------------------------ - - def run(self) -> str: - """Execute the ClinVar-to-proteindb pipeline. Returns output path.""" - - # 1. Load chromosome mapper - chrom_mapper = ChromosomeMapper.from_assembly_report(self._assembly_report) - - # 2. Parse GTF and index transcripts - db = self._parse_gtf(self._gtf_file) - - # 3. Load transcript FASTA sequences - transcripts_dict = SeqIO.index(self._fasta_file, "fasta", - key_function=lambda h: h.split("|")[0].split(" ")[0]) - transcript_id_mapping = {k.split(".")[0]: k for k in transcripts_dict.keys()} - - # 4. Find overlapping transcripts for all VCF records - variant_transcripts = self._annotate_vcf_with_transcripts( - self._vcf_file, self._gtf_file, chrom_mapper, - ) - - # 5. Process VCF records - _, vcf_df = self._read_vcf(self._vcf_file) - - stats = { - "variants_processed": 0, - "variants_excluded_clnsig": 0, - "variants_no_overlap": 0, - "variants_translated": 0, - "transcript_not_found": 0, - } - - with open(self._output_file, "w", encoding="utf-8") as out_fh: - for _, record in vcf_df.iterrows(): - stats["variants_processed"] += 1 - - # Validate REF/ALT - ref = str(record.REF) - alts = [a for a in str(record.ALT).split(",") - if a and all(c in "ACGT" for c in a)] - if not all(c in "ACGT" for c in ref) or not alts: - continue - - # Filter by CLNSIG - clnsig = self._get_info_field(str(record.INFO), "CLNSIG") - if not self.passes_clnsig_filter(clnsig, self._clnsig_exclude): - stats["variants_excluded_clnsig"] += 1 - continue - - # Translation table - trans_table = self._translation_table - chrom_str = str(record.CHROM).lstrip("chr").upper() - if chrom_str in ("M", "MT"): - trans_table = self._mito_translation_table - - # Get overlapping transcripts - variant_key = f"{record.CHROM}:{record.POS}:{record.REF}:{record.ALT}" - transcript_ids = variant_transcripts.get(variant_key, []) - if not transcript_ids: - stats["variants_no_overlap"] += 1 - continue - - for transcript_id in transcript_ids: - # Resolve transcript version - try: - transcript_id_v = transcript_id_mapping.get( - transcript_id.split(".")[0], transcript_id - ) - except (AttributeError, KeyError): - transcript_id_v = transcript_id - - # Get sequence from FASTA - try: - row = transcripts_dict[transcript_id_v] - ref_seq = row.seq - desc = str(row.description) - except KeyError: - stats["transcript_not_found"] += 1 - continue - - # Determine feature types and CDS info - feature_types = ["exon"] - cds_info: list[int] = [] - num_orfs = 3 - if "CDS=" in desc: - try: - cds_info = [int(x) for x in desc.split(" ")[1].split("=")[1].split("-")] - feature_types = ["CDS", "stop_codon"] - num_orfs = 1 - except (ValueError, IndexError): - pass - - # Override with configured num_orfs - if self._num_orfs: - num_orfs = self._num_orfs - - # Get genomic features from GTF - chrom, strand, features_info = self._get_features( - db, transcript_id_v, feature_types, - ) - if chrom is None: - continue - - # Verify chromosome match (after mapping) - record_chrom_refseq = chrom_mapper.map_chrom(str(record.CHROM), "refseq") - if chrom != record_chrom_refseq: - continue - - for alt in alts: - # Check overlap - var_pos = int(record.POS) - if not check_overlap(var_pos, var_pos + len(ref) - 1, features_info): - continue - - # Apply variant - coding_ref, coding_alt = get_altseq( - ref_seq, Seq(ref), Seq(alt), var_pos, - strand, features_info, cds_info, - ) - - if coding_alt != "": - ref_orfs, alt_orfs = get_orfs_vcf( - coding_ref, coding_alt, trans_table, num_orfs, - ) - - # Build header with ClinVar metadata - gene_symbol, _ = self.parse_geneinfo( - self._get_info_field(str(record.INFO), "GENEINFO"), - ) - record_id = str(record.ID) if record.ID and record.ID != "." else "" - seq_id = "_".join(filter(None, [ - self._protein_prefix + record_id, - f"{record.CHROM}.{record.POS}.{ref}.{alt}", - transcript_id_v, - ])) - - desc_str = f"{clnsig}|{gene_symbol}" if gene_symbol else clnsig - - write_output( - seq_id=seq_id, - desc=desc_str, - seqs=alt_orfs, - prots_fn=out_fh, - seqs_filter=ref_orfs, - ) - stats["variants_translated"] += 1 - - if self._report_ref_seq: - write_output( - seq_id=transcript_id_v, - desc="", - seqs=ref_orfs, - prots_fn=out_fh, - ) - - logger.info("ClinVar pipeline summary: %s", stats) - return self._output_file -``` - -**Step 5: Run test to verify it passes** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_clinvar/test_clinvar_service.py -v` -Expected: All PASS - -Note: The integration tests (`TestClinVarPipeline`) may need test data adjustments. The test data sequences and positions must be consistent — CDS positions in the GTF must overlap VCF variant positions, and FASTA sequences must match CDS lengths. Debug and adjust the test data files if positions don't align. The important thing is: Pathogenic/Likely_pathogenic variants get translated, Benign variants are excluded. - -**Step 6: Run all tests** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/ -v` -Expected: All tests PASS - -**Step 7: Commit** - -```bash -git add pgatk/clinvar/clinvar_service.py \ - pgatk/testdata/clinvar/mini_clinvar.vcf \ - pgatk/testdata/clinvar/mini_refseq.gtf \ - pgatk/testdata/clinvar/mini_refseq_protein.faa \ - pgatk/tests/test_clinvar/test_clinvar_service.py -git commit -m "feat: add ClinVarService pipeline for VCF-to-protein generation - -Processes ClinVar VCF against RefSeq GTF using BedTools overlap detection. -Filters by CLNSIG, reuses shared get_altseq/get_orfs_vcf for variant -application and translation." -``` - ---- - -## Task 5: NCBI Data Downloader - -Download NCBI RefSeq and ClinVar files from FTP. - -**Files:** -- Create: `pgatk/clinvar/data_downloader.py` -- Test: `pgatk/tests/test_clinvar/test_data_downloader.py` - -**Step 1: Write the failing test** - -Create `pgatk/tests/test_clinvar/test_data_downloader.py`: - -```python -"""Tests for NCBI data downloader (no actual downloads).""" - -from pathlib import Path - -import pytest - -from pgatk.clinvar.data_downloader import NcbiDataDownloader - - -class TestNcbiDataDownloader: - """Tests for NcbiDataDownloader.""" - - def test_build_refseq_urls(self): - downloader = NcbiDataDownloader(output_dir="/tmp/test") - urls = downloader.get_refseq_urls() - assert any("GRCh38_latest_genomic.gtf.gz" in u for u in urls) - assert any("GRCh38_latest_protein.faa.gz" in u for u in urls) - assert any("assembly_report.txt" in u for u in urls) - - def test_build_clinvar_urls(self): - downloader = NcbiDataDownloader(output_dir="/tmp/test") - urls = downloader.get_clinvar_urls() - assert any("clinvar.vcf.gz" in u for u in urls) - - def test_expected_files_list(self): - downloader = NcbiDataDownloader(output_dir="/tmp/test") - files = downloader.expected_files() - assert len(files) >= 4 # gtf, protein fasta, assembly report, clinvar vcf - - def test_output_dir_created(self, tmp_path): - out_dir = tmp_path / "ncbi_data" - downloader = NcbiDataDownloader(output_dir=str(out_dir)) - downloader.ensure_output_dir() - assert out_dir.exists() -``` - -**Step 2: Run test to verify it fails** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_clinvar/test_data_downloader.py -v` -Expected: FAIL with `ModuleNotFoundError` - -**Step 3: Implement `pgatk/clinvar/data_downloader.py`** - -```python -"""Download NCBI RefSeq and ClinVar reference files. - -Uses urllib for FTP/HTTPS downloads. Skips existing files unless force=True. -""" -from __future__ import annotations - -import logging -import os -from pathlib import Path - -from pgatk.toolbox.general import download_file, check_create_folders - -logger = logging.getLogger(__name__) - -_DEFAULT_REFSEQ_BASE = ( - "https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/" - "GRCh38_latest/refseq_identifiers/" -) -_DEFAULT_CLINVAR_BASE = ( - "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/" -) - -_REFSEQ_FILES = [ - "GRCh38_latest_genomic.gtf.gz", - "GRCh38_latest_protein.faa.gz", - "GRCh38_latest_assembly_report.txt", -] -_CLINVAR_FILES = [ - "clinvar.vcf.gz", - "clinvar.vcf.gz.tbi", -] - - -class NcbiDataDownloader: - """Download NCBI RefSeq and ClinVar reference data.""" - - def __init__( - self, - output_dir: str, - refseq_base_url: str = _DEFAULT_REFSEQ_BASE, - clinvar_base_url: str = _DEFAULT_CLINVAR_BASE, - ) -> None: - self._output_dir = output_dir - self._refseq_base = refseq_base_url - self._clinvar_base = clinvar_base_url - - def ensure_output_dir(self) -> None: - """Create output directory if it doesn't exist.""" - check_create_folders([self._output_dir]) - - def get_refseq_urls(self) -> list[str]: - """Return list of RefSeq file URLs to download.""" - return [self._refseq_base + f for f in _REFSEQ_FILES] - - def get_clinvar_urls(self) -> list[str]: - """Return list of ClinVar file URLs to download.""" - return [self._clinvar_base + f for f in _CLINVAR_FILES] - - def expected_files(self) -> list[str]: - """Return list of expected local file paths after download.""" - all_files = _REFSEQ_FILES + _CLINVAR_FILES - return [os.path.join(self._output_dir, f) for f in all_files] - - def download_all(self, force: bool = False) -> list[str]: - """Download all required files. Returns list of downloaded file paths. - - :param force: If True, re-download even if files exist. - """ - self.ensure_output_dir() - downloaded = [] - - all_urls = self.get_refseq_urls() + self.get_clinvar_urls() - all_names = _REFSEQ_FILES + _CLINVAR_FILES - - for url, name in zip(all_urls, all_names): - local_path = os.path.join(self._output_dir, name) - - if os.path.exists(local_path) and not force: - logger.info("File already exists, skipping: %s", local_path) - downloaded.append(local_path) - continue - - logger.info("Downloading %s -> %s", url, local_path) - result = download_file(url, local_path, logger) - if result: - downloaded.append(result) - else: - logger.error("Failed to download: %s", url) - - return downloaded -``` - -**Step 4: Run test to verify it passes** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_clinvar/test_data_downloader.py -v` -Expected: All PASS - -**Step 5: Commit** - -```bash -git add pgatk/clinvar/data_downloader.py pgatk/tests/test_clinvar/test_data_downloader.py -git commit -m "feat: add NcbiDataDownloader for RefSeq/ClinVar file downloads - -Downloads GTF, protein FASTA, assembly report, and ClinVar VCF from NCBI FTP. -Skips existing files, supports --force re-download." -``` - ---- - -## Task 6: CLI Commands and CLI Registration - -Wire up the two new CLI commands and register them in `cli.py`. - -**Files:** -- Create: `pgatk/commands/clinvar_to_proteindb.py` -- Create: `pgatk/commands/ncbi_downloader.py` -- Modify: `pgatk/cli.py` - -**Step 1: Create `pgatk/commands/clinvar_to_proteindb.py`** - -```python -import click - -from pgatk.clinvar.clinvar_service import ClinVarService -from pgatk.config.registry import load_config - - -@click.command("clinvar-to-proteindb", short_help="Generate protein database from ClinVar VCF + RefSeq GTF") -@click.option("-c", "--config_file", help="Configuration YAML file (optional)") -@click.option("-v", "--vcf", required=True, help="ClinVar VCF file path") -@click.option("-g", "--gtf", required=True, help="NCBI RefSeq GTF file path") -@click.option("-f", "--fasta", required=True, help="RefSeq protein/transcript FASTA file path") -@click.option("-a", "--assembly-report", required=True, help="NCBI assembly report file path") -@click.option("-o", "--output", required=True, help="Output protein FASTA file path") -@click.option("--clnsig-exclude", default=None, - help="Comma-separated CLNSIG values to exclude (default: Benign,Likely_benign,Benign/Likely_benign)") -@click.option("--consequences", default=None, - help="Comma-separated molecular consequences to include") -@click.option("-t", "--translation-table", type=int, default=None, help="Translation table (default: 1)") -@click.option("-p", "--protein-prefix", default=None, help="Prefix for variant protein IDs (default: clinvar_)") -@click.option("--report-ref-seq", is_flag=True, default=False, - help="Also report reference protein sequences") -@click.pass_context -def clinvar_to_proteindb(ctx, config_file, vcf, gtf, fasta, assembly_report, output, - clnsig_exclude, consequences, translation_table, protein_prefix, - report_ref_seq): - """Generate a variant protein database from ClinVar VCF and NCBI RefSeq GTF. - - This command does NOT require VEP annotations. It uses BedTools interval - overlap to find transcripts affected by each ClinVar variant, then applies - the variant and translates to protein. - """ - config = load_config("clinvar", config_file) - clinvar_cfg = config.get("clinvar_translation", {}) - - # Build kwargs with config defaults, CLI overrides - kwargs = { - "vcf_file": vcf, - "gtf_file": gtf, - "fasta_file": fasta, - "assembly_report": assembly_report, - "output_file": output, - "translation_table": translation_table or clinvar_cfg.get("translation_table", 1), - "mito_translation_table": clinvar_cfg.get("mito_translation_table", 2), - "protein_prefix": protein_prefix or clinvar_cfg.get("protein_prefix", "clinvar_"), - "num_orfs": clinvar_cfg.get("num_orfs", 1), - "report_ref_seq": report_ref_seq or clinvar_cfg.get("report_ref_seq", False), - } - - if clnsig_exclude: - kwargs["clnsig_exclude"] = [x.strip() for x in clnsig_exclude.split(",")] - elif "clinical_significance_exclude" in clinvar_cfg: - kwargs["clnsig_exclude"] = clinvar_cfg["clinical_significance_exclude"] - - if consequences: - kwargs["consequences"] = [x.strip() for x in consequences.split(",")] - elif "include_consequences" in clinvar_cfg: - kwargs["consequences"] = clinvar_cfg["include_consequences"] - - service = ClinVarService(**kwargs) - service.run() -``` - -**Step 2: Create `pgatk/commands/ncbi_downloader.py`** - -```python -import click - -from pgatk.clinvar.data_downloader import NcbiDataDownloader -from pgatk.config.registry import load_config - - -@click.command("ncbi-downloader", short_help="Download NCBI RefSeq and ClinVar reference files") -@click.option("-c", "--config_file", help="Configuration YAML file (optional)") -@click.option("-o", "--output-dir", required=True, help="Output directory for downloaded files") -@click.option("--force", is_flag=True, default=False, help="Re-download files even if they exist") -@click.pass_context -def ncbi_downloader(ctx, config_file, output_dir, force): - """Download NCBI RefSeq GTF, protein FASTA, assembly report, and ClinVar VCF. - - Files are downloaded to the specified output directory. Existing files - are skipped unless --force is used. - """ - config = load_config("clinvar", config_file) - clinvar_cfg = config.get("clinvar_translation", {}) - - refseq_base = clinvar_cfg.get("ncbi_refseq_ftp", - "https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/") - clinvar_base = clinvar_cfg.get("clinvar_ftp", - "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/") - - downloader = NcbiDataDownloader( - output_dir=output_dir, - refseq_base_url=refseq_base, - clinvar_base_url=clinvar_base, - ) - downloaded = downloader.download_all(force=force) - click.echo(f"Downloaded {len(downloaded)} files to {output_dir}") -``` - -**Step 3: Register commands in `pgatk/cli.py`** - -Add at line 30 (after the existing imports): - -```python -from pgatk.commands import clinvar_to_proteindb as clinvar_to_proteindb_cmd -from pgatk.commands import ncbi_downloader as ncbi_downloader_cmd -``` - -Add at line 56 (after the last `cli.add_command`): - -```python -cli.add_command(clinvar_to_proteindb_cmd.clinvar_to_proteindb) -cli.add_command(ncbi_downloader_cmd.ncbi_downloader) -``` - -**Step 4: Verify CLI help works** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pgatk.cli --help` -Expected: Output should include `clinvar-to-proteindb` and `ncbi-downloader` commands - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pgatk.cli clinvar-to-proteindb --help` -Expected: Shows help text with all options - -**Step 5: Run all tests** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/ -v` -Expected: All tests PASS - -**Step 6: Commit** - -```bash -git add pgatk/commands/clinvar_to_proteindb.py \ - pgatk/commands/ncbi_downloader.py \ - pgatk/cli.py -git commit -m "feat: add clinvar-to-proteindb and ncbi-downloader CLI commands - -Wire up ClinVar pipeline and NCBI downloader as Click commands. -Register both in the main CLI entry point." -``` - ---- - -## Task 7: Final Integration Test and Cleanup - -End-to-end test running the full CLI pipeline with mini test data. - -**Files:** -- Create: `pgatk/tests/test_clinvar/test_clinvar_integration.py` - -**Step 1: Write integration test** - -Create `pgatk/tests/test_clinvar/test_clinvar_integration.py`: - -```python -"""End-to-end integration test for the ClinVar pipeline.""" - -from pathlib import Path - -from click.testing import CliRunner - -from pgatk.cli import cli - - -TESTDATA_DIR = Path(__file__).resolve().parent.parent.parent / "testdata" / "clinvar" - - -class TestClinVarCLIIntegration: - """Test the clinvar-to-proteindb CLI command end-to-end.""" - - def test_cli_produces_output(self, tmp_path): - output_file = tmp_path / "clinvar_output.fa" - runner = CliRunner() - result = runner.invoke(cli, [ - "clinvar-to-proteindb", - "--vcf", str(TESTDATA_DIR / "mini_clinvar.vcf"), - "--gtf", str(TESTDATA_DIR / "mini_refseq.gtf"), - "--fasta", str(TESTDATA_DIR / "mini_refseq_protein.faa"), - "--assembly-report", str(TESTDATA_DIR / "mini_assembly_report.txt"), - "--output", str(output_file), - ]) - assert result.exit_code == 0, f"CLI failed: {result.output}\n{result.exception}" - assert output_file.exists() - - def test_cli_help_shows_options(self): - runner = CliRunner() - result = runner.invoke(cli, ["clinvar-to-proteindb", "--help"]) - assert result.exit_code == 0 - assert "clinvar" in result.output.lower() - assert "--vcf" in result.output - assert "--assembly-report" in result.output - - def test_ncbi_downloader_help(self): - runner = CliRunner() - result = runner.invoke(cli, ["ncbi-downloader", "--help"]) - assert result.exit_code == 0 - assert "--output-dir" in result.output - assert "--force" in result.output -``` - -**Step 2: Run integration test** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_clinvar/test_clinvar_integration.py -v` -Expected: All PASS - -**Step 3: Run full test suite** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/ -v` -Expected: All tests PASS (existing + new) - -**Step 4: Commit** - -```bash -git add pgatk/tests/test_clinvar/test_clinvar_integration.py -git commit -m "test: add end-to-end CLI integration tests for ClinVar pipeline" -``` - -**Step 5: Clean up any gffutils .db files created during tests** - -If gffutils created `.db` files in testdata during test runs, add them to `.gitignore`: - -```bash -echo "*.db" >> pgatk/testdata/clinvar/.gitignore -git add pgatk/testdata/clinvar/.gitignore -git commit -m "chore: gitignore gffutils .db files in ClinVar testdata" -``` diff --git a/docs/plans/2026-03-02-clinvar-ncbi-support-design.md b/docs/plans/2026-03-02-clinvar-ncbi-support-design.md deleted file mode 100644 index f9c4c6b8..00000000 --- a/docs/plans/2026-03-02-clinvar-ncbi-support-design.md +++ /dev/null @@ -1,208 +0,0 @@ -# ClinVar/NCBI RefSeq Protein Database Generation — Design - -**Goal:** Add support for generating variant protein databases from ClinVar VCF files using NCBI RefSeq GTF annotations, independent of VEP/Ensembl. - -**Architecture:** New `pgatk/clinvar/` module with ClinVar-specific pipeline, chromosome mapper, and NCBI data downloader. Shared algorithms (`get_altseq`, `get_orfs_vcf`) are reused from existing code. - -**Tech Stack:** Python, pysam (VCF), gffutils (GTF), pyBedTools (interval overlap), pyfaidx (FASTA), tqdm, PyYAML. - ---- - -## 1. Architecture Overview - -### New module: `pgatk/clinvar/` - -``` -pgatk/clinvar/ - __init__.py - clinvar_service.py # Main pipeline: ClinVar VCF + RefSeq GTF -> protein FASTA - chromosome_mapper.py # NC_000001.11 <-> 1 <-> chr1 mapping - data_downloader.py # Download NCBI RefSeq + ClinVar files -``` - -### New CLI commands - -``` -pgatk/commands/ - clinvar_to_proteindb.py # clinvar-to-proteindb command - ncbi_downloader.py # ncbi-downloader command -``` - -### Shared utilities - -Extract reusable VCF-to-protein functions into `pgatk/toolbox/vcf_utils.py`: -- `get_altseq()` — apply variant to transcript sequence -- `get_orfs_vcf()` — translate modified transcript to protein -- `three_frame_translation()` — 3-frame translation helper - -These are currently in `pgatk/ensembl/ensembl.py` and will be imported by both the Ensembl and ClinVar pipelines. - -### New configuration - -``` -pgatk/config/clinvar_config.yaml -``` - ---- - -## 2. Pipeline Data Flow - -### Stage 1: Chromosome Mapping - -Parse NCBI assembly report (`GRCh38_latest_assembly_report.txt`) to build bidirectional chromosome name maps: -- `NC_000001.11` (RefSeq accession in GTF) -- `1` (numeric, used in ClinVar VCF) -- `chr1` (UCSC-style) - -The `chromosome_mapper.py` module exposes `map_chrom(name, target_convention)` for converting between conventions. - -### Stage 2: GTF Indexing - -Use gffutils to build a database from the RefSeq GTF: -- Filter to `protein_coding` biotype transcripts (`NM_*` and optionally `XM_*`) -- Index CDS features per transcript -- Build transcript-to-gene mapping from `gene_id` attribute (gene symbol in RefSeq) -- Extract transcript CDS sequences from protein FASTA (or reconstruct from genome FASTA) - -### Stage 3: ClinVar VCF Processing - -For each VCF record: -1. **Filter by CLNSIG:** Exclude Benign, Likely_benign, Benign/Likely_benign -2. **Filter by MC (molecular consequence):** Keep missense, nonsense, frameshift, indels, splice variants -3. **Map chromosome:** Convert VCF chromosome (numeric) to GTF chromosome (NC_) using chromosome mapper -4. **Find overlapping transcripts:** Use BedTools/interval tree to find transcripts whose CDS region overlaps the variant position -5. **Apply variant:** For each overlapping transcript, use `get_altseq()` to modify the transcript sequence -6. **Translate:** Use `get_orfs_vcf()` to translate modified sequence to protein -7. **Collect unique variant proteins** - -### Stage 4: Output - -Write FASTA with ClinVar metadata headers: -``` ->clinvar|RCV000012345|NM_000123|BRCA1|Pathogenic|missense_variant VARIANT_DESCRIPTION -MPROTEINSEQUENCE... -``` - ---- - -## 3. NCBI Downloader - -### Files downloaded - -| File | Source | Purpose | -|------|--------|---------| -| `GRCh38_latest_genomic.gtf.gz` | RefSeq FTP | Gene annotations | -| `GRCh38_latest_protein.faa.gz` | RefSeq FTP | Protein sequences | -| `GRCh38_latest_assembly_report.txt` | RefSeq FTP | Chromosome name mapping | -| `clinvar.vcf.gz` + `.tbi` | ClinVar FTP | Variant calls | - -### Implementation (`pgatk/clinvar/data_downloader.py`) - -- Uses `urllib.request` for FTP/HTTPS downloads (no extra dependencies) -- Downloads to user-specified `--output-dir` -- Skips existing files unless `--force` is passed -- Parses assembly report to produce chromosome mapping JSON -- Progress reporting via `tqdm` - -### CLI - -``` -pgatk ncbi-downloader --output-dir ./ncbi_data [--genome-build GRCh38] [--force] -``` - -### Chromosome Mapper (`pgatk/clinvar/chromosome_mapper.py`) - -- Reads assembly report TSV (columns: Sequence-Name, Sequence-Role, Assigned-Molecule, GenBank-Accn, RefSeq-Accn, UCSC-style-name) -- Builds bidirectional maps: `NC_000001.11 <-> 1 <-> chr1` -- Exposes `map_chrom(name, target_convention)` for any format conversion -- Handles unknown chromosomes gracefully (returns input unchanged + warning) - ---- - -## 4. Configuration - -### `pgatk/config/clinvar_config.yaml` - -```yaml -# ClinVar-specific defaults -clinical_significance_exclude: - - "Benign" - - "Likely_benign" - - "Benign/Likely_benign" - -# NCBI RefSeq biotypes to include -biotypes: - - "protein_coding" - -# Molecular consequences to include (from ClinVar MC field) -consequences: - - "missense_variant" - - "nonsense" - - "frameshift_variant" - - "inframe_insertion" - - "inframe_deletion" - - "stop_gained" - - "stop_lost" - - "start_lost" - - "splice_donor_variant" - - "splice_acceptor_variant" - -# FTP URLs -ncbi_ftp_base: "https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/" -clinvar_ftp_base: "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/" -``` - -### CLI parameters for `clinvar-to-proteindb` - -``` -pgatk clinvar-to-proteindb \ - --vcf clinvar.vcf.gz \ - --gtf GRCh38_latest_genomic.gtf \ - --fasta GRCh38_latest_protein.faa \ - --assembly-report GRCh38_latest_assembly_report.txt \ - --output clinvar_proteins.fa \ - [--clnsig-exclude "Benign,Likely_benign"] \ - [--consequences "missense_variant,stop_gained,..."] \ - [--biotypes "protein_coding"] \ - [--af-field AF_EXAC] -``` - -All filtering parameters have sensible defaults from the config file and can be overridden via CLI. - ---- - -## 5. Testing Strategy - -### Test data (committed to repo) - -- `testdata/clinvar/mini_clinvar.vcf` — ~20 hand-picked ClinVar records: SNPs (missense, nonsense, stop_lost), indels, Benign + Pathogenic entries, multiple chromosomes -- `testdata/clinvar/mini_refseq.gtf` — GTF subset with ~5 transcripts matching the VCF variants -- `testdata/clinvar/mini_refseq_protein.faa` — Matching protein FASTA for the 5 transcripts -- `testdata/clinvar/mini_assembly_report.txt` — Assembly report subset for tested chromosomes -- `testdata/clinvar/expected_output.fa` — Expected output protein FASTA for validation - -### Unit tests (`pgatk/tests/test_clinvar/`) - -1. **`test_chromosome_mapper.py`** — NC_ <-> numeric <-> chr mapping, unknown chromosome handling, assembly report parsing -2. **`test_clinvar_service.py`** — CLNSIG filtering, transcript overlap detection, variant application (reusing `get_altseq`/`get_orfs_vcf`), end-to-end mini pipeline -3. **`test_data_downloader.py`** — Assembly report parsing (no actual FTP downloads), chromosome map JSON generation - -### Integration test - -- **`test_clinvar_integration.py`** — Full pipeline run with mini test data, compare output to expected FASTA - -### Reuse validation - -Shared functions (`get_altseq`, `get_orfs_vcf`) are already tested in `test_ensembl_core.py`. ClinVar tests focus on new code paths: chromosome mapping, CLNSIG filtering, transcript overlap, and CLI wiring. - ---- - -## Design Decisions - -1. **New module vs. extending Ensembl:** ClinVar/RefSeq is fundamentally different from the Ensembl+VEP pipeline (no CSQ annotations, different chromosome naming, different GTF structure). A separate module avoids coupling and keeps both pipelines clean. - -2. **Internal transcript mapping vs. VEP:** ClinVar VCF lacks transcript-level annotations. Rather than requiring users to run VEP first, we use BedTools interval overlap to find transcripts internally. This is simpler for users and keeps the pipeline self-contained. - -3. **Shared algorithm extraction:** `get_altseq` and `get_orfs_vcf` are generic enough to work with any VCF+GTF combination. Extracting them to `toolbox/vcf_utils.py` enables reuse without code duplication. - -4. **Assembly report for chromosome mapping:** NCBI's assembly report is the authoritative source for chromosome name mappings. Parsing it once is more reliable than hardcoding mappings. diff --git a/docs/plans/2026-03-03-clinvar-pipeline-hardening.md b/docs/plans/2026-03-03-clinvar-pipeline-hardening.md deleted file mode 100644 index abae8c95..00000000 --- a/docs/plans/2026-03-03-clinvar-pipeline-hardening.md +++ /dev/null @@ -1,618 +0,0 @@ -# ClinVar Pipeline Hardening Implementation Plan - -> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. - -**Goal:** Fix the three weaknesses in the ClinVar pipeline: activate MC consequence filtering, eliminate double VCF read, and add biotype filtering. - -**Architecture:** Refactor `ClinVarService.run()` to read the VCF once into a DataFrame, build the BedTools overlap map from the DataFrame (not a second file read), apply MC and biotype filters early to skip irrelevant variants, and add a duplicate variant-transcript guard. - -**Tech Stack:** Python, pandas, pybedtools, gffutils, pytest - ---- - -### Task 1: Activate MC Consequence Filtering - -The `include_consequences` config is defined in `clinvar_config.yaml` but never used. -The ClinVar VCF already has an `MC` field (e.g., `MC=SO:0001583|missense_variant`) that -tells us the variant consequence. We need to parse it and filter variants whose MC -does not match any entry in `include_consequences`. - -**Files:** -- Modify: `pgatk/clinvar/clinvar_service.py:52-80` (add `_include_consequences` to `__init__`) -- Modify: `pgatk/clinvar/clinvar_service.py:340-532` (add MC filter in `run()`) -- Modify: `pgatk/config/clinvar_config.yaml:18-29` (update defaults — remove splice variants we can't model) -- Test: `pgatk/tests/test_clinvar/test_clinvar_service.py` -- Modify: `pgatk/testdata/clinvar/mini_clinvar.vcf` (add test records with different MC values) - -**Step 1: Write the failing tests** - -Add to `pgatk/tests/test_clinvar/test_clinvar_service.py`: - -```python -class TestMCFiltering: - """Tests for MC consequence filtering.""" - - def test_passes_mc_filter_matching_consequence(self): - """A variant with missense_variant should pass when it's in the include list.""" - include = ["missense_variant", "stop_gained"] - mc = "SO:0001583|missense_variant" - assert ClinVarService.passes_mc_filter(mc, include) is True - - def test_passes_mc_filter_no_match(self): - """A variant with synonymous_variant should NOT pass.""" - include = ["missense_variant", "stop_gained"] - mc = "SO:0001819|synonymous_variant" - assert ClinVarService.passes_mc_filter(mc, include) is False - - def test_passes_mc_filter_multi_consequence_any_match(self): - """If ANY consequence matches the include list, the variant passes.""" - include = ["missense_variant", "stop_gained"] - mc = "SO:0001819|synonymous_variant,SO:0001587|stop_gained" - assert ClinVarService.passes_mc_filter(mc, include) is True - - def test_passes_mc_filter_empty_mc_passes(self): - """Variants without MC field should pass (no info to filter on).""" - include = ["missense_variant"] - assert ClinVarService.passes_mc_filter("", include) is True - - def test_passes_mc_filter_all_keyword(self): - """When include list is ['all'], everything passes.""" - mc = "SO:0001819|synonymous_variant" - assert ClinVarService.passes_mc_filter(mc, ["all"]) is True -``` - -**Step 2: Run tests to verify they fail** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_clinvar/test_clinvar_service.py::TestMCFiltering -v` -Expected: FAIL with `AttributeError: type object 'ClinVarService' has no attribute 'passes_mc_filter'` - -**Step 3: Implement `passes_mc_filter` and wire into `__init__` and `run()`** - -In `pgatk/clinvar/clinvar_service.py`: - -1. Add `passes_mc_filter` static method (after `passes_clnsig_filter`): - -```python -@staticmethod -def passes_mc_filter(mc_field: str, include_list: list[str]) -> bool: - """Return True when *mc_field* contains at least one consequence in *include_list*. - - An empty MC field always passes (no information to filter on). - The special value ``'all'`` in include_list disables filtering. - """ - if not mc_field: - return True - if "all" in include_list: - return True - consequences = ClinVarService.parse_mc_consequences(mc_field) - return any(c in include_list for c in consequences) -``` - -2. In `__init__`, after `self._clnsig_exclude = ...` (line 80), add: - -```python -self._include_consequences = self._cfg.get( - "include_consequences", ["all"] -) -``` - -3. In `run()`, after the CLNSIG filter block (after line 401), add: - -```python -# --- MC consequence filter --- -mc_field = self._get_info_field(info, "MC") -if not self.passes_mc_filter(mc_field, self._include_consequences): - stats["variants_filtered_mc"] += 1 - continue -``` - -4. Add `"variants_filtered_mc": 0` to the `stats` dict (line 373). - -**Step 4: Update config defaults** - -In `pgatk/config/clinvar_config.yaml`, update the `include_consequences` list. -Remove splice variants (we can't model their protein consequence — that would -require exon-skipping prediction). Add a comment explaining why: - -```yaml - # Molecular consequences to include (from ClinVar MC field). - # Only consequences that produce modelable protein changes are included. - # Splice variants are excluded because the pipeline cannot predict - # exon-skipping outcomes — use VEP for those. - include_consequences: - - "missense_variant" - - "nonsense" - - "frameshift_variant" - - "inframe_insertion" - - "inframe_deletion" - - "stop_gained" - - "stop_lost" - - "start_lost" -``` - -**Step 5: Update test data** - -Add a synonymous variant to `pgatk/testdata/clinvar/mini_clinvar.vcf` (after line 9): - -``` -1 1054 rs00004 G A . . GENEINFO=GeneA:1234;CLNSIG=Pathogenic;MC=SO:0001819|synonymous_variant -``` - -This is Pathogenic but synonymous — it should pass CLNSIG but fail MC filter. - -**Step 6: Add integration test for MC filtering** - -Add to `TestClinVarPipeline` in `test_clinvar_service.py`: - -```python -def test_synonymous_variant_excluded(self, tmp_path): - """Pathogenic synonymous variant (rs00004) should not appear in output.""" - output_file = str(tmp_path / "output.fa") - service = ClinVarService( - vcf_file=MINI_VCF, - gtf_file=MINI_GTF, - fasta_file=MINI_FASTA, - assembly_report=ASSEMBLY_REPORT, - output_file=output_file, - ) - service.run() - with open(output_file, "r") as f: - content = f.read() - assert "rs00004" not in content -``` - -**Step 7: Run all tests to verify** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_clinvar/ -v` -Expected: ALL PASS - -**Step 8: Commit** - -```bash -git add pgatk/clinvar/clinvar_service.py pgatk/config/clinvar_config.yaml pgatk/tests/test_clinvar/test_clinvar_service.py pgatk/testdata/clinvar/mini_clinvar.vcf -git commit -m "feat(clinvar): activate MC consequence filtering from config" -``` - ---- - -### Task 2: Single-Pass VCF Reading - -Currently the VCF is read twice: once in `_annotate_vcf_with_transcripts()` (to build -the BED for BedTools) and once in `_read_vcf()` (to get the DataFrame for processing). -Refactor so the VCF is read once into a DataFrame, and the BED is built from the -DataFrame rows. - -**Files:** -- Modify: `pgatk/clinvar/clinvar_service.py:246-334` (refactor `_annotate_vcf_with_transcripts`) -- Modify: `pgatk/clinvar/clinvar_service.py:340-380` (refactor `run()` to read once) -- Test: `pgatk/tests/test_clinvar/test_clinvar_service.py` - -**Step 1: Write the failing test** - -Add to `test_clinvar_service.py`: - -```python -class TestBuildOverlapMap: - """Tests for _build_overlap_map (DataFrame-based BedTools annotation).""" - - @pytest.fixture(autouse=True) - def _cleanup_db(self): - db_path = Path(MINI_GTF).with_suffix(".db") - if db_path.exists(): - db_path.unlink() - yield - if db_path.exists(): - db_path.unlink() - - def test_build_overlap_map_from_dataframe(self): - """_build_overlap_map should accept a DataFrame and return overlap dict.""" - from pgatk.clinvar.chromosome_mapper import ChromosomeMapper - chrom_mapper = ChromosomeMapper.from_assembly_report(ASSEMBLY_REPORT) - _meta, vcf_df = ClinVarService._read_vcf(MINI_VCF) - overlap_map = ClinVarService._build_overlap_map(vcf_df, MINI_GTF, chrom_mapper) - assert isinstance(overlap_map, dict) - # rs00001 at chr1:1006 should overlap NM_000001.1 CDS (1000-1299) - assert any("1:1006:" in k for k in overlap_map) - - def test_build_overlap_map_matches_old_method(self): - """_build_overlap_map should produce same results as _annotate_vcf_with_transcripts.""" - from pgatk.clinvar.chromosome_mapper import ChromosomeMapper - chrom_mapper = ChromosomeMapper.from_assembly_report(ASSEMBLY_REPORT) - _meta, vcf_df = ClinVarService._read_vcf(MINI_VCF) - new_result = ClinVarService._build_overlap_map(vcf_df, MINI_GTF, chrom_mapper) - old_result = ClinVarService._annotate_vcf_with_transcripts(MINI_VCF, MINI_GTF, chrom_mapper) - assert new_result == old_result -``` - -**Step 2: Run tests to verify they fail** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_clinvar/test_clinvar_service.py::TestBuildOverlapMap -v` -Expected: FAIL with `AttributeError: ... has no attribute '_build_overlap_map'` - -**Step 3: Implement `_build_overlap_map`** - -Add new method to `ClinVarService` (replace or supplement `_annotate_vcf_with_transcripts`): - -```python -@staticmethod -def _build_overlap_map( - vcf_df: pd.DataFrame, - gtf_file: str, - chrom_mapper: ChromosomeMapper, -) -> dict[str, list[str]]: - """Find transcripts overlapping each VCF variant via BedTools. - - Unlike ``_annotate_vcf_with_transcripts``, this method builds the BED - from an already-loaded DataFrame, avoiding a second file read. - - Returns a dict mapping ``"CHROM:POS:REF:ALT"`` variant keys to lists - of overlapping transcript IDs. - """ - bed_lines: list[str] = [] - for _, row in vcf_df.iterrows(): - ref = str(row.REF) - if any(c not in "ACGT" for c in ref): - continue - chrom_numeric = str(row.CHROM) - pos = int(row.POS) - alt_field = str(row.ALT) - chrom_refseq = chrom_mapper.map_chrom(chrom_numeric, "refseq") - start = pos - 1 # BED is 0-based half-open - end = start + len(ref) - for alt in alt_field.split(","): - alt = alt.strip() - if not alt or not all(c in "ACGT" for c in alt): - continue - variant_key = f"{chrom_numeric}:{pos}:{ref}:{alt}" - bed_lines.append(f"{chrom_refseq}\t{start}\t{end}\t{variant_key}\n") - - if not bed_lines: - return {} - - with tempfile.NamedTemporaryFile(mode="w", suffix=".bed", delete=False) as tmp: - tmp.writelines(bed_lines) - tmp_bed_path = tmp.name - - try: - vcf_bed = BedTool(tmp_bed_path) - gtf_bed = BedTool(gtf_file) - intersection = vcf_bed.intersect(gtf_bed, wo=True) - - result: dict[str, list[str]] = {} - for feature in intersection: - fields = str(feature).strip().split("\t") - variant_key = fields[3] - gtf_type_idx = 4 + 2 - if len(fields) <= gtf_type_idx: - continue - if fields[gtf_type_idx] != "CDS": - continue - gtf_attrs_idx = 4 + 8 - if len(fields) <= gtf_attrs_idx: - continue - transcript_id = _extract_transcript_id(fields[gtf_attrs_idx]) - if transcript_id: - result.setdefault(variant_key, []) - if transcript_id not in result[variant_key]: - result[variant_key].append(transcript_id) - return result - finally: - Path(tmp_bed_path).unlink(missing_ok=True) -``` - -**Step 4: Refactor `run()` to use single-pass reading** - -In `run()`, replace the two calls: - -```python -# OLD (two reads): -overlap_map = self._annotate_vcf_with_transcripts( - self._vcf_file, self._gtf_file, chrom_mapper -) -_metadata, vcf_df = self._read_vcf(self._vcf_file) - -# NEW (one read): -_metadata, vcf_df = self._read_vcf(self._vcf_file) -overlap_map = self._build_overlap_map(vcf_df, self._gtf_file, chrom_mapper) -``` - -**Step 5: Remove `_annotate_vcf_with_transcripts`** - -Delete the old method entirely (lines 246-334). It is no longer called. - -**Step 6: Run all tests** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_clinvar/ -v` -Expected: ALL PASS - -**Step 7: Commit** - -```bash -git add pgatk/clinvar/clinvar_service.py pgatk/tests/test_clinvar/test_clinvar_service.py -git commit -m "refactor(clinvar): single-pass VCF reading — build overlap map from DataFrame" -``` - ---- - -### Task 3: Biotype Filtering from GTF - -The config has `include_biotypes: "protein_coding"` but it's never applied. When -processing a transcript, check its `gene_biotype` attribute from the gffutils DB. -Skip transcripts that don't match the include list. - -**Files:** -- Modify: `pgatk/clinvar/clinvar_service.py:52-80` (add `_include_biotypes` to `__init__`) -- Modify: `pgatk/clinvar/clinvar_service.py` in `run()` (add biotype check in transcript loop) -- Test: `pgatk/tests/test_clinvar/test_clinvar_service.py` -- Modify: `pgatk/testdata/clinvar/mini_refseq.gtf` (add `gene_biotype` attribute to existing records) - -**Step 1: Update test data** - -Add `gene_biotype "protein_coding";` to all GTF lines in `mini_refseq.gtf`. -For example, the first transcript line becomes: - -``` -NC_000001.11 BestRefSeq transcript 1000 1299 . + . gene_id "GeneA"; transcript_id "NM_000001.1"; gene_biotype "protein_coding"; -``` - -Do this for ALL lines (transcript, exon, CDS) in the GTF. - -**Step 2: Write the failing test** - -Add to `test_clinvar_service.py`: - -```python -class TestBiotypeFiltering: - """Tests for biotype filtering from GTF.""" - - def test_get_biotype_from_db(self): - """_get_transcript_biotype should extract gene_biotype from gffutils DB.""" - from pgatk.clinvar.clinvar_service import ClinVarService - db = ClinVarService._parse_gtf(MINI_GTF) - biotype = ClinVarService._get_transcript_biotype(db, "NM_000001.1") - assert biotype == "protein_coding" - - def test_get_biotype_missing_returns_empty(self): - """Missing biotype returns empty string (passes filter).""" - from pgatk.clinvar.clinvar_service import ClinVarService - db = ClinVarService._parse_gtf(MINI_GTF) - biotype = ClinVarService._get_transcript_biotype(db, "NONEXISTENT") - assert biotype == "" -``` - -**Step 3: Run tests to verify they fail** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_clinvar/test_clinvar_service.py::TestBiotypeFiltering -v` -Expected: FAIL with `AttributeError` - -**Step 4: Implement biotype filtering** - -1. Add `_get_transcript_biotype` static method: - -```python -@staticmethod -def _get_transcript_biotype(db: gffutils.FeatureDB, transcript_id: str) -> str: - """Extract gene_biotype from a gffutils transcript feature. - - Returns empty string if the transcript or attribute is not found. - """ - try: - feature = db[transcript_id] - except gffutils.exceptions.FeatureNotFoundError: - try: - feature = db[transcript_id.split(".")[0]] - except gffutils.exceptions.FeatureNotFoundError: - return "" - try: - return feature.attributes.get("gene_biotype", [""])[0] - except (IndexError, AttributeError): - return "" -``` - -2. In `__init__`, load the biotype config: - -```python -biotypes_raw = self._cfg.get("include_biotypes", "all") -if isinstance(biotypes_raw, str): - self._include_biotypes = [b.strip() for b in biotypes_raw.split(",")] -else: - self._include_biotypes = list(biotypes_raw) -``` - -3. In `run()`, inside the transcript loop (after looking up the transcript in -FASTA, before getting features), add: - -```python -# --- Biotype filter --- -if self._include_biotypes != ["all"]: - biotype = self._get_transcript_biotype(db, tid) - if biotype and biotype not in self._include_biotypes: - stats["transcripts_filtered_biotype"] += 1 - continue -``` - -4. Add `"transcripts_filtered_biotype": 0` to `stats`. - -**Step 5: Run all tests** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_clinvar/ -v` -Expected: ALL PASS - -**Step 6: Commit** - -```bash -git add pgatk/clinvar/clinvar_service.py pgatk/tests/test_clinvar/test_clinvar_service.py pgatk/testdata/clinvar/mini_refseq.gtf -git commit -m "feat(clinvar): add biotype filtering from GTF gene_biotype attribute" -``` - ---- - -### Task 4: Duplicate Variant-Transcript Guard and Stats Cleanup - -The Ensembl pipeline tracks processed `(transcript_id, ref, alt)` combinations to -avoid processing the same variant-transcript pair twice. The ClinVar pipeline lacks -this. Also clean up the `_read_vcf` HEADERS dict (deferred I3 from code review). - -**Files:** -- Modify: `pgatk/clinvar/clinvar_service.py` in `run()` (add duplicate guard) -- Modify: `pgatk/clinvar/clinvar_service.py:213-240` (clean up `_read_vcf`) -- Test: `pgatk/tests/test_clinvar/test_clinvar_service.py` - -**Step 1: Write the failing test** - -```python -class TestDuplicateGuard: - """Pipeline should not process the same variant-transcript pair twice.""" - - @pytest.fixture(autouse=True) - def _cleanup_db(self): - db_path = Path(MINI_GTF).with_suffix(".db") - if db_path.exists(): - db_path.unlink() - yield - if db_path.exists(): - db_path.unlink() - - def test_no_duplicate_fasta_entries(self, tmp_path): - """Each variant-transcript combo should appear at most once in output.""" - output_file = str(tmp_path / "output.fa") - service = ClinVarService( - vcf_file=MINI_VCF, - gtf_file=MINI_GTF, - fasta_file=MINI_FASTA, - assembly_report=ASSEMBLY_REPORT, - output_file=output_file, - ) - service.run() - with open(output_file, "r") as f: - headers = [line.strip() for line in f if line.startswith(">")] - # No duplicate headers - assert len(headers) == len(set(headers)), ( - f"Duplicate FASTA entries found: {headers}" - ) -``` - -**Step 2: Implement duplicate guard in `run()`** - -Before the main `for _, record in vcf_df.iterrows():` loop, add: - -```python -processed_pairs: set[str] = set() -``` - -Inside the transcript loop, before the `# Resolve transcript in FASTA` block, add: - -```python -pair_key = f"{variant_key}:{transcript_id}" -if pair_key in processed_pairs: - continue -processed_pairs.add(pair_key) -``` - -**Step 3: Clean up `_read_vcf` HEADERS** - -Replace the `HEADERS` dict with a simple column name list (remove the misleading -type values that are never applied): - -```python -@staticmethod -def _read_vcf(vcf_file: str) -> tuple[list, pd.DataFrame]: - """Read a VCF file and return metadata lines and a DataFrame of records.""" - COLUMNS = ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] - - metadata: list[str] = [] - data: list[list[str]] = [] - with open(vcf_file, "r", encoding="utf-8") as fh: - for line in fh: - line = line.strip() - if not line: - continue - if line.startswith("#"): - metadata.append(line) - else: - data.append(line.split("\t")[0:8]) - - vcf_df = pd.DataFrame(data, columns=COLUMNS) - return metadata, vcf_df -``` - -**Step 4: Run all tests** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_clinvar/ -v` -Expected: ALL PASS - -**Step 5: Commit** - -```bash -git add pgatk/clinvar/clinvar_service.py pgatk/tests/test_clinvar/test_clinvar_service.py -git commit -m "fix(clinvar): add duplicate variant-transcript guard, clean up _read_vcf" -``` - ---- - -### Task 5: Comprehensive Integration Test Updates - -Verify the full pipeline with the new filters active. Ensure test data exercises -all filter paths (CLNSIG, MC, biotype). Verify the single-pass refactor doesn't -change output. - -**Files:** -- Modify: `pgatk/tests/test_clinvar/test_clinvar_integration.py` -- Modify: `pgatk/tests/test_clinvar/test_clinvar_service.py` - -**Step 1: Add integration tests** - -Add to `TestClinVarCLIIntegration` in `test_clinvar_integration.py`: - -```python -def test_cli_filters_synonymous_variants(self, tmp_path, clinvar_testdata): - """Synonymous variants should be filtered by MC even if Pathogenic.""" - output_file = tmp_path / "clinvar_output.fa" - runner = CliRunner() - result = runner.invoke(cli, [ - "clinvar-to-proteindb", - "--vcf", str(clinvar_testdata / "mini_clinvar.vcf"), - "--gtf", str(clinvar_testdata / "mini_refseq.gtf"), - "--fasta", str(clinvar_testdata / "mini_refseq_protein.faa"), - "--assembly-report", str(clinvar_testdata / "mini_assembly_report.txt"), - "--output", str(output_file), - ]) - assert result.exit_code == 0 - content = output_file.read_text() - # rs00004 is Pathogenic but synonymous — should NOT appear - assert "rs00004" not in content - # rs00001 is Pathogenic missense — SHOULD appear - assert "rs00001" in content -``` - -Add a test that existing variants still produce correct output: - -```python -def test_pipeline_output_contains_clnsig_in_header(self, tmp_path, clinvar_testdata): - """Output FASTA headers should contain CLNSIG and gene symbol.""" - output_file = tmp_path / "clinvar_output.fa" - runner = CliRunner() - result = runner.invoke(cli, [ - "clinvar-to-proteindb", - "--vcf", str(clinvar_testdata / "mini_clinvar.vcf"), - "--gtf", str(clinvar_testdata / "mini_refseq.gtf"), - "--fasta", str(clinvar_testdata / "mini_refseq_protein.faa"), - "--assembly-report", str(clinvar_testdata / "mini_assembly_report.txt"), - "--output", str(output_file), - ]) - assert result.exit_code == 0 - content = output_file.read_text() - assert "Pathogenic" in content -``` - -**Step 2: Run full test suite** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/ -v` -Expected: ALL PASS (including existing Ensembl tests and VCF utils tests) - -**Step 3: Commit** - -```bash -git add pgatk/tests/test_clinvar/ -git commit -m "test(clinvar): add integration tests for MC filtering and output validation" -``` diff --git a/docs/use-cases.md b/docs/use-cases.md index 5af6de50..bd3cf206 100644 --- a/docs/use-cases.md +++ b/docs/use-cases.md @@ -183,11 +183,11 @@ pgatk ensembl-downloader \ -o ensembl_human \ --skip_protein \ --skip_ncrna \ - --skip_cdna \ - --skip_dna + --skip_cdna ``` -This downloads the gene annotation GTF and the VCF file with known variants. +This downloads the gene annotation GTF, VCF file with known variants, and the +genome FASTA (needed by gffread to extract transcript sequences). ### Step 2 -- Generate transcript sequences @@ -797,8 +797,7 @@ pgatk supports any species available in ENSEMBL. For example, rice ```bash pgatk ensembl-downloader \ -t 39947 \ - -o ensembl_rice \ - --skip_dna + -o ensembl_rice ``` ### Step 2 -- Generate transcript sequences and canonical proteome @@ -860,7 +859,7 @@ For model organisms like mouse (*Mus musculus*, taxonomy 10090): ```bash # Download -pgatk ensembl-downloader -t 10090 -o ensembl_mouse --skip_dna +pgatk ensembl-downloader -t 10090 -o ensembl_mouse # Generate transcript sequences gffread -F -w ensembl_mouse/transcripts.fa \ diff --git a/pgatk/clinvar/clinvar_service.py b/pgatk/clinvar/clinvar_service.py index b5760cbd..ee9a2cc6 100644 --- a/pgatk/clinvar/clinvar_service.py +++ b/pgatk/clinvar/clinvar_service.py @@ -122,9 +122,12 @@ def passes_clnsig_filter(clnsig: str, exclude_list: list[str]) -> bool: return False # Split compound values and check each component. parts = re.split(r"[/,]", clnsig) - return not all(p.strip().replace("_", " ").lower() in + non_empty = [p.strip() for p in parts if p.strip()] + if not non_empty: + return True # delimiter-only or whitespace-only → treat as empty + return not all(p.replace("_", " ").lower() in [e.replace("_", " ").lower() for e in exclude_list] - for p in parts if p.strip()) + for p in non_empty) @staticmethod def passes_mc_filter(mc_field: str, include_list: list[str]) -> bool: diff --git a/pgatk/tests/test_clinvar/test_clinvar_service.py b/pgatk/tests/test_clinvar/test_clinvar_service.py index eb62ee05..5f5e84bd 100644 --- a/pgatk/tests/test_clinvar/test_clinvar_service.py +++ b/pgatk/tests/test_clinvar/test_clinvar_service.py @@ -71,6 +71,13 @@ def test_compound_risk_factor_excluded(self): # '_risk_factor' is not in exclude list, so it's not an excluded component assert ClinVarService.passes_clnsig_filter("Benign,_risk_factor", exclude) is True + def test_delimiter_only_clnsig_passes(self): + """CLNSIG with only delimiters (e.g. '/') should pass like empty.""" + exclude = ["Benign", "Likely_benign"] + assert ClinVarService.passes_clnsig_filter("/", exclude) is True + assert ClinVarService.passes_clnsig_filter(",", exclude) is True + assert ClinVarService.passes_clnsig_filter("/,/", exclude) is True + # --------------------------------------------------------------------------- # TestMolecularConsequenceParser diff --git a/pgatk/tests/test_vcf_utils.py b/pgatk/tests/test_vcf_utils.py index 70b7250c..92ca86cd 100644 --- a/pgatk/tests/test_vcf_utils.py +++ b/pgatk/tests/test_vcf_utils.py @@ -110,6 +110,54 @@ def test_snp_minus_strand(self): assert str(coding_ref) == "ATGCATGCAT" assert str(coding_alt) == "ATGCATGCAG" + def test_multibase_ref_extending_past_exon_clips_alt(self): + """When REF extends past the exon boundary, ALT should also be clipped. + + Exon: positions 10-14 (5 bases). Variant at pos 13 with REF=ACGT (4 bases, + extends to pos 16 -- 2 bases past exon end). Only 2 of 4 REF bases are + exonic. ALT=TT (2 bases) should be clipped by the same 2 intronic bases, + leaving 0 ALT bases (pure deletion of the 2 exonic REF bases). + """ + # positions: 10 11 12 13 14 + ref_seq = Seq("ATGCA") # 5 bases, exon is 10-14 + features_info = [[10, 14, 'exon']] + # REF=ACGT at pos 13: exonic portion is AC (pos 13-14), intronic is GT (pos 15-16) + # ALT=TT: clipped by 2 (intronic REF bases) → empty + coding_ref, coding_alt = get_altseq( + ref_seq, Seq("ACGT"), Seq("TT"), 13, '+', features_info + ) + # With ALT clipped to empty, result should be ref minus the 2 exonic bases + assert str(coding_ref) == "ATGCA" + assert str(coding_alt) == "ATG" # first 3 bases preserved, last 2 deleted + + def test_multibase_ref_extending_past_exon_partial_alt_clip(self): + """ALT longer than intronic portion retains some bases after clipping. + + Exon: positions 10-14. Variant at pos 13 with REF=ACG (extends 1 past exon). + Intronic REF = 1 base. ALT=TTT (3 bases) clipped by 1 → TT (2 bases). + """ + ref_seq = Seq("ATGCA") # exon 10-14 + features_info = [[10, 14, 'exon']] + # REF=ACG at pos 13: exonic = AC (2 bases), intronic = G (1 base) + # ALT=TTT clipped by 1 → TT + coding_ref, coding_alt = get_altseq( + ref_seq, Seq("ACG"), Seq("TTT"), 13, '+', features_info + ) + assert str(coding_ref) == "ATGCA" + # ref_seq[0:3] + "TT" + ref_seq[3+2:] = "ATG" + "TT" + "" = "ATGTT" + assert str(coding_alt) == "ATGTT" + + def test_ref_within_exon_no_clipping(self): + """When REF is entirely within the exon, no clipping occurs.""" + ref_seq = Seq("ATGCATGCAT") + features_info = [[10, 19, 'exon']] + # REF=GC at pos 12: entirely within exon 10-19 + coding_ref, coding_alt = get_altseq( + ref_seq, Seq("GC"), Seq("TT"), 12, '+', features_info + ) + assert str(coding_ref) == "ATGCATGCAT" + assert str(coding_alt) == "ATTTATGCAT" + # --------------------------------------------------------------------------- # get_orfs_vcf diff --git a/pgatk/toolbox/vcf_utils.py b/pgatk/toolbox/vcf_utils.py index d371e5df..61fe4db6 100644 --- a/pgatk/toolbox/vcf_utils.py +++ b/pgatk/toolbox/vcf_utils.py @@ -116,11 +116,33 @@ def get_altseq( var_end_genomic = var_pos + len(ref_allele) - 1 exonic_ref_len = min(var_end_genomic, feature[1]) - var_pos + 1 c = max(exonic_ref_len, 0) + # Clip the ALT allele by the same number of trailing bases that + # were removed from REF. VCF variants are left-aligned, so the + # trailing bases correspond to the intronic portion. + intronic_ref_len = len(ref_allele) - c + if intronic_ref_len > 0: + exonic_alt_len = max(len(var_allele) - intronic_ref_len, 0) + var_allele = var_allele[:exonic_alt_len] + logger.debug( + "Variant at %d extends %d bases past exon boundary " + "(feature %d-%d); clipped REF to %d and ALT to %d bases", + var_pos, intronic_ref_len, feature[0], feature[1], + c, exonic_alt_len, + ) alt_seq = ref_seq[0:var_index_in_cds] + var_allele + ref_seq[var_index_in_cds + c::] if strand == '-': return ref_seq[::-1], alt_seq[::-1] else: return ref_seq, alt_seq + else: + # Log when variant starts outside this feature (intron-start) + var_end_genomic = var_pos + len(ref_allele) - 1 + if var_end_genomic >= feature[0] and var_pos < feature[0]: + logger.debug( + "Variant at %d-%d starts before feature %d-%d " + "(intron-start); skipping this feature", + var_pos, var_end_genomic, feature[0], feature[1], + ) nc_index += (feature[1] - feature[0] + 1)