From 397f02034d44147a42f72a578b081a044c79cc4b Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Tue, 3 Mar 2026 14:26:27 +0000 Subject: [PATCH 1/7] minor changes - fix bugs on algorithms --- pgatk/clinvar/clinvar_service.py | 18 ++++++++++++--- pgatk/clinvar/data_downloader.py | 2 +- pgatk/ensembl/ensembl.py | 16 ++++++------- .../testdata/clinvar/mini_refseq_protein.faa | 4 ++-- pgatk/testdata/proteindb_from_custom_VCF.fa | 8 +++---- .../test_clinvar/test_clinvar_service.py | 17 ++++++++++++++ .../test_clinvar/test_data_downloader.py | 2 +- pgatk/toolbox/vcf_utils.py | 23 +++++++++---------- 8 files changed, 59 insertions(+), 31 deletions(-) diff --git a/pgatk/clinvar/clinvar_service.py b/pgatk/clinvar/clinvar_service.py index 2726c58..2600477 100644 --- a/pgatk/clinvar/clinvar_service.py +++ b/pgatk/clinvar/clinvar_service.py @@ -7,6 +7,7 @@ from __future__ import annotations import logging +import re import sqlite3 import tempfile from pathlib import Path @@ -109,11 +110,21 @@ def _get_info_field(info_str: str, key: str) -> str: def passes_clnsig_filter(clnsig: str, exclude_list: list[str]) -> bool: """Return True when *clnsig* should **not** be filtered out. - An empty or missing CLNSIG always passes. + An empty or missing CLNSIG always passes. Compound values like + ``Benign/Likely_benign`` or ``Benign,_risk_factor`` are split on + ``/`` and ``,`` so that each component is checked individually. """ if not clnsig: return True - return clnsig not in exclude_list + # Exact match first (covers simple values and compound entries + # that are explicitly listed, e.g. "Benign/Likely_benign"). + if clnsig in exclude_list: + return False + # Split compound values and check each component. + parts = re.split(r"[/,]", clnsig) + return not all(p.strip().replace("_", " ").lower() in + [e.replace("_", " ").lower() for e in exclude_list] + for p in parts if p.strip()) @staticmethod def passes_mc_filter(mc_field: str, include_list: list[str]) -> bool: @@ -434,7 +445,8 @@ def run(self) -> str: # Translation table (mito vs standard) trans_table = self._translation_table - if chrom.upper() in ("M", "MT"): + chrom_bare = chrom.lstrip("chr").upper() + if chrom_bare in ("M", "MT"): trans_table = self._mito_translation_table for alt in alts: diff --git a/pgatk/clinvar/data_downloader.py b/pgatk/clinvar/data_downloader.py index 2a34e03..0e4261f 100644 --- a/pgatk/clinvar/data_downloader.py +++ b/pgatk/clinvar/data_downloader.py @@ -21,7 +21,7 @@ _REFSEQ_FILES = [ "GRCh38_latest_genomic.gtf.gz", - "GRCh38_latest_protein.faa.gz", + "GRCh38_latest_rna.fna.gz", "GRCh38_latest_assembly_report.txt", ] _CLINVAR_FILES = [ diff --git a/pgatk/ensembl/ensembl.py b/pgatk/ensembl/ensembl.py index 343e57f..a9a6d7e 100644 --- a/pgatk/ensembl/ensembl.py +++ b/pgatk/ensembl/ensembl.py @@ -517,7 +517,7 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf if str(record.CHROM).lstrip('chr').upper() in ['M', 'MT']: trans_table = self._mito_translation_table - processed_transcript_allele = [] + processed_transcript_allele = set() transcript_records = [] try: transcript_records = \ @@ -531,7 +531,7 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf for transcript_record in transcript_records.split(','): transcript_info = transcript_record.split('|') - if consequence_index: + if consequence_index is not None: try: consequence = transcript_info[consequence_index] except IndexError: @@ -542,7 +542,7 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf continue except TypeError: pass - if biotype_index: + if biotype_index is not None: try: biotype = transcript_info[biotype_index] except IndexError: @@ -599,24 +599,24 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf if chrom is None: # the record info was not found continue # skip transcripts with unwanted consequences - if consequence_index: + if consequence_index is not None: if (consequence in self._exclude_consequences or (consequence not in self._include_consequences and self._include_consequences != ['all'])): continue # skip transcripts with unwanted biotypes - if biotype_index: + if biotype_index is not None: if (biotype in self._exclude_biotypes or (biotype not in self._include_biotypes and self._include_biotypes != ['all'])): continue for alt in alts: - if transcript_id + str(record.REF) + str( - alt) in processed_transcript_allele: # because VEP reports affected transcripts per alt allele + dedup_key = transcript_id + str(record.REF) + str(alt) + if dedup_key in processed_transcript_allele: continue - processed_transcript_allele.append(transcript_id + str(record.REF) + str(alt)) + processed_transcript_allele.add(dedup_key) # for non-CDSs, only consider the exon that actually overlaps the variant try: diff --git a/pgatk/testdata/clinvar/mini_refseq_protein.faa b/pgatk/testdata/clinvar/mini_refseq_protein.faa index 0cebe8d..4f63132 100644 --- a/pgatk/testdata/clinvar/mini_refseq_protein.faa +++ b/pgatk/testdata/clinvar/mini_refseq_protein.faa @@ -1,4 +1,4 @@ >NM_000001.1 CDS=1-300 -ATGAAGCCCAATAAACCACTCTGACTGGCCGAATAGGGATATAGGCAACGCCATGTGCGGCGACCCTTGCGACAGTGACGCTTTCGCCGTTGCCTAAACCGATTTGAAGGAGTCTAGCAGCCGCAGTAAGGCACAATACCTCGTCCGTGTTACCAGACCAAACAAGACGTCCTCTTCAATGTTTAAATGACCCTCTCGTCATAAAACCTTTCTACTATGTGTTCCGCAAGAATCAACAACTACAATGGCGCGTCGTGAATAACGCGACGGCTGAGACGAACGGCGCGTGAATGAAGCGCT +ATGGGACCTAACGTTCACATTATGAGAGTTACGGGTGTTTTAGAGACCGCCCGTGAGAACACCATCATGGAAGCGAACGATATAGTCGGCGTAGAGCGGATGCTAGCCCACTCCTGGAAATACTCCAGGGTACGTCCCCACAGCATCTACCCCACGACCCGAACGCCGTGCCCGGCGCAATCCAAGGTGCTCGAGACTTTGCGAACCGATCAGTCTGGAGCTTGTTGGCCTGCAATAGTCACAAAGGGGATGTATCAGACCTGCATGTGGACGCGACACCTCGGATCTCCTAGGCCTTAA >NM_000002.1 CDS=1-300 -ATGTAAACAGCTCAGGAGCCAGTCCCCTACGTCGCATATCCTGGCCACTGCAGGTGAAGCGAATGGTATCGATACGTAGGAGGTGTGCCTTCGTAGGCTGTTTCTCAGGACGCCCAACTATTCTTTCCAATCCTACATCTGTTTCTTGCGTCGTAGCGGGACCCTCCATTGTTACTTATTAGGTTCTCGTTATGTCTCATAATCTCAGTGCTGGTGTGATAAGCAAACCACCCTACTGGCACGAAGTTCACAGAAGTGAGATTATGTCTCGTTTGGCAGTCTTGATGCTCGGGGGACACT +ATGCCGATCGGGCAAGTATTGGGTGGCACAGCGGGAAGGGAGGTGATTAGGCTCCGACACTTGGGAGTAGATATGGGTCCATCTTATTATAATATGTCGAAGTCCCCACGCCACACAATCTTCTTTGCATGTGTCCCAATCGGCCTTCGCTGTTTCGGCCTCAGCCAAAGAATTGTTGATGAGCAAGTTGCCCGTTTAGCCCGCCCTATGAGAGAACTTACCTACAATTGGACTAGCGGAAGGTCAGGTCGTGCGACACGACGAGCGCTCGACCAAGATTGGTTTAAAGGTGTGACTTAG diff --git a/pgatk/testdata/proteindb_from_custom_VCF.fa b/pgatk/testdata/proteindb_from_custom_VCF.fa index 22f87e0..5325f01 100644 --- a/pgatk/testdata/proteindb_from_custom_VCF.fa +++ b/pgatk/testdata/proteindb_from_custom_VCF.fa @@ -1,11 +1,11 @@ ->varsample_rs78350717_22.15528913.C.A_ENST00000643195 -MNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKDFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII* ->varsample_rs78350717_22.15528913.C.T_ENST00000643195 -MNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKVFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII* >varsample_rs78350717_22.15528913.C.A_ENST00000252835 MCPLTLQVTGLMNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKDFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII* >varsample_rs78350717_22.15528913.C.T_ENST00000252835 MCPLTLQVTGLMNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKVFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII* +>varsample_rs78350717_22.15528913.C.A_ENST00000643195 +MNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKDFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII* +>varsample_rs78350717_22.15528913.C.T_ENST00000643195 +MNVSEPNSSFAFVNEFILQGFSCEWTIQIFLFSLFTTTYALTITGNGAIAFVLWCDRRLHTPMYMFLGNFSFLEIWYVSSTVPKMLVNFLSEKKNISFAGCFLQFYFFFSLGTSECLLLTVMAFDQYLAICRPLLYPNIMTGHLYAKLVILCWVCGFLWFLIPIVLISQMPFCGPNIIDHVVCDPGPRFALDCVSAPRIQLFCYTLSSLVIFGNFLFIIGSYTLVLKAMLGMPSSTGRHKVFSTCGSHLAVVSLCYSSLMVMYVSPGLGHSTGMQKIETLFYAMVTPLFNPLIYSLQNKEIKAALRKVLGSSNII* >varsample_rs147461488_22.17740102.GGCCACGCTCAACT.G_ENST00000342111 MDCEVNNGSSLRDECITNLLVFGFLQSCSDNSFRRELDALGHELPVLAPQWEGYDELQTDGNRSSHSRLGRIEAGASDNNTASAEEETEAAGSVAFMELRQ*F*KSRRHHPEYCQAPRPGRGQHGP*HPSGPG >varsample_rs147461488_22.17740102.GGCCACGCTCAACT.G_ENST00000617586 diff --git a/pgatk/tests/test_clinvar/test_clinvar_service.py b/pgatk/tests/test_clinvar/test_clinvar_service.py index 405416c..eb62ee0 100644 --- a/pgatk/tests/test_clinvar/test_clinvar_service.py +++ b/pgatk/tests/test_clinvar/test_clinvar_service.py @@ -54,6 +54,23 @@ def test_likely_pathogenic_passes(self): exclude = ["Benign", "Likely_benign", "Benign/Likely_benign"] assert ClinVarService.passes_clnsig_filter("Likely_pathogenic", exclude) is True + def test_compound_all_benign_excluded(self): + """Compound value with all-benign components should be excluded.""" + exclude = ["Benign", "Likely_benign", "Benign/Likely_benign"] + # Both components are in the exclude list → excluded + assert ClinVarService.passes_clnsig_filter("Benign,Likely_benign", exclude) is False + + def test_compound_mixed_passes(self): + """Compound value with a non-benign component should pass.""" + exclude = ["Benign", "Likely_benign", "Benign/Likely_benign"] + assert ClinVarService.passes_clnsig_filter("Pathogenic/Likely_benign", exclude) is True + + def test_compound_risk_factor_excluded(self): + """Benign with risk_factor — all components benign → excluded.""" + exclude = ["Benign", "Likely_benign"] + # '_risk_factor' is not in exclude list, so it's not an excluded component + assert ClinVarService.passes_clnsig_filter("Benign,_risk_factor", exclude) is True + # --------------------------------------------------------------------------- # TestMolecularConsequenceParser diff --git a/pgatk/tests/test_clinvar/test_data_downloader.py b/pgatk/tests/test_clinvar/test_data_downloader.py index 3d89b80..e7b59ab 100644 --- a/pgatk/tests/test_clinvar/test_data_downloader.py +++ b/pgatk/tests/test_clinvar/test_data_downloader.py @@ -15,7 +15,7 @@ def test_build_refseq_urls(self): urls = downloader.get_refseq_urls() assert len(urls) == 3 assert any("GRCh38_latest_genomic.gtf.gz" in u for u in urls) - assert any("GRCh38_latest_protein.faa.gz" in u for u in urls) + assert any("GRCh38_latest_rna.fna.gz" in u for u in urls) assert any("assembly_report.txt" in u for u in urls) def test_build_clinvar_urls(self): diff --git a/pgatk/toolbox/vcf_utils.py b/pgatk/toolbox/vcf_utils.py index db62340..248cc08 100644 --- a/pgatk/toolbox/vcf_utils.py +++ b/pgatk/toolbox/vcf_utils.py @@ -106,18 +106,17 @@ def get_altseq( ref_seq = ref_seq[start_coding_index:stop_coding_index] nc_index = 0 - if len(ref_allele) == len(var_allele) or ref_allele[0] == var_allele[0]: - for feature in features_info: - if var_pos in range(feature[0], feature[1] + 1): - var_index_in_cds = nc_index + (var_pos - feature[0]) - c = len(ref_allele) - alt_seq = ref_seq[0:var_index_in_cds] + var_allele + ref_seq[var_index_in_cds + c::] - if strand == '-': - return ref_seq[::-1], alt_seq[::-1] - else: - return ref_seq, alt_seq - - nc_index += (feature[1] - feature[0] + 1) + for feature in features_info: + if var_pos in range(feature[0], feature[1] + 1): + var_index_in_cds = nc_index + (var_pos - feature[0]) + c = len(ref_allele) + alt_seq = ref_seq[0:var_index_in_cds] + var_allele + ref_seq[var_index_in_cds + c::] + if strand == '-': + return ref_seq[::-1], alt_seq[::-1] + else: + return ref_seq, alt_seq + + nc_index += (feature[1] - feature[0] + 1) return ref_seq, alt_seq From 80a4cc971bfa2848fcc5ed388546b54b654c25d2 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Tue, 3 Mar 2026 15:48:06 +0000 Subject: [PATCH 2/7] minor changes and bug fixing. --- pgatk/cgenomes/cgenomes_proteindb.py | 147 +++++++++++++++++++++------ 1 file changed, 115 insertions(+), 32 deletions(-) diff --git a/pgatk/cgenomes/cgenomes_proteindb.py b/pgatk/cgenomes/cgenomes_proteindb.py index 8b188d5..455c5c0 100644 --- a/pgatk/cgenomes/cgenomes_proteindb.py +++ b/pgatk/cgenomes/cgenomes_proteindb.py @@ -1,16 +1,52 @@ from __future__ import annotations +import gzip import logging import re from pathlib import Path from typing import Any, Optional from Bio import SeqIO +from Bio.Data.IUPACData import protein_letters_3to1 +from Bio.Seq import Seq from pgatk.cgenomes.models import SNP from pgatk.toolbox.general import ParameterConfiguration +def _open_text(path: str, mode: str = 'r', encoding: str = 'utf-8', **kwargs): + """Open a file for text I/O, decompressing automatically if path ends with .gz.""" + enc = encoding or "utf-8" + if path.endswith('.gz'): + text_mode = mode if 't' in mode else mode + 't' + return gzip.open(path, text_mode, encoding=enc, **kwargs) + return open(path, mode, encoding=enc, **kwargs) + + +def _three_to_one(aa_str: str) -> str: + """Convert a string of 3-letter amino acid codes to 1-letter codes. + + If the string is already in 1-letter codes (all single uppercase letters), + it is returned unchanged. For 3-letter codes (e.g. ``AlaGlyVal``), each + triplet is converted using Biopython's mapping. + """ + # Quick check: if all characters are uppercase single letters and the + # string length is not a multiple of 3, or it contains lowercase, try + # 3-letter conversion. + if len(aa_str) >= 3 and any(c.islower() for c in aa_str): + result = [] + for i in range(0, len(aa_str), 3): + triplet = aa_str[i:i + 3] + # Title-case the triplet for lookup (e.g. "ala" -> "Ala") + triplet_title = triplet.capitalize() + if triplet_title in protein_letters_3to1: + result.append(protein_letters_3to1[triplet_title]) + else: + return aa_str # not a valid 3-letter sequence, return as-is + return "".join(result) + return aa_str + + class CancerGenomesService(ParameterConfiguration): CONFIG_CANCER_GENOMES_MUTATION_FILE = 'mutation_file' CONFIG_COMPLETE_GENES_FILE = "all_cds_genes_file" @@ -88,10 +124,11 @@ def get_multiple_options(options_str: str) -> list[str]: return list(map(lambda x: x.strip(), options_str.split(","))) @staticmethod - def get_mut_pro_seq(snp: SNP, seq: str) -> Optional[str]: + def get_mut_pro_seq(snp: SNP, seq: Seq) -> Optional[str]: nucleotide = ["A", "T", "C", "G"] mut_pro_seq = "" - if "?" not in snp.dna_mut and snp.aa_mut != 'p.?': # unambiguous DNA change known in CDS sequence + if (snp.dna_mut is not None and "?" not in snp.dna_mut and + snp.aa_mut is not None and snp.aa_mut != 'p.?'): # unambiguous DNA change known in CDS sequence positions = re.findall(r'\d+', snp.dna_mut) if ">" in snp.dna_mut and len(positions) == 1: # Substitution tmplist = snp.dna_mut.split(">") @@ -100,27 +137,41 @@ def get_mut_pro_seq(snp: SNP, seq: str) -> Optional[str]: index = int(positions[0]) - 1 if ref_dna == str(seq[index]).upper() and mut_dna in nucleotide: # seq_mut = seq[:index] + mut_dna + seq[index + 1:] - mut_pro_seq = seq_mut.translate(to_stop=False) + mut_pro_seq = str(seq_mut.translate(to_stop=False)) + elif "delins" in snp.dna_mut: + # Deletion-insertion: delete range then insert new bases + insert_dna = snp.dna_mut.split("delins")[1] + if insert_dna.isalpha() and len(positions) >= 2: + del_index1 = int(positions[0]) - 1 + del_index2 = int(positions[1]) + seq_mut = seq[:del_index1] + insert_dna + seq[del_index2:] + mut_pro_seq = str(seq_mut.translate(to_stop=False)) + elif insert_dna.isalpha() and len(positions) == 1: + # Single-position delins: replace one base + del_index1 = int(positions[0]) - 1 + seq_mut = seq[:del_index1] + insert_dna + seq[del_index1 + 1:] + mut_pro_seq = str(seq_mut.translate(to_stop=False)) + elif "ins" in snp.dna_mut: index = snp.dna_mut.index("ins") insert_dna = snp.dna_mut[index + 3:] if insert_dna.isalpha(): ins_index1 = int(positions[0]) seq_mut = seq[:ins_index1] + insert_dna + seq[ins_index1:] - mut_pro_seq = seq_mut.translate(to_stop=False) + mut_pro_seq = str(seq_mut.translate(to_stop=False)) elif "del" in snp.dna_mut: if len(positions) == 2: del_index1 = int(positions[0]) - 1 del_index2 = int(positions[1]) seq_mut = seq[:del_index1] + seq[del_index2:] - mut_pro_seq = seq_mut.translate(to_stop=False) + mut_pro_seq = str(seq_mut.translate(to_stop=False)) elif len(positions) == 1: del_index1 = int(positions[0]) - 1 seq_mut = seq[:del_index1] + seq[del_index1 + 1:] - mut_pro_seq = seq_mut.translate(to_stop=False) + mut_pro_seq = str(seq_mut.translate(to_stop=False)) else: - if "?" not in snp.aa_mut: # unambiguous aa change known in protein sequence + if snp.aa_mut is not None and "?" not in snp.aa_mut: # unambiguous aa change known in protein sequence positions = re.findall(r'\d+', snp.aa_mut) protein_seq = str(seq.translate(to_stop=False)) @@ -164,8 +215,8 @@ def get_mut_pro_seq(snp: SNP, seq: str) -> Optional[str]: mut_pro_seq = protein_seq[:del_index1] + mut_aa + protein_seq[del_index2:] elif "insertion" in snp.mutation_type: - ins_index1 = int(positions[0]) - 1 - mut_pro_seq = protein_seq[:ins_index1] + mut_aa + protein_seq[ins_index1 + 1:] + ins_index1 = int(positions[0]) + mut_pro_seq = protein_seq[:ins_index1] + mut_aa + protein_seq[ins_index1:] elif "compound substitution" in snp.mutation_type: if "*" not in mut_aa: del_index1 = int(positions[0]) - 1 @@ -185,11 +236,12 @@ def cosmic_to_proteindb(self) -> None: """ self.get_logger().debug("Starting reading All cosmic genes") COSMIC_CDS_DB = {} - for record in SeqIO.parse(self._local_complete_genes, 'fasta'): - try: - COSMIC_CDS_DB[record.id].append(record) - except KeyError: - COSMIC_CDS_DB[record.id] = [record] + with _open_text(self._local_complete_genes, encoding='utf-8') as genes_handle: + for record in SeqIO.parse(genes_handle, 'fasta'): + try: + COSMIC_CDS_DB[record.id].append(record) + except KeyError: + COSMIC_CDS_DB[record.id] = [record] regex = re.compile('[^a-zA-Z]') mutation_dic = {} @@ -197,17 +249,38 @@ def cosmic_to_proteindb(self) -> None: self.get_logger().debug("Reading input CosmicMutantExport.tsv ...") line_counter = 1 - with open(self._local_mutation_file, encoding="latin-1") as cosmic_input, \ + required_columns = ["Gene name", "Accession Number", "Mutation CDS", "Mutation AA", "Mutation Description"] + with _open_text(self._local_mutation_file, encoding="latin-1") as cosmic_input, \ open(self._local_output_file, 'w', encoding='utf-8') as output: - header = cosmic_input.readline().split("\t") - gene_col = header.index("Gene name") - enst_col = header.index("Accession Number") - cds_col = header.index("Mutation CDS") - aa_col = header.index("Mutation AA") - muttype_col = header.index("Mutation Description") + header = cosmic_input.readline().strip().split("\t") + try: + gene_col = header.index("Gene name") + enst_col = header.index("Accession Number") + cds_col = header.index("Mutation CDS") + aa_col = header.index("Mutation AA") + muttype_col = header.index("Mutation Description") + except ValueError as e: + self.get_logger().error( + "COSMIC file missing required columns. Expected %s, got: %s", + required_columns, header + ) + raise ValueError( + f"COSMIC mutation file missing required column: {e}. " + f"Expected columns include: {required_columns}" + ) from e filter_col = None if self._filter_column: - filter_col = header.index(self._filter_column) + try: + filter_col = header.index(self._filter_column) + except ValueError: + self.get_logger().warning( + "Filter column '%s' not found in COSMIC header: %s. Filtering disabled.", + self._filter_column, header + ) + + max_col = max(gene_col, enst_col, cds_col, aa_col, muttype_col) + if filter_col is not None: + max_col = max(max_col, filter_col) for line in cosmic_input: if line_counter % 10000 == 0: @@ -215,6 +288,9 @@ def cosmic_to_proteindb(self) -> None: self.get_logger().debug(msg) line_counter += 1 row = line.strip().split("\t") + if len(row) <= max_col: + self.get_logger().debug("Skipping malformed row (insufficient columns) at line %s: %s", line_counter, row[:5]) + continue # filter out mutations from unspecified groups if filter_col is not None: if row[filter_col] not in self._accepted_values and self._accepted_values != ['all']: @@ -270,12 +346,12 @@ def cosmic_to_proteindb(self) -> None: self.get_logger().debug("COSMIC contains in total {} non redundant mutations".format(len(mutation_dic))) @staticmethod - def get_sample_headers(header_line: list, filter_coumn: str) -> tuple[Optional[int], Optional[int]]: + def get_sample_headers(header_line: list, filter_column: str) -> tuple[Optional[int], Optional[int]]: _logger = logging.getLogger(__name__) try: - filter_col = header_line.index(filter_coumn) + filter_col = header_line.index(filter_column) except ValueError: - _logger.warning('%s was not found in the header row: %s', filter_coumn, header_line) + _logger.warning('%s was not found in the header row: %s', filter_column, header_line) return None, None try: sample_id_col = header_line.index('SAMPLE_ID') @@ -296,8 +372,11 @@ def get_value_per_sample(self, local_clinical_sample_file: str, filter_column: s # check for header and re-assign columns if 'SAMPLE_ID' in sl and filter_column in sl: filter_column_col, sample_id_col = self.get_sample_headers(sl, filter_column) + # Skip adding the header row itself to sample_value + continue if filter_column_col is not None and sample_id_col is not None: - sample_value[sl[sample_id_col]] = sl[filter_column_col].strip().replace(' ', '_') + if (sample_id_col < len(sl) and filter_column_col < len(sl)): + sample_value[sl[sample_id_col]] = sl[filter_column_col].strip().replace(' ', '_') else: self.get_logger().warning("No column was found for %s, %s in %s", filter_column, 'SAMPLE_ID', local_clinical_sample_file) @@ -320,11 +399,11 @@ def cbioportal_to_proteindb(self) -> None: group_mutations_dict = {} seq_dic = {} - fafile = SeqIO.parse(self._local_complete_genes, "fasta") - for record in fafile: - newacc = record.id.split(".")[0] - if newacc not in seq_dic: - seq_dic[newacc] = record.seq + with _open_text(self._local_complete_genes, encoding='utf-8') as genes_handle: + for record in SeqIO.parse(genes_handle, "fasta"): + newacc = record.id.split(".")[0] + if newacc not in seq_dic: + seq_dic[newacc] = record.seq header_cols = {"HGVSc": None, "Transcript_ID": None, "Variant_Classification": None, "Variant_Type": None, "HGVSp_Short": None, 'Tumor_Sample_Barcode': None} @@ -343,7 +422,8 @@ def cbioportal_to_proteindb(self) -> None: self.get_logger().warning('No clinical sample file is given therefore no filter could be applied.') return - with open(self._local_mutation_file, "r", encoding='utf-8') as mutfile, open(self._local_output_file, "w", encoding='utf-8') as output: + with _open_text(self._local_mutation_file, encoding='utf-8') as mutfile, \ + open(self._local_output_file, "w", encoding='utf-8') as output: for i, line in enumerate(mutfile): row = line.strip().split("\t") if row[0] == '#': @@ -406,6 +486,9 @@ def cbioportal_to_proteindb(self) -> None: except IndexError: self.get_logger().warning("Incorrect SNP format or record %s %s %s", i, pos, line) continue + if ">" not in pos: + self.get_logger().warning("SNP position string missing '>' (line %s): %s", i, pos) + continue idx = pos.index(">") ref_dna = pos[idx - 1] mut_dna = pos[idx + 1] From 2bd43fe0a380a1e415220ea1afbfb47f71abfdca Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Tue, 3 Mar 2026 15:57:38 +0000 Subject: [PATCH 3/7] minor change --- pgatk/cgenomes/cgenomes_proteindb.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pgatk/cgenomes/cgenomes_proteindb.py b/pgatk/cgenomes/cgenomes_proteindb.py index 455c5c0..b0a97df 100644 --- a/pgatk/cgenomes/cgenomes_proteindb.py +++ b/pgatk/cgenomes/cgenomes_proteindb.py @@ -20,7 +20,12 @@ def _open_text(path: str, mode: str = 'r', encoding: str = 'utf-8', **kwargs): if path.endswith('.gz'): text_mode = mode if 't' in mode else mode + 't' return gzip.open(path, text_mode, encoding=enc, **kwargs) - return open(path, mode, encoding=enc, **kwargs) + if enc == "utf-8": + return open(path, mode, encoding="utf-8", **kwargs) + if enc == "latin-1": + return open(path, mode, encoding="latin-1", **kwargs) + # Only utf-8 and latin-1 are used; fallback to utf-8 for any other value + return open(path, mode, encoding="utf-8", **kwargs) def _three_to_one(aa_str: str) -> str: From dd4ee5754407447c31f00def45c82da40c67ed91 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Tue, 3 Mar 2026 18:27:48 +0000 Subject: [PATCH 4/7] minor gubs fixing. --- pgatk/cgenomes/cgenomes_proteindb.py | 16 +++++-- pgatk/clinvar/clinvar_service.py | 4 +- pgatk/ensembl/ensembl.py | 50 ++++++++++++++------- pgatk/testdata/proteindb_from_gnomad_VCF.fa | 2 - pgatk/toolbox/vcf_utils.py | 8 +++- 5 files changed, 55 insertions(+), 25 deletions(-) diff --git a/pgatk/cgenomes/cgenomes_proteindb.py b/pgatk/cgenomes/cgenomes_proteindb.py index b0a97df..f42d7b8 100644 --- a/pgatk/cgenomes/cgenomes_proteindb.py +++ b/pgatk/cgenomes/cgenomes_proteindb.py @@ -181,8 +181,12 @@ def get_mut_pro_seq(snp: SNP, seq: Seq) -> Optional[str]: protein_seq = str(seq.translate(to_stop=False)) if "Missense" in snp.mutation_type: - mut_aa = snp.aa_mut[-1] - if not mut_aa.isalpha(): + # Extract the mutant residue from HGVS like p.V600E (1-letter) + # or p.Val600Glu (3-letter). For 1-letter the last char is the + # AA; for 3-letter we need to convert the last triplet. + aa_suffix = re.sub(r'^p\..*\d+', '', snp.aa_mut) + mut_aa = _three_to_one(aa_suffix) if aa_suffix else '' + if not mut_aa or len(mut_aa) != 1 or not mut_aa.isalpha(): return '' index = int(positions[0]) - 1 mut_pro_seq = protein_seq[:index] + mut_aa + protein_seq[index + 1:] @@ -194,7 +198,8 @@ def get_mut_pro_seq(snp: SNP, seq: Seq) -> Optional[str]: index = snp.aa_mut.index("ins") except ValueError: return '' - insert_aa = snp.aa_mut[index + 3:] + insert_aa_raw = snp.aa_mut[index + 3:] + insert_aa = _three_to_one(insert_aa_raw) if insert_aa.isalpha(): ins_index1 = int(positions[0]) mut_pro_seq = protein_seq[:ins_index1] + insert_aa + protein_seq[ins_index1:] @@ -211,7 +216,10 @@ def get_mut_pro_seq(snp: SNP, seq: Seq) -> Optional[str]: index = snp.aa_mut.index(">") except ValueError: return '' - mut_aa = snp.aa_mut[index + 1:] + mut_aa_raw = snp.aa_mut[index + 1:] + mut_aa = _three_to_one(mut_aa_raw.replace('*', '')) + if '*' in mut_aa_raw: + mut_aa += '*' if not mut_aa.replace('*','').isalpha(): return '' if "deletion" in snp.mutation_type: diff --git a/pgatk/clinvar/clinvar_service.py b/pgatk/clinvar/clinvar_service.py index 2600477..b5760cb 100644 --- a/pgatk/clinvar/clinvar_service.py +++ b/pgatk/clinvar/clinvar_service.py @@ -493,9 +493,9 @@ def run(self) -> str: if "CDS=" in desc: try: cds_str = [ - p + p.strip("[]") for p in desc.split() - if p.startswith("CDS=") + if p.strip("[]").startswith("CDS=") ][0] cds_info = [ int(x) diff --git a/pgatk/ensembl/ensembl.py b/pgatk/ensembl/ensembl.py index a9a6d7e..e1d8b7e 100644 --- a/pgatk/ensembl/ensembl.py +++ b/pgatk/ensembl/ensembl.py @@ -328,7 +328,9 @@ def annoate_vcf(vcf_file: str, gtf_file: str, BedTool(gtf_file).intersect(BedTool(vcf_file), wo=True).saveas(f"{vcf_stem}_all.bed") - muts_dict = {} + muts_dict: dict[str, list[str]] = {} + # VCF columns start after the GTF columns (gene_info_index + 1) + vcf_start_col = gene_info_index + 1 with open(f"{vcf_stem}_all.bed", 'r', encoding='utf-8') as an: for line in an.readlines(): sl = line.strip().split('\t') @@ -345,11 +347,17 @@ def annoate_vcf(vcf_file: str, gtf_file: str, if transcript_id == 'NO_OVERLAP': continue - # write the mutation line as a key and set the overlapping transcriptID as its value(s) + # Use canonical positional key (CHROM:POS:REF:ALT) for reliable + # matching between BED output and raw VCF lines. try: - muts_dict['\t'.join(sl[gene_info_index + 1:-1])].append(transcript_id) - except KeyError: - muts_dict['\t'.join(sl[gene_info_index + 1:-1])] = [transcript_id] + vcf_chrom = sl[vcf_start_col] + vcf_pos = sl[vcf_start_col + 1] + vcf_ref = sl[vcf_start_col + 3] + vcf_alt = sl[vcf_start_col + 4] + key = f"{vcf_chrom}\t{vcf_pos}\t{vcf_ref}\t{vcf_alt}" + except IndexError: + continue + muts_dict.setdefault(key, []).append(transcript_id) with open(f"{vcf_stem}_annotated.vcf", 'w', encoding='utf-8') as ann, open(vcf_file, 'r', encoding='utf-8') as v: # write vcf headers to the output file @@ -358,13 +366,17 @@ def annoate_vcf(vcf_file: str, gtf_file: str, ann.write(line) else: # write the mutations and their overlapping transcript to output file - try: - sl = line.strip().split('\t') + sl = line.strip().split('\t') + if len(sl) < 8: + ann.write(line) + continue + key = f"{sl[0]}\t{sl[1]}\t{sl[3]}\t{sl[4]}" + if key in muts_dict: sl[vcf_info_field_index] = '{};{}={}'.format( sl[vcf_info_field_index].strip(), annotation_str, - ','.join(set(muts_dict[line.strip()]))) + ','.join(set(muts_dict[key]))) ann.write('\t'.join(sl) + '\n') - except KeyError: + else: ann.write(line) return f"{vcf_stem}_annotated.vcf" @@ -503,7 +515,7 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf if self._af_field: # get AF from the INFO field try: - af = float([x.split('=')[1] for x in record.INFO.split(';') if x.startswith(self._af_field)][0]) + af = float([x.split('=', 1)[1] for x in record.INFO.split(';') if x.split('=', 1)[0] == self._af_field][0]) except (ValueError, IndexError): invalid_records['# variants with invalid record'] += 1 continue @@ -521,8 +533,8 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf transcript_records = [] try: transcript_records = \ - [x.split('=')[1] for x in record.INFO.split(';') if x.startswith(self._annotation_field_name)][ - 0] + [x.split('=', 1)[1] for x in record.INFO.split(';') + if x.split('=', 1)[0] == self._annotation_field_name][0] except IndexError: # no overlapping feature was found invalid_records['# variants with invalid record'] += 1 msg = "skipped record {}, no annotation feature was found".format(record) @@ -586,11 +598,17 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf num_orfs = 3 if 'CDS=' in desc: try: - cds_info = [int(x) for x in desc.split(' ')[1].split('=')[1].split('-')] - feature_types = ['CDS', 'stop_codon'] - num_orfs = 1 + cds_token = next( + (t.strip('[]') for t in desc.split(' ') + if t.strip('[]').startswith('CDS=')), + None, + ) + if cds_token: + cds_info = [int(x) for x in cds_token.split('=')[1].split('-')] + feature_types = ['CDS', 'stop_codon'] + num_orfs = 1 except (ValueError, IndexError): - msg = "Could not extra cds position from fasta header for: {}".format(desc) + msg = "Could not extract cds position from fasta header for: {}".format(desc) self.get_logger().debug(msg) chrom, strand, features_info = self.get_features(db, diff --git a/pgatk/testdata/proteindb_from_gnomad_VCF.fa b/pgatk/testdata/proteindb_from_gnomad_VCF.fa index 3f008d7..9397434 100644 --- a/pgatk/testdata/proteindb_from_gnomad_VCF.fa +++ b/pgatk/testdata/proteindb_from_gnomad_VCF.fa @@ -1,5 +1,3 @@ ->gnomvar_rs760117505_1.69337.A.C_ENST00000335137.3 -MVTEFIFLGLSDSQELQTFLFMLFFVFYGGIVFGNLLIVITVVSDSHLHSPMYFLLANLSLIDLSLSSVTAPKMITDFFSQRQVISFKGCLVQIFLLHFFGGSEMVILIAMGFDRYIAICKPLHYTTIMCGNACVGIMAVTWGIGFLHSVSQLAFAVHLLFCGPNEVDSFYCDLPRVIKLACTDTYRLDIMVIANSGVLTVCSFVLLIISYTIILMTIQHRPLDKSSKALSTLTAHITVVLLFFGPCVFIYAWPFPIKSLDKFLAVFYSVITPLLNPIIYTLRNKDMKTAIRQLRKWDAHSSVKF* >gnomvar_rs2691305_1.69511.A.G_ENST00000335137.3 MVTEFIFLGLSDSQELQTFLFMLFFVFYGGIVFGNLLIVITVVSDSHLHSPMYFLLANLSLIDLSLSSVTAPKMITDFFSQRKVISFKGCLVQIFLLHFFGGSEMVILIAMGFDRYIAICKPLHYTTIMCGNACVGIMAVAWGIGFLHSVSQLAFAVHLLFCGPNEVDSFYCDLPRVIKLACTDTYRLDIMVIANSGVLTVCSFVLLIISYTIILMTIQHRPLDKSSKALSTLTAHITVVLLFFGPCVFIYAWPFPIKSLDKFLAVFYSVITPLLNPIIYTLRNKDMKTAIRQLRKWDAHSSVKF* >gnomvar_rs200505207_1.69761.A.T_ENST00000335137.3 diff --git a/pgatk/toolbox/vcf_utils.py b/pgatk/toolbox/vcf_utils.py index 248cc08..d371e5d 100644 --- a/pgatk/toolbox/vcf_utils.py +++ b/pgatk/toolbox/vcf_utils.py @@ -109,7 +109,13 @@ def get_altseq( for feature in features_info: if var_pos in range(feature[0], feature[1] + 1): var_index_in_cds = nc_index + (var_pos - feature[0]) - c = len(ref_allele) + # Clip the ref allele length to the portion that falls within + # this feature. When a multi-base variant extends from an exon + # into an intron, the intronic bases are absent from the + # transcript and must not be counted. + var_end_genomic = var_pos + len(ref_allele) - 1 + exonic_ref_len = min(var_end_genomic, feature[1]) - var_pos + 1 + c = max(exonic_ref_len, 0) alt_seq = ref_seq[0:var_index_in_cds] + var_allele + ref_seq[var_index_in_cds + c::] if strand == '-': return ref_seq[::-1], alt_seq[::-1] From 6dcf9dcffd46c8775d05e52a909d5ac163700b09 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Tue, 3 Mar 2026 18:46:24 +0000 Subject: [PATCH 5/7] minor changes in help --- docs/index.md | 1 + docs/pgatk-cli.md | 163 ++++++++++++ docs/use-cases.md | 619 ++++++++++++++++++++++++++++++++++++++++++++++ mkdocs.yml | 1 + 4 files changed, 784 insertions(+) create mode 100644 docs/use-cases.md diff --git a/docs/index.md b/docs/index.md index d710453..f6e58a2 100644 --- a/docs/index.md +++ b/docs/index.md @@ -23,6 +23,7 @@ See the [Installation](installation.md) page for more options (Bioconda, Docker, | [Introduction](introduction.md) | Overview of the proteogenomics field | | [Installation](installation.md) | How to install pgatk (pip, Bioconda, Docker, source) | | [pgatk CLI](pgatk-cli.md) | Full command-line reference for all tools | +| [Use Cases](use-cases.md) | End-to-end workflows and recipes for common scenarios | | [File Formats](formats.md) | BED, GTF, GCT format specifications | | [Changelog](changelog.md) | Version history and release notes | | [Support](support.md) | Contributors and how to get help | diff --git a/docs/pgatk-cli.md b/docs/pgatk-cli.md index 2bda0be..f36971a 100644 --- a/docs/pgatk-cli.md +++ b/docs/pgatk-cli.md @@ -15,12 +15,16 @@ Options: Commands: cbioportal-downloader Command to download the the cbioportal studies cbioportal-to-proteindb Command to translate cbioportal mutation data into proteindb + clinvar-to-proteindb Generate protein database from ClinVar VCF + RefSeq GTF cosmic-downloader Command to download the cosmic mutation database cosmic-to-proteindb Command to translate Cosmic mutation data into proteindb + digest-mutant-protein Digest mutant proteins and filter against canonical proteome dnaseq-to-proteindb Generate peptides based on DNA sequences ensembl-check Command to check ensembl database for stop codons, gaps ensembl-downloader Command to download the ensembl information generate-decoy Create decoy protein sequences using multiple methods + map-peptide2genome Map peptides to genomic coordinates (GFF3 output) + ncbi-downloader Download NCBI RefSeq and ClinVar reference files threeframe-translation Command to perform 3'frame translation vcf-to-proteindb Generate peptides based on DNA variants VCF files ``` @@ -171,6 +175,44 @@ git lfs pull -I public --include "data_clinical_sample.txt" git lfs pull -I public --include "data_mutations_mskcc.txt" ``` +### Downloading NCBI / ClinVar Data + +Downloading NCBI RefSeq annotations and ClinVar variants for human (GRCh38) is performed using the command `ncbi-downloader`. The tool downloads four files: + +- RefSeq gene annotations (GTF) +- RefSeq transcript nucleotide sequences (FASTA) +- Assembly report (chromosome name mapping) +- ClinVar variant calls (VCF) + +#### Command Options + +```bash +$ pgatk ncbi-downloader -h +Usage: pgatk ncbi-downloader [OPTIONS] + + Required parameters: + -o, --output-dir TEXT Output directory for downloaded files + + Optional parameters: + -c, --config_file TEXT Configuration YAML file + --force Re-download files even if they exist + -h, --help Show this message and exit. +``` + +#### Examples + +- Download all NCBI/ClinVar reference files: + + ```bash + pgatk ncbi-downloader -o ncbi_data + ``` + +- Force re-download of all files: + + ```bash + pgatk ncbi-downloader -o ncbi_data --force + ``` + ## Generate Protein Databases The **pgatk** framework provides a set of tools to generate protein databases in `FASTA` format from DNA sequences, variants, and mutations. Multiple commands are available depending on the data type provided by the user and the public data providers (cBioPortal, COSMIC and ENSEMBL). @@ -355,6 +397,43 @@ The output of the tool is a protein fasta file written to the path specified by --output_proteindb var_peptides.fa ``` +### ClinVar Variants to Protein Sequences + +[ClinVar](https://www.ncbi.nlm.nih.gov/clinvar/) is NCBI's public archive of reports on the relationships among human variations and phenotypes. The command `clinvar-to-proteindb` generates a variant protein database from ClinVar VCF files using NCBI RefSeq gene models. Unlike `vcf-to-proteindb`, this command does **not** require VEP annotations. + +#### Command Options + +```bash +$ pgatk clinvar-to-proteindb -h +Usage: pgatk clinvar-to-proteindb [OPTIONS] + + Required parameters: + -v, --vcf TEXT ClinVar VCF file path + -g, --gtf TEXT NCBI RefSeq GTF file path + -f, --fasta TEXT RefSeq transcript nucleotide FASTA file path + -a, --assembly-report TEXT NCBI assembly report file path + -o, --output TEXT Output protein FASTA file path + + Optional parameters: + -c, --config_file TEXT Configuration YAML file + -h, --help Show this message and exit. +``` + +The input files can be downloaded using the [ncbi-downloader](#downloading-ncbi--clinvar-data) command. + +#### Examples + +- Generate a protein database from ClinVar variants: + + ```bash + pgatk clinvar-to-proteindb \ + --vcf ncbi_data/clinvar.vcf.gz \ + --gtf ncbi_data/GRCh38_latest_genomic.gtf.gz \ + --fasta ncbi_data/GRCh38_latest_rna.fna.gz \ + --assembly-report ncbi_data/GRCh38_latest_assembly_report.txt \ + --output clinvar_proteins.fa + ``` + ### Transcripts (DNA) to Protein Sequences DNA sequences given in a FASTA format can be translated using the `dnaseq-to-proteindb` tool. This tool allows for translation of all kinds of transcripts (coding and noncoding) by specifying the desired biotypes. @@ -489,3 +568,87 @@ Usage: pgatk generate-decoy [OPTIONS] ```bash pgatk generate-decoy -c config/protein_decoy.yaml --input proteindb.fa --output decoy_proteindb.fa ``` + +## Post-Processing Utilities + +### Digest and Filter Variant Peptides + +The `digest-mutant-protein` command performs *in silico* tryptic digestion of mutant proteins and filters out any peptides that also appear in a canonical reference proteome. The result is a compact FASTA of variant-specific peptides only. + +#### Command Options + +```bash +$ pgatk digest-mutant-protein -h +Usage: pgatk digest-mutant-protein [OPTIONS] + + Required parameters: + -i, --input TEXT Input mutant protein FASTA file(s), comma-separated + -f, --fasta TEXT Reference canonical protein FASTA + -o, --output TEXT Output file for unique variant peptides + + Optional parameters: + --prefix TEXT Header prefix for output entries (default: Mutation) + --min-len INTEGER Minimum peptide length (default: 7) + --max-len INTEGER Maximum peptide length (default: 40) + --missed-cleavages INTEGER Number of missed cleavages (default: 0) + -h, --help Show this message and exit. +``` + +#### Examples + +- Digest variant proteins and keep only variant-specific peptides: + + ```bash + pgatk digest-mutant-protein \ + --input variant_proteins.fa \ + --fasta canonical_proteins.fa \ + --output unique_variant_peptides.fa \ + --min-len 7 \ + --max-len 40 \ + --missed-cleavages 2 + ``` + +- Combine and filter multiple variant sources: + + ```bash + pgatk digest-mutant-protein \ + --input cosmic_proteins.fa,clinvar_proteins.fa,ensembl_variants.fa \ + --fasta canonical_proteins.fa \ + --output combined_variant_peptides.fa + ``` + +### Map Peptides to Genomic Coordinates + +The `map-peptide2genome` command maps proteomics-identified peptides to genomic coordinates and produces a GFF3 file suitable for visualization in genome browsers (UCSC, IGV, Ensembl). + +#### Command Options + +```bash +$ pgatk map-peptide2genome -h +Usage: pgatk map-peptide2genome [OPTIONS] + + Required parameters: + -i, --input TEXT Input peptide identification TSV + -g, --gtf TEXT GTF gene annotation file + -f, --fasta TEXT Protein FASTA file + -m, --idmap TEXT Protein-to-transcript ID mapping file + -o, --output TEXT Output GFF3 file + + Optional parameters: + --pep-col INTEGER Peptide column index, 0-based (default: 0) + --prot-col INTEGER Protein column index, 0-based (default: 1) + -h, --help Show this message and exit. +``` + +#### Examples + +- Map identified peptides to genomic coordinates: + + ```bash + pgatk map-peptide2genome \ + --input peptide_identifications.tsv \ + --gtf genes.gtf \ + --fasta proteins.fa \ + --idmap protein_to_transcript.tsv \ + --output peptides.gff3 + ``` diff --git a/docs/use-cases.md b/docs/use-cases.md new file mode 100644 index 0000000..2622a97 --- /dev/null +++ b/docs/use-cases.md @@ -0,0 +1,619 @@ +# Use Cases and Recipes + +This page provides end-to-end workflows for common proteogenomics scenarios. +Each recipe shows the exact commands to run, from downloading data to producing +a search-ready protein database. + +--- + +## 1. Human Variant Protein Database from ENSEMBL + +Build a variant protein database using ENSEMBL population variants (common SNPs +and indels) for human proteogenomics searches. + +### Step 1 -- Download ENSEMBL data + +Download the GTF, CDS, and VCF files for *Homo sapiens* (taxonomy 9606): + +```bash +pgatk ensembl-downloader \ + -t 9606 \ + -o ensembl_human \ + --skip_protein \ + --skip_ncrna \ + --skip_cdna \ + --skip_dna +``` + +This downloads the gene annotation GTF and the VCF file with known variants. + +### Step 2 -- Generate transcript sequences + +Use [gffread](http://ccb.jhu.edu/software/stringtie/gff.shtml#gffread) to +extract transcript sequences from the GTF and genome FASTA: + +```bash +gffread -F -w ensembl_human/transcripts.fa \ + -g ensembl_human/genome.fa \ + ensembl_human/Homo_sapiens.GRCh38.*.gtf.gz +``` + +### Step 3 -- Generate the variant protein database + +Translate all ENSEMBL variants that affect protein-coding transcripts: + +```bash +pgatk vcf-to-proteindb \ + --vcf ensembl_human/homo_sapiens_incl_consequences.vcf.gz \ + --input_fasta ensembl_human/transcripts.fa \ + --gene_annotations_gtf ensembl_human/Homo_sapiens.GRCh38.*.gtf.gz \ + --output_proteindb ensembl_human/variant_proteins.fa +``` + +### Step 4 -- Generate the canonical protein database + +Translate canonical protein-coding transcripts: + +```bash +pgatk dnaseq-to-proteindb \ + --input_fasta ensembl_human/transcripts.fa \ + --output_proteindb ensembl_human/canonical_proteins.fa +``` + +### Step 5 -- Combine and add decoy sequences + +Merge canonical and variant databases, then generate decoys: + +```bash +cat ensembl_human/canonical_proteins.fa \ + ensembl_human/variant_proteins.fa \ + > ensembl_human/target.fa + +pgatk generate-decoy \ + --input ensembl_human/target.fa \ + --output ensembl_human/target_decoy.fa \ + --method decoypyrat \ + --decoy_prefix DECOY_ +``` + +The file `target_decoy.fa` is ready for database searching. + +--- + +## 2. High-Frequency Variant Database (AF Filtering) + +Build a protein database that only includes common population variants above a +given allele frequency threshold. + +```bash +pgatk vcf-to-proteindb \ + --vcf homo_sapiens_incl_consequences.vcf.gz \ + --input_fasta transcripts.fa \ + --gene_annotations_gtf genes.gtf \ + --af_field MAF \ + --af_threshold 0.01 \ + --output_proteindb common_variant_proteins.fa +``` + +To restrict to specific consequence types (e.g. only missense variants): + +```bash +pgatk vcf-to-proteindb \ + --vcf homo_sapiens_incl_consequences.vcf.gz \ + --input_fasta transcripts.fa \ + --gene_annotations_gtf genes.gtf \ + --af_field MAF \ + --af_threshold 0.05 \ + --include_consequences missense_variant \ + --output_proteindb missense_common_proteins.fa +``` + +--- + +## 3. ClinVar Clinical Variant Database + +Build a protein database from ClinVar pathogenic and likely pathogenic variants +using the NCBI RefSeq gene models. + +### Step 1 -- Download NCBI / ClinVar files + +```bash +pgatk ncbi-downloader -o ncbi_data +``` + +This downloads four files to `ncbi_data/`: + +- `GRCh38_latest_genomic.gtf.gz` -- RefSeq gene annotations +- `GRCh38_latest_rna.fna.gz` -- RefSeq transcript nucleotide sequences +- `GRCh38_latest_assembly_report.txt` -- Chromosome name mapping +- `clinvar.vcf.gz` -- ClinVar variant calls + +### Step 2 -- Generate the ClinVar protein database + +```bash +pgatk clinvar-to-proteindb \ + --vcf ncbi_data/clinvar.vcf.gz \ + --gtf ncbi_data/GRCh38_latest_genomic.gtf.gz \ + --fasta ncbi_data/GRCh38_latest_rna.fna.gz \ + --assembly-report ncbi_data/GRCh38_latest_assembly_report.txt \ + --output clinvar_proteins.fa +``` + +### Step 3 -- Add decoy sequences + +```bash +pgatk generate-decoy \ + --input clinvar_proteins.fa \ + --output clinvar_target_decoy.fa \ + --method decoypyrat +``` + +!!! tip + The ClinVar pipeline does **not** require VEP annotations. It uses + BedTools interval overlap to find transcripts affected by each variant, + then applies the variant to the transcript sequence and translates it. + +--- + +## 4. Cancer-Specific Database from COSMIC + +Generate tissue-specific protein databases from COSMIC somatic mutations. + +### Step 1 -- Download COSMIC data + +```bash +pgatk cosmic-downloader \ + -u your_email@example.com \ + -p your_password \ + -o cosmic_data +``` + +!!! note + A COSMIC account is required. Register at + [https://cancer.sanger.ac.uk/cosmic/register](https://cancer.sanger.ac.uk/cosmic/register). + +### Step 2a -- Single database from all mutations + +```bash +pgatk cosmic-to-proteindb \ + --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \ + --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \ + --output_db cosmic_all_proteins.fa +``` + +### Step 2b -- One database per cancer type + +Split mutations by primary tissue site, creating one FASTA per cancer type: + +```bash +pgatk cosmic-to-proteindb \ + --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \ + --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \ + --output_db cosmic_proteins.fa \ + --split_by_filter_column +``` + +This produces files named `cosmic_proteins_.fa` for each tissue type +(e.g. `cosmic_proteins_lung.fa`, `cosmic_proteins_breast.fa`). + +### Step 2c -- Database for a specific cancer type + +Limit to a single tissue type: + +```bash +pgatk cosmic-to-proteindb \ + --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \ + --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \ + --output_db cosmic_lung_proteins.fa \ + --accepted_values "lung" +``` + +### Step 2d -- Cell-line specific databases + +Use cell-line data split by sample name: + +```bash +pgatk cosmic-to-proteindb \ + --input_mutation cosmic_data/CosmicCLP_MutantExport.tsv.gz \ + --input_genes cosmic_data/All_CellLines_Genes.fasta.gz \ + --output_db cosmic_cellline_proteins.fa \ + --filter_column "Sample name" \ + --split_by_filter_column +``` + +--- + +## 5. Cancer Study from cBioPortal + +Generate a protein database from cancer mutation data hosted in cBioPortal. + +### Step 1 -- List available studies + +```bash +pgatk cbioportal-downloader --list_studies +``` + +### Step 2 -- Download a specific study + +```bash +pgatk cbioportal-downloader \ + -d blca_mskcc_solit_2014 \ + -o cbioportal_data +``` + +### Step 3 -- Download ENSEMBL CDS (hg19 assembly) + +cBioPortal mutations are aligned to hg19 (GRCh37): + +```bash +pgatk ensembl-downloader \ + -t 9606 \ + --grch37 \ + -o ensembl_hg19 \ + --skip_vcf \ + --skip_gtf \ + --skip_protein \ + --skip_ncrna \ + --skip_cdna \ + --skip_dna +``` + +### Step 4 -- Translate mutations to protein sequences + +```bash +pgatk cbioportal-to-proteindb \ + --input_mutation cbioportal_data/data_mutations_mskcc.txt \ + --input_cds ensembl_hg19/Homo_sapiens.GRCh37.cds.all.fa.gz \ + --output_db bladder_cancer_proteins.fa +``` + +To generate tissue-type-specific databases using the clinical sample file: + +```bash +pgatk cbioportal-to-proteindb \ + --input_mutation cbioportal_data/data_mutations_mskcc.txt \ + --input_cds ensembl_hg19/Homo_sapiens.GRCh37.cds.all.fa.gz \ + --clinical_sample_file cbioportal_data/data_clinical_sample.txt \ + --output_db cbioportal_proteins.fa \ + --split_by_filter_column +``` + +--- + +## 6. Non-Coding RNA Protein Database + +Translate non-coding RNA transcripts to search for novel peptides from +supposedly non-coding regions. + +### lincRNA database + +```bash +pgatk dnaseq-to-proteindb \ + --input_fasta transcripts.fa \ + --output_proteindb lincRNA_proteins.fa \ + --var_prefix lincRNA_ \ + --include_biotypes lincRNA \ + --num_orfs 3 \ + --skip_including_all_cds +``` + +### Pseudogene database + +```bash +pgatk dnaseq-to-proteindb \ + --input_fasta transcripts.fa \ + --output_proteindb pseudogene_proteins.fa \ + --var_prefix pseudogene_ \ + --include_biotypes processed_pseudogene,transcribed_processed_pseudogene,translated_processed_pseudogene \ + --num_orfs 3 \ + --skip_including_all_cds +``` + +### Alternative ORFs from protein-coding transcripts + +Translate non-canonical reading frames of protein-coding genes: + +```bash +pgatk dnaseq-to-proteindb \ + --input_fasta transcripts.fa \ + --output_proteindb altorf_proteins.fa \ + --var_prefix altorf_ \ + --include_biotypes altORFs \ + --skip_including_all_cds +``` + +--- + +## 7. Six-Frame Genome Translation + +Perform a full six-frame translation of a genome assembly. This is useful as +an unbiased search space but produces very large databases. + +```bash +pgatk dnaseq-to-proteindb \ + --input_fasta genome.fa \ + --output_proteindb genome_6frame.fa \ + --biotype_str '' \ + --num_orfs 3 \ + --num_orfs_complement 3 +``` + +!!! warning + Six-frame translation of a full mammalian genome produces a very large + database. Consider translating individual chromosomes or using it in + combination with a smaller targeted database search. + +--- + +## 8. gnomAD Population Variant Database + +Build a protein database from gnomAD variants using GENCODE annotations. + +```bash +pgatk vcf-to-proteindb \ + --vcf gnomad_genome.vcf.gz \ + --input_fasta gencode_transcripts.fa \ + --gene_annotations_gtf gencode.v38.annotation.gtf.gz \ + --annotation_field_name vep \ + --af_field controls_AF \ + --af_threshold 0.01 \ + --include_consequences missense_variant,inframe_insertion,inframe_deletion,frameshift_variant \ + --biotype_str transcript_type \ + --output_proteindb gnomad_proteins.fa +``` + +!!! tip "gnomAD-specific parameters" + - `--annotation_field_name vep` -- gnomAD uses `vep` instead of `CSQ` + - `--af_field controls_AF` -- Use control allele frequency + - `--biotype_str transcript_type` -- GENCODE uses `transcript_type` instead + of ENSEMBL's `transcript_biotype` + +--- + +## 9. Sample-Specific Database from WGS/WES + +Translate variants from a whole-genome or whole-exome sequencing VCF of an +individual sample. These VCFs typically lack VEP annotations. + +```bash +pgatk vcf-to-proteindb \ + --vcf sample_variants.vcf \ + --input_fasta transcripts.fa \ + --gene_annotations_gtf genes.gtf \ + --annotation_field_name '' \ + --ignore_filters \ + --output_proteindb sample_proteins.fa +``` + +- `--annotation_field_name ''` tells the tool to skip parsing VEP/CSQ + annotations (not present in raw variant callers like GATK HaplotypeCaller) +- `--ignore_filters` includes all variants regardless of the FILTER column + +To include only variants that passed quality filtering: + +```bash +pgatk vcf-to-proteindb \ + --vcf sample_variants.vcf \ + --input_fasta transcripts.fa \ + --gene_annotations_gtf genes.gtf \ + --annotation_field_name '' \ + --accepted_filters PASS \ + --output_proteindb sample_proteins_pass.fa +``` + +--- + +## 10. Digest and Filter Variant Peptides + +After generating a variant protein database you may want to digest the proteins +*in silico* and keep only peptides that differ from the canonical proteome. +This produces a compact FASTA of variant-specific peptides. + +```bash +pgatk digest-mutant-protein \ + --input variant_proteins.fa \ + --fasta canonical_proteins.fa \ + --output unique_variant_peptides.fa \ + --min-len 7 \ + --max-len 40 \ + --missed-cleavages 2 +``` + +To combine multiple variant sources (e.g. COSMIC + ClinVar): + +```bash +pgatk digest-mutant-protein \ + --input cosmic_proteins.fa,clinvar_proteins.fa \ + --fasta canonical_proteins.fa \ + --output combined_variant_peptides.fa +``` + +--- + +## 11. Map Identified Peptides to Genomic Coordinates + +After a proteomics search, map the identified peptides back to genomic +coordinates for visualization in genome browsers (e.g. UCSC, IGV). + +```bash +pgatk map-peptide2genome \ + --input peptide_identifications.tsv \ + --gtf genes.gtf \ + --fasta proteins.fa \ + --idmap protein_to_transcript.tsv \ + --output peptides.gff3 +``` + +The input TSV should contain at least peptide sequence and protein accession +columns (configurable with `--pep-col` and `--prot-col`). + +--- + +## 12. Complete Proteogenomics Workflow + +A full end-to-end workflow combining canonical proteins, population variants, +clinical variants, cancer mutations, and non-coding translations into a single +search-ready database. + +### Step 1 -- Download all data sources + +```bash +# ENSEMBL (canonical + population variants) +pgatk ensembl-downloader -t 9606 -o ensembl_data + +# NCBI / ClinVar +pgatk ncbi-downloader -o ncbi_data + +# COSMIC (requires account) +pgatk cosmic-downloader -u user@example.com -p password -o cosmic_data +``` + +### Step 2 -- Prepare transcript sequences + +```bash +gffread -F -w ensembl_data/transcripts.fa \ + -g ensembl_data/genome.fa \ + ensembl_data/Homo_sapiens.GRCh38.*.gtf.gz +``` + +### Step 3 -- Generate all protein databases + +```bash +# Canonical proteins +pgatk dnaseq-to-proteindb \ + --input_fasta ensembl_data/transcripts.fa \ + --output_proteindb canonical.fa + +# ENSEMBL population variants +pgatk vcf-to-proteindb \ + --vcf ensembl_data/homo_sapiens_incl_consequences.vcf.gz \ + --input_fasta ensembl_data/transcripts.fa \ + --gene_annotations_gtf ensembl_data/Homo_sapiens.GRCh38.*.gtf.gz \ + --output_proteindb ensembl_variants.fa + +# ClinVar clinical variants +pgatk clinvar-to-proteindb \ + --vcf ncbi_data/clinvar.vcf.gz \ + --gtf ncbi_data/GRCh38_latest_genomic.gtf.gz \ + --fasta ncbi_data/GRCh38_latest_rna.fna.gz \ + --assembly-report ncbi_data/GRCh38_latest_assembly_report.txt \ + --output clinvar_variants.fa + +# COSMIC somatic mutations +pgatk cosmic-to-proteindb \ + --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \ + --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \ + --output_db cosmic_variants.fa + +# Non-coding RNA (lincRNA) +pgatk dnaseq-to-proteindb \ + --input_fasta ensembl_data/transcripts.fa \ + --output_proteindb lincRNA.fa \ + --var_prefix lincRNA_ \ + --include_biotypes lincRNA \ + --num_orfs 3 \ + --skip_including_all_cds +``` + +### Step 4 -- Combine all databases + +```bash +cat canonical.fa \ + ensembl_variants.fa \ + clinvar_variants.fa \ + cosmic_variants.fa \ + lincRNA.fa \ + > combined_target.fa +``` + +### Step 5 -- Add decoy sequences + +```bash +pgatk generate-decoy \ + --input combined_target.fa \ + --output proteogenomics_target_decoy.fa \ + --method decoypyrat \ + --decoy_prefix DECOY_ +``` + +### Step 6 (optional) -- Extract unique variant peptides + +```bash +pgatk digest-mutant-protein \ + --input ensembl_variants.fa,clinvar_variants.fa,cosmic_variants.fa \ + --fasta canonical.fa \ + --output unique_variant_peptides.fa \ + --min-len 7 \ + --max-len 40 \ + --missed-cleavages 2 +``` + +The file `proteogenomics_target_decoy.fa` is a comprehensive search database, +and `unique_variant_peptides.fa` provides a focused list of variant-specific +peptides for validation. + +--- + +## 13. Non-Human Species + +pgatk supports any species available in ENSEMBL. Here is an example for mouse +(*Mus musculus*, taxonomy 10090): + +```bash +# Download +pgatk ensembl-downloader -t 10090 -o ensembl_mouse --skip_dna + +# Generate transcript sequences +gffread -F -w ensembl_mouse/transcripts.fa \ + -g ensembl_mouse/genome.fa \ + ensembl_mouse/Mus_musculus.GRCm39.*.gtf.gz + +# Canonical proteins +pgatk dnaseq-to-proteindb \ + --input_fasta ensembl_mouse/transcripts.fa \ + --output_proteindb mouse_canonical.fa + +# Variant proteins +pgatk vcf-to-proteindb \ + --vcf ensembl_mouse/mus_musculus_incl_consequences.vcf.gz \ + --input_fasta ensembl_mouse/transcripts.fa \ + --gene_annotations_gtf ensembl_mouse/Mus_musculus.GRCm39.*.gtf.gz \ + --output_proteindb mouse_variants.fa + +# Combine and generate decoy +cat mouse_canonical.fa mouse_variants.fa > mouse_target.fa +pgatk generate-decoy \ + --input mouse_target.fa \ + --output mouse_target_decoy.fa \ + --method decoypyrat +``` + +--- + +## 14. Three-Frame Translation + +Perform a simple three-frame translation of any nucleotide FASTA file: + +```bash +pgatk threeframe-translation \ + --input_fasta input_sequences.fa \ + --output translated_proteins.fa +``` + +--- + +## 15. Quality Check: Validate Protein Database + +Before using a protein database, check it for internal stop codons and short +sequences: + +```bash +pgatk ensembl-check \ + --input_fasta protein_database.fa \ + --output validated_database.fa \ + --num_aa 6 +``` + +This filters out sequences shorter than 6 amino acids. Use `--add_stop_codons` +to include proteins that contain internal stop codons (useful for +proteogenomics where frameshifts may introduce premature stops). diff --git a/mkdocs.yml b/mkdocs.yml index aa2dab9..13bee38 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -95,6 +95,7 @@ nav: - Introduction: introduction.md - Installation: installation.md - pgatk CLI: pgatk-cli.md + - Use Cases: use-cases.md - File Formats: formats.md - Changelog: changelog.md - Support: support.md From 78fecd885f4c747a42822943d2f08e1d47b28129 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Tue, 3 Mar 2026 19:24:55 +0000 Subject: [PATCH 6/7] improve docs --- docs/use-cases.md | 849 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 654 insertions(+), 195 deletions(-) diff --git a/docs/use-cases.md b/docs/use-cases.md index 2622a97..5af6de5 100644 --- a/docs/use-cases.md +++ b/docs/use-cases.md @@ -6,10 +6,172 @@ a search-ready protein database. --- +## Cell-Type Specific Non-Canonical Peptide Discovery + +!!! abstract "Featured workflow" + This workflow reproduces the analysis from [Umer et al., *Bioinformatics* 2022](https://doi.org/10.1093/bioinformatics/btab838), + where cell-type specific databases were generated for 65 cell lines, + revealing non-canonical and cryptic peptides amounting to **>5% of total + peptide identifications**. + +The key insight is to build **cell-type specific** proteogenomics databases +that combine canonical proteins with non-canonical translations (pseudogenes, +lncRNAs, alternative ORFs) and variant sequences from multiple genomic sources. +Searching with a database tailored to the cell type of interest maximizes +discovery while keeping the search space focused. + +### Step 1 -- Download ENSEMBL data + +```bash +pgatk ensembl-downloader \ + -t 9606 \ + -o ensembl_human +``` + +### Step 2 -- Generate transcript sequences from GTF + +```bash +gffread -F -w ensembl_human/transcripts.fa \ + -g ensembl_human/genome.fa \ + ensembl_human/Homo_sapiens.GRCh38.*.gtf.gz +``` + +### Step 3 -- Canonical protein-coding sequences + +Translate all protein-coding transcripts using their annotated CDS: + +```bash +pgatk dnaseq-to-proteindb \ + --input_fasta ensembl_human/transcripts.fa \ + --output_proteindb canonical.fa +``` + +### Step 4 -- Non-canonical translations + +#### Pseudogenes (three-frame) + +Pseudogenes were traditionally considered non-functional, but some produce +detectable peptides. Translate them in three reading frames: + +```bash +pgatk dnaseq-to-proteindb \ + --input_fasta ensembl_human/transcripts.fa \ + --output_proteindb pseudogene.fa \ + --var_prefix pseudo_ \ + --include_biotypes processed_pseudogene,unprocessed_pseudogene,transcribed_processed_pseudogene,transcribed_unprocessed_pseudogene,translated_processed_pseudogene \ + --num_orfs 3 \ + --skip_including_all_cds +``` + +#### lncRNAs (three-frame) + +Long non-coding RNAs can harbor small open reading frames encoding +micropeptides: + +```bash +pgatk dnaseq-to-proteindb \ + --input_fasta ensembl_human/transcripts.fa \ + --output_proteindb lncrna.fa \ + --var_prefix lncrna_ \ + --include_biotypes lincRNA,antisense,sense_intronic,sense_overlapping \ + --num_orfs 3 \ + --skip_including_all_cds +``` + +#### Alternative ORFs (exonic out-of-frame translation) + +Non-canonical reading frames of protein-coding mRNAs can produce cryptic +peptides: + +```bash +pgatk dnaseq-to-proteindb \ + --input_fasta ensembl_human/transcripts.fa \ + --output_proteindb altorf.fa \ + --var_prefix altorf_ \ + --include_biotypes altORFs \ + --skip_including_all_cds +``` + +### Step 5 -- Population variant proteins + +Include common human variants from ENSEMBL: + +```bash +pgatk vcf-to-proteindb \ + --vcf ensembl_human/homo_sapiens_incl_consequences.vcf.gz \ + --input_fasta ensembl_human/transcripts.fa \ + --gene_annotations_gtf ensembl_human/Homo_sapiens.GRCh38.*.gtf.gz \ + --output_proteindb ensembl_variants.fa +``` + +### Step 6 -- COSMIC somatic mutations (optional, for cancer cell lines) + +For cancer cell-line studies, add cell-line-specific somatic mutations: + +```bash +pgatk cosmic-downloader \ + -u your_email@example.com \ + -p your_password \ + -o cosmic_data + +pgatk cosmic-to-proteindb \ + --input_mutation cosmic_data/CosmicCLP_MutantExport.tsv.gz \ + --input_genes cosmic_data/All_CellLines_Genes.fasta.gz \ + --output_db cosmic_cellline.fa \ + --filter_column "Sample name" \ + --split_by_filter_column +``` + +### Step 7 -- Combine and generate target-decoy database + +```bash +# Combine all components +# For a specific cell line, use the matching COSMIC file if available +cat canonical.fa \ + pseudogene.fa \ + lncrna.fa \ + altorf.fa \ + ensembl_variants.fa \ + > cell_type_target.fa + +# Generate decoy sequences +pgatk generate-decoy \ + --input cell_type_target.fa \ + --output cell_type_target_decoy.fa \ + --method decoypyrat \ + --decoy_prefix DECOY_ +``` + +### Step 8 -- Extract non-canonical unique peptides + +After database searching with your search engine, you can also pre-compute the +set of non-canonical peptides unique to these novel sources: + +```bash +pgatk digest-mutant-protein \ + --input pseudogene.fa,lncrna.fa,altorf.fa,ensembl_variants.fa \ + --fasta canonical.fa \ + --output non_canonical_peptides.fa \ + --min-len 7 \ + --max-len 40 \ + --missed-cleavages 2 +``` + +!!! tip "Expected results" + In the original study, this workflow applied to 65 cell lines across six + PRIDE datasets identified non-canonical peptides amounting to >5% of + total identifications. The most common non-canonical sources were + pseudogenes and lncRNAs, followed by alternative ORFs and variant + peptides. + +--- + ## 1. Human Variant Protein Database from ENSEMBL Build a variant protein database using ENSEMBL population variants (common SNPs -and indels) for human proteogenomics searches. +and indels) for human proteogenomics searches. This is the most common starting +point for proteogenomics experiments -- augmenting the canonical proteome with +known variant peptides that would otherwise be missed. ### Step 1 -- Download ENSEMBL data @@ -80,10 +242,16 @@ The file `target_decoy.fa` is ready for database searching. --- -## 2. High-Frequency Variant Database (AF Filtering) +## 2. Population-Specific Variant Database + +Population-level genetic variants cause amino acid changes that are invisible +to standard reference database searches. Studies have shown that incorporating +common missense variants can increase peptide identifications by 3--5% and is +especially important when analyzing samples from underrepresented ancestries. -Build a protein database that only includes common population variants above a -given allele frequency threshold. +### Common variants above a frequency threshold + +Include only variants present in at least 1% of the population: ```bash pgatk vcf-to-proteindb \ @@ -95,7 +263,10 @@ pgatk vcf-to-proteindb \ --output_proteindb common_variant_proteins.fa ``` -To restrict to specific consequence types (e.g. only missense variants): +### Missense-only variant database + +Restrict to single amino acid variants (SAAVs), the most common variant type +detectable by mass spectrometry: ```bash pgatk vcf-to-proteindb \ @@ -108,12 +279,42 @@ pgatk vcf-to-proteindb \ --output_proteindb missense_common_proteins.fa ``` +### gnomAD ancestry-stratified database + +gnomAD provides allele frequencies stratified by ancestry (African, East Asian, +South Asian, European, Latino, etc.). Build a database using variants common in +a specific population: + +```bash +pgatk vcf-to-proteindb \ + --vcf gnomad.exomes.v4.1.sites.vcf.bgz \ + --input_fasta gencode_transcripts.fa \ + --gene_annotations_gtf gencode.v44.annotation.gtf.gz \ + --annotation_field_name vep \ + --af_field AF_afr \ + --af_threshold 0.01 \ + --include_consequences missense_variant,inframe_insertion,inframe_deletion \ + --biotype_str transcript_type \ + --output_proteindb gnomad_afr_proteins.fa +``` + +!!! tip "gnomAD-specific parameters" + - `--annotation_field_name vep` -- gnomAD uses `vep` instead of `CSQ` + - `--af_field` -- Use population-specific AF fields: `AF_afr` (African), + `AF_eas` (East Asian), `AF_sas` (South Asian), `AF_nfe` (Non-Finnish European), + `AF_amr` (Latino), or `controls_AF` (all controls) + - `--biotype_str transcript_type` -- GENCODE uses `transcript_type` instead + of ENSEMBL's `transcript_biotype` + --- ## 3. ClinVar Clinical Variant Database -Build a protein database from ClinVar pathogenic and likely pathogenic variants -using the NCBI RefSeq gene models. +[ClinVar](https://www.ncbi.nlm.nih.gov/clinvar/) catalogs the relationship +between human variants and clinical phenotypes. Building a ClinVar-derived +protein database enables detection of clinically annotated variant peptides by +mass spectrometry -- useful for clinical proteomics, pharmacoproteomics, and +validating pathogenic variants at the protein level. ### Step 1 -- Download NCBI / ClinVar files @@ -155,37 +356,28 @@ pgatk generate-decoy \ --- -## 4. Cancer-Specific Database from COSMIC +## 4. Tumor-Specific Databases for Cancer Proteogenomics -Generate tissue-specific protein databases from COSMIC somatic mutations. +Cancer proteogenomics studies (such as those from CPTAC) build +tumor-specific protein databases to detect somatic mutant peptides, understand +therapy resistance, and prioritize neoantigen candidates. The databases combine +somatic mutations from cancer-specific sources (COSMIC, cBioPortal) and/or +patient-matched whole-exome sequencing. -### Step 1 -- Download COSMIC data +### 4a. COSMIC somatic mutations by cancer type + +Generate one protein database per primary tissue site. This is the standard +approach for large-scale cancer proteogenomics when patient-level WES is not +available: ```bash +# Download COSMIC data (requires account) pgatk cosmic-downloader \ -u your_email@example.com \ -p your_password \ -o cosmic_data -``` - -!!! note - A COSMIC account is required. Register at - [https://cancer.sanger.ac.uk/cosmic/register](https://cancer.sanger.ac.uk/cosmic/register). -### Step 2a -- Single database from all mutations - -```bash -pgatk cosmic-to-proteindb \ - --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \ - --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \ - --output_db cosmic_all_proteins.fa -``` - -### Step 2b -- One database per cancer type - -Split mutations by primary tissue site, creating one FASTA per cancer type: - -```bash +# Generate per-tissue databases pgatk cosmic-to-proteindb \ --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \ --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \ @@ -193,12 +385,12 @@ pgatk cosmic-to-proteindb \ --split_by_filter_column ``` -This produces files named `cosmic_proteins_.fa` for each tissue type -(e.g. `cosmic_proteins_lung.fa`, `cosmic_proteins_breast.fa`). +This produces files like `cosmic_proteins_lung.fa`, `cosmic_proteins_breast.fa`, +etc. Use the tissue-matched database for your cancer type of interest. -### Step 2c -- Database for a specific cancer type +### 4b. Single cancer type -Limit to a single tissue type: +Build a focused database for one cancer type (e.g. lung): ```bash pgatk cosmic-to-proteindb \ @@ -208,9 +400,10 @@ pgatk cosmic-to-proteindb \ --accepted_values "lung" ``` -### Step 2d -- Cell-line specific databases +### 4c. Cell-line proteogenomics -Use cell-line data split by sample name: +When analyzing cell-line proteomes, use cell-line-specific mutations. COSMIC +provides a dedicated cell-line export with mutations annotated per sample: ```bash pgatk cosmic-to-proteindb \ @@ -221,71 +414,133 @@ pgatk cosmic-to-proteindb \ --split_by_filter_column ``` ---- - -## 5. Cancer Study from cBioPortal - -Generate a protein database from cancer mutation data hosted in cBioPortal. +### 4d. cBioPortal study-specific database -### Step 1 -- List available studies +cBioPortal hosts mutation data from thousands of cancer genomics studies. +Generate a protein database from a specific study: ```bash +# List available studies pgatk cbioportal-downloader --list_studies + +# Download a study +pgatk cbioportal-downloader \ + -d brca_tcga_pan_can_atlas_2018 \ + -o cbioportal_data + +# Download ENSEMBL CDS (hg19 -- cBioPortal uses GRCh37) +pgatk ensembl-downloader \ + -t 9606 --grch37 -o ensembl_hg19 \ + --skip_vcf --skip_gtf --skip_protein \ + --skip_ncrna --skip_cdna --skip_dna + +# Translate mutations +pgatk cbioportal-to-proteindb \ + --input_mutation cbioportal_data/data_mutations_mskcc.txt \ + --input_cds ensembl_hg19/Homo_sapiens.GRCh37.cds.all.fa.gz \ + --output_db brca_tcga_proteins.fa ``` -### Step 2 -- Download a specific study +### 4e. Combined cancer database for immunopeptidomics + +For HLA immunopeptidomics / neoantigen discovery, a broad mutation database +maximizes the chance of detecting mutant HLA-presented peptides. Studies have +shown that COSMIC-derived databases can identify 5x more mutant immunopeptides +than patient WES alone. Combine COSMIC with ClinVar: ```bash -pgatk cbioportal-downloader \ - -d blca_mskcc_solit_2014 \ - -o cbioportal_data +# Generate COSMIC mutations +pgatk cosmic-to-proteindb \ + --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \ + --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \ + --output_db cosmic_proteins.fa + +# Generate ClinVar mutations +pgatk clinvar-to-proteindb \ + --vcf ncbi_data/clinvar.vcf.gz \ + --gtf ncbi_data/GRCh38_latest_genomic.gtf.gz \ + --fasta ncbi_data/GRCh38_latest_rna.fna.gz \ + --assembly-report ncbi_data/GRCh38_latest_assembly_report.txt \ + --output clinvar_proteins.fa + +# Extract variant-unique peptides (undigested -- HLA peptides are not tryptic) +# For immunopeptidomics, use the full-length variant proteins directly: +cat cosmic_proteins.fa clinvar_proteins.fa > neoantigen_candidates.fa + +pgatk generate-decoy \ + --input neoantigen_candidates.fa \ + --output neoantigen_target_decoy.fa \ + --method decoypyrat ``` -### Step 3 -- Download ENSEMBL CDS (hg19 assembly) +--- + +## 5. Patient-Specific Database from WGS/WES + +When matched whole-genome or whole-exome sequencing data is available for +a sample, build a personalized protein database from the patient's own +variants. This is the gold standard for precision proteogenomics as used in +CPTAC studies. -cBioPortal mutations are aligned to hg19 (GRCh37): +### From a raw VCF (no VEP annotations) + +VCF files from variant callers (e.g. GATK HaplotypeCaller, Strelka, MuTect2) +typically lack VEP consequence annotations: ```bash -pgatk ensembl-downloader \ - -t 9606 \ - --grch37 \ - -o ensembl_hg19 \ - --skip_vcf \ - --skip_gtf \ - --skip_protein \ - --skip_ncrna \ - --skip_cdna \ - --skip_dna +pgatk vcf-to-proteindb \ + --vcf patient_somatic.vcf \ + --input_fasta transcripts.fa \ + --gene_annotations_gtf genes.gtf \ + --annotation_field_name '' \ + --ignore_filters \ + --output_proteindb patient_proteins.fa ``` -### Step 4 -- Translate mutations to protein sequences +- `--annotation_field_name ''` skips VEP/CSQ parsing +- `--ignore_filters` includes all called variants + +### From a VEP-annotated VCF + +If the VCF has been annotated with Ensembl VEP, you can filter by consequence +and transcript biotype: ```bash -pgatk cbioportal-to-proteindb \ - --input_mutation cbioportal_data/data_mutations_mskcc.txt \ - --input_cds ensembl_hg19/Homo_sapiens.GRCh37.cds.all.fa.gz \ - --output_db bladder_cancer_proteins.fa +pgatk vcf-to-proteindb \ + --vcf patient_somatic.vep.vcf \ + --input_fasta transcripts.fa \ + --gene_annotations_gtf genes.gtf \ + --include_consequences missense_variant,frameshift_variant,stop_gained,inframe_insertion,inframe_deletion \ + --output_proteindb patient_proteins.fa ``` -To generate tissue-type-specific databases using the clinical sample file: +### Only PASS-filtered variants + +Include only high-confidence variant calls: ```bash -pgatk cbioportal-to-proteindb \ - --input_mutation cbioportal_data/data_mutations_mskcc.txt \ - --input_cds ensembl_hg19/Homo_sapiens.GRCh37.cds.all.fa.gz \ - --clinical_sample_file cbioportal_data/data_clinical_sample.txt \ - --output_db cbioportal_proteins.fa \ - --split_by_filter_column +pgatk vcf-to-proteindb \ + --vcf patient_somatic.vcf \ + --input_fasta transcripts.fa \ + --gene_annotations_gtf genes.gtf \ + --annotation_field_name '' \ + --accepted_filters PASS \ + --output_proteindb patient_pass_proteins.fa ``` --- -## 6. Non-Coding RNA Protein Database +## 6. Novel ORF and Micropeptide Discovery + +Proteogenomics is a key approach for discovering novel coding regions: small +open reading frames (smORFs), micropeptides from lncRNAs, pseudogene-encoded +proteins, and alternative reading frames. Studies have found that non-canonical +peptides can account for over 5% of total identifications. -Translate non-coding RNA transcripts to search for novel peptides from -supposedly non-coding regions. +### lincRNA-derived proteins -### lincRNA database +Long intergenic non-coding RNAs (lincRNAs) can encode small proteins. Translate +them in three reading frames: ```bash pgatk dnaseq-to-proteindb \ @@ -297,7 +552,9 @@ pgatk dnaseq-to-proteindb \ --skip_including_all_cds ``` -### Pseudogene database +### Pseudogene-derived proteins + +Some pseudogenes are transcribed and may produce functional peptides: ```bash pgatk dnaseq-to-proteindb \ @@ -311,7 +568,8 @@ pgatk dnaseq-to-proteindb \ ### Alternative ORFs from protein-coding transcripts -Translate non-canonical reading frames of protein-coding genes: +Translate non-canonical reading frames of known protein-coding genes to +discover upstream ORFs (uORFs), overlapping ORFs, and downstream ORFs: ```bash pgatk dnaseq-to-proteindb \ @@ -322,12 +580,60 @@ pgatk dnaseq-to-proteindb \ --skip_including_all_cds ``` +### Antisense transcripts + +```bash +pgatk dnaseq-to-proteindb \ + --input_fasta transcripts.fa \ + --output_proteindb antisense_proteins.fa \ + --var_prefix antisense_ \ + --include_biotypes antisense,antisense_RNA \ + --num_orfs 3 \ + --skip_including_all_cds +``` + +### Combined non-canonical database + +Combine all non-canonical sources with the canonical proteome: + +```bash +cat canonical_proteins.fa \ + lincRNA_proteins.fa \ + pseudogene_proteins.fa \ + altorf_proteins.fa \ + antisense_proteins.fa \ + > novel_orf_target.fa + +pgatk generate-decoy \ + --input novel_orf_target.fa \ + --output novel_orf_target_decoy.fa \ + --method decoypyrat +``` + +!!! tip + After searching, use `digest-mutant-protein` to extract only the novel + peptides that do not appear in the canonical proteome: + + ```bash + pgatk digest-mutant-protein \ + --input lincRNA_proteins.fa,pseudogene_proteins.fa,altorf_proteins.fa \ + --fasta canonical_proteins.fa \ + --output novel_unique_peptides.fa + ``` + --- -## 7. Six-Frame Genome Translation +## 7. Genome Annotation Refinement + +Proteogenomics is used to validate and refine genome annotations by mapping +proteomics-identified peptides back to genomic coordinates. This has been +applied to improve annotations for organisms from wheat (validating 33,612 +gene models) to human (discovering novel splice junctions and exons). -Perform a full six-frame translation of a genome assembly. This is useful as -an unbiased search space but produces very large databases. +### Six-frame genome translation + +Translate the entire genome in all six reading frames to create an unbiased +search space: ```bash pgatk dnaseq-to-proteindb \ @@ -341,119 +647,284 @@ pgatk dnaseq-to-proteindb \ !!! warning Six-frame translation of a full mammalian genome produces a very large database. Consider translating individual chromosomes or using it in - combination with a smaller targeted database search. + combination with a smaller targeted database for a two-pass search strategy. ---- +### Three-frame transcriptome translation -## 8. gnomAD Population Variant Database +A more focused approach: translate all annotated transcripts in three reading +frames to capture alternative ORFs without the full genome search space: + +```bash +pgatk threeframe-translation \ + --input_fasta transcripts.fa \ + --output translated_transcripts.fa +``` -Build a protein database from gnomAD variants using GENCODE annotations. +### Map identified peptides back to the genome + +After database searching, map novel peptide identifications to genomic +coordinates for annotation and genome browser visualization: ```bash -pgatk vcf-to-proteindb \ - --vcf gnomad_genome.vcf.gz \ - --input_fasta gencode_transcripts.fa \ - --gene_annotations_gtf gencode.v38.annotation.gtf.gz \ - --annotation_field_name vep \ - --af_field controls_AF \ - --af_threshold 0.01 \ - --include_consequences missense_variant,inframe_insertion,inframe_deletion,frameshift_variant \ - --biotype_str transcript_type \ - --output_proteindb gnomad_proteins.fa +pgatk map-peptide2genome \ + --input novel_peptide_identifications.tsv \ + --gtf genes.gtf \ + --fasta proteins.fa \ + --idmap protein_to_transcript.tsv \ + --output novel_peptides.gff3 ``` -!!! tip "gnomAD-specific parameters" - - `--annotation_field_name vep` -- gnomAD uses `vep` instead of `CSQ` - - `--af_field controls_AF` -- Use control allele frequency - - `--biotype_str transcript_type` -- GENCODE uses `transcript_type` instead - of ENSEMBL's `transcript_biotype` +The GFF3 output can be loaded into UCSC Genome Browser, IGV, or Ensembl for +visual inspection alongside gene models, RNA-seq coverage, and conservation +tracks. --- -## 9. Sample-Specific Database from WGS/WES +## 8. Metaproteomics: Microbiome and Environmental Samples + +Metaproteomics characterizes the functional activity of microbial communities. +The central challenge is building the protein search database, since communities +can contain hundreds to thousands of species. When matched metagenomic +sequencing data is available, six-frame translation of assembled contigs +provides a sample-matched database. -Translate variants from a whole-genome or whole-exome sequencing VCF of an -individual sample. These VCFs typically lack VEP annotations. +### Six-frame translation of metagenome assembly + +After assembling metagenomic reads into contigs (e.g. with megahit or +metaSPAdes), translate them: ```bash -pgatk vcf-to-proteindb \ - --vcf sample_variants.vcf \ - --input_fasta transcripts.fa \ - --gene_annotations_gtf genes.gtf \ - --annotation_field_name '' \ - --ignore_filters \ - --output_proteindb sample_proteins.fa +pgatk dnaseq-to-proteindb \ + --input_fasta metagenome_contigs.fa \ + --output_proteindb metagenome_6frame.fa \ + --biotype_str '' \ + --num_orfs 3 \ + --num_orfs_complement 3 \ + --protein_prefix meta_ ``` -- `--annotation_field_name ''` tells the tool to skip parsing VEP/CSQ - annotations (not present in raw variant callers like GATK HaplotypeCaller) -- `--ignore_filters` includes all variants regardless of the FILTER column +### Three-frame translation of predicted ORFs -To include only variants that passed quality filtering: +If gene prediction has been run on the metagenome assembly (e.g. Prodigal, +MetaGeneAnnotator), translate the predicted CDS regions: ```bash -pgatk vcf-to-proteindb \ - --vcf sample_variants.vcf \ - --input_fasta transcripts.fa \ - --gene_annotations_gtf genes.gtf \ - --annotation_field_name '' \ - --accepted_filters PASS \ - --output_proteindb sample_proteins_pass.fa +pgatk dnaseq-to-proteindb \ + --input_fasta predicted_genes.fna \ + --output_proteindb metagenome_proteins.fa \ + --biotype_str '' \ + --protein_prefix meta_ ``` +### Add decoy for metaproteomics search + +```bash +pgatk generate-decoy \ + --input metagenome_proteins.fa \ + --output metagenome_target_decoy.fa \ + --method decoypyrat +``` + +!!! note + For very large metaproteomics databases (millions of sequences), + consider using `--memory_save` with `generate-decoy` to reduce memory + usage, or splitting the database by contig/bin before decoy generation. + --- -## 10. Digest and Filter Variant Peptides +## 9. Long-Read Transcriptomics + Proteogenomics -After generating a variant protein database you may want to digest the proteins -*in silico* and keep only peptides that differ from the canonical proteome. -This produces a compact FASTA of variant-specific peptides. +Long-read RNA sequencing (PacBio Iso-Seq, Oxford Nanopore) captures full-length +transcript isoforms without assembly, enabling unambiguous protein isoform +prediction. Studies have shown that 25% of long-read-detected protein isoforms +may be absent from reference annotations. + +### Translate long-read transcript sequences + +After processing long reads through an isoform classification pipeline (e.g. +SQANTI3), translate the high-quality transcript sequences: + +```bash +# Translate all long-read transcripts, including novel isoforms +pgatk dnaseq-to-proteindb \ + --input_fasta long_read_transcripts.fa \ + --output_proteindb long_read_proteins.fa \ + --biotype_str '' \ + --protein_prefix lr_ +``` + +### Combine with canonical proteome + +```bash +cat canonical_proteins.fa long_read_proteins.fa > lr_target.fa + +pgatk generate-decoy \ + --input lr_target.fa \ + --output lr_target_decoy.fa \ + --method decoypyrat +``` + +### Extract novel isoform peptides + +After searching, identify peptides unique to long-read isoforms that are not +present in the reference: ```bash pgatk digest-mutant-protein \ - --input variant_proteins.fa \ + --input long_read_proteins.fa \ --fasta canonical_proteins.fa \ - --output unique_variant_peptides.fa \ + --output novel_isoform_peptides.fa \ --min-len 7 \ --max-len 40 \ --missed-cleavages 2 ``` -To combine multiple variant sources (e.g. COSMIC + ClinVar): +--- + +## 10. Plant and Non-Model Organism Proteogenomics + +For non-model organisms with draft or incomplete genome annotations, +proteogenomics improves gene model quality. This approach has been particularly +impactful for crop species -- for example, a wheat proteogenomics study +validated over 33,000 gene models and recommended promoting 3,700 +low-confidence genes. + +### Step 1 -- Download ENSEMBL data for the species + +pgatk supports any species available in ENSEMBL. For example, rice +(*Oryza sativa*, taxonomy 39947): ```bash -pgatk digest-mutant-protein \ - --input cosmic_proteins.fa,clinvar_proteins.fa \ - --fasta canonical_proteins.fa \ - --output combined_variant_peptides.fa +pgatk ensembl-downloader \ + -t 39947 \ + -o ensembl_rice \ + --skip_dna ``` ---- +### Step 2 -- Generate transcript sequences and canonical proteome + +```bash +gffread -F -w ensembl_rice/transcripts.fa \ + -g ensembl_rice/genome.fa \ + ensembl_rice/Oryza_sativa.IRGSP-1.0.*.gtf.gz + +pgatk dnaseq-to-proteindb \ + --input_fasta ensembl_rice/transcripts.fa \ + --output_proteindb rice_canonical.fa +``` + +### Step 3 -- Three-frame translation for novel gene discovery + +```bash +pgatk threeframe-translation \ + --input_fasta ensembl_rice/transcripts.fa \ + --output rice_3frame.fa +``` + +### Step 4 -- Six-frame genome translation (unbiased) -## 11. Map Identified Peptides to Genomic Coordinates +```bash +pgatk dnaseq-to-proteindb \ + --input_fasta ensembl_rice/genome.fa \ + --output_proteindb rice_genome_6frame.fa \ + --biotype_str '' \ + --num_orfs 3 \ + --num_orfs_complement 3 +``` + +### Step 5 -- Combine databases + +```bash +cat rice_canonical.fa rice_3frame.fa > rice_target.fa + +pgatk generate-decoy \ + --input rice_target.fa \ + --output rice_target_decoy.fa \ + --method decoypyrat +``` -After a proteomics search, map the identified peptides back to genomic -coordinates for visualization in genome browsers (e.g. UCSC, IGV). +### Step 6 -- Map identified novel peptides to the genome ```bash pgatk map-peptide2genome \ - --input peptide_identifications.tsv \ - --gtf genes.gtf \ - --fasta proteins.fa \ + --input rice_novel_peptides.tsv \ + --gtf ensembl_rice/Oryza_sativa.IRGSP-1.0.*.gtf.gz \ + --fasta rice_target.fa \ --idmap protein_to_transcript.tsv \ - --output peptides.gff3 + --output rice_novel_peptides.gff3 ``` -The input TSV should contain at least peptide sequence and protein accession -columns (configurable with `--pep-col` and `--prot-col`). +### Mouse example + +For model organisms like mouse (*Mus musculus*, taxonomy 10090): + +```bash +# Download +pgatk ensembl-downloader -t 10090 -o ensembl_mouse --skip_dna + +# Generate transcript sequences +gffread -F -w ensembl_mouse/transcripts.fa \ + -g ensembl_mouse/genome.fa \ + ensembl_mouse/Mus_musculus.GRCm39.*.gtf.gz + +# Canonical + variant proteins +pgatk dnaseq-to-proteindb \ + --input_fasta ensembl_mouse/transcripts.fa \ + --output_proteindb mouse_canonical.fa + +pgatk vcf-to-proteindb \ + --vcf ensembl_mouse/mus_musculus_incl_consequences.vcf.gz \ + --input_fasta ensembl_mouse/transcripts.fa \ + --gene_annotations_gtf ensembl_mouse/Mus_musculus.GRCm39.*.gtf.gz \ + --output_proteindb mouse_variants.fa + +# Combine and generate decoy +cat mouse_canonical.fa mouse_variants.fa > mouse_target.fa +pgatk generate-decoy \ + --input mouse_target.fa \ + --output mouse_target_decoy.fa \ + --method decoypyrat +``` + +--- + +## 11. Digest and Filter Variant Peptides + +After generating variant protein databases from any source, *in silico* +digestion and filtering against the canonical proteome extracts only the +peptides that are unique to the variant sequences. This compact peptide list +is useful for targeted validation, spectral library building, and +quantification of variant peptides. + +### Single source + +```bash +pgatk digest-mutant-protein \ + --input variant_proteins.fa \ + --fasta canonical_proteins.fa \ + --output unique_variant_peptides.fa \ + --min-len 7 \ + --max-len 40 \ + --missed-cleavages 2 +``` + +### Combine multiple variant sources + +```bash +pgatk digest-mutant-protein \ + --input cosmic_proteins.fa,clinvar_proteins.fa,ensembl_variants.fa \ + --fasta canonical_proteins.fa \ + --output combined_variant_peptides.fa +``` --- -## 12. Complete Proteogenomics Workflow +## 12. Complete Multi-Source Proteogenomics Workflow A full end-to-end workflow combining canonical proteins, population variants, clinical variants, cancer mutations, and non-coding translations into a single -search-ready database. +search-ready database. This is the approach used in large-scale studies that aim +to maximize the discovery of non-canonical peptides. ### Step 1 -- Download all data sources @@ -513,6 +984,15 @@ pgatk dnaseq-to-proteindb \ --include_biotypes lincRNA \ --num_orfs 3 \ --skip_including_all_cds + +# Pseudogenes +pgatk dnaseq-to-proteindb \ + --input_fasta ensembl_data/transcripts.fa \ + --output_proteindb pseudogene.fa \ + --var_prefix pseudogene_ \ + --include_biotypes processed_pseudogene,transcribed_processed_pseudogene \ + --num_orfs 3 \ + --skip_including_all_cds ``` ### Step 4 -- Combine all databases @@ -523,14 +1003,22 @@ cat canonical.fa \ clinvar_variants.fa \ cosmic_variants.fa \ lincRNA.fa \ + pseudogene.fa \ > combined_target.fa ``` -### Step 5 -- Add decoy sequences +### Step 5 -- Quality check and decoy generation ```bash +# Filter short sequences and validate +pgatk ensembl-check \ + --input_fasta combined_target.fa \ + --output validated_target.fa \ + --num_aa 6 + +# Add decoy sequences pgatk generate-decoy \ - --input combined_target.fa \ + --input validated_target.fa \ --output proteogenomics_target_decoy.fa \ --method decoypyrat \ --decoy_prefix DECOY_ @@ -540,7 +1028,7 @@ pgatk generate-decoy \ ```bash pgatk digest-mutant-protein \ - --input ensembl_variants.fa,clinvar_variants.fa,cosmic_variants.fa \ + --input ensembl_variants.fa,clinvar_variants.fa,cosmic_variants.fa,lincRNA.fa,pseudogene.fa \ --fasta canonical.fa \ --output unique_variant_peptides.fa \ --min-len 7 \ @@ -549,63 +1037,21 @@ pgatk digest-mutant-protein \ ``` The file `proteogenomics_target_decoy.fa` is a comprehensive search database, -and `unique_variant_peptides.fa` provides a focused list of variant-specific +and `unique_variant_peptides.fa` provides a focused list of non-canonical peptides for validation. ---- - -## 13. Non-Human Species - -pgatk supports any species available in ENSEMBL. Here is an example for mouse -(*Mus musculus*, taxonomy 10090): - -```bash -# Download -pgatk ensembl-downloader -t 10090 -o ensembl_mouse --skip_dna - -# Generate transcript sequences -gffread -F -w ensembl_mouse/transcripts.fa \ - -g ensembl_mouse/genome.fa \ - ensembl_mouse/Mus_musculus.GRCm39.*.gtf.gz - -# Canonical proteins -pgatk dnaseq-to-proteindb \ - --input_fasta ensembl_mouse/transcripts.fa \ - --output_proteindb mouse_canonical.fa - -# Variant proteins -pgatk vcf-to-proteindb \ - --vcf ensembl_mouse/mus_musculus_incl_consequences.vcf.gz \ - --input_fasta ensembl_mouse/transcripts.fa \ - --gene_annotations_gtf ensembl_mouse/Mus_musculus.GRCm39.*.gtf.gz \ - --output_proteindb mouse_variants.fa - -# Combine and generate decoy -cat mouse_canonical.fa mouse_variants.fa > mouse_target.fa -pgatk generate-decoy \ - --input mouse_target.fa \ - --output mouse_target_decoy.fa \ - --method decoypyrat -``` +!!! tip "Database size considerations" + Larger proteogenomic databases improve coverage but increase the FDR + penalty. Consider a two-pass strategy: first search against a focused + database (canonical + variants), then a broader database (adding + non-coding ORFs and six-frame translations) for discovery. --- -## 14. Three-Frame Translation +## 13. Quality Check: Validate Protein Database -Perform a simple three-frame translation of any nucleotide FASTA file: - -```bash -pgatk threeframe-translation \ - --input_fasta input_sequences.fa \ - --output translated_proteins.fa -``` - ---- - -## 15. Quality Check: Validate Protein Database - -Before using a protein database, check it for internal stop codons and short -sequences: +Before using a protein database for searching, validate it for internal stop +codons and short sequences: ```bash pgatk ensembl-check \ @@ -617,3 +1063,16 @@ pgatk ensembl-check \ This filters out sequences shorter than 6 amino acids. Use `--add_stop_codons` to include proteins that contain internal stop codons (useful for proteogenomics where frameshifts may introduce premature stops). + +--- + +## References + +If you use pgatk in your research, please cite: + +> Husen M Umer, Enrique Audain, Yafeng Zhu, Julianus Pfeuffer, Timo +> Sachsenberg, Janne Lehtiö, Rui M Branca, Yasset Perez-Riverol. +> **Generation of ENSEMBL-based proteogenomics databases boosts the +> identification of non-canonical peptides.** +> *Bioinformatics*, Volume 38, Issue 5, 1 March 2022, Pages 1470--1472. +> [https://doi.org/10.1093/bioinformatics/btab838](https://doi.org/10.1093/bioinformatics/btab838) From 047f026e184467cb1df602c5ec9cfb0813667603 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Tue, 3 Mar 2026 21:34:05 +0000 Subject: [PATCH 7/7] minor changes to the algorithms and bugs --- ...6-03-01-pgatk-rename-refactoring-design.md | 127 -- ...026-03-01-pgatk-rename-refactoring-plan.md | 817 -------- .../plans/2026-03-01-phase0-infrastructure.md | 1199 ------------ .../2026-03-02-clinvar-ncbi-implementation.md | 1652 ----------------- .../2026-03-02-clinvar-ncbi-support-design.md | 208 --- .../2026-03-03-clinvar-pipeline-hardening.md | 618 ------ docs/use-cases.md | 11 +- pgatk/clinvar/clinvar_service.py | 7 +- .../test_clinvar/test_clinvar_service.py | 7 + pgatk/tests/test_vcf_utils.py | 48 + pgatk/toolbox/vcf_utils.py | 22 + 11 files changed, 87 insertions(+), 4629 deletions(-) delete mode 100644 docs/plans/2026-03-01-pgatk-rename-refactoring-design.md delete mode 100644 docs/plans/2026-03-01-pgatk-rename-refactoring-plan.md delete mode 100644 docs/plans/2026-03-01-phase0-infrastructure.md delete mode 100644 docs/plans/2026-03-02-clinvar-ncbi-implementation.md delete mode 100644 docs/plans/2026-03-02-clinvar-ncbi-support-design.md delete mode 100644 docs/plans/2026-03-03-clinvar-pipeline-hardening.md diff --git a/docs/plans/2026-03-01-pgatk-rename-refactoring-design.md b/docs/plans/2026-03-01-pgatk-rename-refactoring-design.md deleted file mode 100644 index 399edd3..0000000 --- a/docs/plans/2026-03-01-pgatk-rename-refactoring-design.md +++ /dev/null @@ -1,127 +0,0 @@ -# Design: py-pgatk → pgatk Rename & Refactoring - -**Date:** 2026-03-01 -**Status:** Approved - -## Context - -The project is currently named `py-pgatk` with a Python package `pypgatk`. We are doing a full cleanup: rename, config system overhaul, bug fixes, and dead code removal. - -## Decisions - -| Decision | Choice | Rationale | -|---|---|---| -| Scope | Full cleanup | Rename + config + bugs + dead code in one effort | -| Delivery | 4 separate PRs | Reviewable, independent after PR 1 | -| Backward compat | Clean break | No shim package, rename everything | -| Config approach | Config Registry | Central mapping, auto-load bundled defaults | -| CLI entry point | `pgatk` | Matches project name | - -## PR 1: Rename (branch: `refactor/rename`) - -**Rename `pypgatk/` → `pgatk/`** across the entire codebase. - -### Changes - -- Rename directory: `pypgatk/` → `pgatk/` -- Rename CLI module: `pypgatkc.py` → `cli.py` -- Update `pyproject.toml`: - - `name = "pgatk"` - - Entry point: `pgatk = "pgatk.cli:main"` - - Package find: `include = ["pgatk*"]` - - Package data: `pgatk = ["config/*.yaml", "config/*.json"]` - - URLs: `bigbio/pgatk` -- Update all internal imports: `from pypgatk.xxx` → `from pgatk.xxx` -- Update docs references (mkdocs.yml, README.md, doc pages) -- Update GitHub Actions workflows -- Update test imports - -### Files affected - -- Every `.py` file with `pypgatk` imports (~44 files) -- `pyproject.toml` -- `mkdocs.yml` -- `README.md` -- `.github/workflows/*.yml` -- `docs/*.md` - -## PR 2: Config Registry (branch: `refactor/config-registry`) - -**Auto-load bundled config defaults so `-c` flag becomes optional.** - -### New module: `pgatk/config/registry.py` - -```python -import os -from pgatk.toolbox.general import read_yaml_from_file - -CONFIG_DIR = os.path.dirname(__file__) - -COMMAND_CONFIGS = { - "ensembl_downloader": "ensembl_downloader_config.yaml", - "ensembl_config": "ensembl_config.yaml", - "cosmic": "cosmic_config.yaml", - "cbioportal": "cbioportal_config.yaml", - "protein_decoy": "protein_decoy.yaml", - "openms_analysis": "openms_analysis.yaml", -} - -def load_config(config_key, user_config_file=None): - """Load config: user file if provided, otherwise bundled default.""" - if user_config_file is not None: - return read_yaml_from_file(user_config_file) - default_file = COMMAND_CONFIGS.get(config_key) - if default_file is None: - return {} - return read_yaml_from_file(os.path.join(CONFIG_DIR, default_file)) -``` - -### Command changes - -Replace the 3-line boilerplate in every command: - -```python -# Before -config_data = None -if config_file is not None: - config_data = read_yaml_from_file(config_file) - -# After -from pgatk.config.registry import load_config -config_data = load_config("ensembl_downloader", config_file) -``` - -### Files affected - -- New: `pgatk/config/registry.py` -- All 14 command files in `pgatk/commands/` - -## PR 3: Bug Fixes (branch: `fix/bugs`) - -| Bug | File | Fix | -|---|---|---| -| `_filter_write_idxml_with_df` writes unfiltered data | `proteomics/openms.py` | Write only filtered entries | -| ISO-8859 encoding typo | `cgenomes/cosmic_downloader.py` | `'ISO-8859'` → `'ISO-8859-1'` | -| Deprecated `Bio.pairwise2` | `proteomics/openms.py` | Replace with `Bio.Align.PairwiseAligner` | -| Enzyme name typos | `proteogenomics/models.py` | `'ArgC/P'` → `'Arg-C/P'` etc. | -| `print_help()` for required args | `commands/*.py` | Use Click's `required=True` | -| `download_file` log overwrite | `toolbox/general.py:193` | `if log is not None:` → `if log is None:` | - -## PR 4: Cleanup (branch: `refactor/cleanup`) - -| Item | File | Action | -|---|---|---| -| Dead `Species`/`SpeciesService` classes | `ensembl/ensembl.py` | Remove if unused | -| Commented-out code after return | `toolbox/general.py:241-253` | Delete | -| Hardcoded version `'0.0.26'` | `cli.py` | Use `importlib.metadata.version("pgatk")` | -| Log files created in CWD | `toolbox/general.py` | Log to stderr only by default | - -## Dependency Order - -``` -PR 1 (Rename) ──► PR 2 (Config Registry) - ──► PR 3 (Bug Fixes) - ──► PR 4 (Cleanup) -``` - -PR 1 must merge first. PRs 2, 3, 4 are independent of each other. diff --git a/docs/plans/2026-03-01-pgatk-rename-refactoring-plan.md b/docs/plans/2026-03-01-pgatk-rename-refactoring-plan.md deleted file mode 100644 index 11252f2..0000000 --- a/docs/plans/2026-03-01-pgatk-rename-refactoring-plan.md +++ /dev/null @@ -1,817 +0,0 @@ -# pgatk Rename & Refactoring Implementation Plan - -> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. - -**Goal:** Rename `py-pgatk`/`pypgatk` to `pgatk` everywhere, add config auto-loading, fix bugs, and remove dead code. - -**Architecture:** 4 independent PRs — PR 1 (rename) merges first, PRs 2-4 are independent of each other. Each PR gets its own branch off `dev`. - -**Tech Stack:** Python 3.9+, Click CLI, PyYAML, BioPython, pyOpenMS, MkDocs Material - ---- - -## PR 1: Rename `pypgatk` → `pgatk` (branch: `refactor/rename`) - -### Task 1.1: Rename the package directory and CLI module - -**Files:** -- Rename: `pypgatk/` → `pgatk/` (entire directory tree) -- Rename: `pgatk/pypgatkc.py` → `pgatk/cli.py` - -**Step 1: Rename the directory** - -```bash -cd /Users/yperez/work/proteogenomics/pgatk -git mv pypgatk pgatk -``` - -**Step 2: Rename the CLI module** - -```bash -git mv pgatk/pypgatkc.py pgatk/cli.py -``` - -**Step 3: Verify the rename** - -```bash -ls pgatk/ -# Should show: __init__.py cli.py cgenomes/ commands/ config/ db/ ensembl/ logs/ proteogenomics/ proteomics/ test_all.bed test_annotated.vcf testdata/ tests/ toolbox/ -``` - ---- - -### Task 1.2: Update `__init__.py` - -**Files:** -- Modify: `pgatk/__init__.py` - -**Step 1: Update the package name** - -Change `name = "pypgatk"` to `name = "pgatk"`. - ---- - -### Task 1.3: Update `cli.py` (formerly `pypgatkc.py`) - -**Files:** -- Modify: `pgatk/cli.py` - -**Step 1: Update all imports from `pypgatk` to `pgatk`** - -Replace all 14 import lines: -```python -from pgatk.commands import ensembl_downloader as ensembl_downloader_cmd -from pgatk.commands import ensembl_database as ensembl_database_cmd -from pgatk.commands import cosmic_downloader as cosmic_downloader_cmd -from pgatk.commands import cbioportal_downloader as cbioportal_downloader_cmd -from pgatk.commands import cosmic_to_proteindb as cosmic_to_proteindb_cmd -from pgatk.commands import cbioportal_to_proteindb as cbioportal_to_proteindb_cmd -from pgatk.commands import threeframe_translation as threeframe_translation_cmd -from pgatk.commands import vcf_to_proteindb as vcf_to_proteindb_cmd -from pgatk.commands import dnaseq_to_proteindb as dnase_to_proteindb_cmd -from pgatk.commands import proteindb_decoy as proteindb_decoy_cmd -from pgatk.commands import peptide_class_fdr as peptide_class_fdr_cmd -from pgatk.commands import validate_peptides as validate_peptides_cmd -from pgatk.commands import mztab_class_fdr as mztab_class_fdr_cmd -from pgatk.commands import blast_get_position as blast_get_position_cmd -``` - -**Step 2: Update docstring** - -Change `"py-pgatk"` references to `"pgatk"` in the module docstring and CLI help text. - ---- - -### Task 1.4: Update all Python imports across all modules - -**Files (all files under `pgatk/` that import from `pypgatk`):** - -Use a global find-and-replace: `from pypgatk.` → `from pgatk.` - -These files need updating: -- `pgatk/commands/blast_get_position.py` -- `pgatk/commands/cbioportal_downloader.py` -- `pgatk/commands/cbioportal_to_proteindb.py` -- `pgatk/commands/cosmic_downloader.py` -- `pgatk/commands/cosmic_to_proteindb.py` -- `pgatk/commands/dnaseq_to_proteindb.py` -- `pgatk/commands/ensembl_database.py` -- `pgatk/commands/ensembl_downloader.py` -- `pgatk/commands/mztab_class_fdr.py` -- `pgatk/commands/peptide_class_fdr.py` -- `pgatk/commands/proteindb_decoy.py` -- `pgatk/commands/threeframe_translation.py` -- `pgatk/commands/validate_peptides.py` -- `pgatk/commands/vcf_to_proteindb.py` -- `pgatk/cgenomes/cbioportal_downloader.py` -- `pgatk/cgenomes/cgenomes_proteindb.py` -- `pgatk/cgenomes/cosmic_downloader.py` -- `pgatk/ensembl/data_downloader.py` -- `pgatk/ensembl/ensembl.py` -- `pgatk/ensembl/exceptions.py` -- `pgatk/proteogenomics/blast_get_position.py` -- `pgatk/proteogenomics/mztab_class_fdr.py` -- `pgatk/proteogenomics/spectrumai.py` -- `pgatk/proteomics/openms.py` -- `pgatk/proteomics/db/protein_database_decoy.py` -- `pgatk/toolbox/general.py` - -**Step 1: Run a sed-like replacement across all `.py` files** - -In every `.py` file under `pgatk/`, replace `from pypgatk.` with `from pgatk.` and `import pypgatk` with `import pgatk`. - -**Step 2: Verify no remaining references** - -```bash -grep -r "pypgatk" pgatk/ --include="*.py" -# Should return nothing (except possibly string literals in help text) -``` - ---- - -### Task 1.5: Update test files - -**Files:** -- Modify: `pgatk/tests/pypgatk_tests.py` -- Modify: `pgatk/tests/classes_tests.py` - -**Step 1: Update test imports** - -In `pypgatk_tests.py`: -```python -# Before -from pypgatk.pypgatkc import cli -# After -from pgatk.cli import cli -``` - -In `classes_tests.py`: -```python -# Before -from pypgatk.proteomics.models import PYGPATK_ENZYMES -# After -from pgatk.proteomics.models import PYGPATK_ENZYMES -``` - -**Step 2: Rename the constant `PYGPATK_ENZYMES` → `PGATK_ENZYMES`** - -In `pgatk/proteomics/models.py`, rename the constant: -```python -# Before -PYGPATK_ENZYMES = ... -# After -PGATK_ENZYMES = ... -``` - -Update all references (in `models.py`, `classes_tests.py`, `protein_database_decoy.py`, and any command files). - ---- - -### Task 1.6: Update `pyproject.toml` - -**Files:** -- Modify: `pyproject.toml` - -**Step 1: Update all references** - -```toml -[project] -name = "pgatk" - -[project.scripts] -pgatk = "pgatk.cli:main" - -[project.urls] -Homepage = "http://github.com/bigbio/pgatk" -Repository = "http://github.com/bigbio/pgatk" -Issues = "http://github.com/bigbio/pgatk/issues" - -[tool.setuptools.packages.find] -include = ["pgatk*"] - -[tool.setuptools.package-data] -pgatk = ["config/*.yaml", "config/*.json"] -``` - ---- - -### Task 1.7: Update non-Python files - -**Files:** -- Modify: `MANIFEST.in` -- Modify: `Dockerfile` -- Modify: `.gitignore` -- Modify: `conda-enviroment.yaml` - -**Step 1: Update MANIFEST.in** - -``` -recursive-include pgatk *.json -recursive-include pgatk *.yaml -``` - -**Step 2: Update Dockerfile** - -Replace `pypgatk` references with `pgatk`: -- `LABEL software="pgatk"`, `container="pgatk"` -- `ENV PATH=$PATH:/tool/source/pgatk/` -- `RUN chmod +x /tool/source/pgatk/cli.py` -- Update git clone URL to `bigbio/pgatk` - -**Step 3: Update .gitignore** - -Replace `pypgatk/testdata/` paths with `pgatk/testdata/`. - -**Step 4: Update conda-enviroment.yaml** - -Change `name: py-pgatk` to `name: pgatk`. - ---- - -### Task 1.8: Update GitHub Actions workflows - -**Files:** -- Modify: `.github/workflows/pythonapp.yml` -- Modify: `.github/workflows/pythonpackage.yml` - -**Step 1: Update test directory references** - -In both files, change: -```yaml -# Before -cd pypgatk -# After -cd pgatk -``` - ---- - -### Task 1.9: Update documentation files - -**Files:** -- Modify: `mkdocs.yml` -- Modify: `README.md` -- Modify: `docs/index.md` -- Modify: `docs/installation.md` -- Modify: `docs/pypgatk.md` (rename to `docs/pgatk-cli.md`) -- Modify: `docs/changelog.md` - -**Step 1: Update mkdocs.yml** - -```yaml -repo_name: bigbio/pgatk -repo_url: https://github.com/bigbio/pgatk -# Update nav -nav: - - Home: index.md - - Introduction: introduction.md - - Installation: installation.md - - pgatk CLI: pgatk-cli.md - - File Formats: formats.md - - Changelog: changelog.md - - Support: support.md -# Update social links -- icon: fontawesome/brands/github - link: https://github.com/bigbio/pgatk -- icon: fontawesome/brands/python - link: https://pypi.org/project/pgatk/ -``` - -**Step 2: Rename docs/pypgatk.md → docs/pgatk-cli.md** - -```bash -git mv docs/pypgatk.md docs/pgatk-cli.md -``` - -**Step 3: Update all doc content** - -In all doc files, replace: -- `pypgatk` (the CLI command) → `pgatk` -- `py-pgatk` (repo name) → `pgatk` -- `pip install pypgatk` → `pip install pgatk` -- Badge URLs: `bigbio/py-pgatk` → `bigbio/pgatk` -- PyPI badges: `pypgatk` → `pgatk` - ---- - -### Task 1.10: Run tests and verify - -**Step 1: Install in editable mode** - -```bash -cd /Users/yperez/work/proteogenomics/pgatk -pip install -e . -``` - -**Step 2: Run tests** - -```bash -cd pgatk -python -m unittest discover -s tests -p "*_tests.py" -v -``` - -Expected: All tests pass. - -**Step 3: Verify CLI entry point** - -```bash -pgatk --version -# Expected: pgatk, version 0.0.26 -``` - -**Step 4: Commit** - -```bash -git add -A -git commit -m "refactor: rename pypgatk to pgatk - -- Rename package directory pypgatk/ → pgatk/ -- Rename CLI module pypgatkc.py → cli.py -- Update all Python imports from pypgatk to pgatk -- Rename constant PYGPATK_ENZYMES → PGATK_ENZYMES -- Update pyproject.toml (name, entry point, URLs, package config) -- Update Dockerfile, MANIFEST.in, .gitignore, conda env -- Update GitHub Actions workflows -- Update MkDocs documentation -- CLI command: pypgatk → pgatk -- PyPI package: pypgatk → pgatk" -``` - ---- - -## PR 2: Config Registry (branch: `refactor/config-registry`) - -### Task 2.1: Create `pgatk/config/registry.py` - -**Files:** -- Create: `pgatk/config/registry.py` - -**Step 1: Write the registry module** - -```python -""" -Central config registry: maps config keys to bundled default YAML files. -Enables auto-loading of defaults when user doesn't pass -c. -""" -import os - -from pgatk.toolbox.general import read_yaml_from_file - -CONFIG_DIR = os.path.dirname(__file__) - -COMMAND_CONFIGS = { - "ensembl_downloader": "ensembl_downloader_config.yaml", - "ensembl_config": "ensembl_config.yaml", - "cosmic": "cosmic_config.yaml", - "cbioportal": "cbioportal_config.yaml", - "protein_decoy": "protein_decoy.yaml", - "openms_analysis": "openms_analysis.yaml", -} - - -def load_config(config_key, user_config_file=None): - """ - Load configuration for a command. - - If user_config_file is provided, loads that file. - Otherwise, loads the bundled default config for the given key. - - :param config_key: key in COMMAND_CONFIGS - :param user_config_file: optional path to user-provided config file - :return: parsed YAML config dict, or empty dict if no config found - """ - if user_config_file is not None: - return read_yaml_from_file(user_config_file) - - default_file = COMMAND_CONFIGS.get(config_key) - if default_file is None: - return {} - return read_yaml_from_file(os.path.join(CONFIG_DIR, default_file)) -``` - ---- - -### Task 2.2: Update command files to use `load_config` - -**Files (all 14 command files in `pgatk/commands/`):** - -For each command file, replace: - -```python -# Before -from pgatk.toolbox.general import read_yaml_from_file - -# ... in the command function: -config_data = None -if config_file is not None: - config_data = read_yaml_from_file(config_file) -``` - -With: - -```python -# After -from pgatk.config.registry import load_config - -# ... in the command function: -config_data = load_config("", config_file) -``` - -**Config key mapping per command file:** - -| Command file | Config key | -|---|---| -| `ensembl_downloader.py` | `"ensembl_downloader"` | -| `ensembl_database.py` | `"ensembl_config"` | -| `vcf_to_proteindb.py` | `"ensembl_config"` | -| `dnaseq_to_proteindb.py` | `"ensembl_config"` | -| `threeframe_translation.py` | `"ensembl_config"` | -| `cosmic_downloader.py` | `"cosmic"` | -| `cosmic_to_proteindb.py` | `"cosmic"` | -| `cbioportal_downloader.py` | `"cbioportal"` | -| `cbioportal_to_proteindb.py` | `"cbioportal"` | -| `proteindb_decoy.py` | `"protein_decoy"` | -| `peptide_class_fdr.py` | `"openms_analysis"` | -| `validate_peptides.py` | `"ensembl_config"` | -| `mztab_class_fdr.py` | `"ensembl_config"` | -| `blast_get_position.py` | (no config needed — uses None) | - ---- - -### Task 2.3: Update tests that pass `--config_file` - -**Files:** -- Modify: `pgatk/tests/pypgatk_tests.py` - -Some tests currently pass `--config_file` explicitly. These should still work (user override). -Add a new test that runs a command without `--config_file` to verify auto-loading: - -```python -def test_generate_decoy_database_autoconfig(self): - """Test that generate-decoy works without explicit --config_file (auto-loads bundled default).""" - runner = CliRunner() - result = runner.invoke(cli, - ['generate-decoy', '-in', 'testdata/test_db.fa', - '-out', 'testdata/output_decoy.fa', - '--method', 'protein-reverse']) - self.assertEqual(result.exit_code, 0) -``` - -Note: `test_generate_decoy_database_noconfig` already tests this case — verify it still passes. - -**Step 1: Run tests** - -```bash -cd pgatk -python -m unittest discover -s tests -p "*_tests.py" -v -``` - -**Step 2: Commit** - -```bash -git add pgatk/config/registry.py pgatk/commands/ -git commit -m "refactor: add config registry for auto-loading bundled defaults - -- Add pgatk/config/registry.py with central COMMAND_CONFIGS mapping -- Update all 14 command files to use load_config() -- -c flag is now optional: bundled defaults are loaded automatically -- User can still override with -c path/to/custom_config.yaml" -``` - ---- - -## PR 3: Bug Fixes (branch: `fix/bugs`) - -### Task 3.1: Fix `_filter_write_idxml_with_df` writing unfiltered data - -**Files:** -- Modify: `pgatk/proteomics/openms.py:437` - -**Step 1: Fix the store call** - -```python -# Before (line 437) -idxml_parser().store(output_file, prot_ids, pep_ids) -# After -idxml_parser().store(output_file, prot_ids, new_pep_ids) -``` - ---- - -### Task 3.2: Fix ISO-8859 encoding typo - -**Files:** -- Modify: `pgatk/cgenomes/cosmic_downloader.py:156` - -**Step 1: Fix the encoding name** - -The en-dash (U+2013) in `'ISO-8859–1'` must be replaced with a regular hyphen: - -```python -# Before (line 156) -outfile.write(gzip.decompress(open(local_file, 'rb').read()).decode('ISO-8859–1')) -# After -outfile.write(gzip.decompress(open(local_file, 'rb').read()).decode('ISO-8859-1')) -``` - ---- - -### Task 3.3: Fix `download_file` log inversion bug - -**Files:** -- Modify: `pgatk/toolbox/general.py:192-193` - -**Step 1: Fix the condition** - -```python -# Before (line 192-193) -if log is not None: - log = logging -# After -if log is None: - log = logging -``` - ---- - -### Task 3.4: Fix enzyme name typos in models.py - -**Files:** -- Modify: `pgatk/proteomics/models.py:21-22` - -**Step 1: Fix the typos** - -```python -# Before (lines 21-22 — "cleavege" is misspelled) -'Trypsin/P': {'cleavage rule': '(?<=[KRX])', 'PSIID': 'MS:1001313', 'cleavege sites': 'KR'}, -'V8-DE': {'cleavage rule': '(?<=[DBEZX])(?!P)', 'PSIID': 'MS:1001314', 'cleavege sites': 'DBEZX'}, -# After -'Trypsin/P': {'cleavage rule': '(?<=[KRX])', 'PSIID': 'MS:1001313', 'cleavage sites': 'KR'}, -'V8-DE': {'cleavage rule': '(?<=[DBEZX])(?!P)', 'PSIID': 'MS:1001314', 'cleavage sites': 'DBEZX'}, -``` - -**Important:** Check if `'cleavege sites'` is used as a dictionary key anywhere — if so, update those references too. Search for `cleavege` across the codebase. - ---- - -### Task 3.5: Replace `print_help()` with Click's `required=True` - -**Files (11 command files):** - -For each command file listed below, add `required=True` to the relevant `@click.option()` decorators and remove the `print_help()` check. - -| File | Options to make required | -|---|---| -| `ensembl_database.py` | `--input_fasta` | -| `vcf_to_proteindb.py` | `--input_fasta`, `--vcf`, `--gene_annotations_gtf` | -| `cbioportal_to_proteindb.py` | `--input_mutation`, `--input_cds`, `--output_db` | -| `cosmic_to_proteindb.py` | `--input_mutation`, `--input_genes`, `--output_db` | -| `threeframe_translation.py` | `--input_fasta` | -| `proteindb_decoy.py` | `--input_database` | -| `peptide_class_fdr.py` | `--input_file`, `--output_file` | -| `dnaseq_to_proteindb.py` | `--input_fasta` | -| `blast_get_position.py` | `--input_psm_to_blast`, `--input_reference_database`, `--output_psm` | -| `mztab_class_fdr.py` | `--input_mztab`, `--outfile_name` | - -**Special case: `ensembl_downloader.py`** — This command requires `--taxonomy` OR `--ensembl_name` (either one). Can't use `required=True` directly. Use a Click callback or keep a validation check (but raise `click.UsageError` instead of `print_help()`): - -```python -if taxonomy is None and ensembl_name is None: - raise click.UsageError("Either --taxonomy or --ensembl_name is required.") -``` - -**Special case: `validate_peptides.py`** — Review the specific validation logic before converting. - -**Step 1: For each file, add `required=True` and remove `print_help()` import and call** - -Example pattern: -```python -# Before -from pgatk.commands.utils import print_help - -@click.option('-in', '--input_fasta', help='...') -def command(ctx, config_file, input_fasta, ...): - if input_fasta is None: - print_help() - -# After (remove print_help import, add required=True) -@click.option('-in', '--input_fasta', help='...', required=True) -def command(ctx, config_file, input_fasta, ...): - # print_help() call removed -``` - -**Step 2: Remove `print_help` from `pgatk/commands/utils.py`** if no longer used anywhere. - -**Step 3: Run tests** - -```bash -cd pgatk -python -m unittest discover -s tests -p "*_tests.py" -v -``` - ---- - -### Task 3.6: Replace deprecated `Bio.pairwise2` - -**Files:** -- Modify: `pgatk/proteogenomics/blast_get_position.py` - -**Step 1: Replace import** - -```python -# Before -from Bio import pairwise2, SeqIO -# After -from Bio import SeqIO -from Bio.Align import PairwiseAligner -``` - -**Step 2: Replace usage in `peptide_blast_protein` function** - -```python -# Before -score = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide, - match=1, mismatch=0, open=-2, extend=-2, score_only=True) -if score == length-1: - alignment = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide, - match=1, mismatch=0, open=-2, extend=-2)[0] - -# After -aligner = PairwiseAligner() -aligner.mode = 'local' -aligner.match_score = 1 -aligner.mismatch_score = 0 -aligner.open_gap_score = -2 -aligner.extend_gap_score = -2 -alignments = aligner.align(fasta, peptide) -score = alignments.score if alignments else 0 -if score == length - 1: - alignment = alignments[0] - # Update attribute access: alignment.aligned gives coordinate pairs - # Need to adapt get_details() calls to work with new alignment format -``` - -**Note:** The `PairwiseAligner` API differs from `pairwise2`. The alignment object attributes change: -- `alignment.seqA` / `alignment.seqB` → access via `alignment.target` / `alignment.query` -- `alignment.start` / `alignment.end` → access via `alignment.aligned` coordinate arrays - -Carefully test the `get_details()` function calls to ensure they still work with the new alignment format. - -**Step 3: Commit** - -```bash -git add -A -git commit -m "fix: resolve multiple bugs - -- Fix _filter_write_idxml_with_df writing unfiltered data (openms.py) -- Fix ISO-8859 encoding en-dash typo (cosmic_downloader.py) -- Fix download_file log inversion bug (general.py) -- Fix enzyme name typos 'cleavege' → 'cleavage' (models.py) -- Replace print_help() with Click required=True (11 command files) -- Replace deprecated Bio.pairwise2 with PairwiseAligner (blast_get_position.py)" -``` - ---- - -## PR 4: Cleanup (branch: `refactor/cleanup`) - -### Task 4.1: Remove dead `Species` and `SpeciesService` classes - -**Files:** -- Modify: `pgatk/ensembl/models.py` - -**Step 1: Verify they are unused** - -```bash -grep -r "Species\|SpeciesService" pgatk/ --include="*.py" -l -# Should only show pgatk/ensembl/models.py -``` - -**Step 2: Remove both classes** (lines 9-180 approximately). Keep any other code in the file (e.g., imports used elsewhere). - ---- - -### Task 4.2: Remove dead code after return statement - -**Files:** -- Modify: `pgatk/toolbox/general.py` - -**Step 1: Delete commented-out code** at lines 241-253 (after `return downloaded_file` in `download_file()`). - ---- - -### Task 4.3: Dynamic version from package metadata - -**Files:** -- Modify: `pgatk/cli.py` - -**Step 1: Use importlib.metadata** - -```python -# Before -@click.version_option(version='0.0.26') -def cli(): - -# After -from importlib.metadata import version, PackageNotFoundError - -try: - __version__ = version("pgatk") -except PackageNotFoundError: - __version__ = "dev" - -@click.version_option(version=__version__) -def cli(): -``` - ---- - -### Task 4.4: Fix CWD log file creation - -**Files:** -- Modify: `pgatk/toolbox/general.py` - -**Step 1: Replace FileHandler with StreamHandler** - -In `ParameterConfiguration.__init__`, replace the file-based logging with stderr-based logging: - -```python -# Before (lines 72-89) -self._log_handlers = [] -log_handlers_prefix = self._ROOT_CONFIG_NAME + '-' -log_handlers_extension = '.log' - -self._logger = logging.getLogger(__name__) -self._logger.setLevel(getattr(logging, self._log_level)) -self._log_files = [] -for llevel, lformat in self._logger_formatters.items(): - logfile = os.path.join(log_handlers_prefix + llevel.lower() + log_handlers_extension) - lformatter = logging.Formatter(lformat) - lhandler = logging.FileHandler(logfile, mode='w') - lhandler.setLevel(getattr(logging, llevel)) - lhandler.setFormatter(lformatter) - self._log_handlers.append(lhandler) - self._logger.addHandler(lhandler) - self._log_files.append(logfile) - -# After -self._log_handlers = [] -self._logger = logging.getLogger(__name__) -self._logger.setLevel(getattr(logging, self._log_level)) -self._log_files = [] - -# Log to stderr instead of creating files in CWD -for llevel, lformat in self._logger_formatters.items(): - lformatter = logging.Formatter(lformat) - lhandler = logging.StreamHandler() - lhandler.setLevel(getattr(logging, llevel)) - lhandler.setFormatter(lformatter) - self._log_handlers.append(lhandler) - self._logger.addHandler(lhandler) -``` - -**Step 2: Clean up `get_session_log_files()`** - -This method returns `self._log_files` which will now be empty. Keep the method (in case downstream code calls it) but it will return `[]`. - ---- - -### Task 4.5: Run full test suite and commit - -**Step 1: Run tests** - -```bash -cd pgatk -python -m unittest discover -s tests -p "*_tests.py" -v -``` - -**Step 2: Commit** - -```bash -git add -A -git commit -m "refactor: remove dead code and improve logging - -- Remove unused Species and SpeciesService classes (ensembl/models.py) -- Remove commented-out dead code after return (toolbox/general.py) -- Use dynamic version from package metadata instead of hardcoded '0.0.26' -- Log to stderr instead of creating log files in CWD" -``` - ---- - -## Summary: Branch and PR Strategy - -| PR | Branch | Base | Description | -|---|---|---|---| -| 1 | `refactor/rename` | `dev` | Full rename: pypgatk → pgatk | -| 2 | `refactor/config-registry` | `refactor/rename` | Auto-load bundled config defaults | -| 3 | `fix/bugs` | `refactor/rename` | Fix 6 bugs | -| 4 | `refactor/cleanup` | `refactor/rename` | Remove dead code, fix logging | - -PR 1 merges first → then PRs 2, 3, 4 can merge in any order. - -## Post-Merge (Manual) - -1. Rename GitHub repo: Settings → Repository name → `pgatk` -2. Publish new `pgatk` package to PyPI -3. Update Bioconda recipe diff --git a/docs/plans/2026-03-01-phase0-infrastructure.md b/docs/plans/2026-03-01-phase0-infrastructure.md deleted file mode 100644 index d70a21d..0000000 --- a/docs/plans/2026-03-01-phase0-infrastructure.md +++ /dev/null @@ -1,1199 +0,0 @@ -# Phase 0: Infrastructure & Code Quality — Implementation Plan - -> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. - -**Goal:** Fix existing bugs, modernize code patterns, and expand test coverage so the graph engine (Phase 1) is built on solid ground. - -**Architecture:** No architectural changes — this is a cleanup phase touching existing files. All changes are backward-compatible. The test suite must pass before and after each task. - -**Tech Stack:** Python 3.9+, pytest, dataclasses, pathlib, concurrent.futures, logging - ---- - -## Task 1: Add conftest.py with proper fixtures and absolute paths - -**Files:** -- Create: `pgatk/tests/conftest.py` -- Modify: `pgatk/tests/pgatk_tests.py` -- Modify: `pgatk/tests/classes_tests.py` - -**Step 1: Create conftest.py with path fixtures** - -```python -# pgatk/tests/conftest.py -import os -from pathlib import Path - -import pytest - -# Absolute path to testdata/ directory, works regardless of CWD -TESTDATA_DIR = Path(__file__).resolve().parent.parent / "testdata" -CONFIG_DIR = Path(__file__).resolve().parent.parent / "config" - - -@pytest.fixture -def testdata_dir(): - """Absolute path to the testdata directory.""" - return TESTDATA_DIR - - -@pytest.fixture -def config_dir(): - """Absolute path to the config directory.""" - return CONFIG_DIR - - -@pytest.fixture(autouse=True) -def change_to_package_dir(tmp_path, monkeypatch): - """Ensure tests run from a consistent directory (the pgatk package root).""" - monkeypatch.chdir(Path(__file__).resolve().parent.parent) -``` - -**Step 2: Run existing tests to verify they still pass** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/ -v --timeout=120 -x` -Expected: All existing tests pass (the `autouse` fixture changes CWD to the package root, preserving relative path behavior). - -**Step 3: Commit** - -```bash -git add pgatk/tests/conftest.py -git commit -m "test: add conftest.py with testdata/config path fixtures" -``` - ---- - -## Task 2: Fix variable shadowing bug in protein_database_decoy.py - -**Files:** -- Modify: `pgatk/proteomics/db/protein_database_decoy.py:170,180,203` -- Create: `pgatk/tests/test_decoy_shadowing.py` - -**Step 1: Write the failing test** - -```python -# pgatk/tests/test_decoy_shadowing.py -"""Verify that the decoy_sequence import is not shadowed by local variables.""" -from pyteomics.fasta import decoy_sequence as pyteomics_decoy_sequence -from pgatk.proteomics.db import protein_database_decoy - - -def test_decoy_sequence_import_not_shadowed(): - """The module-level import of decoy_sequence must remain callable after - print_target_decoy_composition defines a local variable of the same name.""" - # Verify the import at module level is still the pyteomics function - module_ref = protein_database_decoy.decoy_sequence - assert callable(module_ref), ( - "decoy_sequence at module level should be the pyteomics function, " - f"got {type(module_ref)}" - ) - assert module_ref is pyteomics_decoy_sequence -``` - -**Step 2: Run test to verify it passes (this is a module-level check, so it should pass)** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_decoy_shadowing.py -v` -Expected: PASS (the shadow is local to the method, not module-level — but we fix it for safety) - -**Step 3: Rename the shadowing variable** - -In `pgatk/proteomics/db/protein_database_decoy.py`, rename the local variable `decoy_sequence` to `decoy_seq_accumulated` at lines 170, 180, and 203: - -- Line 170: `decoy_sequence = ''` → `decoy_seq_accumulated = ''` -- Line 180: `decoy_sequence = decoy_sequence + str(record.seq)` → `decoy_seq_accumulated = decoy_seq_accumulated + str(record.seq)` -- Line 203: `decoy_aa_composition = self.count_aa_in_dictionary(decoy_aa_composition, decoy_sequence)` → `decoy_aa_composition = self.count_aa_in_dictionary(decoy_aa_composition, decoy_seq_accumulated)` - -**Step 4: Run all tests to verify nothing broke** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/ -v --timeout=120 -x` -Expected: All PASS - -**Step 5: Commit** - -```bash -git add pgatk/proteomics/db/protein_database_decoy.py pgatk/tests/test_decoy_shadowing.py -git commit -m "fix: rename shadowing variable decoy_sequence in print_target_decoy_composition" -``` - ---- - -## Task 3: Convert SNP class to dataclass - -**Files:** -- Modify: `pgatk/cgenomes/models.py` -- Modify: `pgatk/cgenomes/cgenomes_proteindb.py` (update `snp.type` → `snp.mutation_type`) -- Create: `pgatk/tests/test_models.py` - -**Step 1: Write the failing test** - -```python -# pgatk/tests/test_models.py -import dataclasses -from pgatk.cgenomes.models import SNP - - -def test_snp_is_dataclass(): - assert dataclasses.is_dataclass(SNP) - - -def test_snp_defaults(): - snp = SNP() - assert snp.gene is None - assert snp.mrna is None - assert snp.dna_mut is None - assert snp.aa_mut is None - assert snp.mutation_type is None - - -def test_snp_with_values(): - snp = SNP(gene="BRCA1", mrna="NM_007294", dna_mut="c.5266dupC", - aa_mut="p.Q1756fs", mutation_type="Insertion") - assert snp.gene == "BRCA1" - assert snp.mutation_type == "Insertion" - - -def test_snp_no_type_builtin_shadow(): - """The old SNP class stored mutation_type as self.type, shadowing the builtin.""" - snp = SNP(mutation_type="Substitution") - assert not hasattr(snp, 'type') or snp.mutation_type == "Substitution" -``` - -**Step 2: Run test to verify it fails** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_models.py -v` -Expected: FAIL — `test_snp_is_dataclass` fails because SNP is not a dataclass yet - -**Step 3: Convert SNP to dataclass** - -Replace the entire content of `pgatk/cgenomes/models.py`: - -```python -from dataclasses import dataclass -from typing import Optional - - -@dataclass -class SNP: - """Represents a single nucleotide polymorphism or mutation.""" - gene: Optional[str] = None - mrna: Optional[str] = None - dna_mut: Optional[str] = None - aa_mut: Optional[str] = None - mutation_type: Optional[str] = None -``` - -**Step 4: Update references from `snp.type` to `snp.mutation_type`** - -In `pgatk/cgenomes/cgenomes_proteindb.py`, find all uses of `.type` on SNP objects: -- Search for `snp.type` or `.type =` in the context of SNP objects -- Line where SNP is constructed: `snp = SNP()` followed by `snp.type = ...` → change to `snp.mutation_type = ...` -- Any reads of `snp.type` → `snp.mutation_type` - -Specifically, look for patterns like: -```python -snp.type = mutation_type # → snp.mutation_type = mutation_type -``` -And reads like: -```python -snp.type # → snp.mutation_type -``` - -**Step 5: Run tests to verify they pass** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_models.py pgatk/tests/ -v --timeout=120 -x` -Expected: All PASS - -**Step 6: Commit** - -```bash -git add pgatk/cgenomes/models.py pgatk/cgenomes/cgenomes_proteindb.py pgatk/tests/test_models.py -git commit -m "refactor: convert SNP to dataclass, rename .type to .mutation_type" -``` - ---- - -## Task 4: Replace pathos with concurrent.futures in blast_get_position.py - -**Files:** -- Modify: `pgatk/proteogenomics/blast_get_position.py:1-10,85,151-155` -- Modify: `pyproject.toml` (remove pathos from dependencies after both files are migrated) - -**Step 1: Write the test** - -```python -# pgatk/tests/test_blast_imports.py -"""Verify blast_get_position uses stdlib concurrent.futures, not pathos.""" -import importlib - - -def test_no_pathos_import_in_blast(): - """blast_get_position should not import pathos.""" - import pgatk.proteogenomics.blast_get_position as mod - source_file = mod.__file__ - with open(source_file, 'r') as f: - source = f.read() - assert 'pathos' not in source, "blast_get_position.py still imports pathos" - assert 'concurrent.futures' in source or 'ProcessPoolExecutor' in source -``` - -**Step 2: Run test to verify it fails** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_blast_imports.py -v` -Expected: FAIL — pathos is still imported - -**Step 3: Replace pathos with concurrent.futures** - -In `pgatk/proteogenomics/blast_get_position.py`: - -Replace imports (lines 1-9): -```python -import pandas as pd -from Bio import SeqIO -import datetime -from concurrent.futures import ProcessPoolExecutor -from tqdm import tqdm -import ahocorasick - -from pgatk.toolbox.general import ParameterConfiguration -``` - -Remove `Manager` import and usage. In `__init__`, replace line 85: -```python -# OLD: self.blast_dict = Manager().dict() -self.blast_dict = {} -``` - -Replace the pool usage (lines 151-155) in `blast()`: -```python -# OLD: -# pool = Pool(int(self._number_of_processes)) -# list(tqdm(pool.imap(self._result, seq_list), total=len(seq_list), desc="Blast", unit="peptide")) -# pool.close() -# pool.join() - -# NEW: -with ProcessPoolExecutor(max_workers=int(self._number_of_processes)) as executor: - list(tqdm(executor.map(self._result, seq_list, chunksize=100), - total=len(seq_list), desc="Blast", unit="peptide")) -``` - -Note: Since `self._result` writes to `self.blast_dict` which is a shared Manager dict, and we're removing Manager, we need to refactor `_result()` to return results instead of writing to shared state. Then collect results in the main process: - -```python -# Refactor _result to return data instead of writing to shared dict -def _result(self, sequence): - """Process a single peptide and return (peptide, details) or None.""" - result = peptide_blast_protein(self.fasta_dict, sequence) - if result: - return (sequence, result) - return None - -# In blast(): -with ProcessPoolExecutor(max_workers=int(self._number_of_processes)) as executor: - results = list(tqdm(executor.map(self._result, seq_list, chunksize=100), - total=len(seq_list), desc="Blast", unit="peptide")) -for r in results: - if r is not None: - self.blast_dict[r[0]] = r[1] -``` - -**Step 4: Run the existing blast test** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/pgatk_tests.py::PgatkRunnerTests::test_blast -v --timeout=120` -Expected: PASS - -**Step 5: Run import test** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_blast_imports.py -v` -Expected: PASS - -**Step 6: Commit** - -```bash -git add pgatk/proteogenomics/blast_get_position.py pgatk/tests/test_blast_imports.py -git commit -m "refactor: replace pathos with concurrent.futures in blast_get_position" -``` - ---- - -## Task 5: Replace pathos with concurrent.futures in spectrumai.py - -**Files:** -- Modify: `pgatk/proteogenomics/spectrumai.py:1-10,47,290,314-318` - -**Step 1: Write the test** - -```python -# pgatk/tests/test_spectrumai_imports.py -"""Verify spectrumai uses stdlib concurrent.futures, not pathos.""" - - -def test_no_pathos_import_in_spectrumai(): - import pgatk.proteogenomics.spectrumai as mod - with open(mod.__file__, 'r') as f: - source = f.read() - assert 'pathos' not in source, "spectrumai.py still imports pathos" -``` - -**Step 2: Run test to verify it fails** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_spectrumai_imports.py -v` -Expected: FAIL - -**Step 3: Replace pathos with concurrent.futures** - -In `pgatk/proteogenomics/spectrumai.py`: - -Replace imports (lines 1-10): -```python -import datetime -import os.path -import re -import pandas as pd -from concurrent.futures import ProcessPoolExecutor -from pyopenms import (TheoreticalSpectrumGenerator, MSSpectrum, - AASequence, Param, MzMLFile, MSExperiment, SpectrumLookup) -from tqdm import tqdm -from pgatk.toolbox.general import ParameterConfiguration -``` - -Remove Manager usage. In `__init__`, replace line 47: -```python -# OLD: self.df_list = Manager().list() -# Remove this line entirely — we'll collect results from executor.map() -``` - -Refactor `_multiprocess_inspect_spectrum` (around line 290) to return the result instead of appending to shared list: -```python -def _multiprocess_inspect_spectrum(self, df): - """Process a single mzML group and return the annotated DataFrame.""" - return self._inspect_spectrum(df, self._mzml_path, self._mzml_files) -``` - -Replace pool usage (lines 314-318) in `validate()`: -```python -# OLD: -# pool = Pool(int(self._number_of_processes)) -# list(tqdm(pool.imap(self._multiprocess_inspect_spectrum, list_of_dfs), ...)) -# pool.close() -# pool.join() - -# NEW: -with ProcessPoolExecutor(max_workers=int(self._number_of_processes)) as executor: - results = list(tqdm( - executor.map(self._multiprocess_inspect_spectrum, list_of_dfs), - total=len(list_of_dfs), desc="Validate By Each mzML", unit="mzML" - )) - -# Replace the line that concatenates self.df_list: -# OLD: df_output = pd.concat(self.df_list, axis=0, ignore_index=True) -# NEW: -df_output = pd.concat(results, axis=0, ignore_index=True) -``` - -**Step 4: Run test to verify it passes** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_spectrumai_imports.py -v` -Expected: PASS - -**Step 5: Now remove pathos from pyproject.toml** - -In `pyproject.toml`, remove the `pathos` dependency line: -``` -# Remove: "pathos>=0.2.8,<1.0", -``` -Also remove from `requirements.txt` if present. - -**Step 6: Run full test suite** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/ -v --timeout=120 -x` -Expected: All PASS - -**Step 7: Commit** - -```bash -git add pgatk/proteogenomics/spectrumai.py pgatk/tests/test_spectrumai_imports.py pyproject.toml requirements.txt -git commit -m "refactor: replace pathos with concurrent.futures in spectrumai, remove pathos dependency" -``` - ---- - -## Task 6: Replace print() with logger across all modules - -This is a large task covering many files. Work through them one file at a time. - -**Files:** -- Modify: `pgatk/proteomics/db/protein_database_decoy.py` (14 print sites) -- Modify: `pgatk/ensembl/ensembl.py` (9 print sites) -- Modify: `pgatk/cgenomes/cgenomes_proteindb.py` (18 print sites) -- Modify: `pgatk/proteogenomics/blast_get_position.py` (8 print sites) -- Modify: `pgatk/proteogenomics/spectrumai.py` (6 print sites) - -**Step 1: Write a test that checks for print() usage in source files** - -```python -# pgatk/tests/test_no_print_statements.py -"""Verify that production code uses logger instead of print().""" -import ast -from pathlib import Path - -PGATK_ROOT = Path(__file__).resolve().parent.parent - -# Files that should not contain print() calls (excluding tests and __init__) -CHECKED_FILES = [ - "proteomics/db/protein_database_decoy.py", - "ensembl/ensembl.py", - "cgenomes/cgenomes_proteindb.py", - "proteogenomics/blast_get_position.py", - "proteogenomics/spectrumai.py", -] - - -def _count_print_calls(filepath: Path) -> list[int]: - """Return line numbers of print() function calls in a Python file.""" - source = filepath.read_text() - tree = ast.parse(source) - lines = [] - for node in ast.walk(tree): - if isinstance(node, ast.Call): - func = node.func - if isinstance(func, ast.Name) and func.id == "print": - lines.append(node.lineno) - return lines - - -def test_no_print_in_production_code(): - violations = {} - for rel_path in CHECKED_FILES: - filepath = PGATK_ROOT / rel_path - if filepath.exists(): - lines = _count_print_calls(filepath) - if lines: - violations[rel_path] = lines - assert not violations, ( - f"print() calls found in production code (use logger instead):\n" - + "\n".join(f" {f}: lines {lines}" for f, lines in violations.items()) - ) -``` - -**Step 2: Run test to verify it fails** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_no_print_statements.py -v` -Expected: FAIL with list of all print() locations - -**Step 3: Replace print() calls file by file** - -For each file, the pattern is: -1. Ensure the file has a logger (most already do via `ParameterConfiguration.get_logger()`) -2. Replace `print(...)` with `self.get_logger().info(...)` for informational output -3. Replace `print("ERROR: ...")` with `self.get_logger().error(...)` -4. Replace `print("Warning: ...")` with `self.get_logger().warning(...)` -5. For static methods or module-level functions without `self`, use `logging.getLogger(__name__)` - -**Key replacements by file:** - -**protein_database_decoy.py** — all `print()` inside instance methods → `self.get_logger().info()`: -- Lines 158-160, 193, 195, 197, 199-201: diagnostic stats → `self.get_logger().info(...)` -- Lines 261, 262, 279, 282, 286, 320, 367: decoypyrat progress → `self.get_logger().info(...)` -- Lines 481, 483-484: final stats → `self.get_logger().info(...)` - -**ensembl.py** — mixed instance and static methods: -- Line 131: `print("skip entries...")` → `self.get_logger().warning("skip entries...")` -- Line 248: `print("Database already exists...")` → `logging.getLogger(__name__).debug("Database already exists: %s %s", e, gtf_db_file)` -- Line 272-274: `print("""Feature {} ...""")` → `logging.getLogger(__name__).warning(...)` -- Line 400: `print("Could not extra cds...")` → `self.get_logger().warning(...)` -- Line 755: redundant `print(msg)` after `logger` call → remove the print -- Lines 823-829: `print(" total number...")` → `self.get_logger().info(...)` -- Line 857-858: `print("ERROR: This script...")` → `logging.getLogger(__name__).error(...)` - -**cgenomes_proteindb.py** — add `import logging` at top, use `self.get_logger()` in instance methods and `logging.getLogger(__name__)` in static methods. Replace all 18 print calls. - -**blast_get_position.py** — replace 8 print calls with `self.get_logger()`. - -**spectrumai.py** — replace 6 print calls with `self.get_logger()`. - -**Step 4: Run the no-print test** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_no_print_statements.py -v` -Expected: PASS - -**Step 5: Run full test suite** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/ -v --timeout=120 -x` -Expected: All PASS - -**Step 6: Commit** - -```bash -git add pgatk/proteomics/db/protein_database_decoy.py pgatk/ensembl/ensembl.py pgatk/cgenomes/cgenomes_proteindb.py pgatk/proteogenomics/blast_get_position.py pgatk/proteogenomics/spectrumai.py pgatk/tests/test_no_print_statements.py -git commit -m "refactor: replace print() with logger across all production modules" -``` - ---- - -## Task 7: Replace broad except Exception with specific exceptions - -**Files:** -- Modify: `pgatk/toolbox/general.py:163,229,342` -- Modify: `pgatk/ensembl/ensembl.py:247` -- Modify: `pgatk/proteogenomics/spectrumai.py:163,180` - -**Step 1: Fix each broad except block** - -**`toolbox/general.py` line 163:** -```python -# OLD: -except Exception as e: - raise ToolBoxException(str(e)) -# NEW: -except (OSError, PermissionError) as e: - raise ToolBoxException(str(e)) from e -``` - -**`toolbox/general.py` line 229 (download_file retry):** -```python -# OLD: -except Exception as error: - remaining_download_tries -= 1 - log.error("Error code: " + str(error)) -# NEW: -except (URLError, ContentTooShortError, OSError, HTTPError) as error: - remaining_download_tries -= 1 - log.error("Download error: %s", error) -``` - -**`toolbox/general.py` line 342 (gunzip_files):** -```python -# OLD: -except Exception as e: - err_msg = "UNKNOWN ERROR uncompressing file..." -# NEW: -except (OSError, subprocess.CalledProcessError) as e: - err_msg = "Error uncompressing file..." -``` - -**`ensembl/ensembl.py` line 247 (parse_gtf):** -```python -# OLD: -except Exception as e: - print("Database already exists: " + str(e), gtf_db_file) -# NEW: -except (gffutils.FeatureNotFoundError, ValueError, sqlite3.OperationalError) as e: - logging.getLogger(__name__).debug("GTF database already exists: %s %s", e, gtf_db_file) -``` -Add `import sqlite3` to the imports. - -**`spectrumai.py` line 163 (mzML loading):** -```python -# OLD: -except Exception as e: - print(mzml_file + " has ERROR!") -# NEW: -except (OSError, RuntimeError, ValueError) as e: - self.get_logger().error("Error loading mzML file %s: %s", mzml_file, e) -``` - -**`spectrumai.py` line 180 (scan lookup):** -```python -# OLD: -except Exception as e: - print("ERROR: ...") -# NEW: -except (RuntimeError, IndexError, KeyError) as e: - self.get_logger().error("Scan lookup error in %s scan %s: %s", mzml_file, scan_num, e) -``` - -**Step 2: Run full test suite** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/ -v --timeout=120 -x` -Expected: All PASS - -**Step 3: Commit** - -```bash -git add pgatk/toolbox/general.py pgatk/ensembl/ensembl.py pgatk/proteogenomics/spectrumai.py -git commit -m "refactor: replace broad except Exception with specific exception types" -``` - ---- - -## Task 8: Standardize config fallback pattern with helper method - -**Files:** -- Modify: `pgatk/toolbox/general.py` (add `_get_config_value` helper to `ParameterConfiguration`) -- Modify: `pgatk/ensembl/ensembl.py` (simplify `__init__` config loading) -- Modify: `pgatk/cgenomes/cgenomes_proteindb.py` (simplify `get_mutations_default_options`) -- Modify: `pgatk/proteogenomics/blast_get_position.py` (simplify `get_blast_parameters`) -- Modify: `pgatk/proteogenomics/spectrumai.py` (simplify parameter loading) - -**Step 1: Write the test** - -```python -# pgatk/tests/test_parameter_configuration.py -"""Test the ParameterConfiguration config fallback helper.""" -from pgatk.toolbox.general import ParameterConfiguration - - -def test_get_config_value_from_pipeline_params(): - """Pipeline params should take precedence.""" - config = ParameterConfiguration.__new__(ParameterConfiguration) - config._pipeline_params = {"root": {"key1": "from_pipeline"}} - config._default_params = {"root": {"key1": "from_default"}} - config._ROOT_CONFIG_NAME = "root" - result = config.get_config_value("key1", "fallback") - assert result == "from_pipeline" - - -def test_get_config_value_from_defaults(): - """Default params used when pipeline params missing.""" - config = ParameterConfiguration.__new__(ParameterConfiguration) - config._pipeline_params = {"root": {}} - config._default_params = {"root": {"key1": "from_default"}} - config._ROOT_CONFIG_NAME = "root" - result = config.get_config_value("key1", "fallback") - assert result == "from_default" - - -def test_get_config_value_fallback(): - """Fallback returned when neither source has the key.""" - config = ParameterConfiguration.__new__(ParameterConfiguration) - config._pipeline_params = {"root": {}} - config._default_params = {"root": {}} - config._ROOT_CONFIG_NAME = "root" - result = config.get_config_value("missing_key", "fallback_value") - assert result == "fallback_value" -``` - -**Step 2: Run test to verify it fails** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_parameter_configuration.py -v` -Expected: FAIL — `get_config_value` does not exist yet - -**Step 3: Add the helper method to ParameterConfiguration** - -In `pgatk/toolbox/general.py`, add to the `ParameterConfiguration` class: - -```python -def get_config_value(self, key: str, default=None): - """Get a configuration value with standard 3-layer fallback. - - Checks pipeline_params first, then default_params, then returns default. - """ - root = self._ROOT_CONFIG_NAME - # Check pipeline params first - if (self._pipeline_params is not None - and root in self._pipeline_params - and key in self._pipeline_params[root]): - return self._pipeline_params[root][key] - # Then check default config params - if (self._default_params is not None - and root in self._default_params - and key in self._default_params[root]): - return self._default_params[root][key] - return default -``` - -**Step 4: Run test to verify it passes** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_parameter_configuration.py -v` -Expected: PASS - -**Step 5: Refactor subclasses to use the new helper** - -Replace the repetitive fallback patterns in each subclass with `self.get_config_value(key, default)`. For example, in `blast_get_position.py`, the `get_blast_parameters()` method: - -```python -# OLD pattern (repeated everywhere): -def get_blast_parameters(self, variable: str, default_value): - value = default_value - if self._pipeline_params is not None and self._ROOT_CONFIG_NAME in self._pipeline_params: - if variable in self._pipeline_params[self._ROOT_CONFIG_NAME]: - value = self._pipeline_params[self._ROOT_CONFIG_NAME][variable] - elif self._default_params is not None and self._ROOT_CONFIG_NAME in self._default_params: - if variable in self._default_params[self._ROOT_CONFIG_NAME]: - value = self._default_params[self._ROOT_CONFIG_NAME][variable] - return value - -# NEW: -def get_blast_parameters(self, variable: str, default_value): - return self.get_config_value(variable, default_value) -``` - -Apply the same simplification to equivalent methods in `cgenomes_proteindb.py`, `spectrumai.py`, and `ensembl.py`. - -**Step 6: Run full test suite** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/ -v --timeout=120 -x` -Expected: All PASS - -**Step 7: Commit** - -```bash -git add pgatk/toolbox/general.py pgatk/ensembl/ensembl.py pgatk/cgenomes/cgenomes_proteindb.py pgatk/proteogenomics/blast_get_position.py pgatk/proteogenomics/spectrumai.py pgatk/tests/test_parameter_configuration.py -git commit -m "refactor: add get_config_value helper, DRY up config fallback pattern" -``` - ---- - -## Task 9: Convert db/digest_mutant_protein.py to proper CLI command - -**Files:** -- Modify: `pgatk/db/digest_mutant_protein.py` (wrap in Click command) -- Modify: `pgatk/cli.py` (register new command) -- Create: `pgatk/commands/digest_mutant_protein.py` (Click wrapper) - -**Step 1: Refactor the script into a function** - -Wrap all logic in `pgatk/db/digest_mutant_protein.py` inside a function with proper parameters: - -```python -# pgatk/db/digest_mutant_protein.py -import re -import logging -from collections import OrderedDict -from typing import Optional - -from Bio import SeqIO - -logger = logging.getLogger(__name__) - - -def trypsin_cleavage(proseq: str, miss_cleavage: int) -> list: - """Perform in-silico tryptic digestion.""" - # (existing trypsin_cleavage function body, unchanged) - ... - - -def digest_mutant_proteins( - input_file: str, - fasta_file: str, - output_file: str, - decoy_prefix: str = "DECOY_", - min_length: int = 7, - max_length: int = 40, - missed_cleavages: int = 2, -) -> None: - """Digest mutant proteins and filter against canonical peptidome. - - Reads mutant protein sequences, performs tryptic digestion, removes peptides - found in the canonical proteome, and writes unique variant peptides. - """ - # (existing script logic from lines 83-139, using function parameters - # instead of getopt-parsed variables) - ... -``` - -Remove all `sys.argv`, `getopt`, and `sys.exit()` code. Remove the import-time execution. - -**Step 2: Create Click command wrapper** - -```python -# pgatk/commands/digest_mutant_protein.py -import click - -from pgatk.db.digest_mutant_protein import digest_mutant_proteins - - -@click.command("digest-mutant-protein", short_help="Digest mutant proteins and filter against canonical proteome") -@click.option("-i", "--input", "input_file", required=True, help="Input mutant protein FASTA") -@click.option("-f", "--fasta", "fasta_file", required=True, help="Reference canonical protein FASTA") -@click.option("-o", "--output", "output_file", required=True, help="Output file for unique variant peptides") -@click.option("--prefix", default="DECOY_", help="Decoy prefix to skip (default: DECOY_)") -@click.option("--min-len", default=7, type=int, help="Minimum peptide length (default: 7)") -@click.option("--max-len", default=40, type=int, help="Maximum peptide length (default: 40)") -@click.option("--missed-cleavages", default=2, type=int, help="Missed cleavages (default: 2)") -def digest_mutant_protein(input_file, fasta_file, output_file, prefix, min_len, max_len, missed_cleavages): - digest_mutant_proteins(input_file, fasta_file, output_file, prefix, min_len, max_len, missed_cleavages) -``` - -**Step 3: Register in cli.py** - -Add to `pgatk/cli.py`: -```python -from pgatk.commands.digest_mutant_protein import digest_mutant_protein -cli.add_command(digest_mutant_protein) -``` - -**Step 4: Run CLI help to verify registration** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pgatk.cli digest-mutant-protein --help` -Expected: Help text with all options displayed - -**Step 5: Run full test suite** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/ -v --timeout=120 -x` -Expected: All PASS - -**Step 6: Commit** - -```bash -git add pgatk/db/digest_mutant_protein.py pgatk/commands/digest_mutant_protein.py pgatk/cli.py -git commit -m "refactor: convert digest_mutant_protein from standalone script to Click CLI command" -``` - ---- - -## Task 10: Convert db/map_peptide2genome.py to proper CLI command - -**Files:** -- Modify: `pgatk/db/map_peptide2genome.py` (wrap in functions) -- Create: `pgatk/commands/map_peptide2genome.py` (Click wrapper) -- Modify: `pgatk/cli.py` (register new command) - -**Step 1: Refactor the script into functions** - -Same pattern as Task 9. Wrap all logic in a function: - -```python -# pgatk/db/map_peptide2genome.py -import re -import logging -from typing import Optional - -from Bio import SeqIO - -logger = logging.getLogger(__name__) - - -class EXON: - """Represents an exon with genomic coordinates.""" - # (existing EXON class, unchanged except add type hints) - ... - - -def cal_trans_pos(exon_list: list) -> list: - # (existing function, unchanged) - ... - - -def get_pep_cor(exon_object_list: list, n1: int, n2: int) -> list: - # (existing function, unchanged) - ... - - -def parse_gtf(infile: str) -> dict: - # (existing function, unchanged) - ... - - -def map_peptides_to_genome( - input_file: str, - gtf_file: str, - fasta_file: str, - idmap_file: str, - output_file: str, - pep_col: int = 0, - prot_col: int = 1, -) -> None: - """Map identified peptides to genomic coordinates. - - Reads peptide identifications, maps them to transcript coordinates using - a GTF file and protein-transcript ID mapping, outputs GFF3 format. - """ - # (existing script logic from lines 146-261, using function parameters) - ... -``` - -Remove all `sys.argv`, `getopt`, `sys.exit()` code. - -**Step 2: Create Click command wrapper** - -```python -# pgatk/commands/map_peptide2genome.py -import click - -from pgatk.db.map_peptide2genome import map_peptides_to_genome - - -@click.command("map-peptide2genome", short_help="Map peptides to genomic coordinates (GFF3 output)") -@click.option("-i", "--input", "input_file", required=True, help="Input peptide identification TSV") -@click.option("-g", "--gtf", "gtf_file", required=True, help="GTF gene annotation file") -@click.option("-f", "--fasta", "fasta_file", required=True, help="Protein FASTA file") -@click.option("-m", "--idmap", "idmap_file", required=True, help="Protein-to-transcript ID mapping file") -@click.option("-o", "--output", "output_file", required=True, help="Output GFF3 file") -@click.option("--pep-col", default=0, type=int, help="Peptide column index (default: 0)") -@click.option("--prot-col", default=1, type=int, help="Protein column index (default: 1)") -def map_peptide2genome(input_file, gtf_file, fasta_file, idmap_file, output_file, pep_col, prot_col): - map_peptides_to_genome(input_file, gtf_file, fasta_file, idmap_file, output_file, pep_col, prot_col) -``` - -**Step 3: Register in cli.py** - -Add to `pgatk/cli.py`: -```python -from pgatk.commands.map_peptide2genome import map_peptide2genome -cli.add_command(map_peptide2genome) -``` - -**Step 4: Run CLI help to verify registration** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pgatk.cli map-peptide2genome --help` -Expected: Help text with all options displayed - -**Step 5: Run full test suite** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/ -v --timeout=120 -x` -Expected: All PASS - -**Step 6: Commit** - -```bash -git add pgatk/db/map_peptide2genome.py pgatk/commands/map_peptide2genome.py pgatk/cli.py -git commit -m "refactor: convert map_peptide2genome from standalone script to Click CLI command" -``` - ---- - -## Task 11: Use pathlib.Path for file operations - -**Files:** -- Modify: `pgatk/ensembl/ensembl.py` (5 sites) -- Modify: `pgatk/cgenomes/cgenomes_proteindb.py` (2 sites) -- Modify: `pgatk/proteomics/db/protein_database_decoy.py` (1 site) - -**Step 1: Add pathlib import and replace string path operations** - -**`ensembl/ensembl.py`** — add `from pathlib import Path` to imports: - -- Line 417: `os.path.abspath(vcf_file.split('/')[-1].replace('.vcf', ''))` → `str(Path(vcf_file).resolve().stem)` -- Line 419: `annotated_vcf + '_all.bed'` → `str(Path(annotated_vcf).with_name(Path(annotated_vcf).name + '_all.bed'))` - Or simpler: keep `annotated_vcf` as a Path stem and use f-strings: `f"{annotated_vcf}_all.bed"` -- Line 508: `gene_annotations_gtf.replace('.gtf', '.db')` → `str(Path(gene_annotations_gtf).with_suffix('.db'))` - -**`cgenomes/cgenomes_proteindb.py`** — add `from pathlib import Path`: - -- Line 260: `self._local_output_file.replace('.fa', '') + '_' + ...` → `str(Path(self._local_output_file).with_suffix('')) + '_' + ...` (or use Path stem) -- Line 461: same pattern - -**`protein_database_decoy.py`** — add `from pathlib import Path`: - -- Line 479: `self._output_file.replace('.fa', '') + '_noAlternative.fa'` → `str(Path(self._output_file).with_suffix('').with_name(Path(self._output_file).stem + '_noAlternative.fa'))` - -**Step 2: Run full test suite** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/ -v --timeout=120 -x` -Expected: All PASS - -**Step 3: Commit** - -```bash -git add pgatk/ensembl/ensembl.py pgatk/cgenomes/cgenomes_proteindb.py pgatk/proteomics/db/protein_database_decoy.py -git commit -m "refactor: use pathlib.Path for file path operations" -``` - ---- - -## Task 12: Add type hints to public APIs - -**Files:** -- Modify: `pgatk/toolbox/general.py` -- Modify: `pgatk/ensembl/ensembl.py` -- Modify: `pgatk/cgenomes/cgenomes_proteindb.py` -- Modify: `pgatk/proteogenomics/blast_get_position.py` -- Modify: `pgatk/proteogenomics/spectrumai.py` -- Modify: `pgatk/proteomics/db/protein_database_decoy.py` - -**Step 1: Add type hints to each file's public methods** - -Work through each file adding type annotations to all public methods (non-underscore-prefixed). Use `from __future__ import annotations` at the top of each modified file for forward references. - -Key patterns: -```python -from __future__ import annotations -from typing import Optional -from pathlib import Path -import logging -import pandas as pd -import gffutils -``` - -Examples of what to add: - -```python -# ensembl.py -def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf: str) -> str: ... -def dnaseq_to_proteindb(self, input_fasta: str) -> str: ... -def check_proteindb(self, input_fasta: str, add_stop_codon: bool = True, num_aa: int = 6) -> None: ... - -@staticmethod -def get_altseq(ref_seq: str, ref_allele: str, var_allele: str, var_pos: int, - strand: str, features_info: list, cds_info: Optional[list] = None) -> tuple: ... - -@staticmethod -def parse_gtf(gene_annotations_gtf: str, gtf_db_file: str) -> gffutils.FeatureDB: ... -``` - -```python -# blast_get_position.py -def blast(self, input_psm_to_blast: str, output_psm: str) -> None: ... - -@staticmethod -def get_details(fasta: str, peptide: str) -> list[str]: ... - -@staticmethod -def peptide_blast_protein(fasta: str, peptide: str) -> Optional[list]: ... -``` - -**Step 2: Run mypy (optional, non-blocking) and tests** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/ -v --timeout=120 -x` -Expected: All PASS - -**Step 3: Commit** - -```bash -git add pgatk/toolbox/general.py pgatk/ensembl/ensembl.py pgatk/cgenomes/cgenomes_proteindb.py pgatk/proteogenomics/blast_get_position.py pgatk/proteogenomics/spectrumai.py pgatk/proteomics/db/protein_database_decoy.py -git commit -m "refactor: add type hints to all public API methods" -``` - ---- - -## Task 13: Expand unit test coverage for core functions - -**Files:** -- Create: `pgatk/tests/test_ensembl_core.py` -- Create: `pgatk/tests/test_cosmic_core.py` -- Create: `pgatk/tests/test_decoy_core.py` - -**Step 1: Write unit tests for get_altseq()** - -```python -# pgatk/tests/test_ensembl_core.py -"""Unit tests for EnsemblDataService core functions.""" -from pgatk.ensembl.ensembl import EnsemblDataService - - -class TestGetAltseq: - """Test the static method that computes alternative sequences from variants.""" - - def test_snp_plus_strand(self): - """Simple SNP on plus strand should substitute one base.""" - ref_seq = "ATGCGATCGA" - # SNP at position 3 (0-indexed in CDS): G -> T - alt_seq = EnsemblDataService.get_altseq( - ref_seq=ref_seq, ref_allele="G", var_allele="T", - var_pos=4, strand="+", - features_info=[(0, len(ref_seq))], # single exon covering full CDS - ) - assert alt_seq is not None - # The exact assertion depends on what get_altseq returns — - # inspect the return value and assert accordingly - - def test_insertion_plus_strand(self): - """Insertion on plus strand should add bases.""" - ref_seq = "ATGCGATCGA" - alt_seq = EnsemblDataService.get_altseq( - ref_seq=ref_seq, ref_allele="G", var_allele="GAA", - var_pos=4, strand="+", - features_info=[(0, len(ref_seq))], - ) - assert alt_seq is not None - - def test_deletion_plus_strand(self): - """Deletion on plus strand should remove bases.""" - ref_seq = "ATGCGATCGA" - alt_seq = EnsemblDataService.get_altseq( - ref_seq=ref_seq, ref_allele="CGA", var_allele="C", - var_pos=4, strand="+", - features_info=[(0, len(ref_seq))], - ) - assert alt_seq is not None -``` - -**Step 2: Write unit tests for get_mut_pro_seq()** - -```python -# pgatk/tests/test_cosmic_core.py -"""Unit tests for CancerGenomesService core mutation functions.""" -from pgatk.cgenomes.cgenomes_proteindb import CancerGenomesService -from pgatk.cgenomes.models import SNP - - -class TestGetMutProSeq: - """Test the static method that applies COSMIC mutations to protein sequences.""" - - def test_substitution_missense(self): - """A single AA substitution should change one residue.""" - snp = SNP(gene="TEST", aa_mut="p.A10V", mutation_type="Substitution - Missense") - ref_seq = "MAAAAAAAAAAAAAAAAA" # A at position 10 - result = CancerGenomesService.get_mut_pro_seq(snp, ref_seq) - # A at position 10 (1-indexed) should become V - assert result is not None - assert "V" in result - - def test_nonsense(self): - """Nonsense mutation should truncate at the stop.""" - snp = SNP(gene="TEST", aa_mut="p.A10*", mutation_type="Substitution - Nonsense") - ref_seq = "MAAAAAAAAAAAAAAAAA" - result = CancerGenomesService.get_mut_pro_seq(snp, ref_seq) - assert result is not None -``` - -**Step 3: Write unit tests for decoy revswitch()** - -```python -# pgatk/tests/test_decoy_core.py -"""Unit tests for ProteinDBDecoyService core functions.""" -from pgatk.proteomics.db.protein_database_decoy import ProteinDBDecoyService - - -class TestRevswitch: - """Test the reverse-with-KR-switch function.""" - - def test_revswitch_preserves_length(self): - """Reversed sequence should have the same length.""" - seq = "ABCDEFGHIJK" - result = ProteinDBDecoyService.revswitch(seq) - assert len(result) == len(seq) - - def test_revswitch_kr_handling(self): - """K and R at cleavage sites should be preserved at the correct positions.""" - seq = "PEPTIDEK" - result = ProteinDBDecoyService.revswitch(seq) - assert isinstance(result, str) - assert len(result) == len(seq) -``` - -**Step 4: Run all new tests** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_ensembl_core.py pgatk/tests/test_cosmic_core.py pgatk/tests/test_decoy_core.py -v` -Expected: All PASS (adjust assertions based on actual function return values) - -**Step 5: Run full test suite** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/ -v --timeout=120 -x` -Expected: All PASS - -**Step 6: Commit** - -```bash -git add pgatk/tests/test_ensembl_core.py pgatk/tests/test_cosmic_core.py pgatk/tests/test_decoy_core.py -git commit -m "test: add unit tests for get_altseq, get_mut_pro_seq, and revswitch" -``` - ---- - -## Task Summary - -| Task | Description | Files | Depends On | -|------|-------------|-------|------------| -| 1 | Add conftest.py with fixtures | tests/conftest.py | — | -| 2 | Fix variable shadowing bug | protein_database_decoy.py | — | -| 3 | Convert SNP to dataclass | cgenomes/models.py, cgenomes_proteindb.py | — | -| 4 | Replace pathos in blast_get_position | blast_get_position.py | — | -| 5 | Replace pathos in spectrumai | spectrumai.py | — | -| 6 | Replace print() with logger | 5 files | — | -| 7 | Replace broad except blocks | 3 files | Task 6 | -| 8 | Standardize config fallback | toolbox/general.py + 4 files | — | -| 9 | Convert digest_mutant_protein to CLI | db/digest_mutant_protein.py, cli.py | — | -| 10 | Convert map_peptide2genome to CLI | db/map_peptide2genome.py, cli.py | — | -| 11 | Use pathlib.Path | 3 files | — | -| 12 | Add type hints to public APIs | 6 files | Tasks 1-11 | -| 13 | Expand unit test coverage | 3 test files | Tasks 1-11 | - -**Parallelizable groups:** -- Tasks 1-5 can run in parallel (no shared file dependencies) -- Task 6 can start after Tasks 2, 4, 5 (they touch overlapping files) -- Tasks 7 depends on Task 6 (print→logger before except→specific) -- Tasks 9-10 are independent of each other -- Tasks 12-13 should run last (they depend on all prior cleanups) diff --git a/docs/plans/2026-03-02-clinvar-ncbi-implementation.md b/docs/plans/2026-03-02-clinvar-ncbi-implementation.md deleted file mode 100644 index efe7a4a..0000000 --- a/docs/plans/2026-03-02-clinvar-ncbi-implementation.md +++ /dev/null @@ -1,1652 +0,0 @@ -# ClinVar/NCBI RefSeq Protein Database Generation — Implementation Plan - -> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. - -**Goal:** Add CLI commands to generate variant protein databases from ClinVar VCF files using NCBI RefSeq GTF annotations, without requiring VEP. - -**Architecture:** New `pgatk/clinvar/` module with chromosome mapper, ClinVar VCF→protein pipeline, NCBI data downloader. Shared VCF→protein functions (`get_altseq`, `get_orfs_vcf`, etc.) are extracted from `pgatk/ensembl/ensembl.py` into `pgatk/toolbox/vcf_utils.py` and imported by both pipelines. - -**Tech Stack:** Python 3.9+, gffutils (GTF indexing), pybedtools (interval overlap), BioPython (Seq, SeqIO), pandas, Click (CLI), PyYAML (config), tqdm (progress), urllib (downloads). - ---- - -## Task 1: Extract Shared VCF→Protein Functions to `pgatk/toolbox/vcf_utils.py` - -These static methods in `pgatk/ensembl/ensembl.py` are generic (no Ensembl-specific logic) and will be reused by the ClinVar pipeline. Extract them to a shared module. - -**Files:** -- Create: `pgatk/toolbox/vcf_utils.py` -- Modify: `pgatk/ensembl/ensembl.py` -- Test: `pgatk/tests/test_vcf_utils.py` - -**Step 1: Write the failing test** - -Create `pgatk/tests/test_vcf_utils.py`: - -```python -"""Tests for shared VCF utility functions extracted from EnsemblDataService.""" - -from Bio.Seq import Seq - -from pgatk.toolbox.vcf_utils import check_overlap, get_altseq, get_orfs_vcf, write_output - - -class TestCheckOverlap: - """Tests for check_overlap().""" - - def test_variant_fully_within_feature(self): - features = [[100, 200, 'CDS']] - assert check_overlap(150, 160, features) is True - - def test_variant_outside_feature(self): - features = [[100, 200, 'CDS']] - assert check_overlap(50, 60, features) is False - - def test_variant_start_neg1_always_overlaps(self): - assert check_overlap(-1, 10, [[100, 200, 'CDS']]) is True - - def test_variant_partial_overlap_start(self): - features = [[100, 200, 'CDS']] - assert check_overlap(90, 110, features) is True - - -class TestGetAltseq: - """Tests for get_altseq().""" - - def test_snp_plus_strand(self): - ref_seq = Seq("ATGCCCGGG") - features = [[10, 18, 'exon']] - ref, alt = get_altseq(ref_seq, Seq("C"), Seq("T"), 13, '+', features) - assert len(alt) > 0 - assert ref != alt - - def test_returns_tuple(self): - ref_seq = Seq("ATGCCC") - features = [[10, 15, 'exon']] - result = get_altseq(ref_seq, Seq("C"), Seq("T"), 12, '+', features) - assert isinstance(result, tuple) - assert len(result) == 2 - - -class TestGetOrfsVcf: - """Tests for get_orfs_vcf().""" - - def test_single_orf(self): - ref = Seq("ATGCCCGGG") - alt = Seq("ATGTCCGGG") - ref_orfs, alt_orfs = get_orfs_vcf(ref, alt, 1, num_orfs=1) - assert len(ref_orfs) == 1 - assert len(alt_orfs) == 1 - - def test_three_orfs(self): - ref = Seq("ATGCCCGGG") - alt = Seq("ATGTCCGGG") - ref_orfs, alt_orfs = get_orfs_vcf(ref, alt, 1, num_orfs=3) - assert len(ref_orfs) == 3 - assert len(alt_orfs) == 3 - - -class TestWriteOutput: - """Tests for write_output().""" - - def test_writes_fasta(self, tmp_path): - out_file = tmp_path / "test.fa" - with open(out_file, 'w') as f: - write_output("seq1", "description", ["MPKF"], f) - content = out_file.read_text() - assert ">seq1 description" in content - assert "MPKF" in content -``` - -**Step 2: Run test to verify it fails** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_vcf_utils.py -v` -Expected: FAIL with `ModuleNotFoundError: No module named 'pgatk.toolbox.vcf_utils'` - -**Step 3: Create `pgatk/toolbox/vcf_utils.py`** - -Extract these 4 static methods from `pgatk/ensembl/ensembl.py` (lines 142-166, 168-227, 280-296, 826-849) as standalone functions: - -```python -"""Shared VCF-to-protein utility functions. - -Extracted from pgatk.ensembl.ensembl.EnsemblDataService so that both the -Ensembl and ClinVar pipelines can reuse the same core algorithms. -""" -from __future__ import annotations - -import logging -from typing import Any, Optional - -from Bio.Seq import Seq - -logger = logging.getLogger(__name__) - - -def check_overlap(var_start: int, var_end: int, features_info: Optional[list] = None) -> bool: - """Return True when the variant overlaps any of the features. - - :param var_start: Variant start position (genomic). -1 means always overlap. - :param var_end: Variant end position (genomic). - :param features_info: List of [start, end, type] for each feature. - """ - if features_info is None: - features_info = [[0, 1, 'type']] - if var_start == -1: - return True - for feature_pos in features_info: - pep_start = feature_pos[0] - pep_end = feature_pos[1] - if var_start <= pep_start <= var_end: - return True - elif var_start <= pep_end <= var_end: - return True - elif pep_start <= var_start and pep_end >= var_end: - return True - return False - - -def get_altseq( - ref_seq: Seq, - ref_allele: Seq, - var_allele: Seq, - var_pos: int, - strand: str, - features_info: list, - cds_info: Optional[list] = None, -) -> tuple: - """Modify a reference sequence based on a variant allele. - - Handles strand orientation, CDS trimming, and exon-based coordinate - calculation. Works identically with Ensembl and RefSeq annotations. - - :param ref_seq: Full transcript sequence (all exons concatenated). - :param ref_allele: Reference allele (Seq object). - :param var_allele: Alternate allele (Seq object). - :param var_pos: Genomic position of the variant. - :param strand: '+' or '-'. - :param features_info: List of [start, end, type] per exon/CDS. - :param cds_info: Optional [cds_start, cds_end] for CDS trimming. - :return: (ref_coding_seq, alt_coding_seq) tuple. - """ - if cds_info is None: - cds_info = [] - alt_seq = "" - if len(cds_info) == 2: - start_coding_index = cds_info[0] - 1 - stop_coding_index = cds_info[1] - else: - start_coding_index = 0 - total_len = 0 - for x in features_info: - total_len += x[1] - x[0] + 1 - stop_coding_index = total_len - - if strand == '-': - ref_seq = ref_seq[::-1] - ref_allele = ref_allele.complement() - var_allele = var_allele.complement() - - if strand == '-' and len(cds_info) == 2: - n = len(ref_seq) - ref_seq = ref_seq[n - stop_coding_index:n - start_coding_index] - else: - ref_seq = ref_seq[start_coding_index:stop_coding_index] - - nc_index = 0 - if len(ref_allele) == len(var_allele) or ref_allele[0] == var_allele[0]: - for feature in features_info: - if var_pos in range(feature[0], feature[1] + 1): - var_index_in_cds = nc_index + (var_pos - feature[0]) - c = len(ref_allele) - alt_seq = ref_seq[0:var_index_in_cds] + var_allele + ref_seq[var_index_in_cds + c::] - if strand == '-': - return ref_seq[::-1], alt_seq[::-1] - else: - return ref_seq, alt_seq - nc_index += (feature[1] - feature[0] + 1) - - return ref_seq, alt_seq - - -def get_orfs_vcf( - ref_seq: Seq, - alt_seq: Seq, - translation_table: int, - num_orfs: int = 1, -) -> tuple[list, list]: - """Translate reference and alternate sequences into ORFs. - - :param ref_seq: Reference coding sequence. - :param alt_seq: Alternate coding sequence with variant applied. - :param translation_table: NCBI translation table number. - :param num_orfs: Number of reading frames to generate (1 or 3). - :return: (ref_orfs, alt_orfs) lists of translated protein sequences. - """ - ref_orfs = [] - alt_orfs = [] - for n in range(0, num_orfs): - ref_orfs.append(ref_seq[n::].translate(translation_table)) - alt_orfs.append(alt_seq[n::].translate(translation_table)) - return ref_orfs, alt_orfs - - -def write_output( - seq_id: str, - desc: str, - seqs: list, - prots_fn: Any, - seqs_filter: Optional[list] = None, -) -> None: - """Write ORFs to a FASTA output file. - - :param seq_id: Sequence accession/ID for the FASTA header. - :param desc: Description string for the FASTA header. - :param seqs: List of protein sequences (ORFs) to write. - :param prots_fn: Open file handle for writing. - :param seqs_filter: If provided, skip ORFs that match reference (used to - write only variant-specific proteins). - """ - if seqs_filter is None: - seqs_filter = [] - write_i = False - if len(seqs) > 1: - write_i = True - - formatted_desc = " " + desc if desc else "" - for i, orf in enumerate(seqs): - if orf in seqs_filter: - continue - if write_i: - prots_fn.write('>{}{}\n{}\n'.format(seq_id + "_" + str(i + 1), formatted_desc, orf)) - else: - prots_fn.write('>{}{}\n{}\n'.format(seq_id, formatted_desc, orf)) -``` - -**Step 4: Run test to verify it passes** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_vcf_utils.py -v` -Expected: All PASS - -**Step 5: Update `pgatk/ensembl/ensembl.py` to import from shared module** - -Replace the 4 static method bodies in `EnsemblDataService` with thin wrappers that delegate to the shared functions. This preserves backward compatibility for any code calling `EnsemblDataService.check_overlap(...)` etc. - -In `pgatk/ensembl/ensembl.py`: -- Add import at the top: `from pgatk.toolbox.vcf_utils import check_overlap as _check_overlap, get_altseq as _get_altseq, get_orfs_vcf as _get_orfs_vcf, write_output as _write_output` -- Replace method bodies of `check_overlap` (lines 142-165), `get_altseq` (lines 168-227), `get_orfs_vcf` (lines 280-296), `write_output` (lines 826-849) with delegation calls: - -```python -@staticmethod -def check_overlap(var_start, var_end, features_info=None): - return _check_overlap(var_start, var_end, features_info) - -@staticmethod -def get_altseq(ref_seq, ref_allele, var_allele, var_pos, strand, features_info, cds_info=None): - return _get_altseq(ref_seq, ref_allele, var_allele, var_pos, strand, features_info, cds_info) - -@staticmethod -def get_orfs_vcf(ref_seq, alt_seq, translation_table, num_orfs=1): - return _get_orfs_vcf(ref_seq, alt_seq, translation_table, num_orfs) - -@staticmethod -def write_output(seq_id, desc, seqs, prots_fn, seqs_filter=None): - _write_output(seq_id, desc, seqs, prots_fn, seqs_filter) -``` - -**Step 6: Run ALL existing tests to verify no regressions** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/ -v` -Expected: All 66+ tests PASS (existing tests still call `EnsemblDataService.get_altseq(...)` etc., which now delegate to the shared module) - -**Step 7: Commit** - -```bash -git add pgatk/toolbox/vcf_utils.py pgatk/tests/test_vcf_utils.py pgatk/ensembl/ensembl.py -git commit -m "refactor: extract shared VCF utility functions to toolbox/vcf_utils.py - -Extract check_overlap, get_altseq, get_orfs_vcf, write_output from -EnsemblDataService to pgatk.toolbox.vcf_utils for reuse by ClinVar pipeline. -EnsemblDataService methods now delegate to shared functions." -``` - ---- - -## Task 2: Chromosome Mapper - -Build the chromosome name mapper that converts between NC_ accessions (RefSeq GTF), numeric names (ClinVar VCF), and UCSC chr-prefixed names. - -**Files:** -- Create: `pgatk/clinvar/__init__.py` -- Create: `pgatk/clinvar/chromosome_mapper.py` -- Create: `pgatk/testdata/clinvar/mini_assembly_report.txt` -- Test: `pgatk/tests/test_clinvar/__init__.py` -- Test: `pgatk/tests/test_clinvar/test_chromosome_mapper.py` - -**Step 1: Create test data** - -Create `pgatk/testdata/clinvar/mini_assembly_report.txt` — a subset of the real NCBI assembly report format. The file has comment lines starting with `#`, then a header line, then TSV data. The columns we care about are: col 0 = Sequence-Name (e.g. "1"), col 6 = RefSeq-Accn (e.g. "NC_000001.11"), col 9 = UCSC-style-name (e.g. "chr1"): - -``` -# Assembly name: GRCh38.p14 -# Description: Genome Reference Consortium Human Build 38 patch release 14 -# Organism name: Homo sapiens -# Sequence-Name Sequence-Role Assigned-Molecule Assigned-Molecule-loc/type GenBank-Accn Relationship RefSeq-Accn Assembly-Unit Sequence-Length UCSC-style-name -1 assembled-molecule 1 Chromosome CM000663.2 = NC_000001.11 Primary Assembly 248956422 chr1 -2 assembled-molecule 2 Chromosome CM000664.2 = NC_000002.12 Primary Assembly 242193529 chr2 -X assembled-molecule X Chromosome CM000685.2 = NC_000023.11 Primary Assembly 156040895 chrX -Y assembled-molecule Y Chromosome CM000686.2 = NC_000024.10 Primary Assembly 57227415 chrY -MT assembled-molecule MT Mitochondria J01415.2 = NC_012920.1 non-nuclear 16569 chrM -``` - -**Step 2: Write the failing test** - -Create `pgatk/tests/test_clinvar/__init__.py` (empty file). - -Create `pgatk/tests/test_clinvar/test_chromosome_mapper.py`: - -```python -"""Tests for pgatk.clinvar.chromosome_mapper.""" - -from pathlib import Path - -import pytest - -from pgatk.clinvar.chromosome_mapper import ChromosomeMapper - - -TESTDATA_DIR = Path(__file__).resolve().parent.parent.parent / "testdata" / "clinvar" -ASSEMBLY_REPORT = TESTDATA_DIR / "mini_assembly_report.txt" - - -class TestChromosomeMapper: - """Tests for ChromosomeMapper.""" - - @pytest.fixture - def mapper(self): - return ChromosomeMapper.from_assembly_report(str(ASSEMBLY_REPORT)) - - def test_numeric_to_refseq(self, mapper): - assert mapper.map_chrom("1", "refseq") == "NC_000001.11" - - def test_refseq_to_numeric(self, mapper): - assert mapper.map_chrom("NC_000001.11", "numeric") == "1" - - def test_numeric_to_ucsc(self, mapper): - assert mapper.map_chrom("1", "ucsc") == "chr1" - - def test_ucsc_to_numeric(self, mapper): - assert mapper.map_chrom("chr1", "numeric") == "1" - - def test_refseq_to_ucsc(self, mapper): - assert mapper.map_chrom("NC_000001.11", "ucsc") == "chr1" - - def test_x_chromosome(self, mapper): - assert mapper.map_chrom("X", "refseq") == "NC_000023.11" - assert mapper.map_chrom("NC_000023.11", "numeric") == "X" - - def test_mt_chromosome(self, mapper): - assert mapper.map_chrom("MT", "refseq") == "NC_012920.1" - assert mapper.map_chrom("NC_012920.1", "ucsc") == "chrM" - - def test_unknown_chromosome_returns_input(self, mapper): - assert mapper.map_chrom("UNKNOWN_CONTIG", "refseq") == "UNKNOWN_CONTIG" - - def test_invalid_convention_raises(self, mapper): - with pytest.raises(ValueError, match="convention"): - mapper.map_chrom("1", "invalid_convention") - - def test_chr_prefixed_input(self, mapper): - """Input with chr prefix should be recognized as UCSC convention.""" - assert mapper.map_chrom("chrX", "numeric") == "X" - assert mapper.map_chrom("chrX", "refseq") == "NC_000023.11" -``` - -**Step 3: Run test to verify it fails** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_clinvar/test_chromosome_mapper.py -v` -Expected: FAIL with `ModuleNotFoundError: No module named 'pgatk.clinvar'` - -**Step 4: Implement chromosome mapper** - -Create `pgatk/clinvar/__init__.py` (empty file). - -Create `pgatk/clinvar/chromosome_mapper.py`: - -```python -"""Bidirectional chromosome name mapping: RefSeq (NC_) <-> numeric <-> UCSC (chr). - -Parses an NCBI assembly report to build the mapping tables. -""" -from __future__ import annotations - -import logging -from pathlib import Path - -logger = logging.getLogger(__name__) - -# Valid target convention names -_VALID_CONVENTIONS = {"numeric", "refseq", "ucsc"} - - -class ChromosomeMapper: - """Maps chromosome names between naming conventions. - - Supports three conventions: - - 'numeric': e.g. '1', '2', 'X', 'Y', 'MT' - - 'refseq': e.g. 'NC_000001.11' - - 'ucsc': e.g. 'chr1', 'chrX', 'chrM' - """ - - def __init__( - self, - numeric_to_refseq: dict[str, str], - numeric_to_ucsc: dict[str, str], - ) -> None: - self._num_to_ref = numeric_to_refseq - self._num_to_ucsc = numeric_to_ucsc - - # Build reverse maps - self._ref_to_num = {v: k for k, v in numeric_to_refseq.items()} - self._ucsc_to_num = {v: k for k, v in numeric_to_ucsc.items()} - - @classmethod - def from_assembly_report(cls, report_path: str) -> ChromosomeMapper: - """Parse an NCBI assembly report file and build chromosome maps. - - The assembly report is a TSV with comment lines starting with '#'. - Relevant columns (0-indexed): - - 0: Sequence-Name (e.g. '1', 'X', 'MT') - - 6: RefSeq-Accn (e.g. 'NC_000001.11') - - 9: UCSC-style-name (e.g. 'chr1') - """ - numeric_to_refseq: dict[str, str] = {} - numeric_to_ucsc: dict[str, str] = {} - - with open(report_path, "r", encoding="utf-8") as fh: - for line in fh: - line = line.strip() - if line.startswith("#") or not line: - continue - cols = line.split("\t") - if len(cols) < 10: - continue - seq_name = cols[0] # numeric name - refseq_accn = cols[6] # NC_ accession - ucsc_name = cols[9] # chr-prefixed name - - if refseq_accn and refseq_accn != "na": - numeric_to_refseq[seq_name] = refseq_accn - if ucsc_name and ucsc_name != "na": - numeric_to_ucsc[seq_name] = ucsc_name - - logger.info( - "Loaded chromosome mapping: %d entries from %s", - len(numeric_to_refseq), - Path(report_path).name, - ) - return cls(numeric_to_refseq, numeric_to_ucsc) - - def _to_numeric(self, name: str) -> str | None: - """Convert any convention to numeric. Returns None if unknown.""" - # Already numeric? - if name in self._num_to_ref or name in self._num_to_ucsc: - return name - # RefSeq? - if name in self._ref_to_num: - return self._ref_to_num[name] - # UCSC? - if name in self._ucsc_to_num: - return self._ucsc_to_num[name] - return None - - def map_chrom(self, name: str, target_convention: str) -> str: - """Map a chromosome name to the target convention. - - :param name: Input chromosome name in any convention. - :param target_convention: One of 'numeric', 'refseq', 'ucsc'. - :return: Mapped name, or the original name if mapping is unknown. - :raises ValueError: If target_convention is not valid. - """ - if target_convention not in _VALID_CONVENTIONS: - raise ValueError( - f"Invalid convention '{target_convention}'. " - f"Must be one of: {', '.join(sorted(_VALID_CONVENTIONS))}" - ) - - numeric = self._to_numeric(name) - if numeric is None: - logger.warning("Unknown chromosome name: %s — returning as-is", name) - return name - - if target_convention == "numeric": - return numeric - elif target_convention == "refseq": - return self._num_to_ref.get(numeric, name) - elif target_convention == "ucsc": - return self._num_to_ucsc.get(numeric, name) - return name # unreachable but satisfies linters -``` - -**Step 5: Run test to verify it passes** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_clinvar/test_chromosome_mapper.py -v` -Expected: All PASS - -**Step 6: Commit** - -```bash -git add pgatk/clinvar/__init__.py pgatk/clinvar/chromosome_mapper.py \ - pgatk/testdata/clinvar/mini_assembly_report.txt \ - pgatk/tests/test_clinvar/__init__.py \ - pgatk/tests/test_clinvar/test_chromosome_mapper.py -git commit -m "feat: add ChromosomeMapper for NC_/numeric/UCSC chromosome name conversion - -Parses NCBI assembly report to build bidirectional chromosome name maps. -Supports RefSeq (NC_000001.11), numeric (1), and UCSC (chr1) conventions." -``` - ---- - -## Task 3: ClinVar Config and Config Registry Integration - -Add ClinVar-specific configuration defaults and register them in the config registry. - -**Files:** -- Create: `pgatk/config/clinvar_config.yaml` -- Modify: `pgatk/config/registry.py` - -**Step 1: Create `pgatk/config/clinvar_config.yaml`** - -```yaml -clinvar_translation: - translation_table: 1 - mito_translation_table: 2 - protein_prefix: "clinvar_" - proteindb_output_file: "clinvar-peptide-database.fa" - report_ref_seq: false - num_orfs: 1 - - # ClinVar-specific filtering - clinical_significance_exclude: - - "Benign" - - "Likely_benign" - - "Benign/Likely_benign" - - # NCBI RefSeq biotypes to include - include_biotypes: "protein_coding" - - # Molecular consequences to include (from ClinVar MC field) - include_consequences: - - "missense_variant" - - "nonsense" - - "frameshift_variant" - - "inframe_insertion" - - "inframe_deletion" - - "stop_gained" - - "stop_lost" - - "start_lost" - - "splice_donor_variant" - - "splice_acceptor_variant" - - # NCBI FTP URLs - ncbi_refseq_ftp: "https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/" - clinvar_ftp: "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/" - - logger: - formatters: - DEBUG: "%(asctime)s [%(levelname)7s][%(name)48s][%(module)32s, %(lineno)4s] %(message)s" - INFO: "%(asctime)s [%(levelname)7s][%(name)48s] %(message)s" - loglevel: DEBUG -``` - -**Step 2: Register in `pgatk/config/registry.py`** - -Add `"clinvar": "clinvar_config.yaml"` to the `COMMAND_CONFIGS` dict at line 16 of `pgatk/config/registry.py`: - -```python -COMMAND_CONFIGS = { - "ensembl_downloader": "ensembl_downloader_config.yaml", - "ensembl_config": "ensembl_config.yaml", - "cosmic": "cosmic_config.yaml", - "cbioportal": "cbioportal_config.yaml", - "protein_decoy": "protein_decoy.yaml", - "clinvar": "clinvar_config.yaml", -} -``` - -**Step 3: Verify config loads** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -c "from pgatk.config.registry import load_config; c = load_config('clinvar'); print(c['clinvar_translation']['clinical_significance_exclude'])"` -Expected: `['Benign', 'Likely_benign', 'Benign/Likely_benign']` - -**Step 4: Commit** - -```bash -git add pgatk/config/clinvar_config.yaml pgatk/config/registry.py -git commit -m "feat: add ClinVar config with CLNSIG filtering and NCBI FTP URLs" -``` - ---- - -## Task 4: ClinVar Service — Core Pipeline - -The main ClinVar VCF-to-protein pipeline. This is the largest task. - -**Files:** -- Create: `pgatk/clinvar/clinvar_service.py` -- Create: `pgatk/testdata/clinvar/mini_clinvar.vcf` -- Create: `pgatk/testdata/clinvar/mini_refseq.gtf` -- Create: `pgatk/testdata/clinvar/mini_refseq_protein.faa` -- Test: `pgatk/tests/test_clinvar/test_clinvar_service.py` - -**Step 1: Create test data files** - -These are minimal hand-crafted files that exercise the key code paths. - -Create `pgatk/testdata/clinvar/mini_clinvar.vcf`: -``` -##fileformat=VCFv4.1 -##INFO= -##INFO= -##INFO= -##INFO= -#CHROM POS ID REF ALT QUAL FILTER INFO -1 100015 rs001 C T . . CLNSIG=Pathogenic;GENEINFO=TESTGENE1:1234;MC=SO:0001583|missense_variant -1 100025 rs002 G A . . CLNSIG=Benign;GENEINFO=TESTGENE1:1234;MC=SO:0001583|missense_variant -2 200010 rs003 A G . . CLNSIG=Likely_pathogenic;GENEINFO=TESTGENE2:5678;MC=SO:0001587|stop_gained -``` - -Create `pgatk/testdata/clinvar/mini_refseq.gtf` — RefSeq-style GTF with NC_ chromosome accessions. Note: features must overlap the VCF variant positions after chromosome mapping. - -``` -NC_000001.11 BestRefSeq transcript 100001 100090 . + . gene_id "TESTGENE1"; transcript_id "NM_000001.1"; gene_biotype "protein_coding"; -NC_000001.11 BestRefSeq exon 100001 100030 . + . gene_id "TESTGENE1"; transcript_id "NM_000001.1"; -NC_000001.11 BestRefSeq CDS 100001 100030 . + 0 gene_id "TESTGENE1"; transcript_id "NM_000001.1"; -NC_000001.11 BestRefSeq exon 100051 100090 . + . gene_id "TESTGENE1"; transcript_id "NM_000001.1"; -NC_000001.11 BestRefSeq CDS 100051 100090 . + 2 gene_id "TESTGENE1"; transcript_id "NM_000001.1"; -NC_000002.12 BestRefSeq transcript 200001 200060 . + . gene_id "TESTGENE2"; transcript_id "NM_000002.1"; gene_biotype "protein_coding"; -NC_000002.12 BestRefSeq exon 200001 200060 . + . gene_id "TESTGENE2"; transcript_id "NM_000002.1"; -NC_000002.12 BestRefSeq CDS 200001 200060 . + 0 gene_id "TESTGENE2"; transcript_id "NM_000002.1"; -``` - -Create `pgatk/testdata/clinvar/mini_refseq_protein.faa` — transcript sequences for the transcripts referenced in the GTF. Sequences must be nucleotide, matching the CDS lengths: - -``` ->NM_000001.1 CDS=1-70 -ATGCCCGGGAAATTTCCCGGGAAATTTCCCATGCCCGGGAAATTTCCCGGGAAATTTCCCGGGAAATTTCC ->NM_000002.1 CDS=1-60 -ATGCCCGGGAAATTTCCCGGGAAATTTCCCGGGAAATTTCCCGGGAAATTTCCCGGGAAA -``` - -**Step 2: Write the failing test** - -Create `pgatk/tests/test_clinvar/test_clinvar_service.py`: - -```python -"""Tests for ClinVar VCF-to-protein pipeline.""" - -from pathlib import Path - -import pytest - -from pgatk.clinvar.clinvar_service import ClinVarService - - -TESTDATA_DIR = Path(__file__).resolve().parent.parent.parent / "testdata" / "clinvar" - - -class TestClinSigFiltering: - """Tests for CLNSIG filtering logic.""" - - def test_pathogenic_passes(self): - exclude = ["Benign", "Likely_benign", "Benign/Likely_benign"] - assert ClinVarService.passes_clnsig_filter("Pathogenic", exclude) is True - - def test_benign_excluded(self): - exclude = ["Benign", "Likely_benign", "Benign/Likely_benign"] - assert ClinVarService.passes_clnsig_filter("Benign", exclude) is False - - def test_likely_benign_excluded(self): - exclude = ["Benign", "Likely_benign", "Benign/Likely_benign"] - assert ClinVarService.passes_clnsig_filter("Likely_benign", exclude) is False - - def test_uncertain_passes(self): - exclude = ["Benign", "Likely_benign", "Benign/Likely_benign"] - assert ClinVarService.passes_clnsig_filter("Uncertain_significance", exclude) is True - - def test_empty_clnsig_passes(self): - exclude = ["Benign", "Likely_benign"] - assert ClinVarService.passes_clnsig_filter("", exclude) is True - - -class TestMolecularConsequenceParser: - """Tests for MC field parsing.""" - - def test_parse_missense(self): - mc_field = "SO:0001583|missense_variant" - assert ClinVarService.parse_mc_consequence(mc_field) == "missense_variant" - - def test_parse_multiple_consequences(self): - mc_field = "SO:0001583|missense_variant,SO:0001587|stop_gained" - result = ClinVarService.parse_mc_consequences(mc_field) - assert "missense_variant" in result - assert "stop_gained" in result - - def test_parse_empty_mc(self): - assert ClinVarService.parse_mc_consequence("") == "" - - -class TestGeneInfoParser: - """Tests for GENEINFO field parsing.""" - - def test_parse_single_gene(self): - gene_symbol, gene_id = ClinVarService.parse_geneinfo("BRCA1:672") - assert gene_symbol == "BRCA1" - assert gene_id == "672" - - def test_parse_multi_gene(self): - gene_symbol, gene_id = ClinVarService.parse_geneinfo("BRCA1:672|TP53:7157") - assert gene_symbol == "BRCA1" # first gene - - -class TestClinVarPipeline: - """Integration test for ClinVar-to-proteindb pipeline.""" - - def test_pipeline_produces_output(self, tmp_path): - output_file = tmp_path / "output.fa" - service = ClinVarService( - vcf_file=str(TESTDATA_DIR / "mini_clinvar.vcf"), - gtf_file=str(TESTDATA_DIR / "mini_refseq.gtf"), - fasta_file=str(TESTDATA_DIR / "mini_refseq_protein.faa"), - assembly_report=str(TESTDATA_DIR / "mini_assembly_report.txt"), - output_file=str(output_file), - ) - service.run() - assert output_file.exists() - content = output_file.read_text() - # Should contain at least one protein entry - assert ">" in content - - def test_benign_variants_excluded(self, tmp_path): - output_file = tmp_path / "output.fa" - service = ClinVarService( - vcf_file=str(TESTDATA_DIR / "mini_clinvar.vcf"), - gtf_file=str(TESTDATA_DIR / "mini_refseq.gtf"), - fasta_file=str(TESTDATA_DIR / "mini_refseq_protein.faa"), - assembly_report=str(TESTDATA_DIR / "mini_assembly_report.txt"), - output_file=str(output_file), - ) - service.run() - content = output_file.read_text() - # rs002 is Benign and should NOT appear - assert "rs002" not in content -``` - -**Step 3: Run test to verify it fails** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_clinvar/test_clinvar_service.py -v` -Expected: FAIL with `ModuleNotFoundError: No module named 'pgatk.clinvar.clinvar_service'` - -**Step 4: Implement `pgatk/clinvar/clinvar_service.py`** - -```python -"""ClinVar VCF-to-protein database pipeline. - -Processes ClinVar VCF records against NCBI RefSeq GTF annotations to produce -a FASTA file of variant protein sequences. Does NOT require VEP — uses -BedTools interval overlap to find affected transcripts internally. -""" -from __future__ import annotations - -import logging -import sqlite3 -from pathlib import Path -from typing import Optional - -import gffutils -import pandas as pd -from Bio import SeqIO -from Bio.Seq import Seq -from pybedtools import BedTool - -from pgatk.clinvar.chromosome_mapper import ChromosomeMapper -from pgatk.toolbox.vcf_utils import check_overlap, get_altseq, get_orfs_vcf, write_output - -logger = logging.getLogger(__name__) - -# Default CLNSIG values to exclude -_DEFAULT_CLNSIG_EXCLUDE = ["Benign", "Likely_benign", "Benign/Likely_benign"] - -# Default molecular consequences to include -_DEFAULT_CONSEQUENCES = [ - "missense_variant", "nonsense", "frameshift_variant", - "inframe_insertion", "inframe_deletion", "stop_gained", - "stop_lost", "start_lost", "splice_donor_variant", "splice_acceptor_variant", -] - - -class ClinVarService: - """Generate variant protein databases from ClinVar VCF + RefSeq GTF.""" - - def __init__( - self, - vcf_file: str, - gtf_file: str, - fasta_file: str, - assembly_report: str, - output_file: str, - clnsig_exclude: Optional[list[str]] = None, - consequences: Optional[list[str]] = None, - translation_table: int = 1, - mito_translation_table: int = 2, - protein_prefix: str = "clinvar_", - num_orfs: int = 1, - report_ref_seq: bool = False, - ) -> None: - self._vcf_file = vcf_file - self._gtf_file = gtf_file - self._fasta_file = fasta_file - self._assembly_report = assembly_report - self._output_file = output_file - self._clnsig_exclude = clnsig_exclude or _DEFAULT_CLNSIG_EXCLUDE - self._consequences = consequences or _DEFAULT_CONSEQUENCES - self._translation_table = translation_table - self._mito_translation_table = mito_translation_table - self._protein_prefix = protein_prefix - self._num_orfs = num_orfs - self._report_ref_seq = report_ref_seq - - # ------------------------------------------------------------------ - # Static helper methods for parsing ClinVar INFO fields - # ------------------------------------------------------------------ - - @staticmethod - def passes_clnsig_filter(clnsig: str, exclude_list: list[str]) -> bool: - """Return True if the clinical significance is NOT in the exclude list.""" - if not clnsig: - return True - return clnsig not in exclude_list - - @staticmethod - def parse_mc_consequence(mc_field: str) -> str: - """Parse first molecular consequence from the MC INFO field. - - MC format: 'SO:0001583|missense_variant' - """ - if not mc_field: - return "" - first_mc = mc_field.split(",")[0] - parts = first_mc.split("|") - return parts[1] if len(parts) > 1 else "" - - @staticmethod - def parse_mc_consequences(mc_field: str) -> list[str]: - """Parse all molecular consequences from the MC field.""" - if not mc_field: - return [] - consequences = [] - for entry in mc_field.split(","): - parts = entry.strip().split("|") - if len(parts) > 1: - consequences.append(parts[1]) - return consequences - - @staticmethod - def parse_geneinfo(geneinfo: str) -> tuple[str, str]: - """Parse GENEINFO field. Format: 'SYMBOL:GENEID[|SYMBOL2:GENEID2]'. - - Returns (gene_symbol, gene_id) for the first gene. - """ - if not geneinfo: - return "", "" - first_gene = geneinfo.split("|")[0] - parts = first_gene.split(":") - return (parts[0], parts[1]) if len(parts) == 2 else (geneinfo, "") - - @staticmethod - def _get_info_field(info_str: str, key: str) -> str: - """Extract a value from the VCF INFO column by key.""" - for field in info_str.split(";"): - if field.startswith(key + "="): - return field.split("=", 1)[1] - return "" - - # ------------------------------------------------------------------ - # GTF parsing (reuses gffutils, same pattern as EnsemblDataService) - # ------------------------------------------------------------------ - - @staticmethod - def _parse_gtf(gtf_file: str) -> gffutils.FeatureDB: - """Create or load a gffutils database from a GTF file.""" - db_file = str(Path(gtf_file).with_suffix(".db")) - try: - gffutils.create_db( - gtf_file, db_file, - merge_strategy="create_unique", - keep_order=True, - disable_infer_transcripts=True, - disable_infer_genes=True, - verbose=False, - force=False, - ) - except (ValueError, sqlite3.OperationalError): - logger.info("GTF database already exists: %s", db_file) - - return gffutils.FeatureDB(db_file) - - @staticmethod - def _get_features( - db: gffutils.FeatureDB, - feature_id: str, - feature_types: Optional[list[str]] = None, - ) -> tuple: - """Retrieve chromosome, strand, and coding features for a transcript.""" - if feature_types is None: - feature_types = ["exon"] - try: - feature = db[feature_id] - except gffutils.exceptions.FeatureNotFoundError: - try: - feature = db[feature_id.split(".")[0]] - except gffutils.exceptions.FeatureNotFoundError: - logger.warning("Feature %s not found in GTF database", feature_id) - return None, None, None - - coding_features = [] - for f in db.children(feature, featuretype=feature_types, order_by="end"): - coding_features.append([f.start, f.end, f.featuretype]) - return feature.chrom, feature.strand, coding_features - - # ------------------------------------------------------------------ - # VCF reading (same pandas approach as EnsemblDataService) - # ------------------------------------------------------------------ - - @staticmethod - def _read_vcf(vcf_file: str) -> tuple[list[str], pd.DataFrame]: - """Read a VCF file into metadata headers + DataFrame.""" - metadata = [] - records = [] - with open(vcf_file, "r", encoding="utf-8") as fh: - for line in fh: - line = line.strip() - if line.startswith("#"): - metadata.append(line) - continue - cols = line.split("\t") - if len(cols) < 8: - continue - records.append({ - "CHROM": cols[0], - "POS": cols[1], - "ID": cols[2], - "REF": cols[3], - "ALT": cols[4], - "QUAL": cols[5], - "FILTER": cols[6], - "INFO": cols[7], - }) - df = pd.DataFrame(records) if records else pd.DataFrame( - columns=["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] - ) - return metadata, df - - # ------------------------------------------------------------------ - # Transcript overlap via BedTools - # ------------------------------------------------------------------ - - @staticmethod - def _annotate_vcf_with_transcripts( - vcf_file: str, - gtf_file: str, - chrom_mapper: ChromosomeMapper, - ) -> dict[str, list[str]]: - """Find overlapping transcripts for each VCF record using BedTools. - - Returns a dict mapping 'CHROM:POS:REF:ALT' -> [transcript_id, ...]. - Chromosome names are normalized to RefSeq (NC_) for GTF matching. - """ - overlaps: dict[str, list[str]] = {} - - # Read VCF records and convert to BED format in RefSeq chromosome names - vcf_bed_entries = [] - _, vcf_df = ClinVarService._read_vcf(vcf_file) - - for _, row in vcf_df.iterrows(): - chrom_refseq = chrom_mapper.map_chrom(str(row.CHROM), "refseq") - start = int(row.POS) - 1 # BED is 0-based - end = start + len(str(row.REF)) - key = f"{row.CHROM}:{row.POS}:{row.REF}:{row.ALT}" - vcf_bed_entries.append(f"{chrom_refseq}\t{start}\t{end}\t{key}") - - if not vcf_bed_entries: - return overlaps - - vcf_bed = BedTool("\n".join(vcf_bed_entries), from_string=True) - - # Intersect with GTF — extract CDS features only - try: - gtf_bed = BedTool(gtf_file) - intersection = gtf_bed.intersect(vcf_bed, wo=True) - except Exception as e: - logger.error("BedTools intersection failed: %s", e) - return overlaps - - for feature in intersection: - fields = str(feature).strip().split("\t") - # GTF feature type is at index 2 - if fields[2] != "CDS": - continue - # Parse transcript_id from GTF attributes (index 8) - attrs = fields[8] - transcript_id = "" - for attr in attrs.split(";"): - attr = attr.strip() - if attr.startswith("transcript_id"): - transcript_id = attr.split('"')[1] if '"' in attr else attr.split(" ")[1] - break - if not transcript_id: - continue - - # The key is in the last few columns (from the VCF BED name field) - # BedTool intersect with wo appends the second file fields - variant_key = fields[-2] # the name column from the VCF BED - overlaps.setdefault(variant_key, []) - if transcript_id not in overlaps[variant_key]: - overlaps[variant_key].append(transcript_id) - - return overlaps - - # ------------------------------------------------------------------ - # Main pipeline - # ------------------------------------------------------------------ - - def run(self) -> str: - """Execute the ClinVar-to-proteindb pipeline. Returns output path.""" - - # 1. Load chromosome mapper - chrom_mapper = ChromosomeMapper.from_assembly_report(self._assembly_report) - - # 2. Parse GTF and index transcripts - db = self._parse_gtf(self._gtf_file) - - # 3. Load transcript FASTA sequences - transcripts_dict = SeqIO.index(self._fasta_file, "fasta", - key_function=lambda h: h.split("|")[0].split(" ")[0]) - transcript_id_mapping = {k.split(".")[0]: k for k in transcripts_dict.keys()} - - # 4. Find overlapping transcripts for all VCF records - variant_transcripts = self._annotate_vcf_with_transcripts( - self._vcf_file, self._gtf_file, chrom_mapper, - ) - - # 5. Process VCF records - _, vcf_df = self._read_vcf(self._vcf_file) - - stats = { - "variants_processed": 0, - "variants_excluded_clnsig": 0, - "variants_no_overlap": 0, - "variants_translated": 0, - "transcript_not_found": 0, - } - - with open(self._output_file, "w", encoding="utf-8") as out_fh: - for _, record in vcf_df.iterrows(): - stats["variants_processed"] += 1 - - # Validate REF/ALT - ref = str(record.REF) - alts = [a for a in str(record.ALT).split(",") - if a and all(c in "ACGT" for c in a)] - if not all(c in "ACGT" for c in ref) or not alts: - continue - - # Filter by CLNSIG - clnsig = self._get_info_field(str(record.INFO), "CLNSIG") - if not self.passes_clnsig_filter(clnsig, self._clnsig_exclude): - stats["variants_excluded_clnsig"] += 1 - continue - - # Translation table - trans_table = self._translation_table - chrom_str = str(record.CHROM).lstrip("chr").upper() - if chrom_str in ("M", "MT"): - trans_table = self._mito_translation_table - - # Get overlapping transcripts - variant_key = f"{record.CHROM}:{record.POS}:{record.REF}:{record.ALT}" - transcript_ids = variant_transcripts.get(variant_key, []) - if not transcript_ids: - stats["variants_no_overlap"] += 1 - continue - - for transcript_id in transcript_ids: - # Resolve transcript version - try: - transcript_id_v = transcript_id_mapping.get( - transcript_id.split(".")[0], transcript_id - ) - except (AttributeError, KeyError): - transcript_id_v = transcript_id - - # Get sequence from FASTA - try: - row = transcripts_dict[transcript_id_v] - ref_seq = row.seq - desc = str(row.description) - except KeyError: - stats["transcript_not_found"] += 1 - continue - - # Determine feature types and CDS info - feature_types = ["exon"] - cds_info: list[int] = [] - num_orfs = 3 - if "CDS=" in desc: - try: - cds_info = [int(x) for x in desc.split(" ")[1].split("=")[1].split("-")] - feature_types = ["CDS", "stop_codon"] - num_orfs = 1 - except (ValueError, IndexError): - pass - - # Override with configured num_orfs - if self._num_orfs: - num_orfs = self._num_orfs - - # Get genomic features from GTF - chrom, strand, features_info = self._get_features( - db, transcript_id_v, feature_types, - ) - if chrom is None: - continue - - # Verify chromosome match (after mapping) - record_chrom_refseq = chrom_mapper.map_chrom(str(record.CHROM), "refseq") - if chrom != record_chrom_refseq: - continue - - for alt in alts: - # Check overlap - var_pos = int(record.POS) - if not check_overlap(var_pos, var_pos + len(ref) - 1, features_info): - continue - - # Apply variant - coding_ref, coding_alt = get_altseq( - ref_seq, Seq(ref), Seq(alt), var_pos, - strand, features_info, cds_info, - ) - - if coding_alt != "": - ref_orfs, alt_orfs = get_orfs_vcf( - coding_ref, coding_alt, trans_table, num_orfs, - ) - - # Build header with ClinVar metadata - gene_symbol, _ = self.parse_geneinfo( - self._get_info_field(str(record.INFO), "GENEINFO"), - ) - record_id = str(record.ID) if record.ID and record.ID != "." else "" - seq_id = "_".join(filter(None, [ - self._protein_prefix + record_id, - f"{record.CHROM}.{record.POS}.{ref}.{alt}", - transcript_id_v, - ])) - - desc_str = f"{clnsig}|{gene_symbol}" if gene_symbol else clnsig - - write_output( - seq_id=seq_id, - desc=desc_str, - seqs=alt_orfs, - prots_fn=out_fh, - seqs_filter=ref_orfs, - ) - stats["variants_translated"] += 1 - - if self._report_ref_seq: - write_output( - seq_id=transcript_id_v, - desc="", - seqs=ref_orfs, - prots_fn=out_fh, - ) - - logger.info("ClinVar pipeline summary: %s", stats) - return self._output_file -``` - -**Step 5: Run test to verify it passes** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_clinvar/test_clinvar_service.py -v` -Expected: All PASS - -Note: The integration tests (`TestClinVarPipeline`) may need test data adjustments. The test data sequences and positions must be consistent — CDS positions in the GTF must overlap VCF variant positions, and FASTA sequences must match CDS lengths. Debug and adjust the test data files if positions don't align. The important thing is: Pathogenic/Likely_pathogenic variants get translated, Benign variants are excluded. - -**Step 6: Run all tests** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/ -v` -Expected: All tests PASS - -**Step 7: Commit** - -```bash -git add pgatk/clinvar/clinvar_service.py \ - pgatk/testdata/clinvar/mini_clinvar.vcf \ - pgatk/testdata/clinvar/mini_refseq.gtf \ - pgatk/testdata/clinvar/mini_refseq_protein.faa \ - pgatk/tests/test_clinvar/test_clinvar_service.py -git commit -m "feat: add ClinVarService pipeline for VCF-to-protein generation - -Processes ClinVar VCF against RefSeq GTF using BedTools overlap detection. -Filters by CLNSIG, reuses shared get_altseq/get_orfs_vcf for variant -application and translation." -``` - ---- - -## Task 5: NCBI Data Downloader - -Download NCBI RefSeq and ClinVar files from FTP. - -**Files:** -- Create: `pgatk/clinvar/data_downloader.py` -- Test: `pgatk/tests/test_clinvar/test_data_downloader.py` - -**Step 1: Write the failing test** - -Create `pgatk/tests/test_clinvar/test_data_downloader.py`: - -```python -"""Tests for NCBI data downloader (no actual downloads).""" - -from pathlib import Path - -import pytest - -from pgatk.clinvar.data_downloader import NcbiDataDownloader - - -class TestNcbiDataDownloader: - """Tests for NcbiDataDownloader.""" - - def test_build_refseq_urls(self): - downloader = NcbiDataDownloader(output_dir="/tmp/test") - urls = downloader.get_refseq_urls() - assert any("GRCh38_latest_genomic.gtf.gz" in u for u in urls) - assert any("GRCh38_latest_protein.faa.gz" in u for u in urls) - assert any("assembly_report.txt" in u for u in urls) - - def test_build_clinvar_urls(self): - downloader = NcbiDataDownloader(output_dir="/tmp/test") - urls = downloader.get_clinvar_urls() - assert any("clinvar.vcf.gz" in u for u in urls) - - def test_expected_files_list(self): - downloader = NcbiDataDownloader(output_dir="/tmp/test") - files = downloader.expected_files() - assert len(files) >= 4 # gtf, protein fasta, assembly report, clinvar vcf - - def test_output_dir_created(self, tmp_path): - out_dir = tmp_path / "ncbi_data" - downloader = NcbiDataDownloader(output_dir=str(out_dir)) - downloader.ensure_output_dir() - assert out_dir.exists() -``` - -**Step 2: Run test to verify it fails** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_clinvar/test_data_downloader.py -v` -Expected: FAIL with `ModuleNotFoundError` - -**Step 3: Implement `pgatk/clinvar/data_downloader.py`** - -```python -"""Download NCBI RefSeq and ClinVar reference files. - -Uses urllib for FTP/HTTPS downloads. Skips existing files unless force=True. -""" -from __future__ import annotations - -import logging -import os -from pathlib import Path - -from pgatk.toolbox.general import download_file, check_create_folders - -logger = logging.getLogger(__name__) - -_DEFAULT_REFSEQ_BASE = ( - "https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/" - "GRCh38_latest/refseq_identifiers/" -) -_DEFAULT_CLINVAR_BASE = ( - "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/" -) - -_REFSEQ_FILES = [ - "GRCh38_latest_genomic.gtf.gz", - "GRCh38_latest_protein.faa.gz", - "GRCh38_latest_assembly_report.txt", -] -_CLINVAR_FILES = [ - "clinvar.vcf.gz", - "clinvar.vcf.gz.tbi", -] - - -class NcbiDataDownloader: - """Download NCBI RefSeq and ClinVar reference data.""" - - def __init__( - self, - output_dir: str, - refseq_base_url: str = _DEFAULT_REFSEQ_BASE, - clinvar_base_url: str = _DEFAULT_CLINVAR_BASE, - ) -> None: - self._output_dir = output_dir - self._refseq_base = refseq_base_url - self._clinvar_base = clinvar_base_url - - def ensure_output_dir(self) -> None: - """Create output directory if it doesn't exist.""" - check_create_folders([self._output_dir]) - - def get_refseq_urls(self) -> list[str]: - """Return list of RefSeq file URLs to download.""" - return [self._refseq_base + f for f in _REFSEQ_FILES] - - def get_clinvar_urls(self) -> list[str]: - """Return list of ClinVar file URLs to download.""" - return [self._clinvar_base + f for f in _CLINVAR_FILES] - - def expected_files(self) -> list[str]: - """Return list of expected local file paths after download.""" - all_files = _REFSEQ_FILES + _CLINVAR_FILES - return [os.path.join(self._output_dir, f) for f in all_files] - - def download_all(self, force: bool = False) -> list[str]: - """Download all required files. Returns list of downloaded file paths. - - :param force: If True, re-download even if files exist. - """ - self.ensure_output_dir() - downloaded = [] - - all_urls = self.get_refseq_urls() + self.get_clinvar_urls() - all_names = _REFSEQ_FILES + _CLINVAR_FILES - - for url, name in zip(all_urls, all_names): - local_path = os.path.join(self._output_dir, name) - - if os.path.exists(local_path) and not force: - logger.info("File already exists, skipping: %s", local_path) - downloaded.append(local_path) - continue - - logger.info("Downloading %s -> %s", url, local_path) - result = download_file(url, local_path, logger) - if result: - downloaded.append(result) - else: - logger.error("Failed to download: %s", url) - - return downloaded -``` - -**Step 4: Run test to verify it passes** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_clinvar/test_data_downloader.py -v` -Expected: All PASS - -**Step 5: Commit** - -```bash -git add pgatk/clinvar/data_downloader.py pgatk/tests/test_clinvar/test_data_downloader.py -git commit -m "feat: add NcbiDataDownloader for RefSeq/ClinVar file downloads - -Downloads GTF, protein FASTA, assembly report, and ClinVar VCF from NCBI FTP. -Skips existing files, supports --force re-download." -``` - ---- - -## Task 6: CLI Commands and CLI Registration - -Wire up the two new CLI commands and register them in `cli.py`. - -**Files:** -- Create: `pgatk/commands/clinvar_to_proteindb.py` -- Create: `pgatk/commands/ncbi_downloader.py` -- Modify: `pgatk/cli.py` - -**Step 1: Create `pgatk/commands/clinvar_to_proteindb.py`** - -```python -import click - -from pgatk.clinvar.clinvar_service import ClinVarService -from pgatk.config.registry import load_config - - -@click.command("clinvar-to-proteindb", short_help="Generate protein database from ClinVar VCF + RefSeq GTF") -@click.option("-c", "--config_file", help="Configuration YAML file (optional)") -@click.option("-v", "--vcf", required=True, help="ClinVar VCF file path") -@click.option("-g", "--gtf", required=True, help="NCBI RefSeq GTF file path") -@click.option("-f", "--fasta", required=True, help="RefSeq protein/transcript FASTA file path") -@click.option("-a", "--assembly-report", required=True, help="NCBI assembly report file path") -@click.option("-o", "--output", required=True, help="Output protein FASTA file path") -@click.option("--clnsig-exclude", default=None, - help="Comma-separated CLNSIG values to exclude (default: Benign,Likely_benign,Benign/Likely_benign)") -@click.option("--consequences", default=None, - help="Comma-separated molecular consequences to include") -@click.option("-t", "--translation-table", type=int, default=None, help="Translation table (default: 1)") -@click.option("-p", "--protein-prefix", default=None, help="Prefix for variant protein IDs (default: clinvar_)") -@click.option("--report-ref-seq", is_flag=True, default=False, - help="Also report reference protein sequences") -@click.pass_context -def clinvar_to_proteindb(ctx, config_file, vcf, gtf, fasta, assembly_report, output, - clnsig_exclude, consequences, translation_table, protein_prefix, - report_ref_seq): - """Generate a variant protein database from ClinVar VCF and NCBI RefSeq GTF. - - This command does NOT require VEP annotations. It uses BedTools interval - overlap to find transcripts affected by each ClinVar variant, then applies - the variant and translates to protein. - """ - config = load_config("clinvar", config_file) - clinvar_cfg = config.get("clinvar_translation", {}) - - # Build kwargs with config defaults, CLI overrides - kwargs = { - "vcf_file": vcf, - "gtf_file": gtf, - "fasta_file": fasta, - "assembly_report": assembly_report, - "output_file": output, - "translation_table": translation_table or clinvar_cfg.get("translation_table", 1), - "mito_translation_table": clinvar_cfg.get("mito_translation_table", 2), - "protein_prefix": protein_prefix or clinvar_cfg.get("protein_prefix", "clinvar_"), - "num_orfs": clinvar_cfg.get("num_orfs", 1), - "report_ref_seq": report_ref_seq or clinvar_cfg.get("report_ref_seq", False), - } - - if clnsig_exclude: - kwargs["clnsig_exclude"] = [x.strip() for x in clnsig_exclude.split(",")] - elif "clinical_significance_exclude" in clinvar_cfg: - kwargs["clnsig_exclude"] = clinvar_cfg["clinical_significance_exclude"] - - if consequences: - kwargs["consequences"] = [x.strip() for x in consequences.split(",")] - elif "include_consequences" in clinvar_cfg: - kwargs["consequences"] = clinvar_cfg["include_consequences"] - - service = ClinVarService(**kwargs) - service.run() -``` - -**Step 2: Create `pgatk/commands/ncbi_downloader.py`** - -```python -import click - -from pgatk.clinvar.data_downloader import NcbiDataDownloader -from pgatk.config.registry import load_config - - -@click.command("ncbi-downloader", short_help="Download NCBI RefSeq and ClinVar reference files") -@click.option("-c", "--config_file", help="Configuration YAML file (optional)") -@click.option("-o", "--output-dir", required=True, help="Output directory for downloaded files") -@click.option("--force", is_flag=True, default=False, help="Re-download files even if they exist") -@click.pass_context -def ncbi_downloader(ctx, config_file, output_dir, force): - """Download NCBI RefSeq GTF, protein FASTA, assembly report, and ClinVar VCF. - - Files are downloaded to the specified output directory. Existing files - are skipped unless --force is used. - """ - config = load_config("clinvar", config_file) - clinvar_cfg = config.get("clinvar_translation", {}) - - refseq_base = clinvar_cfg.get("ncbi_refseq_ftp", - "https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/") - clinvar_base = clinvar_cfg.get("clinvar_ftp", - "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/") - - downloader = NcbiDataDownloader( - output_dir=output_dir, - refseq_base_url=refseq_base, - clinvar_base_url=clinvar_base, - ) - downloaded = downloader.download_all(force=force) - click.echo(f"Downloaded {len(downloaded)} files to {output_dir}") -``` - -**Step 3: Register commands in `pgatk/cli.py`** - -Add at line 30 (after the existing imports): - -```python -from pgatk.commands import clinvar_to_proteindb as clinvar_to_proteindb_cmd -from pgatk.commands import ncbi_downloader as ncbi_downloader_cmd -``` - -Add at line 56 (after the last `cli.add_command`): - -```python -cli.add_command(clinvar_to_proteindb_cmd.clinvar_to_proteindb) -cli.add_command(ncbi_downloader_cmd.ncbi_downloader) -``` - -**Step 4: Verify CLI help works** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pgatk.cli --help` -Expected: Output should include `clinvar-to-proteindb` and `ncbi-downloader` commands - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pgatk.cli clinvar-to-proteindb --help` -Expected: Shows help text with all options - -**Step 5: Run all tests** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/ -v` -Expected: All tests PASS - -**Step 6: Commit** - -```bash -git add pgatk/commands/clinvar_to_proteindb.py \ - pgatk/commands/ncbi_downloader.py \ - pgatk/cli.py -git commit -m "feat: add clinvar-to-proteindb and ncbi-downloader CLI commands - -Wire up ClinVar pipeline and NCBI downloader as Click commands. -Register both in the main CLI entry point." -``` - ---- - -## Task 7: Final Integration Test and Cleanup - -End-to-end test running the full CLI pipeline with mini test data. - -**Files:** -- Create: `pgatk/tests/test_clinvar/test_clinvar_integration.py` - -**Step 1: Write integration test** - -Create `pgatk/tests/test_clinvar/test_clinvar_integration.py`: - -```python -"""End-to-end integration test for the ClinVar pipeline.""" - -from pathlib import Path - -from click.testing import CliRunner - -from pgatk.cli import cli - - -TESTDATA_DIR = Path(__file__).resolve().parent.parent.parent / "testdata" / "clinvar" - - -class TestClinVarCLIIntegration: - """Test the clinvar-to-proteindb CLI command end-to-end.""" - - def test_cli_produces_output(self, tmp_path): - output_file = tmp_path / "clinvar_output.fa" - runner = CliRunner() - result = runner.invoke(cli, [ - "clinvar-to-proteindb", - "--vcf", str(TESTDATA_DIR / "mini_clinvar.vcf"), - "--gtf", str(TESTDATA_DIR / "mini_refseq.gtf"), - "--fasta", str(TESTDATA_DIR / "mini_refseq_protein.faa"), - "--assembly-report", str(TESTDATA_DIR / "mini_assembly_report.txt"), - "--output", str(output_file), - ]) - assert result.exit_code == 0, f"CLI failed: {result.output}\n{result.exception}" - assert output_file.exists() - - def test_cli_help_shows_options(self): - runner = CliRunner() - result = runner.invoke(cli, ["clinvar-to-proteindb", "--help"]) - assert result.exit_code == 0 - assert "clinvar" in result.output.lower() - assert "--vcf" in result.output - assert "--assembly-report" in result.output - - def test_ncbi_downloader_help(self): - runner = CliRunner() - result = runner.invoke(cli, ["ncbi-downloader", "--help"]) - assert result.exit_code == 0 - assert "--output-dir" in result.output - assert "--force" in result.output -``` - -**Step 2: Run integration test** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_clinvar/test_clinvar_integration.py -v` -Expected: All PASS - -**Step 3: Run full test suite** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/ -v` -Expected: All tests PASS (existing + new) - -**Step 4: Commit** - -```bash -git add pgatk/tests/test_clinvar/test_clinvar_integration.py -git commit -m "test: add end-to-end CLI integration tests for ClinVar pipeline" -``` - -**Step 5: Clean up any gffutils .db files created during tests** - -If gffutils created `.db` files in testdata during test runs, add them to `.gitignore`: - -```bash -echo "*.db" >> pgatk/testdata/clinvar/.gitignore -git add pgatk/testdata/clinvar/.gitignore -git commit -m "chore: gitignore gffutils .db files in ClinVar testdata" -``` diff --git a/docs/plans/2026-03-02-clinvar-ncbi-support-design.md b/docs/plans/2026-03-02-clinvar-ncbi-support-design.md deleted file mode 100644 index f9c4c6b..0000000 --- a/docs/plans/2026-03-02-clinvar-ncbi-support-design.md +++ /dev/null @@ -1,208 +0,0 @@ -# ClinVar/NCBI RefSeq Protein Database Generation — Design - -**Goal:** Add support for generating variant protein databases from ClinVar VCF files using NCBI RefSeq GTF annotations, independent of VEP/Ensembl. - -**Architecture:** New `pgatk/clinvar/` module with ClinVar-specific pipeline, chromosome mapper, and NCBI data downloader. Shared algorithms (`get_altseq`, `get_orfs_vcf`) are reused from existing code. - -**Tech Stack:** Python, pysam (VCF), gffutils (GTF), pyBedTools (interval overlap), pyfaidx (FASTA), tqdm, PyYAML. - ---- - -## 1. Architecture Overview - -### New module: `pgatk/clinvar/` - -``` -pgatk/clinvar/ - __init__.py - clinvar_service.py # Main pipeline: ClinVar VCF + RefSeq GTF -> protein FASTA - chromosome_mapper.py # NC_000001.11 <-> 1 <-> chr1 mapping - data_downloader.py # Download NCBI RefSeq + ClinVar files -``` - -### New CLI commands - -``` -pgatk/commands/ - clinvar_to_proteindb.py # clinvar-to-proteindb command - ncbi_downloader.py # ncbi-downloader command -``` - -### Shared utilities - -Extract reusable VCF-to-protein functions into `pgatk/toolbox/vcf_utils.py`: -- `get_altseq()` — apply variant to transcript sequence -- `get_orfs_vcf()` — translate modified transcript to protein -- `three_frame_translation()` — 3-frame translation helper - -These are currently in `pgatk/ensembl/ensembl.py` and will be imported by both the Ensembl and ClinVar pipelines. - -### New configuration - -``` -pgatk/config/clinvar_config.yaml -``` - ---- - -## 2. Pipeline Data Flow - -### Stage 1: Chromosome Mapping - -Parse NCBI assembly report (`GRCh38_latest_assembly_report.txt`) to build bidirectional chromosome name maps: -- `NC_000001.11` (RefSeq accession in GTF) -- `1` (numeric, used in ClinVar VCF) -- `chr1` (UCSC-style) - -The `chromosome_mapper.py` module exposes `map_chrom(name, target_convention)` for converting between conventions. - -### Stage 2: GTF Indexing - -Use gffutils to build a database from the RefSeq GTF: -- Filter to `protein_coding` biotype transcripts (`NM_*` and optionally `XM_*`) -- Index CDS features per transcript -- Build transcript-to-gene mapping from `gene_id` attribute (gene symbol in RefSeq) -- Extract transcript CDS sequences from protein FASTA (or reconstruct from genome FASTA) - -### Stage 3: ClinVar VCF Processing - -For each VCF record: -1. **Filter by CLNSIG:** Exclude Benign, Likely_benign, Benign/Likely_benign -2. **Filter by MC (molecular consequence):** Keep missense, nonsense, frameshift, indels, splice variants -3. **Map chromosome:** Convert VCF chromosome (numeric) to GTF chromosome (NC_) using chromosome mapper -4. **Find overlapping transcripts:** Use BedTools/interval tree to find transcripts whose CDS region overlaps the variant position -5. **Apply variant:** For each overlapping transcript, use `get_altseq()` to modify the transcript sequence -6. **Translate:** Use `get_orfs_vcf()` to translate modified sequence to protein -7. **Collect unique variant proteins** - -### Stage 4: Output - -Write FASTA with ClinVar metadata headers: -``` ->clinvar|RCV000012345|NM_000123|BRCA1|Pathogenic|missense_variant VARIANT_DESCRIPTION -MPROTEINSEQUENCE... -``` - ---- - -## 3. NCBI Downloader - -### Files downloaded - -| File | Source | Purpose | -|------|--------|---------| -| `GRCh38_latest_genomic.gtf.gz` | RefSeq FTP | Gene annotations | -| `GRCh38_latest_protein.faa.gz` | RefSeq FTP | Protein sequences | -| `GRCh38_latest_assembly_report.txt` | RefSeq FTP | Chromosome name mapping | -| `clinvar.vcf.gz` + `.tbi` | ClinVar FTP | Variant calls | - -### Implementation (`pgatk/clinvar/data_downloader.py`) - -- Uses `urllib.request` for FTP/HTTPS downloads (no extra dependencies) -- Downloads to user-specified `--output-dir` -- Skips existing files unless `--force` is passed -- Parses assembly report to produce chromosome mapping JSON -- Progress reporting via `tqdm` - -### CLI - -``` -pgatk ncbi-downloader --output-dir ./ncbi_data [--genome-build GRCh38] [--force] -``` - -### Chromosome Mapper (`pgatk/clinvar/chromosome_mapper.py`) - -- Reads assembly report TSV (columns: Sequence-Name, Sequence-Role, Assigned-Molecule, GenBank-Accn, RefSeq-Accn, UCSC-style-name) -- Builds bidirectional maps: `NC_000001.11 <-> 1 <-> chr1` -- Exposes `map_chrom(name, target_convention)` for any format conversion -- Handles unknown chromosomes gracefully (returns input unchanged + warning) - ---- - -## 4. Configuration - -### `pgatk/config/clinvar_config.yaml` - -```yaml -# ClinVar-specific defaults -clinical_significance_exclude: - - "Benign" - - "Likely_benign" - - "Benign/Likely_benign" - -# NCBI RefSeq biotypes to include -biotypes: - - "protein_coding" - -# Molecular consequences to include (from ClinVar MC field) -consequences: - - "missense_variant" - - "nonsense" - - "frameshift_variant" - - "inframe_insertion" - - "inframe_deletion" - - "stop_gained" - - "stop_lost" - - "start_lost" - - "splice_donor_variant" - - "splice_acceptor_variant" - -# FTP URLs -ncbi_ftp_base: "https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/" -clinvar_ftp_base: "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/" -``` - -### CLI parameters for `clinvar-to-proteindb` - -``` -pgatk clinvar-to-proteindb \ - --vcf clinvar.vcf.gz \ - --gtf GRCh38_latest_genomic.gtf \ - --fasta GRCh38_latest_protein.faa \ - --assembly-report GRCh38_latest_assembly_report.txt \ - --output clinvar_proteins.fa \ - [--clnsig-exclude "Benign,Likely_benign"] \ - [--consequences "missense_variant,stop_gained,..."] \ - [--biotypes "protein_coding"] \ - [--af-field AF_EXAC] -``` - -All filtering parameters have sensible defaults from the config file and can be overridden via CLI. - ---- - -## 5. Testing Strategy - -### Test data (committed to repo) - -- `testdata/clinvar/mini_clinvar.vcf` — ~20 hand-picked ClinVar records: SNPs (missense, nonsense, stop_lost), indels, Benign + Pathogenic entries, multiple chromosomes -- `testdata/clinvar/mini_refseq.gtf` — GTF subset with ~5 transcripts matching the VCF variants -- `testdata/clinvar/mini_refseq_protein.faa` — Matching protein FASTA for the 5 transcripts -- `testdata/clinvar/mini_assembly_report.txt` — Assembly report subset for tested chromosomes -- `testdata/clinvar/expected_output.fa` — Expected output protein FASTA for validation - -### Unit tests (`pgatk/tests/test_clinvar/`) - -1. **`test_chromosome_mapper.py`** — NC_ <-> numeric <-> chr mapping, unknown chromosome handling, assembly report parsing -2. **`test_clinvar_service.py`** — CLNSIG filtering, transcript overlap detection, variant application (reusing `get_altseq`/`get_orfs_vcf`), end-to-end mini pipeline -3. **`test_data_downloader.py`** — Assembly report parsing (no actual FTP downloads), chromosome map JSON generation - -### Integration test - -- **`test_clinvar_integration.py`** — Full pipeline run with mini test data, compare output to expected FASTA - -### Reuse validation - -Shared functions (`get_altseq`, `get_orfs_vcf`) are already tested in `test_ensembl_core.py`. ClinVar tests focus on new code paths: chromosome mapping, CLNSIG filtering, transcript overlap, and CLI wiring. - ---- - -## Design Decisions - -1. **New module vs. extending Ensembl:** ClinVar/RefSeq is fundamentally different from the Ensembl+VEP pipeline (no CSQ annotations, different chromosome naming, different GTF structure). A separate module avoids coupling and keeps both pipelines clean. - -2. **Internal transcript mapping vs. VEP:** ClinVar VCF lacks transcript-level annotations. Rather than requiring users to run VEP first, we use BedTools interval overlap to find transcripts internally. This is simpler for users and keeps the pipeline self-contained. - -3. **Shared algorithm extraction:** `get_altseq` and `get_orfs_vcf` are generic enough to work with any VCF+GTF combination. Extracting them to `toolbox/vcf_utils.py` enables reuse without code duplication. - -4. **Assembly report for chromosome mapping:** NCBI's assembly report is the authoritative source for chromosome name mappings. Parsing it once is more reliable than hardcoding mappings. diff --git a/docs/plans/2026-03-03-clinvar-pipeline-hardening.md b/docs/plans/2026-03-03-clinvar-pipeline-hardening.md deleted file mode 100644 index abae8c9..0000000 --- a/docs/plans/2026-03-03-clinvar-pipeline-hardening.md +++ /dev/null @@ -1,618 +0,0 @@ -# ClinVar Pipeline Hardening Implementation Plan - -> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. - -**Goal:** Fix the three weaknesses in the ClinVar pipeline: activate MC consequence filtering, eliminate double VCF read, and add biotype filtering. - -**Architecture:** Refactor `ClinVarService.run()` to read the VCF once into a DataFrame, build the BedTools overlap map from the DataFrame (not a second file read), apply MC and biotype filters early to skip irrelevant variants, and add a duplicate variant-transcript guard. - -**Tech Stack:** Python, pandas, pybedtools, gffutils, pytest - ---- - -### Task 1: Activate MC Consequence Filtering - -The `include_consequences` config is defined in `clinvar_config.yaml` but never used. -The ClinVar VCF already has an `MC` field (e.g., `MC=SO:0001583|missense_variant`) that -tells us the variant consequence. We need to parse it and filter variants whose MC -does not match any entry in `include_consequences`. - -**Files:** -- Modify: `pgatk/clinvar/clinvar_service.py:52-80` (add `_include_consequences` to `__init__`) -- Modify: `pgatk/clinvar/clinvar_service.py:340-532` (add MC filter in `run()`) -- Modify: `pgatk/config/clinvar_config.yaml:18-29` (update defaults — remove splice variants we can't model) -- Test: `pgatk/tests/test_clinvar/test_clinvar_service.py` -- Modify: `pgatk/testdata/clinvar/mini_clinvar.vcf` (add test records with different MC values) - -**Step 1: Write the failing tests** - -Add to `pgatk/tests/test_clinvar/test_clinvar_service.py`: - -```python -class TestMCFiltering: - """Tests for MC consequence filtering.""" - - def test_passes_mc_filter_matching_consequence(self): - """A variant with missense_variant should pass when it's in the include list.""" - include = ["missense_variant", "stop_gained"] - mc = "SO:0001583|missense_variant" - assert ClinVarService.passes_mc_filter(mc, include) is True - - def test_passes_mc_filter_no_match(self): - """A variant with synonymous_variant should NOT pass.""" - include = ["missense_variant", "stop_gained"] - mc = "SO:0001819|synonymous_variant" - assert ClinVarService.passes_mc_filter(mc, include) is False - - def test_passes_mc_filter_multi_consequence_any_match(self): - """If ANY consequence matches the include list, the variant passes.""" - include = ["missense_variant", "stop_gained"] - mc = "SO:0001819|synonymous_variant,SO:0001587|stop_gained" - assert ClinVarService.passes_mc_filter(mc, include) is True - - def test_passes_mc_filter_empty_mc_passes(self): - """Variants without MC field should pass (no info to filter on).""" - include = ["missense_variant"] - assert ClinVarService.passes_mc_filter("", include) is True - - def test_passes_mc_filter_all_keyword(self): - """When include list is ['all'], everything passes.""" - mc = "SO:0001819|synonymous_variant" - assert ClinVarService.passes_mc_filter(mc, ["all"]) is True -``` - -**Step 2: Run tests to verify they fail** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_clinvar/test_clinvar_service.py::TestMCFiltering -v` -Expected: FAIL with `AttributeError: type object 'ClinVarService' has no attribute 'passes_mc_filter'` - -**Step 3: Implement `passes_mc_filter` and wire into `__init__` and `run()`** - -In `pgatk/clinvar/clinvar_service.py`: - -1. Add `passes_mc_filter` static method (after `passes_clnsig_filter`): - -```python -@staticmethod -def passes_mc_filter(mc_field: str, include_list: list[str]) -> bool: - """Return True when *mc_field* contains at least one consequence in *include_list*. - - An empty MC field always passes (no information to filter on). - The special value ``'all'`` in include_list disables filtering. - """ - if not mc_field: - return True - if "all" in include_list: - return True - consequences = ClinVarService.parse_mc_consequences(mc_field) - return any(c in include_list for c in consequences) -``` - -2. In `__init__`, after `self._clnsig_exclude = ...` (line 80), add: - -```python -self._include_consequences = self._cfg.get( - "include_consequences", ["all"] -) -``` - -3. In `run()`, after the CLNSIG filter block (after line 401), add: - -```python -# --- MC consequence filter --- -mc_field = self._get_info_field(info, "MC") -if not self.passes_mc_filter(mc_field, self._include_consequences): - stats["variants_filtered_mc"] += 1 - continue -``` - -4. Add `"variants_filtered_mc": 0` to the `stats` dict (line 373). - -**Step 4: Update config defaults** - -In `pgatk/config/clinvar_config.yaml`, update the `include_consequences` list. -Remove splice variants (we can't model their protein consequence — that would -require exon-skipping prediction). Add a comment explaining why: - -```yaml - # Molecular consequences to include (from ClinVar MC field). - # Only consequences that produce modelable protein changes are included. - # Splice variants are excluded because the pipeline cannot predict - # exon-skipping outcomes — use VEP for those. - include_consequences: - - "missense_variant" - - "nonsense" - - "frameshift_variant" - - "inframe_insertion" - - "inframe_deletion" - - "stop_gained" - - "stop_lost" - - "start_lost" -``` - -**Step 5: Update test data** - -Add a synonymous variant to `pgatk/testdata/clinvar/mini_clinvar.vcf` (after line 9): - -``` -1 1054 rs00004 G A . . GENEINFO=GeneA:1234;CLNSIG=Pathogenic;MC=SO:0001819|synonymous_variant -``` - -This is Pathogenic but synonymous — it should pass CLNSIG but fail MC filter. - -**Step 6: Add integration test for MC filtering** - -Add to `TestClinVarPipeline` in `test_clinvar_service.py`: - -```python -def test_synonymous_variant_excluded(self, tmp_path): - """Pathogenic synonymous variant (rs00004) should not appear in output.""" - output_file = str(tmp_path / "output.fa") - service = ClinVarService( - vcf_file=MINI_VCF, - gtf_file=MINI_GTF, - fasta_file=MINI_FASTA, - assembly_report=ASSEMBLY_REPORT, - output_file=output_file, - ) - service.run() - with open(output_file, "r") as f: - content = f.read() - assert "rs00004" not in content -``` - -**Step 7: Run all tests to verify** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_clinvar/ -v` -Expected: ALL PASS - -**Step 8: Commit** - -```bash -git add pgatk/clinvar/clinvar_service.py pgatk/config/clinvar_config.yaml pgatk/tests/test_clinvar/test_clinvar_service.py pgatk/testdata/clinvar/mini_clinvar.vcf -git commit -m "feat(clinvar): activate MC consequence filtering from config" -``` - ---- - -### Task 2: Single-Pass VCF Reading - -Currently the VCF is read twice: once in `_annotate_vcf_with_transcripts()` (to build -the BED for BedTools) and once in `_read_vcf()` (to get the DataFrame for processing). -Refactor so the VCF is read once into a DataFrame, and the BED is built from the -DataFrame rows. - -**Files:** -- Modify: `pgatk/clinvar/clinvar_service.py:246-334` (refactor `_annotate_vcf_with_transcripts`) -- Modify: `pgatk/clinvar/clinvar_service.py:340-380` (refactor `run()` to read once) -- Test: `pgatk/tests/test_clinvar/test_clinvar_service.py` - -**Step 1: Write the failing test** - -Add to `test_clinvar_service.py`: - -```python -class TestBuildOverlapMap: - """Tests for _build_overlap_map (DataFrame-based BedTools annotation).""" - - @pytest.fixture(autouse=True) - def _cleanup_db(self): - db_path = Path(MINI_GTF).with_suffix(".db") - if db_path.exists(): - db_path.unlink() - yield - if db_path.exists(): - db_path.unlink() - - def test_build_overlap_map_from_dataframe(self): - """_build_overlap_map should accept a DataFrame and return overlap dict.""" - from pgatk.clinvar.chromosome_mapper import ChromosomeMapper - chrom_mapper = ChromosomeMapper.from_assembly_report(ASSEMBLY_REPORT) - _meta, vcf_df = ClinVarService._read_vcf(MINI_VCF) - overlap_map = ClinVarService._build_overlap_map(vcf_df, MINI_GTF, chrom_mapper) - assert isinstance(overlap_map, dict) - # rs00001 at chr1:1006 should overlap NM_000001.1 CDS (1000-1299) - assert any("1:1006:" in k for k in overlap_map) - - def test_build_overlap_map_matches_old_method(self): - """_build_overlap_map should produce same results as _annotate_vcf_with_transcripts.""" - from pgatk.clinvar.chromosome_mapper import ChromosomeMapper - chrom_mapper = ChromosomeMapper.from_assembly_report(ASSEMBLY_REPORT) - _meta, vcf_df = ClinVarService._read_vcf(MINI_VCF) - new_result = ClinVarService._build_overlap_map(vcf_df, MINI_GTF, chrom_mapper) - old_result = ClinVarService._annotate_vcf_with_transcripts(MINI_VCF, MINI_GTF, chrom_mapper) - assert new_result == old_result -``` - -**Step 2: Run tests to verify they fail** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_clinvar/test_clinvar_service.py::TestBuildOverlapMap -v` -Expected: FAIL with `AttributeError: ... has no attribute '_build_overlap_map'` - -**Step 3: Implement `_build_overlap_map`** - -Add new method to `ClinVarService` (replace or supplement `_annotate_vcf_with_transcripts`): - -```python -@staticmethod -def _build_overlap_map( - vcf_df: pd.DataFrame, - gtf_file: str, - chrom_mapper: ChromosomeMapper, -) -> dict[str, list[str]]: - """Find transcripts overlapping each VCF variant via BedTools. - - Unlike ``_annotate_vcf_with_transcripts``, this method builds the BED - from an already-loaded DataFrame, avoiding a second file read. - - Returns a dict mapping ``"CHROM:POS:REF:ALT"`` variant keys to lists - of overlapping transcript IDs. - """ - bed_lines: list[str] = [] - for _, row in vcf_df.iterrows(): - ref = str(row.REF) - if any(c not in "ACGT" for c in ref): - continue - chrom_numeric = str(row.CHROM) - pos = int(row.POS) - alt_field = str(row.ALT) - chrom_refseq = chrom_mapper.map_chrom(chrom_numeric, "refseq") - start = pos - 1 # BED is 0-based half-open - end = start + len(ref) - for alt in alt_field.split(","): - alt = alt.strip() - if not alt or not all(c in "ACGT" for c in alt): - continue - variant_key = f"{chrom_numeric}:{pos}:{ref}:{alt}" - bed_lines.append(f"{chrom_refseq}\t{start}\t{end}\t{variant_key}\n") - - if not bed_lines: - return {} - - with tempfile.NamedTemporaryFile(mode="w", suffix=".bed", delete=False) as tmp: - tmp.writelines(bed_lines) - tmp_bed_path = tmp.name - - try: - vcf_bed = BedTool(tmp_bed_path) - gtf_bed = BedTool(gtf_file) - intersection = vcf_bed.intersect(gtf_bed, wo=True) - - result: dict[str, list[str]] = {} - for feature in intersection: - fields = str(feature).strip().split("\t") - variant_key = fields[3] - gtf_type_idx = 4 + 2 - if len(fields) <= gtf_type_idx: - continue - if fields[gtf_type_idx] != "CDS": - continue - gtf_attrs_idx = 4 + 8 - if len(fields) <= gtf_attrs_idx: - continue - transcript_id = _extract_transcript_id(fields[gtf_attrs_idx]) - if transcript_id: - result.setdefault(variant_key, []) - if transcript_id not in result[variant_key]: - result[variant_key].append(transcript_id) - return result - finally: - Path(tmp_bed_path).unlink(missing_ok=True) -``` - -**Step 4: Refactor `run()` to use single-pass reading** - -In `run()`, replace the two calls: - -```python -# OLD (two reads): -overlap_map = self._annotate_vcf_with_transcripts( - self._vcf_file, self._gtf_file, chrom_mapper -) -_metadata, vcf_df = self._read_vcf(self._vcf_file) - -# NEW (one read): -_metadata, vcf_df = self._read_vcf(self._vcf_file) -overlap_map = self._build_overlap_map(vcf_df, self._gtf_file, chrom_mapper) -``` - -**Step 5: Remove `_annotate_vcf_with_transcripts`** - -Delete the old method entirely (lines 246-334). It is no longer called. - -**Step 6: Run all tests** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_clinvar/ -v` -Expected: ALL PASS - -**Step 7: Commit** - -```bash -git add pgatk/clinvar/clinvar_service.py pgatk/tests/test_clinvar/test_clinvar_service.py -git commit -m "refactor(clinvar): single-pass VCF reading — build overlap map from DataFrame" -``` - ---- - -### Task 3: Biotype Filtering from GTF - -The config has `include_biotypes: "protein_coding"` but it's never applied. When -processing a transcript, check its `gene_biotype` attribute from the gffutils DB. -Skip transcripts that don't match the include list. - -**Files:** -- Modify: `pgatk/clinvar/clinvar_service.py:52-80` (add `_include_biotypes` to `__init__`) -- Modify: `pgatk/clinvar/clinvar_service.py` in `run()` (add biotype check in transcript loop) -- Test: `pgatk/tests/test_clinvar/test_clinvar_service.py` -- Modify: `pgatk/testdata/clinvar/mini_refseq.gtf` (add `gene_biotype` attribute to existing records) - -**Step 1: Update test data** - -Add `gene_biotype "protein_coding";` to all GTF lines in `mini_refseq.gtf`. -For example, the first transcript line becomes: - -``` -NC_000001.11 BestRefSeq transcript 1000 1299 . + . gene_id "GeneA"; transcript_id "NM_000001.1"; gene_biotype "protein_coding"; -``` - -Do this for ALL lines (transcript, exon, CDS) in the GTF. - -**Step 2: Write the failing test** - -Add to `test_clinvar_service.py`: - -```python -class TestBiotypeFiltering: - """Tests for biotype filtering from GTF.""" - - def test_get_biotype_from_db(self): - """_get_transcript_biotype should extract gene_biotype from gffutils DB.""" - from pgatk.clinvar.clinvar_service import ClinVarService - db = ClinVarService._parse_gtf(MINI_GTF) - biotype = ClinVarService._get_transcript_biotype(db, "NM_000001.1") - assert biotype == "protein_coding" - - def test_get_biotype_missing_returns_empty(self): - """Missing biotype returns empty string (passes filter).""" - from pgatk.clinvar.clinvar_service import ClinVarService - db = ClinVarService._parse_gtf(MINI_GTF) - biotype = ClinVarService._get_transcript_biotype(db, "NONEXISTENT") - assert biotype == "" -``` - -**Step 3: Run tests to verify they fail** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_clinvar/test_clinvar_service.py::TestBiotypeFiltering -v` -Expected: FAIL with `AttributeError` - -**Step 4: Implement biotype filtering** - -1. Add `_get_transcript_biotype` static method: - -```python -@staticmethod -def _get_transcript_biotype(db: gffutils.FeatureDB, transcript_id: str) -> str: - """Extract gene_biotype from a gffutils transcript feature. - - Returns empty string if the transcript or attribute is not found. - """ - try: - feature = db[transcript_id] - except gffutils.exceptions.FeatureNotFoundError: - try: - feature = db[transcript_id.split(".")[0]] - except gffutils.exceptions.FeatureNotFoundError: - return "" - try: - return feature.attributes.get("gene_biotype", [""])[0] - except (IndexError, AttributeError): - return "" -``` - -2. In `__init__`, load the biotype config: - -```python -biotypes_raw = self._cfg.get("include_biotypes", "all") -if isinstance(biotypes_raw, str): - self._include_biotypes = [b.strip() for b in biotypes_raw.split(",")] -else: - self._include_biotypes = list(biotypes_raw) -``` - -3. In `run()`, inside the transcript loop (after looking up the transcript in -FASTA, before getting features), add: - -```python -# --- Biotype filter --- -if self._include_biotypes != ["all"]: - biotype = self._get_transcript_biotype(db, tid) - if biotype and biotype not in self._include_biotypes: - stats["transcripts_filtered_biotype"] += 1 - continue -``` - -4. Add `"transcripts_filtered_biotype": 0` to `stats`. - -**Step 5: Run all tests** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_clinvar/ -v` -Expected: ALL PASS - -**Step 6: Commit** - -```bash -git add pgatk/clinvar/clinvar_service.py pgatk/tests/test_clinvar/test_clinvar_service.py pgatk/testdata/clinvar/mini_refseq.gtf -git commit -m "feat(clinvar): add biotype filtering from GTF gene_biotype attribute" -``` - ---- - -### Task 4: Duplicate Variant-Transcript Guard and Stats Cleanup - -The Ensembl pipeline tracks processed `(transcript_id, ref, alt)` combinations to -avoid processing the same variant-transcript pair twice. The ClinVar pipeline lacks -this. Also clean up the `_read_vcf` HEADERS dict (deferred I3 from code review). - -**Files:** -- Modify: `pgatk/clinvar/clinvar_service.py` in `run()` (add duplicate guard) -- Modify: `pgatk/clinvar/clinvar_service.py:213-240` (clean up `_read_vcf`) -- Test: `pgatk/tests/test_clinvar/test_clinvar_service.py` - -**Step 1: Write the failing test** - -```python -class TestDuplicateGuard: - """Pipeline should not process the same variant-transcript pair twice.""" - - @pytest.fixture(autouse=True) - def _cleanup_db(self): - db_path = Path(MINI_GTF).with_suffix(".db") - if db_path.exists(): - db_path.unlink() - yield - if db_path.exists(): - db_path.unlink() - - def test_no_duplicate_fasta_entries(self, tmp_path): - """Each variant-transcript combo should appear at most once in output.""" - output_file = str(tmp_path / "output.fa") - service = ClinVarService( - vcf_file=MINI_VCF, - gtf_file=MINI_GTF, - fasta_file=MINI_FASTA, - assembly_report=ASSEMBLY_REPORT, - output_file=output_file, - ) - service.run() - with open(output_file, "r") as f: - headers = [line.strip() for line in f if line.startswith(">")] - # No duplicate headers - assert len(headers) == len(set(headers)), ( - f"Duplicate FASTA entries found: {headers}" - ) -``` - -**Step 2: Implement duplicate guard in `run()`** - -Before the main `for _, record in vcf_df.iterrows():` loop, add: - -```python -processed_pairs: set[str] = set() -``` - -Inside the transcript loop, before the `# Resolve transcript in FASTA` block, add: - -```python -pair_key = f"{variant_key}:{transcript_id}" -if pair_key in processed_pairs: - continue -processed_pairs.add(pair_key) -``` - -**Step 3: Clean up `_read_vcf` HEADERS** - -Replace the `HEADERS` dict with a simple column name list (remove the misleading -type values that are never applied): - -```python -@staticmethod -def _read_vcf(vcf_file: str) -> tuple[list, pd.DataFrame]: - """Read a VCF file and return metadata lines and a DataFrame of records.""" - COLUMNS = ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] - - metadata: list[str] = [] - data: list[list[str]] = [] - with open(vcf_file, "r", encoding="utf-8") as fh: - for line in fh: - line = line.strip() - if not line: - continue - if line.startswith("#"): - metadata.append(line) - else: - data.append(line.split("\t")[0:8]) - - vcf_df = pd.DataFrame(data, columns=COLUMNS) - return metadata, vcf_df -``` - -**Step 4: Run all tests** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/test_clinvar/ -v` -Expected: ALL PASS - -**Step 5: Commit** - -```bash -git add pgatk/clinvar/clinvar_service.py pgatk/tests/test_clinvar/test_clinvar_service.py -git commit -m "fix(clinvar): add duplicate variant-transcript guard, clean up _read_vcf" -``` - ---- - -### Task 5: Comprehensive Integration Test Updates - -Verify the full pipeline with the new filters active. Ensure test data exercises -all filter paths (CLNSIG, MC, biotype). Verify the single-pass refactor doesn't -change output. - -**Files:** -- Modify: `pgatk/tests/test_clinvar/test_clinvar_integration.py` -- Modify: `pgatk/tests/test_clinvar/test_clinvar_service.py` - -**Step 1: Add integration tests** - -Add to `TestClinVarCLIIntegration` in `test_clinvar_integration.py`: - -```python -def test_cli_filters_synonymous_variants(self, tmp_path, clinvar_testdata): - """Synonymous variants should be filtered by MC even if Pathogenic.""" - output_file = tmp_path / "clinvar_output.fa" - runner = CliRunner() - result = runner.invoke(cli, [ - "clinvar-to-proteindb", - "--vcf", str(clinvar_testdata / "mini_clinvar.vcf"), - "--gtf", str(clinvar_testdata / "mini_refseq.gtf"), - "--fasta", str(clinvar_testdata / "mini_refseq_protein.faa"), - "--assembly-report", str(clinvar_testdata / "mini_assembly_report.txt"), - "--output", str(output_file), - ]) - assert result.exit_code == 0 - content = output_file.read_text() - # rs00004 is Pathogenic but synonymous — should NOT appear - assert "rs00004" not in content - # rs00001 is Pathogenic missense — SHOULD appear - assert "rs00001" in content -``` - -Add a test that existing variants still produce correct output: - -```python -def test_pipeline_output_contains_clnsig_in_header(self, tmp_path, clinvar_testdata): - """Output FASTA headers should contain CLNSIG and gene symbol.""" - output_file = tmp_path / "clinvar_output.fa" - runner = CliRunner() - result = runner.invoke(cli, [ - "clinvar-to-proteindb", - "--vcf", str(clinvar_testdata / "mini_clinvar.vcf"), - "--gtf", str(clinvar_testdata / "mini_refseq.gtf"), - "--fasta", str(clinvar_testdata / "mini_refseq_protein.faa"), - "--assembly-report", str(clinvar_testdata / "mini_assembly_report.txt"), - "--output", str(output_file), - ]) - assert result.exit_code == 0 - content = output_file.read_text() - assert "Pathogenic" in content -``` - -**Step 2: Run full test suite** - -Run: `cd /Users/yperez/work/proteogenomics/pgatk && python -m pytest pgatk/tests/ -v` -Expected: ALL PASS (including existing Ensembl tests and VCF utils tests) - -**Step 3: Commit** - -```bash -git add pgatk/tests/test_clinvar/ -git commit -m "test(clinvar): add integration tests for MC filtering and output validation" -``` diff --git a/docs/use-cases.md b/docs/use-cases.md index 5af6de5..bd3cf20 100644 --- a/docs/use-cases.md +++ b/docs/use-cases.md @@ -183,11 +183,11 @@ pgatk ensembl-downloader \ -o ensembl_human \ --skip_protein \ --skip_ncrna \ - --skip_cdna \ - --skip_dna + --skip_cdna ``` -This downloads the gene annotation GTF and the VCF file with known variants. +This downloads the gene annotation GTF, VCF file with known variants, and the +genome FASTA (needed by gffread to extract transcript sequences). ### Step 2 -- Generate transcript sequences @@ -797,8 +797,7 @@ pgatk supports any species available in ENSEMBL. For example, rice ```bash pgatk ensembl-downloader \ -t 39947 \ - -o ensembl_rice \ - --skip_dna + -o ensembl_rice ``` ### Step 2 -- Generate transcript sequences and canonical proteome @@ -860,7 +859,7 @@ For model organisms like mouse (*Mus musculus*, taxonomy 10090): ```bash # Download -pgatk ensembl-downloader -t 10090 -o ensembl_mouse --skip_dna +pgatk ensembl-downloader -t 10090 -o ensembl_mouse # Generate transcript sequences gffread -F -w ensembl_mouse/transcripts.fa \ diff --git a/pgatk/clinvar/clinvar_service.py b/pgatk/clinvar/clinvar_service.py index b5760cb..ee9a2cc 100644 --- a/pgatk/clinvar/clinvar_service.py +++ b/pgatk/clinvar/clinvar_service.py @@ -122,9 +122,12 @@ def passes_clnsig_filter(clnsig: str, exclude_list: list[str]) -> bool: return False # Split compound values and check each component. parts = re.split(r"[/,]", clnsig) - return not all(p.strip().replace("_", " ").lower() in + non_empty = [p.strip() for p in parts if p.strip()] + if not non_empty: + return True # delimiter-only or whitespace-only → treat as empty + return not all(p.replace("_", " ").lower() in [e.replace("_", " ").lower() for e in exclude_list] - for p in parts if p.strip()) + for p in non_empty) @staticmethod def passes_mc_filter(mc_field: str, include_list: list[str]) -> bool: diff --git a/pgatk/tests/test_clinvar/test_clinvar_service.py b/pgatk/tests/test_clinvar/test_clinvar_service.py index eb62ee0..5f5e84b 100644 --- a/pgatk/tests/test_clinvar/test_clinvar_service.py +++ b/pgatk/tests/test_clinvar/test_clinvar_service.py @@ -71,6 +71,13 @@ def test_compound_risk_factor_excluded(self): # '_risk_factor' is not in exclude list, so it's not an excluded component assert ClinVarService.passes_clnsig_filter("Benign,_risk_factor", exclude) is True + def test_delimiter_only_clnsig_passes(self): + """CLNSIG with only delimiters (e.g. '/') should pass like empty.""" + exclude = ["Benign", "Likely_benign"] + assert ClinVarService.passes_clnsig_filter("/", exclude) is True + assert ClinVarService.passes_clnsig_filter(",", exclude) is True + assert ClinVarService.passes_clnsig_filter("/,/", exclude) is True + # --------------------------------------------------------------------------- # TestMolecularConsequenceParser diff --git a/pgatk/tests/test_vcf_utils.py b/pgatk/tests/test_vcf_utils.py index 70b7250..92ca86c 100644 --- a/pgatk/tests/test_vcf_utils.py +++ b/pgatk/tests/test_vcf_utils.py @@ -110,6 +110,54 @@ def test_snp_minus_strand(self): assert str(coding_ref) == "ATGCATGCAT" assert str(coding_alt) == "ATGCATGCAG" + def test_multibase_ref_extending_past_exon_clips_alt(self): + """When REF extends past the exon boundary, ALT should also be clipped. + + Exon: positions 10-14 (5 bases). Variant at pos 13 with REF=ACGT (4 bases, + extends to pos 16 -- 2 bases past exon end). Only 2 of 4 REF bases are + exonic. ALT=TT (2 bases) should be clipped by the same 2 intronic bases, + leaving 0 ALT bases (pure deletion of the 2 exonic REF bases). + """ + # positions: 10 11 12 13 14 + ref_seq = Seq("ATGCA") # 5 bases, exon is 10-14 + features_info = [[10, 14, 'exon']] + # REF=ACGT at pos 13: exonic portion is AC (pos 13-14), intronic is GT (pos 15-16) + # ALT=TT: clipped by 2 (intronic REF bases) → empty + coding_ref, coding_alt = get_altseq( + ref_seq, Seq("ACGT"), Seq("TT"), 13, '+', features_info + ) + # With ALT clipped to empty, result should be ref minus the 2 exonic bases + assert str(coding_ref) == "ATGCA" + assert str(coding_alt) == "ATG" # first 3 bases preserved, last 2 deleted + + def test_multibase_ref_extending_past_exon_partial_alt_clip(self): + """ALT longer than intronic portion retains some bases after clipping. + + Exon: positions 10-14. Variant at pos 13 with REF=ACG (extends 1 past exon). + Intronic REF = 1 base. ALT=TTT (3 bases) clipped by 1 → TT (2 bases). + """ + ref_seq = Seq("ATGCA") # exon 10-14 + features_info = [[10, 14, 'exon']] + # REF=ACG at pos 13: exonic = AC (2 bases), intronic = G (1 base) + # ALT=TTT clipped by 1 → TT + coding_ref, coding_alt = get_altseq( + ref_seq, Seq("ACG"), Seq("TTT"), 13, '+', features_info + ) + assert str(coding_ref) == "ATGCA" + # ref_seq[0:3] + "TT" + ref_seq[3+2:] = "ATG" + "TT" + "" = "ATGTT" + assert str(coding_alt) == "ATGTT" + + def test_ref_within_exon_no_clipping(self): + """When REF is entirely within the exon, no clipping occurs.""" + ref_seq = Seq("ATGCATGCAT") + features_info = [[10, 19, 'exon']] + # REF=GC at pos 12: entirely within exon 10-19 + coding_ref, coding_alt = get_altseq( + ref_seq, Seq("GC"), Seq("TT"), 12, '+', features_info + ) + assert str(coding_ref) == "ATGCATGCAT" + assert str(coding_alt) == "ATTTATGCAT" + # --------------------------------------------------------------------------- # get_orfs_vcf diff --git a/pgatk/toolbox/vcf_utils.py b/pgatk/toolbox/vcf_utils.py index d371e5d..61fe4db 100644 --- a/pgatk/toolbox/vcf_utils.py +++ b/pgatk/toolbox/vcf_utils.py @@ -116,11 +116,33 @@ def get_altseq( var_end_genomic = var_pos + len(ref_allele) - 1 exonic_ref_len = min(var_end_genomic, feature[1]) - var_pos + 1 c = max(exonic_ref_len, 0) + # Clip the ALT allele by the same number of trailing bases that + # were removed from REF. VCF variants are left-aligned, so the + # trailing bases correspond to the intronic portion. + intronic_ref_len = len(ref_allele) - c + if intronic_ref_len > 0: + exonic_alt_len = max(len(var_allele) - intronic_ref_len, 0) + var_allele = var_allele[:exonic_alt_len] + logger.debug( + "Variant at %d extends %d bases past exon boundary " + "(feature %d-%d); clipped REF to %d and ALT to %d bases", + var_pos, intronic_ref_len, feature[0], feature[1], + c, exonic_alt_len, + ) alt_seq = ref_seq[0:var_index_in_cds] + var_allele + ref_seq[var_index_in_cds + c::] if strand == '-': return ref_seq[::-1], alt_seq[::-1] else: return ref_seq, alt_seq + else: + # Log when variant starts outside this feature (intron-start) + var_end_genomic = var_pos + len(ref_allele) - 1 + if var_end_genomic >= feature[0] and var_pos < feature[0]: + logger.debug( + "Variant at %d-%d starts before feature %d-%d " + "(intron-start); skipping this feature", + var_pos, var_end_genomic, feature[0], feature[1], + ) nc_index += (feature[1] - feature[0] + 1)