From dd4ee5754407447c31f00def45c82da40c67ed91 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Tue, 3 Mar 2026 18:27:48 +0000 Subject: [PATCH 1/3] minor gubs fixing. --- pgatk/cgenomes/cgenomes_proteindb.py | 16 +++++-- pgatk/clinvar/clinvar_service.py | 4 +- pgatk/ensembl/ensembl.py | 50 ++++++++++++++------- pgatk/testdata/proteindb_from_gnomad_VCF.fa | 2 - pgatk/toolbox/vcf_utils.py | 8 +++- 5 files changed, 55 insertions(+), 25 deletions(-) diff --git a/pgatk/cgenomes/cgenomes_proteindb.py b/pgatk/cgenomes/cgenomes_proteindb.py index b0a97df2..f42d7b8d 100644 --- a/pgatk/cgenomes/cgenomes_proteindb.py +++ b/pgatk/cgenomes/cgenomes_proteindb.py @@ -181,8 +181,12 @@ def get_mut_pro_seq(snp: SNP, seq: Seq) -> Optional[str]: protein_seq = str(seq.translate(to_stop=False)) if "Missense" in snp.mutation_type: - mut_aa = snp.aa_mut[-1] - if not mut_aa.isalpha(): + # Extract the mutant residue from HGVS like p.V600E (1-letter) + # or p.Val600Glu (3-letter). For 1-letter the last char is the + # AA; for 3-letter we need to convert the last triplet. + aa_suffix = re.sub(r'^p\..*\d+', '', snp.aa_mut) + mut_aa = _three_to_one(aa_suffix) if aa_suffix else '' + if not mut_aa or len(mut_aa) != 1 or not mut_aa.isalpha(): return '' index = int(positions[0]) - 1 mut_pro_seq = protein_seq[:index] + mut_aa + protein_seq[index + 1:] @@ -194,7 +198,8 @@ def get_mut_pro_seq(snp: SNP, seq: Seq) -> Optional[str]: index = snp.aa_mut.index("ins") except ValueError: return '' - insert_aa = snp.aa_mut[index + 3:] + insert_aa_raw = snp.aa_mut[index + 3:] + insert_aa = _three_to_one(insert_aa_raw) if insert_aa.isalpha(): ins_index1 = int(positions[0]) mut_pro_seq = protein_seq[:ins_index1] + insert_aa + protein_seq[ins_index1:] @@ -211,7 +216,10 @@ def get_mut_pro_seq(snp: SNP, seq: Seq) -> Optional[str]: index = snp.aa_mut.index(">") except ValueError: return '' - mut_aa = snp.aa_mut[index + 1:] + mut_aa_raw = snp.aa_mut[index + 1:] + mut_aa = _three_to_one(mut_aa_raw.replace('*', '')) + if '*' in mut_aa_raw: + mut_aa += '*' if not mut_aa.replace('*','').isalpha(): return '' if "deletion" in snp.mutation_type: diff --git a/pgatk/clinvar/clinvar_service.py b/pgatk/clinvar/clinvar_service.py index 26004777..b5760cbd 100644 --- a/pgatk/clinvar/clinvar_service.py +++ b/pgatk/clinvar/clinvar_service.py @@ -493,9 +493,9 @@ def run(self) -> str: if "CDS=" in desc: try: cds_str = [ - p + p.strip("[]") for p in desc.split() - if p.startswith("CDS=") + if p.strip("[]").startswith("CDS=") ][0] cds_info = [ int(x) diff --git a/pgatk/ensembl/ensembl.py b/pgatk/ensembl/ensembl.py index a9a6d7ef..e1d8b7ee 100644 --- a/pgatk/ensembl/ensembl.py +++ b/pgatk/ensembl/ensembl.py @@ -328,7 +328,9 @@ def annoate_vcf(vcf_file: str, gtf_file: str, BedTool(gtf_file).intersect(BedTool(vcf_file), wo=True).saveas(f"{vcf_stem}_all.bed") - muts_dict = {} + muts_dict: dict[str, list[str]] = {} + # VCF columns start after the GTF columns (gene_info_index + 1) + vcf_start_col = gene_info_index + 1 with open(f"{vcf_stem}_all.bed", 'r', encoding='utf-8') as an: for line in an.readlines(): sl = line.strip().split('\t') @@ -345,11 +347,17 @@ def annoate_vcf(vcf_file: str, gtf_file: str, if transcript_id == 'NO_OVERLAP': continue - # write the mutation line as a key and set the overlapping transcriptID as its value(s) + # Use canonical positional key (CHROM:POS:REF:ALT) for reliable + # matching between BED output and raw VCF lines. try: - muts_dict['\t'.join(sl[gene_info_index + 1:-1])].append(transcript_id) - except KeyError: - muts_dict['\t'.join(sl[gene_info_index + 1:-1])] = [transcript_id] + vcf_chrom = sl[vcf_start_col] + vcf_pos = sl[vcf_start_col + 1] + vcf_ref = sl[vcf_start_col + 3] + vcf_alt = sl[vcf_start_col + 4] + key = f"{vcf_chrom}\t{vcf_pos}\t{vcf_ref}\t{vcf_alt}" + except IndexError: + continue + muts_dict.setdefault(key, []).append(transcript_id) with open(f"{vcf_stem}_annotated.vcf", 'w', encoding='utf-8') as ann, open(vcf_file, 'r', encoding='utf-8') as v: # write vcf headers to the output file @@ -358,13 +366,17 @@ def annoate_vcf(vcf_file: str, gtf_file: str, ann.write(line) else: # write the mutations and their overlapping transcript to output file - try: - sl = line.strip().split('\t') + sl = line.strip().split('\t') + if len(sl) < 8: + ann.write(line) + continue + key = f"{sl[0]}\t{sl[1]}\t{sl[3]}\t{sl[4]}" + if key in muts_dict: sl[vcf_info_field_index] = '{};{}={}'.format( sl[vcf_info_field_index].strip(), annotation_str, - ','.join(set(muts_dict[line.strip()]))) + ','.join(set(muts_dict[key]))) ann.write('\t'.join(sl) + '\n') - except KeyError: + else: ann.write(line) return f"{vcf_stem}_annotated.vcf" @@ -503,7 +515,7 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf if self._af_field: # get AF from the INFO field try: - af = float([x.split('=')[1] for x in record.INFO.split(';') if x.startswith(self._af_field)][0]) + af = float([x.split('=', 1)[1] for x in record.INFO.split(';') if x.split('=', 1)[0] == self._af_field][0]) except (ValueError, IndexError): invalid_records['# variants with invalid record'] += 1 continue @@ -521,8 +533,8 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf transcript_records = [] try: transcript_records = \ - [x.split('=')[1] for x in record.INFO.split(';') if x.startswith(self._annotation_field_name)][ - 0] + [x.split('=', 1)[1] for x in record.INFO.split(';') + if x.split('=', 1)[0] == self._annotation_field_name][0] except IndexError: # no overlapping feature was found invalid_records['# variants with invalid record'] += 1 msg = "skipped record {}, no annotation feature was found".format(record) @@ -586,11 +598,17 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf num_orfs = 3 if 'CDS=' in desc: try: - cds_info = [int(x) for x in desc.split(' ')[1].split('=')[1].split('-')] - feature_types = ['CDS', 'stop_codon'] - num_orfs = 1 + cds_token = next( + (t.strip('[]') for t in desc.split(' ') + if t.strip('[]').startswith('CDS=')), + None, + ) + if cds_token: + cds_info = [int(x) for x in cds_token.split('=')[1].split('-')] + feature_types = ['CDS', 'stop_codon'] + num_orfs = 1 except (ValueError, IndexError): - msg = "Could not extra cds position from fasta header for: {}".format(desc) + msg = "Could not extract cds position from fasta header for: {}".format(desc) self.get_logger().debug(msg) chrom, strand, features_info = self.get_features(db, diff --git a/pgatk/testdata/proteindb_from_gnomad_VCF.fa b/pgatk/testdata/proteindb_from_gnomad_VCF.fa index 3f008d7e..93974340 100644 --- a/pgatk/testdata/proteindb_from_gnomad_VCF.fa +++ b/pgatk/testdata/proteindb_from_gnomad_VCF.fa @@ -1,5 +1,3 @@ ->gnomvar_rs760117505_1.69337.A.C_ENST00000335137.3 -MVTEFIFLGLSDSQELQTFLFMLFFVFYGGIVFGNLLIVITVVSDSHLHSPMYFLLANLSLIDLSLSSVTAPKMITDFFSQRQVISFKGCLVQIFLLHFFGGSEMVILIAMGFDRYIAICKPLHYTTIMCGNACVGIMAVTWGIGFLHSVSQLAFAVHLLFCGPNEVDSFYCDLPRVIKLACTDTYRLDIMVIANSGVLTVCSFVLLIISYTIILMTIQHRPLDKSSKALSTLTAHITVVLLFFGPCVFIYAWPFPIKSLDKFLAVFYSVITPLLNPIIYTLRNKDMKTAIRQLRKWDAHSSVKF* >gnomvar_rs2691305_1.69511.A.G_ENST00000335137.3 MVTEFIFLGLSDSQELQTFLFMLFFVFYGGIVFGNLLIVITVVSDSHLHSPMYFLLANLSLIDLSLSSVTAPKMITDFFSQRKVISFKGCLVQIFLLHFFGGSEMVILIAMGFDRYIAICKPLHYTTIMCGNACVGIMAVAWGIGFLHSVSQLAFAVHLLFCGPNEVDSFYCDLPRVIKLACTDTYRLDIMVIANSGVLTVCSFVLLIISYTIILMTIQHRPLDKSSKALSTLTAHITVVLLFFGPCVFIYAWPFPIKSLDKFLAVFYSVITPLLNPIIYTLRNKDMKTAIRQLRKWDAHSSVKF* >gnomvar_rs200505207_1.69761.A.T_ENST00000335137.3 diff --git a/pgatk/toolbox/vcf_utils.py b/pgatk/toolbox/vcf_utils.py index 248cc081..d371e5df 100644 --- a/pgatk/toolbox/vcf_utils.py +++ b/pgatk/toolbox/vcf_utils.py @@ -109,7 +109,13 @@ def get_altseq( for feature in features_info: if var_pos in range(feature[0], feature[1] + 1): var_index_in_cds = nc_index + (var_pos - feature[0]) - c = len(ref_allele) + # Clip the ref allele length to the portion that falls within + # this feature. When a multi-base variant extends from an exon + # into an intron, the intronic bases are absent from the + # transcript and must not be counted. + var_end_genomic = var_pos + len(ref_allele) - 1 + exonic_ref_len = min(var_end_genomic, feature[1]) - var_pos + 1 + c = max(exonic_ref_len, 0) alt_seq = ref_seq[0:var_index_in_cds] + var_allele + ref_seq[var_index_in_cds + c::] if strand == '-': return ref_seq[::-1], alt_seq[::-1] From 6dcf9dcffd46c8775d05e52a909d5ac163700b09 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Tue, 3 Mar 2026 18:46:24 +0000 Subject: [PATCH 2/3] minor changes in help --- docs/index.md | 1 + docs/pgatk-cli.md | 163 ++++++++++++ docs/use-cases.md | 619 ++++++++++++++++++++++++++++++++++++++++++++++ mkdocs.yml | 1 + 4 files changed, 784 insertions(+) create mode 100644 docs/use-cases.md diff --git a/docs/index.md b/docs/index.md index d7104538..f6e58a2e 100644 --- a/docs/index.md +++ b/docs/index.md @@ -23,6 +23,7 @@ See the [Installation](installation.md) page for more options (Bioconda, Docker, | [Introduction](introduction.md) | Overview of the proteogenomics field | | [Installation](installation.md) | How to install pgatk (pip, Bioconda, Docker, source) | | [pgatk CLI](pgatk-cli.md) | Full command-line reference for all tools | +| [Use Cases](use-cases.md) | End-to-end workflows and recipes for common scenarios | | [File Formats](formats.md) | BED, GTF, GCT format specifications | | [Changelog](changelog.md) | Version history and release notes | | [Support](support.md) | Contributors and how to get help | diff --git a/docs/pgatk-cli.md b/docs/pgatk-cli.md index 2bda0be0..f36971ac 100644 --- a/docs/pgatk-cli.md +++ b/docs/pgatk-cli.md @@ -15,12 +15,16 @@ Options: Commands: cbioportal-downloader Command to download the the cbioportal studies cbioportal-to-proteindb Command to translate cbioportal mutation data into proteindb + clinvar-to-proteindb Generate protein database from ClinVar VCF + RefSeq GTF cosmic-downloader Command to download the cosmic mutation database cosmic-to-proteindb Command to translate Cosmic mutation data into proteindb + digest-mutant-protein Digest mutant proteins and filter against canonical proteome dnaseq-to-proteindb Generate peptides based on DNA sequences ensembl-check Command to check ensembl database for stop codons, gaps ensembl-downloader Command to download the ensembl information generate-decoy Create decoy protein sequences using multiple methods + map-peptide2genome Map peptides to genomic coordinates (GFF3 output) + ncbi-downloader Download NCBI RefSeq and ClinVar reference files threeframe-translation Command to perform 3'frame translation vcf-to-proteindb Generate peptides based on DNA variants VCF files ``` @@ -171,6 +175,44 @@ git lfs pull -I public --include "data_clinical_sample.txt" git lfs pull -I public --include "data_mutations_mskcc.txt" ``` +### Downloading NCBI / ClinVar Data + +Downloading NCBI RefSeq annotations and ClinVar variants for human (GRCh38) is performed using the command `ncbi-downloader`. The tool downloads four files: + +- RefSeq gene annotations (GTF) +- RefSeq transcript nucleotide sequences (FASTA) +- Assembly report (chromosome name mapping) +- ClinVar variant calls (VCF) + +#### Command Options + +```bash +$ pgatk ncbi-downloader -h +Usage: pgatk ncbi-downloader [OPTIONS] + + Required parameters: + -o, --output-dir TEXT Output directory for downloaded files + + Optional parameters: + -c, --config_file TEXT Configuration YAML file + --force Re-download files even if they exist + -h, --help Show this message and exit. +``` + +#### Examples + +- Download all NCBI/ClinVar reference files: + + ```bash + pgatk ncbi-downloader -o ncbi_data + ``` + +- Force re-download of all files: + + ```bash + pgatk ncbi-downloader -o ncbi_data --force + ``` + ## Generate Protein Databases The **pgatk** framework provides a set of tools to generate protein databases in `FASTA` format from DNA sequences, variants, and mutations. Multiple commands are available depending on the data type provided by the user and the public data providers (cBioPortal, COSMIC and ENSEMBL). @@ -355,6 +397,43 @@ The output of the tool is a protein fasta file written to the path specified by --output_proteindb var_peptides.fa ``` +### ClinVar Variants to Protein Sequences + +[ClinVar](https://www.ncbi.nlm.nih.gov/clinvar/) is NCBI's public archive of reports on the relationships among human variations and phenotypes. The command `clinvar-to-proteindb` generates a variant protein database from ClinVar VCF files using NCBI RefSeq gene models. Unlike `vcf-to-proteindb`, this command does **not** require VEP annotations. + +#### Command Options + +```bash +$ pgatk clinvar-to-proteindb -h +Usage: pgatk clinvar-to-proteindb [OPTIONS] + + Required parameters: + -v, --vcf TEXT ClinVar VCF file path + -g, --gtf TEXT NCBI RefSeq GTF file path + -f, --fasta TEXT RefSeq transcript nucleotide FASTA file path + -a, --assembly-report TEXT NCBI assembly report file path + -o, --output TEXT Output protein FASTA file path + + Optional parameters: + -c, --config_file TEXT Configuration YAML file + -h, --help Show this message and exit. +``` + +The input files can be downloaded using the [ncbi-downloader](#downloading-ncbi--clinvar-data) command. + +#### Examples + +- Generate a protein database from ClinVar variants: + + ```bash + pgatk clinvar-to-proteindb \ + --vcf ncbi_data/clinvar.vcf.gz \ + --gtf ncbi_data/GRCh38_latest_genomic.gtf.gz \ + --fasta ncbi_data/GRCh38_latest_rna.fna.gz \ + --assembly-report ncbi_data/GRCh38_latest_assembly_report.txt \ + --output clinvar_proteins.fa + ``` + ### Transcripts (DNA) to Protein Sequences DNA sequences given in a FASTA format can be translated using the `dnaseq-to-proteindb` tool. This tool allows for translation of all kinds of transcripts (coding and noncoding) by specifying the desired biotypes. @@ -489,3 +568,87 @@ Usage: pgatk generate-decoy [OPTIONS] ```bash pgatk generate-decoy -c config/protein_decoy.yaml --input proteindb.fa --output decoy_proteindb.fa ``` + +## Post-Processing Utilities + +### Digest and Filter Variant Peptides + +The `digest-mutant-protein` command performs *in silico* tryptic digestion of mutant proteins and filters out any peptides that also appear in a canonical reference proteome. The result is a compact FASTA of variant-specific peptides only. + +#### Command Options + +```bash +$ pgatk digest-mutant-protein -h +Usage: pgatk digest-mutant-protein [OPTIONS] + + Required parameters: + -i, --input TEXT Input mutant protein FASTA file(s), comma-separated + -f, --fasta TEXT Reference canonical protein FASTA + -o, --output TEXT Output file for unique variant peptides + + Optional parameters: + --prefix TEXT Header prefix for output entries (default: Mutation) + --min-len INTEGER Minimum peptide length (default: 7) + --max-len INTEGER Maximum peptide length (default: 40) + --missed-cleavages INTEGER Number of missed cleavages (default: 0) + -h, --help Show this message and exit. +``` + +#### Examples + +- Digest variant proteins and keep only variant-specific peptides: + + ```bash + pgatk digest-mutant-protein \ + --input variant_proteins.fa \ + --fasta canonical_proteins.fa \ + --output unique_variant_peptides.fa \ + --min-len 7 \ + --max-len 40 \ + --missed-cleavages 2 + ``` + +- Combine and filter multiple variant sources: + + ```bash + pgatk digest-mutant-protein \ + --input cosmic_proteins.fa,clinvar_proteins.fa,ensembl_variants.fa \ + --fasta canonical_proteins.fa \ + --output combined_variant_peptides.fa + ``` + +### Map Peptides to Genomic Coordinates + +The `map-peptide2genome` command maps proteomics-identified peptides to genomic coordinates and produces a GFF3 file suitable for visualization in genome browsers (UCSC, IGV, Ensembl). + +#### Command Options + +```bash +$ pgatk map-peptide2genome -h +Usage: pgatk map-peptide2genome [OPTIONS] + + Required parameters: + -i, --input TEXT Input peptide identification TSV + -g, --gtf TEXT GTF gene annotation file + -f, --fasta TEXT Protein FASTA file + -m, --idmap TEXT Protein-to-transcript ID mapping file + -o, --output TEXT Output GFF3 file + + Optional parameters: + --pep-col INTEGER Peptide column index, 0-based (default: 0) + --prot-col INTEGER Protein column index, 0-based (default: 1) + -h, --help Show this message and exit. +``` + +#### Examples + +- Map identified peptides to genomic coordinates: + + ```bash + pgatk map-peptide2genome \ + --input peptide_identifications.tsv \ + --gtf genes.gtf \ + --fasta proteins.fa \ + --idmap protein_to_transcript.tsv \ + --output peptides.gff3 + ``` diff --git a/docs/use-cases.md b/docs/use-cases.md new file mode 100644 index 00000000..2622a975 --- /dev/null +++ b/docs/use-cases.md @@ -0,0 +1,619 @@ +# Use Cases and Recipes + +This page provides end-to-end workflows for common proteogenomics scenarios. +Each recipe shows the exact commands to run, from downloading data to producing +a search-ready protein database. + +--- + +## 1. Human Variant Protein Database from ENSEMBL + +Build a variant protein database using ENSEMBL population variants (common SNPs +and indels) for human proteogenomics searches. + +### Step 1 -- Download ENSEMBL data + +Download the GTF, CDS, and VCF files for *Homo sapiens* (taxonomy 9606): + +```bash +pgatk ensembl-downloader \ + -t 9606 \ + -o ensembl_human \ + --skip_protein \ + --skip_ncrna \ + --skip_cdna \ + --skip_dna +``` + +This downloads the gene annotation GTF and the VCF file with known variants. + +### Step 2 -- Generate transcript sequences + +Use [gffread](http://ccb.jhu.edu/software/stringtie/gff.shtml#gffread) to +extract transcript sequences from the GTF and genome FASTA: + +```bash +gffread -F -w ensembl_human/transcripts.fa \ + -g ensembl_human/genome.fa \ + ensembl_human/Homo_sapiens.GRCh38.*.gtf.gz +``` + +### Step 3 -- Generate the variant protein database + +Translate all ENSEMBL variants that affect protein-coding transcripts: + +```bash +pgatk vcf-to-proteindb \ + --vcf ensembl_human/homo_sapiens_incl_consequences.vcf.gz \ + --input_fasta ensembl_human/transcripts.fa \ + --gene_annotations_gtf ensembl_human/Homo_sapiens.GRCh38.*.gtf.gz \ + --output_proteindb ensembl_human/variant_proteins.fa +``` + +### Step 4 -- Generate the canonical protein database + +Translate canonical protein-coding transcripts: + +```bash +pgatk dnaseq-to-proteindb \ + --input_fasta ensembl_human/transcripts.fa \ + --output_proteindb ensembl_human/canonical_proteins.fa +``` + +### Step 5 -- Combine and add decoy sequences + +Merge canonical and variant databases, then generate decoys: + +```bash +cat ensembl_human/canonical_proteins.fa \ + ensembl_human/variant_proteins.fa \ + > ensembl_human/target.fa + +pgatk generate-decoy \ + --input ensembl_human/target.fa \ + --output ensembl_human/target_decoy.fa \ + --method decoypyrat \ + --decoy_prefix DECOY_ +``` + +The file `target_decoy.fa` is ready for database searching. + +--- + +## 2. High-Frequency Variant Database (AF Filtering) + +Build a protein database that only includes common population variants above a +given allele frequency threshold. + +```bash +pgatk vcf-to-proteindb \ + --vcf homo_sapiens_incl_consequences.vcf.gz \ + --input_fasta transcripts.fa \ + --gene_annotations_gtf genes.gtf \ + --af_field MAF \ + --af_threshold 0.01 \ + --output_proteindb common_variant_proteins.fa +``` + +To restrict to specific consequence types (e.g. only missense variants): + +```bash +pgatk vcf-to-proteindb \ + --vcf homo_sapiens_incl_consequences.vcf.gz \ + --input_fasta transcripts.fa \ + --gene_annotations_gtf genes.gtf \ + --af_field MAF \ + --af_threshold 0.05 \ + --include_consequences missense_variant \ + --output_proteindb missense_common_proteins.fa +``` + +--- + +## 3. ClinVar Clinical Variant Database + +Build a protein database from ClinVar pathogenic and likely pathogenic variants +using the NCBI RefSeq gene models. + +### Step 1 -- Download NCBI / ClinVar files + +```bash +pgatk ncbi-downloader -o ncbi_data +``` + +This downloads four files to `ncbi_data/`: + +- `GRCh38_latest_genomic.gtf.gz` -- RefSeq gene annotations +- `GRCh38_latest_rna.fna.gz` -- RefSeq transcript nucleotide sequences +- `GRCh38_latest_assembly_report.txt` -- Chromosome name mapping +- `clinvar.vcf.gz` -- ClinVar variant calls + +### Step 2 -- Generate the ClinVar protein database + +```bash +pgatk clinvar-to-proteindb \ + --vcf ncbi_data/clinvar.vcf.gz \ + --gtf ncbi_data/GRCh38_latest_genomic.gtf.gz \ + --fasta ncbi_data/GRCh38_latest_rna.fna.gz \ + --assembly-report ncbi_data/GRCh38_latest_assembly_report.txt \ + --output clinvar_proteins.fa +``` + +### Step 3 -- Add decoy sequences + +```bash +pgatk generate-decoy \ + --input clinvar_proteins.fa \ + --output clinvar_target_decoy.fa \ + --method decoypyrat +``` + +!!! tip + The ClinVar pipeline does **not** require VEP annotations. It uses + BedTools interval overlap to find transcripts affected by each variant, + then applies the variant to the transcript sequence and translates it. + +--- + +## 4. Cancer-Specific Database from COSMIC + +Generate tissue-specific protein databases from COSMIC somatic mutations. + +### Step 1 -- Download COSMIC data + +```bash +pgatk cosmic-downloader \ + -u your_email@example.com \ + -p your_password \ + -o cosmic_data +``` + +!!! note + A COSMIC account is required. Register at + [https://cancer.sanger.ac.uk/cosmic/register](https://cancer.sanger.ac.uk/cosmic/register). + +### Step 2a -- Single database from all mutations + +```bash +pgatk cosmic-to-proteindb \ + --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \ + --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \ + --output_db cosmic_all_proteins.fa +``` + +### Step 2b -- One database per cancer type + +Split mutations by primary tissue site, creating one FASTA per cancer type: + +```bash +pgatk cosmic-to-proteindb \ + --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \ + --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \ + --output_db cosmic_proteins.fa \ + --split_by_filter_column +``` + +This produces files named `cosmic_proteins_.fa` for each tissue type +(e.g. `cosmic_proteins_lung.fa`, `cosmic_proteins_breast.fa`). + +### Step 2c -- Database for a specific cancer type + +Limit to a single tissue type: + +```bash +pgatk cosmic-to-proteindb \ + --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \ + --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \ + --output_db cosmic_lung_proteins.fa \ + --accepted_values "lung" +``` + +### Step 2d -- Cell-line specific databases + +Use cell-line data split by sample name: + +```bash +pgatk cosmic-to-proteindb \ + --input_mutation cosmic_data/CosmicCLP_MutantExport.tsv.gz \ + --input_genes cosmic_data/All_CellLines_Genes.fasta.gz \ + --output_db cosmic_cellline_proteins.fa \ + --filter_column "Sample name" \ + --split_by_filter_column +``` + +--- + +## 5. Cancer Study from cBioPortal + +Generate a protein database from cancer mutation data hosted in cBioPortal. + +### Step 1 -- List available studies + +```bash +pgatk cbioportal-downloader --list_studies +``` + +### Step 2 -- Download a specific study + +```bash +pgatk cbioportal-downloader \ + -d blca_mskcc_solit_2014 \ + -o cbioportal_data +``` + +### Step 3 -- Download ENSEMBL CDS (hg19 assembly) + +cBioPortal mutations are aligned to hg19 (GRCh37): + +```bash +pgatk ensembl-downloader \ + -t 9606 \ + --grch37 \ + -o ensembl_hg19 \ + --skip_vcf \ + --skip_gtf \ + --skip_protein \ + --skip_ncrna \ + --skip_cdna \ + --skip_dna +``` + +### Step 4 -- Translate mutations to protein sequences + +```bash +pgatk cbioportal-to-proteindb \ + --input_mutation cbioportal_data/data_mutations_mskcc.txt \ + --input_cds ensembl_hg19/Homo_sapiens.GRCh37.cds.all.fa.gz \ + --output_db bladder_cancer_proteins.fa +``` + +To generate tissue-type-specific databases using the clinical sample file: + +```bash +pgatk cbioportal-to-proteindb \ + --input_mutation cbioportal_data/data_mutations_mskcc.txt \ + --input_cds ensembl_hg19/Homo_sapiens.GRCh37.cds.all.fa.gz \ + --clinical_sample_file cbioportal_data/data_clinical_sample.txt \ + --output_db cbioportal_proteins.fa \ + --split_by_filter_column +``` + +--- + +## 6. Non-Coding RNA Protein Database + +Translate non-coding RNA transcripts to search for novel peptides from +supposedly non-coding regions. + +### lincRNA database + +```bash +pgatk dnaseq-to-proteindb \ + --input_fasta transcripts.fa \ + --output_proteindb lincRNA_proteins.fa \ + --var_prefix lincRNA_ \ + --include_biotypes lincRNA \ + --num_orfs 3 \ + --skip_including_all_cds +``` + +### Pseudogene database + +```bash +pgatk dnaseq-to-proteindb \ + --input_fasta transcripts.fa \ + --output_proteindb pseudogene_proteins.fa \ + --var_prefix pseudogene_ \ + --include_biotypes processed_pseudogene,transcribed_processed_pseudogene,translated_processed_pseudogene \ + --num_orfs 3 \ + --skip_including_all_cds +``` + +### Alternative ORFs from protein-coding transcripts + +Translate non-canonical reading frames of protein-coding genes: + +```bash +pgatk dnaseq-to-proteindb \ + --input_fasta transcripts.fa \ + --output_proteindb altorf_proteins.fa \ + --var_prefix altorf_ \ + --include_biotypes altORFs \ + --skip_including_all_cds +``` + +--- + +## 7. Six-Frame Genome Translation + +Perform a full six-frame translation of a genome assembly. This is useful as +an unbiased search space but produces very large databases. + +```bash +pgatk dnaseq-to-proteindb \ + --input_fasta genome.fa \ + --output_proteindb genome_6frame.fa \ + --biotype_str '' \ + --num_orfs 3 \ + --num_orfs_complement 3 +``` + +!!! warning + Six-frame translation of a full mammalian genome produces a very large + database. Consider translating individual chromosomes or using it in + combination with a smaller targeted database search. + +--- + +## 8. gnomAD Population Variant Database + +Build a protein database from gnomAD variants using GENCODE annotations. + +```bash +pgatk vcf-to-proteindb \ + --vcf gnomad_genome.vcf.gz \ + --input_fasta gencode_transcripts.fa \ + --gene_annotations_gtf gencode.v38.annotation.gtf.gz \ + --annotation_field_name vep \ + --af_field controls_AF \ + --af_threshold 0.01 \ + --include_consequences missense_variant,inframe_insertion,inframe_deletion,frameshift_variant \ + --biotype_str transcript_type \ + --output_proteindb gnomad_proteins.fa +``` + +!!! tip "gnomAD-specific parameters" + - `--annotation_field_name vep` -- gnomAD uses `vep` instead of `CSQ` + - `--af_field controls_AF` -- Use control allele frequency + - `--biotype_str transcript_type` -- GENCODE uses `transcript_type` instead + of ENSEMBL's `transcript_biotype` + +--- + +## 9. Sample-Specific Database from WGS/WES + +Translate variants from a whole-genome or whole-exome sequencing VCF of an +individual sample. These VCFs typically lack VEP annotations. + +```bash +pgatk vcf-to-proteindb \ + --vcf sample_variants.vcf \ + --input_fasta transcripts.fa \ + --gene_annotations_gtf genes.gtf \ + --annotation_field_name '' \ + --ignore_filters \ + --output_proteindb sample_proteins.fa +``` + +- `--annotation_field_name ''` tells the tool to skip parsing VEP/CSQ + annotations (not present in raw variant callers like GATK HaplotypeCaller) +- `--ignore_filters` includes all variants regardless of the FILTER column + +To include only variants that passed quality filtering: + +```bash +pgatk vcf-to-proteindb \ + --vcf sample_variants.vcf \ + --input_fasta transcripts.fa \ + --gene_annotations_gtf genes.gtf \ + --annotation_field_name '' \ + --accepted_filters PASS \ + --output_proteindb sample_proteins_pass.fa +``` + +--- + +## 10. Digest and Filter Variant Peptides + +After generating a variant protein database you may want to digest the proteins +*in silico* and keep only peptides that differ from the canonical proteome. +This produces a compact FASTA of variant-specific peptides. + +```bash +pgatk digest-mutant-protein \ + --input variant_proteins.fa \ + --fasta canonical_proteins.fa \ + --output unique_variant_peptides.fa \ + --min-len 7 \ + --max-len 40 \ + --missed-cleavages 2 +``` + +To combine multiple variant sources (e.g. COSMIC + ClinVar): + +```bash +pgatk digest-mutant-protein \ + --input cosmic_proteins.fa,clinvar_proteins.fa \ + --fasta canonical_proteins.fa \ + --output combined_variant_peptides.fa +``` + +--- + +## 11. Map Identified Peptides to Genomic Coordinates + +After a proteomics search, map the identified peptides back to genomic +coordinates for visualization in genome browsers (e.g. UCSC, IGV). + +```bash +pgatk map-peptide2genome \ + --input peptide_identifications.tsv \ + --gtf genes.gtf \ + --fasta proteins.fa \ + --idmap protein_to_transcript.tsv \ + --output peptides.gff3 +``` + +The input TSV should contain at least peptide sequence and protein accession +columns (configurable with `--pep-col` and `--prot-col`). + +--- + +## 12. Complete Proteogenomics Workflow + +A full end-to-end workflow combining canonical proteins, population variants, +clinical variants, cancer mutations, and non-coding translations into a single +search-ready database. + +### Step 1 -- Download all data sources + +```bash +# ENSEMBL (canonical + population variants) +pgatk ensembl-downloader -t 9606 -o ensembl_data + +# NCBI / ClinVar +pgatk ncbi-downloader -o ncbi_data + +# COSMIC (requires account) +pgatk cosmic-downloader -u user@example.com -p password -o cosmic_data +``` + +### Step 2 -- Prepare transcript sequences + +```bash +gffread -F -w ensembl_data/transcripts.fa \ + -g ensembl_data/genome.fa \ + ensembl_data/Homo_sapiens.GRCh38.*.gtf.gz +``` + +### Step 3 -- Generate all protein databases + +```bash +# Canonical proteins +pgatk dnaseq-to-proteindb \ + --input_fasta ensembl_data/transcripts.fa \ + --output_proteindb canonical.fa + +# ENSEMBL population variants +pgatk vcf-to-proteindb \ + --vcf ensembl_data/homo_sapiens_incl_consequences.vcf.gz \ + --input_fasta ensembl_data/transcripts.fa \ + --gene_annotations_gtf ensembl_data/Homo_sapiens.GRCh38.*.gtf.gz \ + --output_proteindb ensembl_variants.fa + +# ClinVar clinical variants +pgatk clinvar-to-proteindb \ + --vcf ncbi_data/clinvar.vcf.gz \ + --gtf ncbi_data/GRCh38_latest_genomic.gtf.gz \ + --fasta ncbi_data/GRCh38_latest_rna.fna.gz \ + --assembly-report ncbi_data/GRCh38_latest_assembly_report.txt \ + --output clinvar_variants.fa + +# COSMIC somatic mutations +pgatk cosmic-to-proteindb \ + --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \ + --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \ + --output_db cosmic_variants.fa + +# Non-coding RNA (lincRNA) +pgatk dnaseq-to-proteindb \ + --input_fasta ensembl_data/transcripts.fa \ + --output_proteindb lincRNA.fa \ + --var_prefix lincRNA_ \ + --include_biotypes lincRNA \ + --num_orfs 3 \ + --skip_including_all_cds +``` + +### Step 4 -- Combine all databases + +```bash +cat canonical.fa \ + ensembl_variants.fa \ + clinvar_variants.fa \ + cosmic_variants.fa \ + lincRNA.fa \ + > combined_target.fa +``` + +### Step 5 -- Add decoy sequences + +```bash +pgatk generate-decoy \ + --input combined_target.fa \ + --output proteogenomics_target_decoy.fa \ + --method decoypyrat \ + --decoy_prefix DECOY_ +``` + +### Step 6 (optional) -- Extract unique variant peptides + +```bash +pgatk digest-mutant-protein \ + --input ensembl_variants.fa,clinvar_variants.fa,cosmic_variants.fa \ + --fasta canonical.fa \ + --output unique_variant_peptides.fa \ + --min-len 7 \ + --max-len 40 \ + --missed-cleavages 2 +``` + +The file `proteogenomics_target_decoy.fa` is a comprehensive search database, +and `unique_variant_peptides.fa` provides a focused list of variant-specific +peptides for validation. + +--- + +## 13. Non-Human Species + +pgatk supports any species available in ENSEMBL. Here is an example for mouse +(*Mus musculus*, taxonomy 10090): + +```bash +# Download +pgatk ensembl-downloader -t 10090 -o ensembl_mouse --skip_dna + +# Generate transcript sequences +gffread -F -w ensembl_mouse/transcripts.fa \ + -g ensembl_mouse/genome.fa \ + ensembl_mouse/Mus_musculus.GRCm39.*.gtf.gz + +# Canonical proteins +pgatk dnaseq-to-proteindb \ + --input_fasta ensembl_mouse/transcripts.fa \ + --output_proteindb mouse_canonical.fa + +# Variant proteins +pgatk vcf-to-proteindb \ + --vcf ensembl_mouse/mus_musculus_incl_consequences.vcf.gz \ + --input_fasta ensembl_mouse/transcripts.fa \ + --gene_annotations_gtf ensembl_mouse/Mus_musculus.GRCm39.*.gtf.gz \ + --output_proteindb mouse_variants.fa + +# Combine and generate decoy +cat mouse_canonical.fa mouse_variants.fa > mouse_target.fa +pgatk generate-decoy \ + --input mouse_target.fa \ + --output mouse_target_decoy.fa \ + --method decoypyrat +``` + +--- + +## 14. Three-Frame Translation + +Perform a simple three-frame translation of any nucleotide FASTA file: + +```bash +pgatk threeframe-translation \ + --input_fasta input_sequences.fa \ + --output translated_proteins.fa +``` + +--- + +## 15. Quality Check: Validate Protein Database + +Before using a protein database, check it for internal stop codons and short +sequences: + +```bash +pgatk ensembl-check \ + --input_fasta protein_database.fa \ + --output validated_database.fa \ + --num_aa 6 +``` + +This filters out sequences shorter than 6 amino acids. Use `--add_stop_codons` +to include proteins that contain internal stop codons (useful for +proteogenomics where frameshifts may introduce premature stops). diff --git a/mkdocs.yml b/mkdocs.yml index aa2dab95..13bee389 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -95,6 +95,7 @@ nav: - Introduction: introduction.md - Installation: installation.md - pgatk CLI: pgatk-cli.md + - Use Cases: use-cases.md - File Formats: formats.md - Changelog: changelog.md - Support: support.md From 78fecd885f4c747a42822943d2f08e1d47b28129 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Tue, 3 Mar 2026 19:24:55 +0000 Subject: [PATCH 3/3] improve docs --- docs/use-cases.md | 849 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 654 insertions(+), 195 deletions(-) diff --git a/docs/use-cases.md b/docs/use-cases.md index 2622a975..5af6de50 100644 --- a/docs/use-cases.md +++ b/docs/use-cases.md @@ -6,10 +6,172 @@ a search-ready protein database. --- +## Cell-Type Specific Non-Canonical Peptide Discovery + +!!! abstract "Featured workflow" + This workflow reproduces the analysis from [Umer et al., *Bioinformatics* 2022](https://doi.org/10.1093/bioinformatics/btab838), + where cell-type specific databases were generated for 65 cell lines, + revealing non-canonical and cryptic peptides amounting to **>5% of total + peptide identifications**. + +The key insight is to build **cell-type specific** proteogenomics databases +that combine canonical proteins with non-canonical translations (pseudogenes, +lncRNAs, alternative ORFs) and variant sequences from multiple genomic sources. +Searching with a database tailored to the cell type of interest maximizes +discovery while keeping the search space focused. + +### Step 1 -- Download ENSEMBL data + +```bash +pgatk ensembl-downloader \ + -t 9606 \ + -o ensembl_human +``` + +### Step 2 -- Generate transcript sequences from GTF + +```bash +gffread -F -w ensembl_human/transcripts.fa \ + -g ensembl_human/genome.fa \ + ensembl_human/Homo_sapiens.GRCh38.*.gtf.gz +``` + +### Step 3 -- Canonical protein-coding sequences + +Translate all protein-coding transcripts using their annotated CDS: + +```bash +pgatk dnaseq-to-proteindb \ + --input_fasta ensembl_human/transcripts.fa \ + --output_proteindb canonical.fa +``` + +### Step 4 -- Non-canonical translations + +#### Pseudogenes (three-frame) + +Pseudogenes were traditionally considered non-functional, but some produce +detectable peptides. Translate them in three reading frames: + +```bash +pgatk dnaseq-to-proteindb \ + --input_fasta ensembl_human/transcripts.fa \ + --output_proteindb pseudogene.fa \ + --var_prefix pseudo_ \ + --include_biotypes processed_pseudogene,unprocessed_pseudogene,transcribed_processed_pseudogene,transcribed_unprocessed_pseudogene,translated_processed_pseudogene \ + --num_orfs 3 \ + --skip_including_all_cds +``` + +#### lncRNAs (three-frame) + +Long non-coding RNAs can harbor small open reading frames encoding +micropeptides: + +```bash +pgatk dnaseq-to-proteindb \ + --input_fasta ensembl_human/transcripts.fa \ + --output_proteindb lncrna.fa \ + --var_prefix lncrna_ \ + --include_biotypes lincRNA,antisense,sense_intronic,sense_overlapping \ + --num_orfs 3 \ + --skip_including_all_cds +``` + +#### Alternative ORFs (exonic out-of-frame translation) + +Non-canonical reading frames of protein-coding mRNAs can produce cryptic +peptides: + +```bash +pgatk dnaseq-to-proteindb \ + --input_fasta ensembl_human/transcripts.fa \ + --output_proteindb altorf.fa \ + --var_prefix altorf_ \ + --include_biotypes altORFs \ + --skip_including_all_cds +``` + +### Step 5 -- Population variant proteins + +Include common human variants from ENSEMBL: + +```bash +pgatk vcf-to-proteindb \ + --vcf ensembl_human/homo_sapiens_incl_consequences.vcf.gz \ + --input_fasta ensembl_human/transcripts.fa \ + --gene_annotations_gtf ensembl_human/Homo_sapiens.GRCh38.*.gtf.gz \ + --output_proteindb ensembl_variants.fa +``` + +### Step 6 -- COSMIC somatic mutations (optional, for cancer cell lines) + +For cancer cell-line studies, add cell-line-specific somatic mutations: + +```bash +pgatk cosmic-downloader \ + -u your_email@example.com \ + -p your_password \ + -o cosmic_data + +pgatk cosmic-to-proteindb \ + --input_mutation cosmic_data/CosmicCLP_MutantExport.tsv.gz \ + --input_genes cosmic_data/All_CellLines_Genes.fasta.gz \ + --output_db cosmic_cellline.fa \ + --filter_column "Sample name" \ + --split_by_filter_column +``` + +### Step 7 -- Combine and generate target-decoy database + +```bash +# Combine all components +# For a specific cell line, use the matching COSMIC file if available +cat canonical.fa \ + pseudogene.fa \ + lncrna.fa \ + altorf.fa \ + ensembl_variants.fa \ + > cell_type_target.fa + +# Generate decoy sequences +pgatk generate-decoy \ + --input cell_type_target.fa \ + --output cell_type_target_decoy.fa \ + --method decoypyrat \ + --decoy_prefix DECOY_ +``` + +### Step 8 -- Extract non-canonical unique peptides + +After database searching with your search engine, you can also pre-compute the +set of non-canonical peptides unique to these novel sources: + +```bash +pgatk digest-mutant-protein \ + --input pseudogene.fa,lncrna.fa,altorf.fa,ensembl_variants.fa \ + --fasta canonical.fa \ + --output non_canonical_peptides.fa \ + --min-len 7 \ + --max-len 40 \ + --missed-cleavages 2 +``` + +!!! tip "Expected results" + In the original study, this workflow applied to 65 cell lines across six + PRIDE datasets identified non-canonical peptides amounting to >5% of + total identifications. The most common non-canonical sources were + pseudogenes and lncRNAs, followed by alternative ORFs and variant + peptides. + +--- + ## 1. Human Variant Protein Database from ENSEMBL Build a variant protein database using ENSEMBL population variants (common SNPs -and indels) for human proteogenomics searches. +and indels) for human proteogenomics searches. This is the most common starting +point for proteogenomics experiments -- augmenting the canonical proteome with +known variant peptides that would otherwise be missed. ### Step 1 -- Download ENSEMBL data @@ -80,10 +242,16 @@ The file `target_decoy.fa` is ready for database searching. --- -## 2. High-Frequency Variant Database (AF Filtering) +## 2. Population-Specific Variant Database + +Population-level genetic variants cause amino acid changes that are invisible +to standard reference database searches. Studies have shown that incorporating +common missense variants can increase peptide identifications by 3--5% and is +especially important when analyzing samples from underrepresented ancestries. -Build a protein database that only includes common population variants above a -given allele frequency threshold. +### Common variants above a frequency threshold + +Include only variants present in at least 1% of the population: ```bash pgatk vcf-to-proteindb \ @@ -95,7 +263,10 @@ pgatk vcf-to-proteindb \ --output_proteindb common_variant_proteins.fa ``` -To restrict to specific consequence types (e.g. only missense variants): +### Missense-only variant database + +Restrict to single amino acid variants (SAAVs), the most common variant type +detectable by mass spectrometry: ```bash pgatk vcf-to-proteindb \ @@ -108,12 +279,42 @@ pgatk vcf-to-proteindb \ --output_proteindb missense_common_proteins.fa ``` +### gnomAD ancestry-stratified database + +gnomAD provides allele frequencies stratified by ancestry (African, East Asian, +South Asian, European, Latino, etc.). Build a database using variants common in +a specific population: + +```bash +pgatk vcf-to-proteindb \ + --vcf gnomad.exomes.v4.1.sites.vcf.bgz \ + --input_fasta gencode_transcripts.fa \ + --gene_annotations_gtf gencode.v44.annotation.gtf.gz \ + --annotation_field_name vep \ + --af_field AF_afr \ + --af_threshold 0.01 \ + --include_consequences missense_variant,inframe_insertion,inframe_deletion \ + --biotype_str transcript_type \ + --output_proteindb gnomad_afr_proteins.fa +``` + +!!! tip "gnomAD-specific parameters" + - `--annotation_field_name vep` -- gnomAD uses `vep` instead of `CSQ` + - `--af_field` -- Use population-specific AF fields: `AF_afr` (African), + `AF_eas` (East Asian), `AF_sas` (South Asian), `AF_nfe` (Non-Finnish European), + `AF_amr` (Latino), or `controls_AF` (all controls) + - `--biotype_str transcript_type` -- GENCODE uses `transcript_type` instead + of ENSEMBL's `transcript_biotype` + --- ## 3. ClinVar Clinical Variant Database -Build a protein database from ClinVar pathogenic and likely pathogenic variants -using the NCBI RefSeq gene models. +[ClinVar](https://www.ncbi.nlm.nih.gov/clinvar/) catalogs the relationship +between human variants and clinical phenotypes. Building a ClinVar-derived +protein database enables detection of clinically annotated variant peptides by +mass spectrometry -- useful for clinical proteomics, pharmacoproteomics, and +validating pathogenic variants at the protein level. ### Step 1 -- Download NCBI / ClinVar files @@ -155,37 +356,28 @@ pgatk generate-decoy \ --- -## 4. Cancer-Specific Database from COSMIC +## 4. Tumor-Specific Databases for Cancer Proteogenomics -Generate tissue-specific protein databases from COSMIC somatic mutations. +Cancer proteogenomics studies (such as those from CPTAC) build +tumor-specific protein databases to detect somatic mutant peptides, understand +therapy resistance, and prioritize neoantigen candidates. The databases combine +somatic mutations from cancer-specific sources (COSMIC, cBioPortal) and/or +patient-matched whole-exome sequencing. -### Step 1 -- Download COSMIC data +### 4a. COSMIC somatic mutations by cancer type + +Generate one protein database per primary tissue site. This is the standard +approach for large-scale cancer proteogenomics when patient-level WES is not +available: ```bash +# Download COSMIC data (requires account) pgatk cosmic-downloader \ -u your_email@example.com \ -p your_password \ -o cosmic_data -``` - -!!! note - A COSMIC account is required. Register at - [https://cancer.sanger.ac.uk/cosmic/register](https://cancer.sanger.ac.uk/cosmic/register). -### Step 2a -- Single database from all mutations - -```bash -pgatk cosmic-to-proteindb \ - --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \ - --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \ - --output_db cosmic_all_proteins.fa -``` - -### Step 2b -- One database per cancer type - -Split mutations by primary tissue site, creating one FASTA per cancer type: - -```bash +# Generate per-tissue databases pgatk cosmic-to-proteindb \ --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \ --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \ @@ -193,12 +385,12 @@ pgatk cosmic-to-proteindb \ --split_by_filter_column ``` -This produces files named `cosmic_proteins_.fa` for each tissue type -(e.g. `cosmic_proteins_lung.fa`, `cosmic_proteins_breast.fa`). +This produces files like `cosmic_proteins_lung.fa`, `cosmic_proteins_breast.fa`, +etc. Use the tissue-matched database for your cancer type of interest. -### Step 2c -- Database for a specific cancer type +### 4b. Single cancer type -Limit to a single tissue type: +Build a focused database for one cancer type (e.g. lung): ```bash pgatk cosmic-to-proteindb \ @@ -208,9 +400,10 @@ pgatk cosmic-to-proteindb \ --accepted_values "lung" ``` -### Step 2d -- Cell-line specific databases +### 4c. Cell-line proteogenomics -Use cell-line data split by sample name: +When analyzing cell-line proteomes, use cell-line-specific mutations. COSMIC +provides a dedicated cell-line export with mutations annotated per sample: ```bash pgatk cosmic-to-proteindb \ @@ -221,71 +414,133 @@ pgatk cosmic-to-proteindb \ --split_by_filter_column ``` ---- - -## 5. Cancer Study from cBioPortal - -Generate a protein database from cancer mutation data hosted in cBioPortal. +### 4d. cBioPortal study-specific database -### Step 1 -- List available studies +cBioPortal hosts mutation data from thousands of cancer genomics studies. +Generate a protein database from a specific study: ```bash +# List available studies pgatk cbioportal-downloader --list_studies + +# Download a study +pgatk cbioportal-downloader \ + -d brca_tcga_pan_can_atlas_2018 \ + -o cbioportal_data + +# Download ENSEMBL CDS (hg19 -- cBioPortal uses GRCh37) +pgatk ensembl-downloader \ + -t 9606 --grch37 -o ensembl_hg19 \ + --skip_vcf --skip_gtf --skip_protein \ + --skip_ncrna --skip_cdna --skip_dna + +# Translate mutations +pgatk cbioportal-to-proteindb \ + --input_mutation cbioportal_data/data_mutations_mskcc.txt \ + --input_cds ensembl_hg19/Homo_sapiens.GRCh37.cds.all.fa.gz \ + --output_db brca_tcga_proteins.fa ``` -### Step 2 -- Download a specific study +### 4e. Combined cancer database for immunopeptidomics + +For HLA immunopeptidomics / neoantigen discovery, a broad mutation database +maximizes the chance of detecting mutant HLA-presented peptides. Studies have +shown that COSMIC-derived databases can identify 5x more mutant immunopeptides +than patient WES alone. Combine COSMIC with ClinVar: ```bash -pgatk cbioportal-downloader \ - -d blca_mskcc_solit_2014 \ - -o cbioportal_data +# Generate COSMIC mutations +pgatk cosmic-to-proteindb \ + --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \ + --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \ + --output_db cosmic_proteins.fa + +# Generate ClinVar mutations +pgatk clinvar-to-proteindb \ + --vcf ncbi_data/clinvar.vcf.gz \ + --gtf ncbi_data/GRCh38_latest_genomic.gtf.gz \ + --fasta ncbi_data/GRCh38_latest_rna.fna.gz \ + --assembly-report ncbi_data/GRCh38_latest_assembly_report.txt \ + --output clinvar_proteins.fa + +# Extract variant-unique peptides (undigested -- HLA peptides are not tryptic) +# For immunopeptidomics, use the full-length variant proteins directly: +cat cosmic_proteins.fa clinvar_proteins.fa > neoantigen_candidates.fa + +pgatk generate-decoy \ + --input neoantigen_candidates.fa \ + --output neoantigen_target_decoy.fa \ + --method decoypyrat ``` -### Step 3 -- Download ENSEMBL CDS (hg19 assembly) +--- + +## 5. Patient-Specific Database from WGS/WES + +When matched whole-genome or whole-exome sequencing data is available for +a sample, build a personalized protein database from the patient's own +variants. This is the gold standard for precision proteogenomics as used in +CPTAC studies. -cBioPortal mutations are aligned to hg19 (GRCh37): +### From a raw VCF (no VEP annotations) + +VCF files from variant callers (e.g. GATK HaplotypeCaller, Strelka, MuTect2) +typically lack VEP consequence annotations: ```bash -pgatk ensembl-downloader \ - -t 9606 \ - --grch37 \ - -o ensembl_hg19 \ - --skip_vcf \ - --skip_gtf \ - --skip_protein \ - --skip_ncrna \ - --skip_cdna \ - --skip_dna +pgatk vcf-to-proteindb \ + --vcf patient_somatic.vcf \ + --input_fasta transcripts.fa \ + --gene_annotations_gtf genes.gtf \ + --annotation_field_name '' \ + --ignore_filters \ + --output_proteindb patient_proteins.fa ``` -### Step 4 -- Translate mutations to protein sequences +- `--annotation_field_name ''` skips VEP/CSQ parsing +- `--ignore_filters` includes all called variants + +### From a VEP-annotated VCF + +If the VCF has been annotated with Ensembl VEP, you can filter by consequence +and transcript biotype: ```bash -pgatk cbioportal-to-proteindb \ - --input_mutation cbioportal_data/data_mutations_mskcc.txt \ - --input_cds ensembl_hg19/Homo_sapiens.GRCh37.cds.all.fa.gz \ - --output_db bladder_cancer_proteins.fa +pgatk vcf-to-proteindb \ + --vcf patient_somatic.vep.vcf \ + --input_fasta transcripts.fa \ + --gene_annotations_gtf genes.gtf \ + --include_consequences missense_variant,frameshift_variant,stop_gained,inframe_insertion,inframe_deletion \ + --output_proteindb patient_proteins.fa ``` -To generate tissue-type-specific databases using the clinical sample file: +### Only PASS-filtered variants + +Include only high-confidence variant calls: ```bash -pgatk cbioportal-to-proteindb \ - --input_mutation cbioportal_data/data_mutations_mskcc.txt \ - --input_cds ensembl_hg19/Homo_sapiens.GRCh37.cds.all.fa.gz \ - --clinical_sample_file cbioportal_data/data_clinical_sample.txt \ - --output_db cbioportal_proteins.fa \ - --split_by_filter_column +pgatk vcf-to-proteindb \ + --vcf patient_somatic.vcf \ + --input_fasta transcripts.fa \ + --gene_annotations_gtf genes.gtf \ + --annotation_field_name '' \ + --accepted_filters PASS \ + --output_proteindb patient_pass_proteins.fa ``` --- -## 6. Non-Coding RNA Protein Database +## 6. Novel ORF and Micropeptide Discovery + +Proteogenomics is a key approach for discovering novel coding regions: small +open reading frames (smORFs), micropeptides from lncRNAs, pseudogene-encoded +proteins, and alternative reading frames. Studies have found that non-canonical +peptides can account for over 5% of total identifications. -Translate non-coding RNA transcripts to search for novel peptides from -supposedly non-coding regions. +### lincRNA-derived proteins -### lincRNA database +Long intergenic non-coding RNAs (lincRNAs) can encode small proteins. Translate +them in three reading frames: ```bash pgatk dnaseq-to-proteindb \ @@ -297,7 +552,9 @@ pgatk dnaseq-to-proteindb \ --skip_including_all_cds ``` -### Pseudogene database +### Pseudogene-derived proteins + +Some pseudogenes are transcribed and may produce functional peptides: ```bash pgatk dnaseq-to-proteindb \ @@ -311,7 +568,8 @@ pgatk dnaseq-to-proteindb \ ### Alternative ORFs from protein-coding transcripts -Translate non-canonical reading frames of protein-coding genes: +Translate non-canonical reading frames of known protein-coding genes to +discover upstream ORFs (uORFs), overlapping ORFs, and downstream ORFs: ```bash pgatk dnaseq-to-proteindb \ @@ -322,12 +580,60 @@ pgatk dnaseq-to-proteindb \ --skip_including_all_cds ``` +### Antisense transcripts + +```bash +pgatk dnaseq-to-proteindb \ + --input_fasta transcripts.fa \ + --output_proteindb antisense_proteins.fa \ + --var_prefix antisense_ \ + --include_biotypes antisense,antisense_RNA \ + --num_orfs 3 \ + --skip_including_all_cds +``` + +### Combined non-canonical database + +Combine all non-canonical sources with the canonical proteome: + +```bash +cat canonical_proteins.fa \ + lincRNA_proteins.fa \ + pseudogene_proteins.fa \ + altorf_proteins.fa \ + antisense_proteins.fa \ + > novel_orf_target.fa + +pgatk generate-decoy \ + --input novel_orf_target.fa \ + --output novel_orf_target_decoy.fa \ + --method decoypyrat +``` + +!!! tip + After searching, use `digest-mutant-protein` to extract only the novel + peptides that do not appear in the canonical proteome: + + ```bash + pgatk digest-mutant-protein \ + --input lincRNA_proteins.fa,pseudogene_proteins.fa,altorf_proteins.fa \ + --fasta canonical_proteins.fa \ + --output novel_unique_peptides.fa + ``` + --- -## 7. Six-Frame Genome Translation +## 7. Genome Annotation Refinement + +Proteogenomics is used to validate and refine genome annotations by mapping +proteomics-identified peptides back to genomic coordinates. This has been +applied to improve annotations for organisms from wheat (validating 33,612 +gene models) to human (discovering novel splice junctions and exons). -Perform a full six-frame translation of a genome assembly. This is useful as -an unbiased search space but produces very large databases. +### Six-frame genome translation + +Translate the entire genome in all six reading frames to create an unbiased +search space: ```bash pgatk dnaseq-to-proteindb \ @@ -341,119 +647,284 @@ pgatk dnaseq-to-proteindb \ !!! warning Six-frame translation of a full mammalian genome produces a very large database. Consider translating individual chromosomes or using it in - combination with a smaller targeted database search. + combination with a smaller targeted database for a two-pass search strategy. ---- +### Three-frame transcriptome translation -## 8. gnomAD Population Variant Database +A more focused approach: translate all annotated transcripts in three reading +frames to capture alternative ORFs without the full genome search space: + +```bash +pgatk threeframe-translation \ + --input_fasta transcripts.fa \ + --output translated_transcripts.fa +``` -Build a protein database from gnomAD variants using GENCODE annotations. +### Map identified peptides back to the genome + +After database searching, map novel peptide identifications to genomic +coordinates for annotation and genome browser visualization: ```bash -pgatk vcf-to-proteindb \ - --vcf gnomad_genome.vcf.gz \ - --input_fasta gencode_transcripts.fa \ - --gene_annotations_gtf gencode.v38.annotation.gtf.gz \ - --annotation_field_name vep \ - --af_field controls_AF \ - --af_threshold 0.01 \ - --include_consequences missense_variant,inframe_insertion,inframe_deletion,frameshift_variant \ - --biotype_str transcript_type \ - --output_proteindb gnomad_proteins.fa +pgatk map-peptide2genome \ + --input novel_peptide_identifications.tsv \ + --gtf genes.gtf \ + --fasta proteins.fa \ + --idmap protein_to_transcript.tsv \ + --output novel_peptides.gff3 ``` -!!! tip "gnomAD-specific parameters" - - `--annotation_field_name vep` -- gnomAD uses `vep` instead of `CSQ` - - `--af_field controls_AF` -- Use control allele frequency - - `--biotype_str transcript_type` -- GENCODE uses `transcript_type` instead - of ENSEMBL's `transcript_biotype` +The GFF3 output can be loaded into UCSC Genome Browser, IGV, or Ensembl for +visual inspection alongside gene models, RNA-seq coverage, and conservation +tracks. --- -## 9. Sample-Specific Database from WGS/WES +## 8. Metaproteomics: Microbiome and Environmental Samples + +Metaproteomics characterizes the functional activity of microbial communities. +The central challenge is building the protein search database, since communities +can contain hundreds to thousands of species. When matched metagenomic +sequencing data is available, six-frame translation of assembled contigs +provides a sample-matched database. -Translate variants from a whole-genome or whole-exome sequencing VCF of an -individual sample. These VCFs typically lack VEP annotations. +### Six-frame translation of metagenome assembly + +After assembling metagenomic reads into contigs (e.g. with megahit or +metaSPAdes), translate them: ```bash -pgatk vcf-to-proteindb \ - --vcf sample_variants.vcf \ - --input_fasta transcripts.fa \ - --gene_annotations_gtf genes.gtf \ - --annotation_field_name '' \ - --ignore_filters \ - --output_proteindb sample_proteins.fa +pgatk dnaseq-to-proteindb \ + --input_fasta metagenome_contigs.fa \ + --output_proteindb metagenome_6frame.fa \ + --biotype_str '' \ + --num_orfs 3 \ + --num_orfs_complement 3 \ + --protein_prefix meta_ ``` -- `--annotation_field_name ''` tells the tool to skip parsing VEP/CSQ - annotations (not present in raw variant callers like GATK HaplotypeCaller) -- `--ignore_filters` includes all variants regardless of the FILTER column +### Three-frame translation of predicted ORFs -To include only variants that passed quality filtering: +If gene prediction has been run on the metagenome assembly (e.g. Prodigal, +MetaGeneAnnotator), translate the predicted CDS regions: ```bash -pgatk vcf-to-proteindb \ - --vcf sample_variants.vcf \ - --input_fasta transcripts.fa \ - --gene_annotations_gtf genes.gtf \ - --annotation_field_name '' \ - --accepted_filters PASS \ - --output_proteindb sample_proteins_pass.fa +pgatk dnaseq-to-proteindb \ + --input_fasta predicted_genes.fna \ + --output_proteindb metagenome_proteins.fa \ + --biotype_str '' \ + --protein_prefix meta_ ``` +### Add decoy for metaproteomics search + +```bash +pgatk generate-decoy \ + --input metagenome_proteins.fa \ + --output metagenome_target_decoy.fa \ + --method decoypyrat +``` + +!!! note + For very large metaproteomics databases (millions of sequences), + consider using `--memory_save` with `generate-decoy` to reduce memory + usage, or splitting the database by contig/bin before decoy generation. + --- -## 10. Digest and Filter Variant Peptides +## 9. Long-Read Transcriptomics + Proteogenomics -After generating a variant protein database you may want to digest the proteins -*in silico* and keep only peptides that differ from the canonical proteome. -This produces a compact FASTA of variant-specific peptides. +Long-read RNA sequencing (PacBio Iso-Seq, Oxford Nanopore) captures full-length +transcript isoforms without assembly, enabling unambiguous protein isoform +prediction. Studies have shown that 25% of long-read-detected protein isoforms +may be absent from reference annotations. + +### Translate long-read transcript sequences + +After processing long reads through an isoform classification pipeline (e.g. +SQANTI3), translate the high-quality transcript sequences: + +```bash +# Translate all long-read transcripts, including novel isoforms +pgatk dnaseq-to-proteindb \ + --input_fasta long_read_transcripts.fa \ + --output_proteindb long_read_proteins.fa \ + --biotype_str '' \ + --protein_prefix lr_ +``` + +### Combine with canonical proteome + +```bash +cat canonical_proteins.fa long_read_proteins.fa > lr_target.fa + +pgatk generate-decoy \ + --input lr_target.fa \ + --output lr_target_decoy.fa \ + --method decoypyrat +``` + +### Extract novel isoform peptides + +After searching, identify peptides unique to long-read isoforms that are not +present in the reference: ```bash pgatk digest-mutant-protein \ - --input variant_proteins.fa \ + --input long_read_proteins.fa \ --fasta canonical_proteins.fa \ - --output unique_variant_peptides.fa \ + --output novel_isoform_peptides.fa \ --min-len 7 \ --max-len 40 \ --missed-cleavages 2 ``` -To combine multiple variant sources (e.g. COSMIC + ClinVar): +--- + +## 10. Plant and Non-Model Organism Proteogenomics + +For non-model organisms with draft or incomplete genome annotations, +proteogenomics improves gene model quality. This approach has been particularly +impactful for crop species -- for example, a wheat proteogenomics study +validated over 33,000 gene models and recommended promoting 3,700 +low-confidence genes. + +### Step 1 -- Download ENSEMBL data for the species + +pgatk supports any species available in ENSEMBL. For example, rice +(*Oryza sativa*, taxonomy 39947): ```bash -pgatk digest-mutant-protein \ - --input cosmic_proteins.fa,clinvar_proteins.fa \ - --fasta canonical_proteins.fa \ - --output combined_variant_peptides.fa +pgatk ensembl-downloader \ + -t 39947 \ + -o ensembl_rice \ + --skip_dna ``` ---- +### Step 2 -- Generate transcript sequences and canonical proteome + +```bash +gffread -F -w ensembl_rice/transcripts.fa \ + -g ensembl_rice/genome.fa \ + ensembl_rice/Oryza_sativa.IRGSP-1.0.*.gtf.gz + +pgatk dnaseq-to-proteindb \ + --input_fasta ensembl_rice/transcripts.fa \ + --output_proteindb rice_canonical.fa +``` + +### Step 3 -- Three-frame translation for novel gene discovery + +```bash +pgatk threeframe-translation \ + --input_fasta ensembl_rice/transcripts.fa \ + --output rice_3frame.fa +``` + +### Step 4 -- Six-frame genome translation (unbiased) -## 11. Map Identified Peptides to Genomic Coordinates +```bash +pgatk dnaseq-to-proteindb \ + --input_fasta ensembl_rice/genome.fa \ + --output_proteindb rice_genome_6frame.fa \ + --biotype_str '' \ + --num_orfs 3 \ + --num_orfs_complement 3 +``` + +### Step 5 -- Combine databases + +```bash +cat rice_canonical.fa rice_3frame.fa > rice_target.fa + +pgatk generate-decoy \ + --input rice_target.fa \ + --output rice_target_decoy.fa \ + --method decoypyrat +``` -After a proteomics search, map the identified peptides back to genomic -coordinates for visualization in genome browsers (e.g. UCSC, IGV). +### Step 6 -- Map identified novel peptides to the genome ```bash pgatk map-peptide2genome \ - --input peptide_identifications.tsv \ - --gtf genes.gtf \ - --fasta proteins.fa \ + --input rice_novel_peptides.tsv \ + --gtf ensembl_rice/Oryza_sativa.IRGSP-1.0.*.gtf.gz \ + --fasta rice_target.fa \ --idmap protein_to_transcript.tsv \ - --output peptides.gff3 + --output rice_novel_peptides.gff3 ``` -The input TSV should contain at least peptide sequence and protein accession -columns (configurable with `--pep-col` and `--prot-col`). +### Mouse example + +For model organisms like mouse (*Mus musculus*, taxonomy 10090): + +```bash +# Download +pgatk ensembl-downloader -t 10090 -o ensembl_mouse --skip_dna + +# Generate transcript sequences +gffread -F -w ensembl_mouse/transcripts.fa \ + -g ensembl_mouse/genome.fa \ + ensembl_mouse/Mus_musculus.GRCm39.*.gtf.gz + +# Canonical + variant proteins +pgatk dnaseq-to-proteindb \ + --input_fasta ensembl_mouse/transcripts.fa \ + --output_proteindb mouse_canonical.fa + +pgatk vcf-to-proteindb \ + --vcf ensembl_mouse/mus_musculus_incl_consequences.vcf.gz \ + --input_fasta ensembl_mouse/transcripts.fa \ + --gene_annotations_gtf ensembl_mouse/Mus_musculus.GRCm39.*.gtf.gz \ + --output_proteindb mouse_variants.fa + +# Combine and generate decoy +cat mouse_canonical.fa mouse_variants.fa > mouse_target.fa +pgatk generate-decoy \ + --input mouse_target.fa \ + --output mouse_target_decoy.fa \ + --method decoypyrat +``` + +--- + +## 11. Digest and Filter Variant Peptides + +After generating variant protein databases from any source, *in silico* +digestion and filtering against the canonical proteome extracts only the +peptides that are unique to the variant sequences. This compact peptide list +is useful for targeted validation, spectral library building, and +quantification of variant peptides. + +### Single source + +```bash +pgatk digest-mutant-protein \ + --input variant_proteins.fa \ + --fasta canonical_proteins.fa \ + --output unique_variant_peptides.fa \ + --min-len 7 \ + --max-len 40 \ + --missed-cleavages 2 +``` + +### Combine multiple variant sources + +```bash +pgatk digest-mutant-protein \ + --input cosmic_proteins.fa,clinvar_proteins.fa,ensembl_variants.fa \ + --fasta canonical_proteins.fa \ + --output combined_variant_peptides.fa +``` --- -## 12. Complete Proteogenomics Workflow +## 12. Complete Multi-Source Proteogenomics Workflow A full end-to-end workflow combining canonical proteins, population variants, clinical variants, cancer mutations, and non-coding translations into a single -search-ready database. +search-ready database. This is the approach used in large-scale studies that aim +to maximize the discovery of non-canonical peptides. ### Step 1 -- Download all data sources @@ -513,6 +984,15 @@ pgatk dnaseq-to-proteindb \ --include_biotypes lincRNA \ --num_orfs 3 \ --skip_including_all_cds + +# Pseudogenes +pgatk dnaseq-to-proteindb \ + --input_fasta ensembl_data/transcripts.fa \ + --output_proteindb pseudogene.fa \ + --var_prefix pseudogene_ \ + --include_biotypes processed_pseudogene,transcribed_processed_pseudogene \ + --num_orfs 3 \ + --skip_including_all_cds ``` ### Step 4 -- Combine all databases @@ -523,14 +1003,22 @@ cat canonical.fa \ clinvar_variants.fa \ cosmic_variants.fa \ lincRNA.fa \ + pseudogene.fa \ > combined_target.fa ``` -### Step 5 -- Add decoy sequences +### Step 5 -- Quality check and decoy generation ```bash +# Filter short sequences and validate +pgatk ensembl-check \ + --input_fasta combined_target.fa \ + --output validated_target.fa \ + --num_aa 6 + +# Add decoy sequences pgatk generate-decoy \ - --input combined_target.fa \ + --input validated_target.fa \ --output proteogenomics_target_decoy.fa \ --method decoypyrat \ --decoy_prefix DECOY_ @@ -540,7 +1028,7 @@ pgatk generate-decoy \ ```bash pgatk digest-mutant-protein \ - --input ensembl_variants.fa,clinvar_variants.fa,cosmic_variants.fa \ + --input ensembl_variants.fa,clinvar_variants.fa,cosmic_variants.fa,lincRNA.fa,pseudogene.fa \ --fasta canonical.fa \ --output unique_variant_peptides.fa \ --min-len 7 \ @@ -549,63 +1037,21 @@ pgatk digest-mutant-protein \ ``` The file `proteogenomics_target_decoy.fa` is a comprehensive search database, -and `unique_variant_peptides.fa` provides a focused list of variant-specific +and `unique_variant_peptides.fa` provides a focused list of non-canonical peptides for validation. ---- - -## 13. Non-Human Species - -pgatk supports any species available in ENSEMBL. Here is an example for mouse -(*Mus musculus*, taxonomy 10090): - -```bash -# Download -pgatk ensembl-downloader -t 10090 -o ensembl_mouse --skip_dna - -# Generate transcript sequences -gffread -F -w ensembl_mouse/transcripts.fa \ - -g ensembl_mouse/genome.fa \ - ensembl_mouse/Mus_musculus.GRCm39.*.gtf.gz - -# Canonical proteins -pgatk dnaseq-to-proteindb \ - --input_fasta ensembl_mouse/transcripts.fa \ - --output_proteindb mouse_canonical.fa - -# Variant proteins -pgatk vcf-to-proteindb \ - --vcf ensembl_mouse/mus_musculus_incl_consequences.vcf.gz \ - --input_fasta ensembl_mouse/transcripts.fa \ - --gene_annotations_gtf ensembl_mouse/Mus_musculus.GRCm39.*.gtf.gz \ - --output_proteindb mouse_variants.fa - -# Combine and generate decoy -cat mouse_canonical.fa mouse_variants.fa > mouse_target.fa -pgatk generate-decoy \ - --input mouse_target.fa \ - --output mouse_target_decoy.fa \ - --method decoypyrat -``` +!!! tip "Database size considerations" + Larger proteogenomic databases improve coverage but increase the FDR + penalty. Consider a two-pass strategy: first search against a focused + database (canonical + variants), then a broader database (adding + non-coding ORFs and six-frame translations) for discovery. --- -## 14. Three-Frame Translation +## 13. Quality Check: Validate Protein Database -Perform a simple three-frame translation of any nucleotide FASTA file: - -```bash -pgatk threeframe-translation \ - --input_fasta input_sequences.fa \ - --output translated_proteins.fa -``` - ---- - -## 15. Quality Check: Validate Protein Database - -Before using a protein database, check it for internal stop codons and short -sequences: +Before using a protein database for searching, validate it for internal stop +codons and short sequences: ```bash pgatk ensembl-check \ @@ -617,3 +1063,16 @@ pgatk ensembl-check \ This filters out sequences shorter than 6 amino acids. Use `--add_stop_codons` to include proteins that contain internal stop codons (useful for proteogenomics where frameshifts may introduce premature stops). + +--- + +## References + +If you use pgatk in your research, please cite: + +> Husen M Umer, Enrique Audain, Yafeng Zhu, Julianus Pfeuffer, Timo +> Sachsenberg, Janne Lehtiƶ, Rui M Branca, Yasset Perez-Riverol. +> **Generation of ENSEMBL-based proteogenomics databases boosts the +> identification of non-canonical peptides.** +> *Bioinformatics*, Volume 38, Issue 5, 1 March 2022, Pages 1470--1472. +> [https://doi.org/10.1093/bioinformatics/btab838](https://doi.org/10.1093/bioinformatics/btab838)