From dd4ee5754407447c31f00def45c82da40c67ed91 Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Tue, 3 Mar 2026 18:27:48 +0000
Subject: [PATCH 1/3] minor gubs fixing.

---
 pgatk/cgenomes/cgenomes_proteindb.py        | 16 +++++--
 pgatk/clinvar/clinvar_service.py            |  4 +-
 pgatk/ensembl/ensembl.py                    | 50 ++++++++++++++-------
 pgatk/testdata/proteindb_from_gnomad_VCF.fa |  2 -
 pgatk/toolbox/vcf_utils.py                  |  8 +++-
 5 files changed, 55 insertions(+), 25 deletions(-)

diff --git a/pgatk/cgenomes/cgenomes_proteindb.py b/pgatk/cgenomes/cgenomes_proteindb.py
index b0a97df2..f42d7b8d 100644
--- a/pgatk/cgenomes/cgenomes_proteindb.py
+++ b/pgatk/cgenomes/cgenomes_proteindb.py
@@ -181,8 +181,12 @@ def get_mut_pro_seq(snp: SNP, seq: Seq) -> Optional[str]:
                 protein_seq = str(seq.translate(to_stop=False))
 
                 if "Missense" in snp.mutation_type:
-                    mut_aa = snp.aa_mut[-1]
-                    if not mut_aa.isalpha():
+                    # Extract the mutant residue from HGVS like p.V600E (1-letter)
+                    # or p.Val600Glu (3-letter).  For 1-letter the last char is the
+                    # AA; for 3-letter we need to convert the last triplet.
+                    aa_suffix = re.sub(r'^p\..*\d+', '', snp.aa_mut)
+                    mut_aa = _three_to_one(aa_suffix) if aa_suffix else ''
+                    if not mut_aa or len(mut_aa) != 1 or not mut_aa.isalpha():
                         return ''
                     index = int(positions[0]) - 1
                     mut_pro_seq = protein_seq[:index] + mut_aa + protein_seq[index + 1:]
@@ -194,7 +198,8 @@ def get_mut_pro_seq(snp: SNP, seq: Seq) -> Optional[str]:
                         index = snp.aa_mut.index("ins")
                     except ValueError:
                         return ''
-                    insert_aa = snp.aa_mut[index + 3:]
+                    insert_aa_raw = snp.aa_mut[index + 3:]
+                    insert_aa = _three_to_one(insert_aa_raw)
                     if insert_aa.isalpha():
                         ins_index1 = int(positions[0])
                         mut_pro_seq = protein_seq[:ins_index1] + insert_aa + protein_seq[ins_index1:]
@@ -211,7 +216,10 @@ def get_mut_pro_seq(snp: SNP, seq: Seq) -> Optional[str]:
                         index = snp.aa_mut.index(">")
                     except ValueError:
                         return ''
-                    mut_aa = snp.aa_mut[index + 1:]
+                    mut_aa_raw = snp.aa_mut[index + 1:]
+                    mut_aa = _three_to_one(mut_aa_raw.replace('*', ''))
+                    if '*' in mut_aa_raw:
+                        mut_aa += '*'
                     if not mut_aa.replace('*','').isalpha():
                         return ''
                     if "deletion" in snp.mutation_type:
diff --git a/pgatk/clinvar/clinvar_service.py b/pgatk/clinvar/clinvar_service.py
index 26004777..b5760cbd 100644
--- a/pgatk/clinvar/clinvar_service.py
+++ b/pgatk/clinvar/clinvar_service.py
@@ -493,9 +493,9 @@ def run(self) -> str:
                         if "CDS=" in desc:
                             try:
                                 cds_str = [
-                                    p
+                                    p.strip("[]")
                                     for p in desc.split()
-                                    if p.startswith("CDS=")
+                                    if p.strip("[]").startswith("CDS=")
                                 ][0]
                                 cds_info = [
                                     int(x)
diff --git a/pgatk/ensembl/ensembl.py b/pgatk/ensembl/ensembl.py
index a9a6d7ef..e1d8b7ee 100644
--- a/pgatk/ensembl/ensembl.py
+++ b/pgatk/ensembl/ensembl.py
@@ -328,7 +328,9 @@ def annoate_vcf(vcf_file: str, gtf_file: str,
 
         BedTool(gtf_file).intersect(BedTool(vcf_file), wo=True).saveas(f"{vcf_stem}_all.bed")
 
-        muts_dict = {}
+        muts_dict: dict[str, list[str]] = {}
+        # VCF columns start after the GTF columns (gene_info_index + 1)
+        vcf_start_col = gene_info_index + 1
         with open(f"{vcf_stem}_all.bed", 'r', encoding='utf-8') as an:
             for line in an.readlines():
                 sl = line.strip().split('\t')
@@ -345,11 +347,17 @@ def annoate_vcf(vcf_file: str, gtf_file: str,
                 if transcript_id == 'NO_OVERLAP':
                     continue
 
-                # write the mutation line as a key and set the overlapping transcriptID as its value(s)
+                # Use canonical positional key (CHROM:POS:REF:ALT) for reliable
+                # matching between BED output and raw VCF lines.
                 try:
-                    muts_dict['\t'.join(sl[gene_info_index + 1:-1])].append(transcript_id)
-                except KeyError:
-                    muts_dict['\t'.join(sl[gene_info_index + 1:-1])] = [transcript_id]
+                    vcf_chrom = sl[vcf_start_col]
+                    vcf_pos = sl[vcf_start_col + 1]
+                    vcf_ref = sl[vcf_start_col + 3]
+                    vcf_alt = sl[vcf_start_col + 4]
+                    key = f"{vcf_chrom}\t{vcf_pos}\t{vcf_ref}\t{vcf_alt}"
+                except IndexError:
+                    continue
+                muts_dict.setdefault(key, []).append(transcript_id)
 
         with open(f"{vcf_stem}_annotated.vcf", 'w', encoding='utf-8') as ann, open(vcf_file, 'r', encoding='utf-8') as v:
             # write vcf headers to the output file
@@ -358,13 +366,17 @@ def annoate_vcf(vcf_file: str, gtf_file: str,
                     ann.write(line)
                 else:
                     # write the mutations and their overlapping transcript to output file
-                    try:
-                        sl = line.strip().split('\t')
+                    sl = line.strip().split('\t')
+                    if len(sl) < 8:
+                        ann.write(line)
+                        continue
+                    key = f"{sl[0]}\t{sl[1]}\t{sl[3]}\t{sl[4]}"
+                    if key in muts_dict:
                         sl[vcf_info_field_index] = '{};{}={}'.format(
                             sl[vcf_info_field_index].strip(), annotation_str,
-                            ','.join(set(muts_dict[line.strip()])))
+                            ','.join(set(muts_dict[key])))
                         ann.write('\t'.join(sl) + '\n')
-                    except KeyError:
+                    else:
                         ann.write(line)
 
         return f"{vcf_stem}_annotated.vcf"
@@ -503,7 +515,7 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
                 if self._af_field:
                     # get AF from the INFO field
                     try:
-                        af = float([x.split('=')[1] for x in record.INFO.split(';') if x.startswith(self._af_field)][0])
+                        af = float([x.split('=', 1)[1] for x in record.INFO.split(';') if x.split('=', 1)[0] == self._af_field][0])
                     except (ValueError, IndexError):
                         invalid_records['# variants with invalid record'] += 1
                         continue
@@ -521,8 +533,8 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
                 transcript_records = []
                 try:
                     transcript_records = \
-                        [x.split('=')[1] for x in record.INFO.split(';') if x.startswith(self._annotation_field_name)][
-                            0]
+                        [x.split('=', 1)[1] for x in record.INFO.split(';')
+                         if x.split('=', 1)[0] == self._annotation_field_name][0]
                 except IndexError:  # no overlapping feature was found
                     invalid_records['# variants with invalid record'] += 1
                     msg = "skipped record {}, no annotation feature was found".format(record)
@@ -586,11 +598,17 @@ def vcf_to_proteindb(self, vcf_file: str, input_fasta: str, gene_annotations_gtf
                     num_orfs = 3
                     if 'CDS=' in desc:
                         try:
-                            cds_info = [int(x) for x in desc.split(' ')[1].split('=')[1].split('-')]
-                            feature_types = ['CDS', 'stop_codon']
-                            num_orfs = 1
+                            cds_token = next(
+                                (t.strip('[]') for t in desc.split(' ')
+                                 if t.strip('[]').startswith('CDS=')),
+                                None,
+                            )
+                            if cds_token:
+                                cds_info = [int(x) for x in cds_token.split('=')[1].split('-')]
+                                feature_types = ['CDS', 'stop_codon']
+                                num_orfs = 1
                         except (ValueError, IndexError):
-                            msg = "Could not extra cds position from fasta header for: {}".format(desc)
+                            msg = "Could not extract cds position from fasta header for: {}".format(desc)
                             self.get_logger().debug(msg)
 
                     chrom, strand, features_info = self.get_features(db,
diff --git a/pgatk/testdata/proteindb_from_gnomad_VCF.fa b/pgatk/testdata/proteindb_from_gnomad_VCF.fa
index 3f008d7e..93974340 100644
--- a/pgatk/testdata/proteindb_from_gnomad_VCF.fa
+++ b/pgatk/testdata/proteindb_from_gnomad_VCF.fa
@@ -1,5 +1,3 @@
->gnomvar_rs760117505_1.69337.A.C_ENST00000335137.3
-MVTEFIFLGLSDSQELQTFLFMLFFVFYGGIVFGNLLIVITVVSDSHLHSPMYFLLANLSLIDLSLSSVTAPKMITDFFSQRQVISFKGCLVQIFLLHFFGGSEMVILIAMGFDRYIAICKPLHYTTIMCGNACVGIMAVTWGIGFLHSVSQLAFAVHLLFCGPNEVDSFYCDLPRVIKLACTDTYRLDIMVIANSGVLTVCSFVLLIISYTIILMTIQHRPLDKSSKALSTLTAHITVVLLFFGPCVFIYAWPFPIKSLDKFLAVFYSVITPLLNPIIYTLRNKDMKTAIRQLRKWDAHSSVKF*
 >gnomvar_rs2691305_1.69511.A.G_ENST00000335137.3
 MVTEFIFLGLSDSQELQTFLFMLFFVFYGGIVFGNLLIVITVVSDSHLHSPMYFLLANLSLIDLSLSSVTAPKMITDFFSQRKVISFKGCLVQIFLLHFFGGSEMVILIAMGFDRYIAICKPLHYTTIMCGNACVGIMAVAWGIGFLHSVSQLAFAVHLLFCGPNEVDSFYCDLPRVIKLACTDTYRLDIMVIANSGVLTVCSFVLLIISYTIILMTIQHRPLDKSSKALSTLTAHITVVLLFFGPCVFIYAWPFPIKSLDKFLAVFYSVITPLLNPIIYTLRNKDMKTAIRQLRKWDAHSSVKF*
 >gnomvar_rs200505207_1.69761.A.T_ENST00000335137.3
diff --git a/pgatk/toolbox/vcf_utils.py b/pgatk/toolbox/vcf_utils.py
index 248cc081..d371e5df 100644
--- a/pgatk/toolbox/vcf_utils.py
+++ b/pgatk/toolbox/vcf_utils.py
@@ -109,7 +109,13 @@ def get_altseq(
     for feature in features_info:
         if var_pos in range(feature[0], feature[1] + 1):
             var_index_in_cds = nc_index + (var_pos - feature[0])
-            c = len(ref_allele)
+            # Clip the ref allele length to the portion that falls within
+            # this feature.  When a multi-base variant extends from an exon
+            # into an intron, the intronic bases are absent from the
+            # transcript and must not be counted.
+            var_end_genomic = var_pos + len(ref_allele) - 1
+            exonic_ref_len = min(var_end_genomic, feature[1]) - var_pos + 1
+            c = max(exonic_ref_len, 0)
             alt_seq = ref_seq[0:var_index_in_cds] + var_allele + ref_seq[var_index_in_cds + c::]
             if strand == '-':
                 return ref_seq[::-1], alt_seq[::-1]

From 6dcf9dcffd46c8775d05e52a909d5ac163700b09 Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Tue, 3 Mar 2026 18:46:24 +0000
Subject: [PATCH 2/3] minor changes in help

---
 docs/index.md     |   1 +
 docs/pgatk-cli.md | 163 ++++++++++++
 docs/use-cases.md | 619 ++++++++++++++++++++++++++++++++++++++++++++++
 mkdocs.yml        |   1 +
 4 files changed, 784 insertions(+)
 create mode 100644 docs/use-cases.md

diff --git a/docs/index.md b/docs/index.md
index d7104538..f6e58a2e 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -23,6 +23,7 @@ See the [Installation](installation.md) page for more options (Bioconda, Docker,
 | [Introduction](introduction.md) | Overview of the proteogenomics field |
 | [Installation](installation.md) | How to install pgatk (pip, Bioconda, Docker, source) |
 | [pgatk CLI](pgatk-cli.md) | Full command-line reference for all tools |
+| [Use Cases](use-cases.md) | End-to-end workflows and recipes for common scenarios |
 | [File Formats](formats.md) | BED, GTF, GCT format specifications |
 | [Changelog](changelog.md) | Version history and release notes |
 | [Support](support.md) | Contributors and how to get help |
diff --git a/docs/pgatk-cli.md b/docs/pgatk-cli.md
index 2bda0be0..f36971ac 100644
--- a/docs/pgatk-cli.md
+++ b/docs/pgatk-cli.md
@@ -15,12 +15,16 @@ Options:
 Commands:
   cbioportal-downloader    Command to download the the cbioportal studies
   cbioportal-to-proteindb  Command to translate cbioportal mutation data into proteindb
+  clinvar-to-proteindb     Generate protein database from ClinVar VCF + RefSeq GTF
   cosmic-downloader        Command to download the cosmic mutation database
   cosmic-to-proteindb      Command to translate Cosmic mutation data into proteindb
+  digest-mutant-protein    Digest mutant proteins and filter against canonical proteome
   dnaseq-to-proteindb      Generate peptides based on DNA sequences
   ensembl-check            Command to check ensembl database for stop codons, gaps
   ensembl-downloader       Command to download the ensembl information
   generate-decoy           Create decoy protein sequences using multiple methods
+  map-peptide2genome       Map peptides to genomic coordinates (GFF3 output)
+  ncbi-downloader          Download NCBI RefSeq and ClinVar reference files
   threeframe-translation   Command to perform 3'frame translation
   vcf-to-proteindb         Generate peptides based on DNA variants VCF files
 ```
@@ -171,6 +175,44 @@ git lfs pull -I public --include "data_clinical_sample.txt"
 git lfs pull -I public --include "data_mutations_mskcc.txt"
 ```
 
+### Downloading NCBI / ClinVar Data
+
+Downloading NCBI RefSeq annotations and ClinVar variants for human (GRCh38) is performed using the command `ncbi-downloader`. The tool downloads four files:
+
+- RefSeq gene annotations (GTF)
+- RefSeq transcript nucleotide sequences (FASTA)
+- Assembly report (chromosome name mapping)
+- ClinVar variant calls (VCF)
+
+#### Command Options
+
+```bash
+$ pgatk ncbi-downloader -h
+Usage: pgatk ncbi-downloader [OPTIONS]
+
+  Required parameters:
+    -o, --output-dir TEXT     Output directory for downloaded files
+
+  Optional parameters:
+    -c, --config_file TEXT    Configuration YAML file
+    --force                   Re-download files even if they exist
+    -h, --help                Show this message and exit.
+```
+
+#### Examples
+
+- Download all NCBI/ClinVar reference files:
+
+    ```bash
+    pgatk ncbi-downloader -o ncbi_data
+    ```
+
+- Force re-download of all files:
+
+    ```bash
+    pgatk ncbi-downloader -o ncbi_data --force
+    ```
+
 ## Generate Protein Databases
 
 The **pgatk** framework provides a set of tools to generate protein databases in `FASTA` format from DNA sequences, variants, and mutations. Multiple commands are available depending on the data type provided by the user and the public data providers (cBioPortal, COSMIC and ENSEMBL).
@@ -355,6 +397,43 @@ The output of the tool is a protein fasta file written to the path specified by
         --output_proteindb var_peptides.fa
     ```
 
+### ClinVar Variants to Protein Sequences
+
+[ClinVar](https://www.ncbi.nlm.nih.gov/clinvar/) is NCBI's public archive of reports on the relationships among human variations and phenotypes. The command `clinvar-to-proteindb` generates a variant protein database from ClinVar VCF files using NCBI RefSeq gene models. Unlike `vcf-to-proteindb`, this command does **not** require VEP annotations.
+
+#### Command Options
+
+```bash
+$ pgatk clinvar-to-proteindb -h
+Usage: pgatk clinvar-to-proteindb [OPTIONS]
+
+  Required parameters:
+    -v, --vcf TEXT              ClinVar VCF file path
+    -g, --gtf TEXT              NCBI RefSeq GTF file path
+    -f, --fasta TEXT            RefSeq transcript nucleotide FASTA file path
+    -a, --assembly-report TEXT  NCBI assembly report file path
+    -o, --output TEXT           Output protein FASTA file path
+
+  Optional parameters:
+    -c, --config_file TEXT      Configuration YAML file
+    -h, --help                  Show this message and exit.
+```
+
+The input files can be downloaded using the [ncbi-downloader](#downloading-ncbi--clinvar-data) command.
+
+#### Examples
+
+- Generate a protein database from ClinVar variants:
+
+    ```bash
+    pgatk clinvar-to-proteindb \
+        --vcf ncbi_data/clinvar.vcf.gz \
+        --gtf ncbi_data/GRCh38_latest_genomic.gtf.gz \
+        --fasta ncbi_data/GRCh38_latest_rna.fna.gz \
+        --assembly-report ncbi_data/GRCh38_latest_assembly_report.txt \
+        --output clinvar_proteins.fa
+    ```
+
 ### Transcripts (DNA) to Protein Sequences
 
 DNA sequences given in a FASTA format can be translated using the `dnaseq-to-proteindb` tool. This tool allows for translation of all kinds of transcripts (coding and noncoding) by specifying the desired biotypes.
@@ -489,3 +568,87 @@ Usage: pgatk generate-decoy [OPTIONS]
     ```bash
     pgatk generate-decoy -c config/protein_decoy.yaml --input proteindb.fa --output decoy_proteindb.fa
     ```
+
+## Post-Processing Utilities
+
+### Digest and Filter Variant Peptides
+
+The `digest-mutant-protein` command performs *in silico* tryptic digestion of mutant proteins and filters out any peptides that also appear in a canonical reference proteome. The result is a compact FASTA of variant-specific peptides only.
+
+#### Command Options
+
+```bash
+$ pgatk digest-mutant-protein -h
+Usage: pgatk digest-mutant-protein [OPTIONS]
+
+  Required parameters:
+    -i, --input TEXT        Input mutant protein FASTA file(s), comma-separated
+    -f, --fasta TEXT        Reference canonical protein FASTA
+    -o, --output TEXT       Output file for unique variant peptides
+
+  Optional parameters:
+    --prefix TEXT           Header prefix for output entries (default: Mutation)
+    --min-len INTEGER       Minimum peptide length (default: 7)
+    --max-len INTEGER       Maximum peptide length (default: 40)
+    --missed-cleavages INTEGER  Number of missed cleavages (default: 0)
+    -h, --help              Show this message and exit.
+```
+
+#### Examples
+
+- Digest variant proteins and keep only variant-specific peptides:
+
+    ```bash
+    pgatk digest-mutant-protein \
+        --input variant_proteins.fa \
+        --fasta canonical_proteins.fa \
+        --output unique_variant_peptides.fa \
+        --min-len 7 \
+        --max-len 40 \
+        --missed-cleavages 2
+    ```
+
+- Combine and filter multiple variant sources:
+
+    ```bash
+    pgatk digest-mutant-protein \
+        --input cosmic_proteins.fa,clinvar_proteins.fa,ensembl_variants.fa \
+        --fasta canonical_proteins.fa \
+        --output combined_variant_peptides.fa
+    ```
+
+### Map Peptides to Genomic Coordinates
+
+The `map-peptide2genome` command maps proteomics-identified peptides to genomic coordinates and produces a GFF3 file suitable for visualization in genome browsers (UCSC, IGV, Ensembl).
+
+#### Command Options
+
+```bash
+$ pgatk map-peptide2genome -h
+Usage: pgatk map-peptide2genome [OPTIONS]
+
+  Required parameters:
+    -i, --input TEXT        Input peptide identification TSV
+    -g, --gtf TEXT          GTF gene annotation file
+    -f, --fasta TEXT        Protein FASTA file
+    -m, --idmap TEXT        Protein-to-transcript ID mapping file
+    -o, --output TEXT       Output GFF3 file
+
+  Optional parameters:
+    --pep-col INTEGER       Peptide column index, 0-based (default: 0)
+    --prot-col INTEGER      Protein column index, 0-based (default: 1)
+    -h, --help              Show this message and exit.
+```
+
+#### Examples
+
+- Map identified peptides to genomic coordinates:
+
+    ```bash
+    pgatk map-peptide2genome \
+        --input peptide_identifications.tsv \
+        --gtf genes.gtf \
+        --fasta proteins.fa \
+        --idmap protein_to_transcript.tsv \
+        --output peptides.gff3
+    ```
diff --git a/docs/use-cases.md b/docs/use-cases.md
new file mode 100644
index 00000000..2622a975
--- /dev/null
+++ b/docs/use-cases.md
@@ -0,0 +1,619 @@
+# Use Cases and Recipes
+
+This page provides end-to-end workflows for common proteogenomics scenarios.
+Each recipe shows the exact commands to run, from downloading data to producing
+a search-ready protein database.
+
+---
+
+## 1. Human Variant Protein Database from ENSEMBL
+
+Build a variant protein database using ENSEMBL population variants (common SNPs
+and indels) for human proteogenomics searches.
+
+### Step 1 -- Download ENSEMBL data
+
+Download the GTF, CDS, and VCF files for *Homo sapiens* (taxonomy 9606):
+
+```bash
+pgatk ensembl-downloader \
+    -t 9606 \
+    -o ensembl_human \
+    --skip_protein \
+    --skip_ncrna \
+    --skip_cdna \
+    --skip_dna
+```
+
+This downloads the gene annotation GTF and the VCF file with known variants.
+
+### Step 2 -- Generate transcript sequences
+
+Use [gffread](http://ccb.jhu.edu/software/stringtie/gff.shtml#gffread) to
+extract transcript sequences from the GTF and genome FASTA:
+
+```bash
+gffread -F -w ensembl_human/transcripts.fa \
+    -g ensembl_human/genome.fa \
+    ensembl_human/Homo_sapiens.GRCh38.*.gtf.gz
+```
+
+### Step 3 -- Generate the variant protein database
+
+Translate all ENSEMBL variants that affect protein-coding transcripts:
+
+```bash
+pgatk vcf-to-proteindb \
+    --vcf ensembl_human/homo_sapiens_incl_consequences.vcf.gz \
+    --input_fasta ensembl_human/transcripts.fa \
+    --gene_annotations_gtf ensembl_human/Homo_sapiens.GRCh38.*.gtf.gz \
+    --output_proteindb ensembl_human/variant_proteins.fa
+```
+
+### Step 4 -- Generate the canonical protein database
+
+Translate canonical protein-coding transcripts:
+
+```bash
+pgatk dnaseq-to-proteindb \
+    --input_fasta ensembl_human/transcripts.fa \
+    --output_proteindb ensembl_human/canonical_proteins.fa
+```
+
+### Step 5 -- Combine and add decoy sequences
+
+Merge canonical and variant databases, then generate decoys:
+
+```bash
+cat ensembl_human/canonical_proteins.fa \
+    ensembl_human/variant_proteins.fa \
+    > ensembl_human/target.fa
+
+pgatk generate-decoy \
+    --input ensembl_human/target.fa \
+    --output ensembl_human/target_decoy.fa \
+    --method decoypyrat \
+    --decoy_prefix DECOY_
+```
+
+The file `target_decoy.fa` is ready for database searching.
+
+---
+
+## 2. High-Frequency Variant Database (AF Filtering)
+
+Build a protein database that only includes common population variants above a
+given allele frequency threshold.
+
+```bash
+pgatk vcf-to-proteindb \
+    --vcf homo_sapiens_incl_consequences.vcf.gz \
+    --input_fasta transcripts.fa \
+    --gene_annotations_gtf genes.gtf \
+    --af_field MAF \
+    --af_threshold 0.01 \
+    --output_proteindb common_variant_proteins.fa
+```
+
+To restrict to specific consequence types (e.g. only missense variants):
+
+```bash
+pgatk vcf-to-proteindb \
+    --vcf homo_sapiens_incl_consequences.vcf.gz \
+    --input_fasta transcripts.fa \
+    --gene_annotations_gtf genes.gtf \
+    --af_field MAF \
+    --af_threshold 0.05 \
+    --include_consequences missense_variant \
+    --output_proteindb missense_common_proteins.fa
+```
+
+---
+
+## 3. ClinVar Clinical Variant Database
+
+Build a protein database from ClinVar pathogenic and likely pathogenic variants
+using the NCBI RefSeq gene models.
+
+### Step 1 -- Download NCBI / ClinVar files
+
+```bash
+pgatk ncbi-downloader -o ncbi_data
+```
+
+This downloads four files to `ncbi_data/`:
+
+- `GRCh38_latest_genomic.gtf.gz` -- RefSeq gene annotations
+- `GRCh38_latest_rna.fna.gz` -- RefSeq transcript nucleotide sequences
+- `GRCh38_latest_assembly_report.txt` -- Chromosome name mapping
+- `clinvar.vcf.gz` -- ClinVar variant calls
+
+### Step 2 -- Generate the ClinVar protein database
+
+```bash
+pgatk clinvar-to-proteindb \
+    --vcf ncbi_data/clinvar.vcf.gz \
+    --gtf ncbi_data/GRCh38_latest_genomic.gtf.gz \
+    --fasta ncbi_data/GRCh38_latest_rna.fna.gz \
+    --assembly-report ncbi_data/GRCh38_latest_assembly_report.txt \
+    --output clinvar_proteins.fa
+```
+
+### Step 3 -- Add decoy sequences
+
+```bash
+pgatk generate-decoy \
+    --input clinvar_proteins.fa \
+    --output clinvar_target_decoy.fa \
+    --method decoypyrat
+```
+
+!!! tip
+    The ClinVar pipeline does **not** require VEP annotations. It uses
+    BedTools interval overlap to find transcripts affected by each variant,
+    then applies the variant to the transcript sequence and translates it.
+
+---
+
+## 4. Cancer-Specific Database from COSMIC
+
+Generate tissue-specific protein databases from COSMIC somatic mutations.
+
+### Step 1 -- Download COSMIC data
+
+```bash
+pgatk cosmic-downloader \
+    -u your_email@example.com \
+    -p your_password \
+    -o cosmic_data
+```
+
+!!! note
+    A COSMIC account is required. Register at
+    [https://cancer.sanger.ac.uk/cosmic/register](https://cancer.sanger.ac.uk/cosmic/register).
+
+### Step 2a -- Single database from all mutations
+
+```bash
+pgatk cosmic-to-proteindb \
+    --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \
+    --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \
+    --output_db cosmic_all_proteins.fa
+```
+
+### Step 2b -- One database per cancer type
+
+Split mutations by primary tissue site, creating one FASTA per cancer type:
+
+```bash
+pgatk cosmic-to-proteindb \
+    --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \
+    --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \
+    --output_db cosmic_proteins.fa \
+    --split_by_filter_column
+```
+
+This produces files named `cosmic_proteins_<tissue>.fa` for each tissue type
+(e.g. `cosmic_proteins_lung.fa`, `cosmic_proteins_breast.fa`).
+
+### Step 2c -- Database for a specific cancer type
+
+Limit to a single tissue type:
+
+```bash
+pgatk cosmic-to-proteindb \
+    --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \
+    --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \
+    --output_db cosmic_lung_proteins.fa \
+    --accepted_values "lung"
+```
+
+### Step 2d -- Cell-line specific databases
+
+Use cell-line data split by sample name:
+
+```bash
+pgatk cosmic-to-proteindb \
+    --input_mutation cosmic_data/CosmicCLP_MutantExport.tsv.gz \
+    --input_genes cosmic_data/All_CellLines_Genes.fasta.gz \
+    --output_db cosmic_cellline_proteins.fa \
+    --filter_column "Sample name" \
+    --split_by_filter_column
+```
+
+---
+
+## 5. Cancer Study from cBioPortal
+
+Generate a protein database from cancer mutation data hosted in cBioPortal.
+
+### Step 1 -- List available studies
+
+```bash
+pgatk cbioportal-downloader --list_studies
+```
+
+### Step 2 -- Download a specific study
+
+```bash
+pgatk cbioportal-downloader \
+    -d blca_mskcc_solit_2014 \
+    -o cbioportal_data
+```
+
+### Step 3 -- Download ENSEMBL CDS (hg19 assembly)
+
+cBioPortal mutations are aligned to hg19 (GRCh37):
+
+```bash
+pgatk ensembl-downloader \
+    -t 9606 \
+    --grch37 \
+    -o ensembl_hg19 \
+    --skip_vcf \
+    --skip_gtf \
+    --skip_protein \
+    --skip_ncrna \
+    --skip_cdna \
+    --skip_dna
+```
+
+### Step 4 -- Translate mutations to protein sequences
+
+```bash
+pgatk cbioportal-to-proteindb \
+    --input_mutation cbioportal_data/data_mutations_mskcc.txt \
+    --input_cds ensembl_hg19/Homo_sapiens.GRCh37.cds.all.fa.gz \
+    --output_db bladder_cancer_proteins.fa
+```
+
+To generate tissue-type-specific databases using the clinical sample file:
+
+```bash
+pgatk cbioportal-to-proteindb \
+    --input_mutation cbioportal_data/data_mutations_mskcc.txt \
+    --input_cds ensembl_hg19/Homo_sapiens.GRCh37.cds.all.fa.gz \
+    --clinical_sample_file cbioportal_data/data_clinical_sample.txt \
+    --output_db cbioportal_proteins.fa \
+    --split_by_filter_column
+```
+
+---
+
+## 6. Non-Coding RNA Protein Database
+
+Translate non-coding RNA transcripts to search for novel peptides from
+supposedly non-coding regions.
+
+### lincRNA database
+
+```bash
+pgatk dnaseq-to-proteindb \
+    --input_fasta transcripts.fa \
+    --output_proteindb lincRNA_proteins.fa \
+    --var_prefix lincRNA_ \
+    --include_biotypes lincRNA \
+    --num_orfs 3 \
+    --skip_including_all_cds
+```
+
+### Pseudogene database
+
+```bash
+pgatk dnaseq-to-proteindb \
+    --input_fasta transcripts.fa \
+    --output_proteindb pseudogene_proteins.fa \
+    --var_prefix pseudogene_ \
+    --include_biotypes processed_pseudogene,transcribed_processed_pseudogene,translated_processed_pseudogene \
+    --num_orfs 3 \
+    --skip_including_all_cds
+```
+
+### Alternative ORFs from protein-coding transcripts
+
+Translate non-canonical reading frames of protein-coding genes:
+
+```bash
+pgatk dnaseq-to-proteindb \
+    --input_fasta transcripts.fa \
+    --output_proteindb altorf_proteins.fa \
+    --var_prefix altorf_ \
+    --include_biotypes altORFs \
+    --skip_including_all_cds
+```
+
+---
+
+## 7. Six-Frame Genome Translation
+
+Perform a full six-frame translation of a genome assembly. This is useful as
+an unbiased search space but produces very large databases.
+
+```bash
+pgatk dnaseq-to-proteindb \
+    --input_fasta genome.fa \
+    --output_proteindb genome_6frame.fa \
+    --biotype_str '' \
+    --num_orfs 3 \
+    --num_orfs_complement 3
+```
+
+!!! warning
+    Six-frame translation of a full mammalian genome produces a very large
+    database. Consider translating individual chromosomes or using it in
+    combination with a smaller targeted database search.
+
+---
+
+## 8. gnomAD Population Variant Database
+
+Build a protein database from gnomAD variants using GENCODE annotations.
+
+```bash
+pgatk vcf-to-proteindb \
+    --vcf gnomad_genome.vcf.gz \
+    --input_fasta gencode_transcripts.fa \
+    --gene_annotations_gtf gencode.v38.annotation.gtf.gz \
+    --annotation_field_name vep \
+    --af_field controls_AF \
+    --af_threshold 0.01 \
+    --include_consequences missense_variant,inframe_insertion,inframe_deletion,frameshift_variant \
+    --biotype_str transcript_type \
+    --output_proteindb gnomad_proteins.fa
+```
+
+!!! tip "gnomAD-specific parameters"
+    - `--annotation_field_name vep` -- gnomAD uses `vep` instead of `CSQ`
+    - `--af_field controls_AF` -- Use control allele frequency
+    - `--biotype_str transcript_type` -- GENCODE uses `transcript_type` instead
+      of ENSEMBL's `transcript_biotype`
+
+---
+
+## 9. Sample-Specific Database from WGS/WES
+
+Translate variants from a whole-genome or whole-exome sequencing VCF of an
+individual sample. These VCFs typically lack VEP annotations.
+
+```bash
+pgatk vcf-to-proteindb \
+    --vcf sample_variants.vcf \
+    --input_fasta transcripts.fa \
+    --gene_annotations_gtf genes.gtf \
+    --annotation_field_name '' \
+    --ignore_filters \
+    --output_proteindb sample_proteins.fa
+```
+
+- `--annotation_field_name ''` tells the tool to skip parsing VEP/CSQ
+  annotations (not present in raw variant callers like GATK HaplotypeCaller)
+- `--ignore_filters` includes all variants regardless of the FILTER column
+
+To include only variants that passed quality filtering:
+
+```bash
+pgatk vcf-to-proteindb \
+    --vcf sample_variants.vcf \
+    --input_fasta transcripts.fa \
+    --gene_annotations_gtf genes.gtf \
+    --annotation_field_name '' \
+    --accepted_filters PASS \
+    --output_proteindb sample_proteins_pass.fa
+```
+
+---
+
+## 10. Digest and Filter Variant Peptides
+
+After generating a variant protein database you may want to digest the proteins
+*in silico* and keep only peptides that differ from the canonical proteome.
+This produces a compact FASTA of variant-specific peptides.
+
+```bash
+pgatk digest-mutant-protein \
+    --input variant_proteins.fa \
+    --fasta canonical_proteins.fa \
+    --output unique_variant_peptides.fa \
+    --min-len 7 \
+    --max-len 40 \
+    --missed-cleavages 2
+```
+
+To combine multiple variant sources (e.g. COSMIC + ClinVar):
+
+```bash
+pgatk digest-mutant-protein \
+    --input cosmic_proteins.fa,clinvar_proteins.fa \
+    --fasta canonical_proteins.fa \
+    --output combined_variant_peptides.fa
+```
+
+---
+
+## 11. Map Identified Peptides to Genomic Coordinates
+
+After a proteomics search, map the identified peptides back to genomic
+coordinates for visualization in genome browsers (e.g. UCSC, IGV).
+
+```bash
+pgatk map-peptide2genome \
+    --input peptide_identifications.tsv \
+    --gtf genes.gtf \
+    --fasta proteins.fa \
+    --idmap protein_to_transcript.tsv \
+    --output peptides.gff3
+```
+
+The input TSV should contain at least peptide sequence and protein accession
+columns (configurable with `--pep-col` and `--prot-col`).
+
+---
+
+## 12. Complete Proteogenomics Workflow
+
+A full end-to-end workflow combining canonical proteins, population variants,
+clinical variants, cancer mutations, and non-coding translations into a single
+search-ready database.
+
+### Step 1 -- Download all data sources
+
+```bash
+# ENSEMBL (canonical + population variants)
+pgatk ensembl-downloader -t 9606 -o ensembl_data
+
+# NCBI / ClinVar
+pgatk ncbi-downloader -o ncbi_data
+
+# COSMIC (requires account)
+pgatk cosmic-downloader -u user@example.com -p password -o cosmic_data
+```
+
+### Step 2 -- Prepare transcript sequences
+
+```bash
+gffread -F -w ensembl_data/transcripts.fa \
+    -g ensembl_data/genome.fa \
+    ensembl_data/Homo_sapiens.GRCh38.*.gtf.gz
+```
+
+### Step 3 -- Generate all protein databases
+
+```bash
+# Canonical proteins
+pgatk dnaseq-to-proteindb \
+    --input_fasta ensembl_data/transcripts.fa \
+    --output_proteindb canonical.fa
+
+# ENSEMBL population variants
+pgatk vcf-to-proteindb \
+    --vcf ensembl_data/homo_sapiens_incl_consequences.vcf.gz \
+    --input_fasta ensembl_data/transcripts.fa \
+    --gene_annotations_gtf ensembl_data/Homo_sapiens.GRCh38.*.gtf.gz \
+    --output_proteindb ensembl_variants.fa
+
+# ClinVar clinical variants
+pgatk clinvar-to-proteindb \
+    --vcf ncbi_data/clinvar.vcf.gz \
+    --gtf ncbi_data/GRCh38_latest_genomic.gtf.gz \
+    --fasta ncbi_data/GRCh38_latest_rna.fna.gz \
+    --assembly-report ncbi_data/GRCh38_latest_assembly_report.txt \
+    --output clinvar_variants.fa
+
+# COSMIC somatic mutations
+pgatk cosmic-to-proteindb \
+    --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \
+    --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \
+    --output_db cosmic_variants.fa
+
+# Non-coding RNA (lincRNA)
+pgatk dnaseq-to-proteindb \
+    --input_fasta ensembl_data/transcripts.fa \
+    --output_proteindb lincRNA.fa \
+    --var_prefix lincRNA_ \
+    --include_biotypes lincRNA \
+    --num_orfs 3 \
+    --skip_including_all_cds
+```
+
+### Step 4 -- Combine all databases
+
+```bash
+cat canonical.fa \
+    ensembl_variants.fa \
+    clinvar_variants.fa \
+    cosmic_variants.fa \
+    lincRNA.fa \
+    > combined_target.fa
+```
+
+### Step 5 -- Add decoy sequences
+
+```bash
+pgatk generate-decoy \
+    --input combined_target.fa \
+    --output proteogenomics_target_decoy.fa \
+    --method decoypyrat \
+    --decoy_prefix DECOY_
+```
+
+### Step 6 (optional) -- Extract unique variant peptides
+
+```bash
+pgatk digest-mutant-protein \
+    --input ensembl_variants.fa,clinvar_variants.fa,cosmic_variants.fa \
+    --fasta canonical.fa \
+    --output unique_variant_peptides.fa \
+    --min-len 7 \
+    --max-len 40 \
+    --missed-cleavages 2
+```
+
+The file `proteogenomics_target_decoy.fa` is a comprehensive search database,
+and `unique_variant_peptides.fa` provides a focused list of variant-specific
+peptides for validation.
+
+---
+
+## 13. Non-Human Species
+
+pgatk supports any species available in ENSEMBL. Here is an example for mouse
+(*Mus musculus*, taxonomy 10090):
+
+```bash
+# Download
+pgatk ensembl-downloader -t 10090 -o ensembl_mouse --skip_dna
+
+# Generate transcript sequences
+gffread -F -w ensembl_mouse/transcripts.fa \
+    -g ensembl_mouse/genome.fa \
+    ensembl_mouse/Mus_musculus.GRCm39.*.gtf.gz
+
+# Canonical proteins
+pgatk dnaseq-to-proteindb \
+    --input_fasta ensembl_mouse/transcripts.fa \
+    --output_proteindb mouse_canonical.fa
+
+# Variant proteins
+pgatk vcf-to-proteindb \
+    --vcf ensembl_mouse/mus_musculus_incl_consequences.vcf.gz \
+    --input_fasta ensembl_mouse/transcripts.fa \
+    --gene_annotations_gtf ensembl_mouse/Mus_musculus.GRCm39.*.gtf.gz \
+    --output_proteindb mouse_variants.fa
+
+# Combine and generate decoy
+cat mouse_canonical.fa mouse_variants.fa > mouse_target.fa
+pgatk generate-decoy \
+    --input mouse_target.fa \
+    --output mouse_target_decoy.fa \
+    --method decoypyrat
+```
+
+---
+
+## 14. Three-Frame Translation
+
+Perform a simple three-frame translation of any nucleotide FASTA file:
+
+```bash
+pgatk threeframe-translation \
+    --input_fasta input_sequences.fa \
+    --output translated_proteins.fa
+```
+
+---
+
+## 15. Quality Check: Validate Protein Database
+
+Before using a protein database, check it for internal stop codons and short
+sequences:
+
+```bash
+pgatk ensembl-check \
+    --input_fasta protein_database.fa \
+    --output validated_database.fa \
+    --num_aa 6
+```
+
+This filters out sequences shorter than 6 amino acids. Use `--add_stop_codons`
+to include proteins that contain internal stop codons (useful for
+proteogenomics where frameshifts may introduce premature stops).
diff --git a/mkdocs.yml b/mkdocs.yml
index aa2dab95..13bee389 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -95,6 +95,7 @@ nav:
   - Introduction: introduction.md
   - Installation: installation.md
   - pgatk CLI: pgatk-cli.md
+  - Use Cases: use-cases.md
   - File Formats: formats.md
   - Changelog: changelog.md
   - Support: support.md

From 78fecd885f4c747a42822943d2f08e1d47b28129 Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Tue, 3 Mar 2026 19:24:55 +0000
Subject: [PATCH 3/3] improve docs

---
 docs/use-cases.md | 849 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 654 insertions(+), 195 deletions(-)

diff --git a/docs/use-cases.md b/docs/use-cases.md
index 2622a975..5af6de50 100644
--- a/docs/use-cases.md
+++ b/docs/use-cases.md
@@ -6,10 +6,172 @@ a search-ready protein database.
 
 ---
 
+## Cell-Type Specific Non-Canonical Peptide Discovery
+
+!!! abstract "Featured workflow"
+    This workflow reproduces the analysis from [Umer et al., *Bioinformatics* 2022](https://doi.org/10.1093/bioinformatics/btab838),
+    where cell-type specific databases were generated for 65 cell lines,
+    revealing non-canonical and cryptic peptides amounting to **>5% of total
+    peptide identifications**.
+
+The key insight is to build **cell-type specific** proteogenomics databases
+that combine canonical proteins with non-canonical translations (pseudogenes,
+lncRNAs, alternative ORFs) and variant sequences from multiple genomic sources.
+Searching with a database tailored to the cell type of interest maximizes
+discovery while keeping the search space focused.
+
+### Step 1 -- Download ENSEMBL data
+
+```bash
+pgatk ensembl-downloader \
+    -t 9606 \
+    -o ensembl_human
+```
+
+### Step 2 -- Generate transcript sequences from GTF
+
+```bash
+gffread -F -w ensembl_human/transcripts.fa \
+    -g ensembl_human/genome.fa \
+    ensembl_human/Homo_sapiens.GRCh38.*.gtf.gz
+```
+
+### Step 3 -- Canonical protein-coding sequences
+
+Translate all protein-coding transcripts using their annotated CDS:
+
+```bash
+pgatk dnaseq-to-proteindb \
+    --input_fasta ensembl_human/transcripts.fa \
+    --output_proteindb canonical.fa
+```
+
+### Step 4 -- Non-canonical translations
+
+#### Pseudogenes (three-frame)
+
+Pseudogenes were traditionally considered non-functional, but some produce
+detectable peptides. Translate them in three reading frames:
+
+```bash
+pgatk dnaseq-to-proteindb \
+    --input_fasta ensembl_human/transcripts.fa \
+    --output_proteindb pseudogene.fa \
+    --var_prefix pseudo_ \
+    --include_biotypes processed_pseudogene,unprocessed_pseudogene,transcribed_processed_pseudogene,transcribed_unprocessed_pseudogene,translated_processed_pseudogene \
+    --num_orfs 3 \
+    --skip_including_all_cds
+```
+
+#### lncRNAs (three-frame)
+
+Long non-coding RNAs can harbor small open reading frames encoding
+micropeptides:
+
+```bash
+pgatk dnaseq-to-proteindb \
+    --input_fasta ensembl_human/transcripts.fa \
+    --output_proteindb lncrna.fa \
+    --var_prefix lncrna_ \
+    --include_biotypes lincRNA,antisense,sense_intronic,sense_overlapping \
+    --num_orfs 3 \
+    --skip_including_all_cds
+```
+
+#### Alternative ORFs (exonic out-of-frame translation)
+
+Non-canonical reading frames of protein-coding mRNAs can produce cryptic
+peptides:
+
+```bash
+pgatk dnaseq-to-proteindb \
+    --input_fasta ensembl_human/transcripts.fa \
+    --output_proteindb altorf.fa \
+    --var_prefix altorf_ \
+    --include_biotypes altORFs \
+    --skip_including_all_cds
+```
+
+### Step 5 -- Population variant proteins
+
+Include common human variants from ENSEMBL:
+
+```bash
+pgatk vcf-to-proteindb \
+    --vcf ensembl_human/homo_sapiens_incl_consequences.vcf.gz \
+    --input_fasta ensembl_human/transcripts.fa \
+    --gene_annotations_gtf ensembl_human/Homo_sapiens.GRCh38.*.gtf.gz \
+    --output_proteindb ensembl_variants.fa
+```
+
+### Step 6 -- COSMIC somatic mutations (optional, for cancer cell lines)
+
+For cancer cell-line studies, add cell-line-specific somatic mutations:
+
+```bash
+pgatk cosmic-downloader \
+    -u your_email@example.com \
+    -p your_password \
+    -o cosmic_data
+
+pgatk cosmic-to-proteindb \
+    --input_mutation cosmic_data/CosmicCLP_MutantExport.tsv.gz \
+    --input_genes cosmic_data/All_CellLines_Genes.fasta.gz \
+    --output_db cosmic_cellline.fa \
+    --filter_column "Sample name" \
+    --split_by_filter_column
+```
+
+### Step 7 -- Combine and generate target-decoy database
+
+```bash
+# Combine all components
+# For a specific cell line, use the matching COSMIC file if available
+cat canonical.fa \
+    pseudogene.fa \
+    lncrna.fa \
+    altorf.fa \
+    ensembl_variants.fa \
+    > cell_type_target.fa
+
+# Generate decoy sequences
+pgatk generate-decoy \
+    --input cell_type_target.fa \
+    --output cell_type_target_decoy.fa \
+    --method decoypyrat \
+    --decoy_prefix DECOY_
+```
+
+### Step 8 -- Extract non-canonical unique peptides
+
+After database searching with your search engine, you can also pre-compute the
+set of non-canonical peptides unique to these novel sources:
+
+```bash
+pgatk digest-mutant-protein \
+    --input pseudogene.fa,lncrna.fa,altorf.fa,ensembl_variants.fa \
+    --fasta canonical.fa \
+    --output non_canonical_peptides.fa \
+    --min-len 7 \
+    --max-len 40 \
+    --missed-cleavages 2
+```
+
+!!! tip "Expected results"
+    In the original study, this workflow applied to 65 cell lines across six
+    PRIDE datasets identified non-canonical peptides amounting to >5% of
+    total identifications. The most common non-canonical sources were
+    pseudogenes and lncRNAs, followed by alternative ORFs and variant
+    peptides.
+
+---
+
 ## 1. Human Variant Protein Database from ENSEMBL
 
 Build a variant protein database using ENSEMBL population variants (common SNPs
-and indels) for human proteogenomics searches.
+and indels) for human proteogenomics searches. This is the most common starting
+point for proteogenomics experiments -- augmenting the canonical proteome with
+known variant peptides that would otherwise be missed.
 
 ### Step 1 -- Download ENSEMBL data
 
@@ -80,10 +242,16 @@ The file `target_decoy.fa` is ready for database searching.
 
 ---
 
-## 2. High-Frequency Variant Database (AF Filtering)
+## 2. Population-Specific Variant Database
+
+Population-level genetic variants cause amino acid changes that are invisible
+to standard reference database searches. Studies have shown that incorporating
+common missense variants can increase peptide identifications by 3--5% and is
+especially important when analyzing samples from underrepresented ancestries.
 
-Build a protein database that only includes common population variants above a
-given allele frequency threshold.
+### Common variants above a frequency threshold
+
+Include only variants present in at least 1% of the population:
 
 ```bash
 pgatk vcf-to-proteindb \
@@ -95,7 +263,10 @@ pgatk vcf-to-proteindb \
     --output_proteindb common_variant_proteins.fa
 ```
 
-To restrict to specific consequence types (e.g. only missense variants):
+### Missense-only variant database
+
+Restrict to single amino acid variants (SAAVs), the most common variant type
+detectable by mass spectrometry:
 
 ```bash
 pgatk vcf-to-proteindb \
@@ -108,12 +279,42 @@ pgatk vcf-to-proteindb \
     --output_proteindb missense_common_proteins.fa
 ```
 
+### gnomAD ancestry-stratified database
+
+gnomAD provides allele frequencies stratified by ancestry (African, East Asian,
+South Asian, European, Latino, etc.). Build a database using variants common in
+a specific population:
+
+```bash
+pgatk vcf-to-proteindb \
+    --vcf gnomad.exomes.v4.1.sites.vcf.bgz \
+    --input_fasta gencode_transcripts.fa \
+    --gene_annotations_gtf gencode.v44.annotation.gtf.gz \
+    --annotation_field_name vep \
+    --af_field AF_afr \
+    --af_threshold 0.01 \
+    --include_consequences missense_variant,inframe_insertion,inframe_deletion \
+    --biotype_str transcript_type \
+    --output_proteindb gnomad_afr_proteins.fa
+```
+
+!!! tip "gnomAD-specific parameters"
+    - `--annotation_field_name vep` -- gnomAD uses `vep` instead of `CSQ`
+    - `--af_field` -- Use population-specific AF fields: `AF_afr` (African),
+      `AF_eas` (East Asian), `AF_sas` (South Asian), `AF_nfe` (Non-Finnish European),
+      `AF_amr` (Latino), or `controls_AF` (all controls)
+    - `--biotype_str transcript_type` -- GENCODE uses `transcript_type` instead
+      of ENSEMBL's `transcript_biotype`
+
 ---
 
 ## 3. ClinVar Clinical Variant Database
 
-Build a protein database from ClinVar pathogenic and likely pathogenic variants
-using the NCBI RefSeq gene models.
+[ClinVar](https://www.ncbi.nlm.nih.gov/clinvar/) catalogs the relationship
+between human variants and clinical phenotypes. Building a ClinVar-derived
+protein database enables detection of clinically annotated variant peptides by
+mass spectrometry -- useful for clinical proteomics, pharmacoproteomics, and
+validating pathogenic variants at the protein level.
 
 ### Step 1 -- Download NCBI / ClinVar files
 
@@ -155,37 +356,28 @@ pgatk generate-decoy \
 
 ---
 
-## 4. Cancer-Specific Database from COSMIC
+## 4. Tumor-Specific Databases for Cancer Proteogenomics
 
-Generate tissue-specific protein databases from COSMIC somatic mutations.
+Cancer proteogenomics studies (such as those from CPTAC) build
+tumor-specific protein databases to detect somatic mutant peptides, understand
+therapy resistance, and prioritize neoantigen candidates. The databases combine
+somatic mutations from cancer-specific sources (COSMIC, cBioPortal) and/or
+patient-matched whole-exome sequencing.
 
-### Step 1 -- Download COSMIC data
+### 4a. COSMIC somatic mutations by cancer type
+
+Generate one protein database per primary tissue site. This is the standard
+approach for large-scale cancer proteogenomics when patient-level WES is not
+available:
 
 ```bash
+# Download COSMIC data (requires account)
 pgatk cosmic-downloader \
     -u your_email@example.com \
     -p your_password \
     -o cosmic_data
-```
-
-!!! note
-    A COSMIC account is required. Register at
-    [https://cancer.sanger.ac.uk/cosmic/register](https://cancer.sanger.ac.uk/cosmic/register).
 
-### Step 2a -- Single database from all mutations
-
-```bash
-pgatk cosmic-to-proteindb \
-    --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \
-    --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \
-    --output_db cosmic_all_proteins.fa
-```
-
-### Step 2b -- One database per cancer type
-
-Split mutations by primary tissue site, creating one FASTA per cancer type:
-
-```bash
+# Generate per-tissue databases
 pgatk cosmic-to-proteindb \
     --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \
     --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \
@@ -193,12 +385,12 @@ pgatk cosmic-to-proteindb \
     --split_by_filter_column
 ```
 
-This produces files named `cosmic_proteins_<tissue>.fa` for each tissue type
-(e.g. `cosmic_proteins_lung.fa`, `cosmic_proteins_breast.fa`).
+This produces files like `cosmic_proteins_lung.fa`, `cosmic_proteins_breast.fa`,
+etc. Use the tissue-matched database for your cancer type of interest.
 
-### Step 2c -- Database for a specific cancer type
+### 4b. Single cancer type
 
-Limit to a single tissue type:
+Build a focused database for one cancer type (e.g. lung):
 
 ```bash
 pgatk cosmic-to-proteindb \
@@ -208,9 +400,10 @@ pgatk cosmic-to-proteindb \
     --accepted_values "lung"
 ```
 
-### Step 2d -- Cell-line specific databases
+### 4c. Cell-line proteogenomics
 
-Use cell-line data split by sample name:
+When analyzing cell-line proteomes, use cell-line-specific mutations. COSMIC
+provides a dedicated cell-line export with mutations annotated per sample:
 
 ```bash
 pgatk cosmic-to-proteindb \
@@ -221,71 +414,133 @@ pgatk cosmic-to-proteindb \
     --split_by_filter_column
 ```
 
----
-
-## 5. Cancer Study from cBioPortal
-
-Generate a protein database from cancer mutation data hosted in cBioPortal.
+### 4d. cBioPortal study-specific database
 
-### Step 1 -- List available studies
+cBioPortal hosts mutation data from thousands of cancer genomics studies.
+Generate a protein database from a specific study:
 
 ```bash
+# List available studies
 pgatk cbioportal-downloader --list_studies
+
+# Download a study
+pgatk cbioportal-downloader \
+    -d brca_tcga_pan_can_atlas_2018 \
+    -o cbioportal_data
+
+# Download ENSEMBL CDS (hg19 -- cBioPortal uses GRCh37)
+pgatk ensembl-downloader \
+    -t 9606 --grch37 -o ensembl_hg19 \
+    --skip_vcf --skip_gtf --skip_protein \
+    --skip_ncrna --skip_cdna --skip_dna
+
+# Translate mutations
+pgatk cbioportal-to-proteindb \
+    --input_mutation cbioportal_data/data_mutations_mskcc.txt \
+    --input_cds ensembl_hg19/Homo_sapiens.GRCh37.cds.all.fa.gz \
+    --output_db brca_tcga_proteins.fa
 ```
 
-### Step 2 -- Download a specific study
+### 4e. Combined cancer database for immunopeptidomics
+
+For HLA immunopeptidomics / neoantigen discovery, a broad mutation database
+maximizes the chance of detecting mutant HLA-presented peptides. Studies have
+shown that COSMIC-derived databases can identify 5x more mutant immunopeptides
+than patient WES alone. Combine COSMIC with ClinVar:
 
 ```bash
-pgatk cbioportal-downloader \
-    -d blca_mskcc_solit_2014 \
-    -o cbioportal_data
+# Generate COSMIC mutations
+pgatk cosmic-to-proteindb \
+    --input_mutation cosmic_data/CosmicMutantExport.tsv.gz \
+    --input_genes cosmic_data/All_COSMIC_Genes.fasta.gz \
+    --output_db cosmic_proteins.fa
+
+# Generate ClinVar mutations
+pgatk clinvar-to-proteindb \
+    --vcf ncbi_data/clinvar.vcf.gz \
+    --gtf ncbi_data/GRCh38_latest_genomic.gtf.gz \
+    --fasta ncbi_data/GRCh38_latest_rna.fna.gz \
+    --assembly-report ncbi_data/GRCh38_latest_assembly_report.txt \
+    --output clinvar_proteins.fa
+
+# Extract variant-unique peptides (undigested -- HLA peptides are not tryptic)
+# For immunopeptidomics, use the full-length variant proteins directly:
+cat cosmic_proteins.fa clinvar_proteins.fa > neoantigen_candidates.fa
+
+pgatk generate-decoy \
+    --input neoantigen_candidates.fa \
+    --output neoantigen_target_decoy.fa \
+    --method decoypyrat
 ```
 
-### Step 3 -- Download ENSEMBL CDS (hg19 assembly)
+---
+
+## 5. Patient-Specific Database from WGS/WES
+
+When matched whole-genome or whole-exome sequencing data is available for
+a sample, build a personalized protein database from the patient's own
+variants. This is the gold standard for precision proteogenomics as used in
+CPTAC studies.
 
-cBioPortal mutations are aligned to hg19 (GRCh37):
+### From a raw VCF (no VEP annotations)
+
+VCF files from variant callers (e.g. GATK HaplotypeCaller, Strelka, MuTect2)
+typically lack VEP consequence annotations:
 
 ```bash
-pgatk ensembl-downloader \
-    -t 9606 \
-    --grch37 \
-    -o ensembl_hg19 \
-    --skip_vcf \
-    --skip_gtf \
-    --skip_protein \
-    --skip_ncrna \
-    --skip_cdna \
-    --skip_dna
+pgatk vcf-to-proteindb \
+    --vcf patient_somatic.vcf \
+    --input_fasta transcripts.fa \
+    --gene_annotations_gtf genes.gtf \
+    --annotation_field_name '' \
+    --ignore_filters \
+    --output_proteindb patient_proteins.fa
 ```
 
-### Step 4 -- Translate mutations to protein sequences
+- `--annotation_field_name ''` skips VEP/CSQ parsing
+- `--ignore_filters` includes all called variants
+
+### From a VEP-annotated VCF
+
+If the VCF has been annotated with Ensembl VEP, you can filter by consequence
+and transcript biotype:
 
 ```bash
-pgatk cbioportal-to-proteindb \
-    --input_mutation cbioportal_data/data_mutations_mskcc.txt \
-    --input_cds ensembl_hg19/Homo_sapiens.GRCh37.cds.all.fa.gz \
-    --output_db bladder_cancer_proteins.fa
+pgatk vcf-to-proteindb \
+    --vcf patient_somatic.vep.vcf \
+    --input_fasta transcripts.fa \
+    --gene_annotations_gtf genes.gtf \
+    --include_consequences missense_variant,frameshift_variant,stop_gained,inframe_insertion,inframe_deletion \
+    --output_proteindb patient_proteins.fa
 ```
 
-To generate tissue-type-specific databases using the clinical sample file:
+### Only PASS-filtered variants
+
+Include only high-confidence variant calls:
 
 ```bash
-pgatk cbioportal-to-proteindb \
-    --input_mutation cbioportal_data/data_mutations_mskcc.txt \
-    --input_cds ensembl_hg19/Homo_sapiens.GRCh37.cds.all.fa.gz \
-    --clinical_sample_file cbioportal_data/data_clinical_sample.txt \
-    --output_db cbioportal_proteins.fa \
-    --split_by_filter_column
+pgatk vcf-to-proteindb \
+    --vcf patient_somatic.vcf \
+    --input_fasta transcripts.fa \
+    --gene_annotations_gtf genes.gtf \
+    --annotation_field_name '' \
+    --accepted_filters PASS \
+    --output_proteindb patient_pass_proteins.fa
 ```
 
 ---
 
-## 6. Non-Coding RNA Protein Database
+## 6. Novel ORF and Micropeptide Discovery
+
+Proteogenomics is a key approach for discovering novel coding regions: small
+open reading frames (smORFs), micropeptides from lncRNAs, pseudogene-encoded
+proteins, and alternative reading frames. Studies have found that non-canonical
+peptides can account for over 5% of total identifications.
 
-Translate non-coding RNA transcripts to search for novel peptides from
-supposedly non-coding regions.
+### lincRNA-derived proteins
 
-### lincRNA database
+Long intergenic non-coding RNAs (lincRNAs) can encode small proteins. Translate
+them in three reading frames:
 
 ```bash
 pgatk dnaseq-to-proteindb \
@@ -297,7 +552,9 @@ pgatk dnaseq-to-proteindb \
     --skip_including_all_cds
 ```
 
-### Pseudogene database
+### Pseudogene-derived proteins
+
+Some pseudogenes are transcribed and may produce functional peptides:
 
 ```bash
 pgatk dnaseq-to-proteindb \
@@ -311,7 +568,8 @@ pgatk dnaseq-to-proteindb \
 
 ### Alternative ORFs from protein-coding transcripts
 
-Translate non-canonical reading frames of protein-coding genes:
+Translate non-canonical reading frames of known protein-coding genes to
+discover upstream ORFs (uORFs), overlapping ORFs, and downstream ORFs:
 
 ```bash
 pgatk dnaseq-to-proteindb \
@@ -322,12 +580,60 @@ pgatk dnaseq-to-proteindb \
     --skip_including_all_cds
 ```
 
+### Antisense transcripts
+
+```bash
+pgatk dnaseq-to-proteindb \
+    --input_fasta transcripts.fa \
+    --output_proteindb antisense_proteins.fa \
+    --var_prefix antisense_ \
+    --include_biotypes antisense,antisense_RNA \
+    --num_orfs 3 \
+    --skip_including_all_cds
+```
+
+### Combined non-canonical database
+
+Combine all non-canonical sources with the canonical proteome:
+
+```bash
+cat canonical_proteins.fa \
+    lincRNA_proteins.fa \
+    pseudogene_proteins.fa \
+    altorf_proteins.fa \
+    antisense_proteins.fa \
+    > novel_orf_target.fa
+
+pgatk generate-decoy \
+    --input novel_orf_target.fa \
+    --output novel_orf_target_decoy.fa \
+    --method decoypyrat
+```
+
+!!! tip
+    After searching, use `digest-mutant-protein` to extract only the novel
+    peptides that do not appear in the canonical proteome:
+
+    ```bash
+    pgatk digest-mutant-protein \
+        --input lincRNA_proteins.fa,pseudogene_proteins.fa,altorf_proteins.fa \
+        --fasta canonical_proteins.fa \
+        --output novel_unique_peptides.fa
+    ```
+
 ---
 
-## 7. Six-Frame Genome Translation
+## 7. Genome Annotation Refinement
+
+Proteogenomics is used to validate and refine genome annotations by mapping
+proteomics-identified peptides back to genomic coordinates. This has been
+applied to improve annotations for organisms from wheat (validating 33,612
+gene models) to human (discovering novel splice junctions and exons).
 
-Perform a full six-frame translation of a genome assembly. This is useful as
-an unbiased search space but produces very large databases.
+### Six-frame genome translation
+
+Translate the entire genome in all six reading frames to create an unbiased
+search space:
 
 ```bash
 pgatk dnaseq-to-proteindb \
@@ -341,119 +647,284 @@ pgatk dnaseq-to-proteindb \
 !!! warning
     Six-frame translation of a full mammalian genome produces a very large
     database. Consider translating individual chromosomes or using it in
-    combination with a smaller targeted database search.
+    combination with a smaller targeted database for a two-pass search strategy.
 
----
+### Three-frame transcriptome translation
 
-## 8. gnomAD Population Variant Database
+A more focused approach: translate all annotated transcripts in three reading
+frames to capture alternative ORFs without the full genome search space:
+
+```bash
+pgatk threeframe-translation \
+    --input_fasta transcripts.fa \
+    --output translated_transcripts.fa
+```
 
-Build a protein database from gnomAD variants using GENCODE annotations.
+### Map identified peptides back to the genome
+
+After database searching, map novel peptide identifications to genomic
+coordinates for annotation and genome browser visualization:
 
 ```bash
-pgatk vcf-to-proteindb \
-    --vcf gnomad_genome.vcf.gz \
-    --input_fasta gencode_transcripts.fa \
-    --gene_annotations_gtf gencode.v38.annotation.gtf.gz \
-    --annotation_field_name vep \
-    --af_field controls_AF \
-    --af_threshold 0.01 \
-    --include_consequences missense_variant,inframe_insertion,inframe_deletion,frameshift_variant \
-    --biotype_str transcript_type \
-    --output_proteindb gnomad_proteins.fa
+pgatk map-peptide2genome \
+    --input novel_peptide_identifications.tsv \
+    --gtf genes.gtf \
+    --fasta proteins.fa \
+    --idmap protein_to_transcript.tsv \
+    --output novel_peptides.gff3
 ```
 
-!!! tip "gnomAD-specific parameters"
-    - `--annotation_field_name vep` -- gnomAD uses `vep` instead of `CSQ`
-    - `--af_field controls_AF` -- Use control allele frequency
-    - `--biotype_str transcript_type` -- GENCODE uses `transcript_type` instead
-      of ENSEMBL's `transcript_biotype`
+The GFF3 output can be loaded into UCSC Genome Browser, IGV, or Ensembl for
+visual inspection alongside gene models, RNA-seq coverage, and conservation
+tracks.
 
 ---
 
-## 9. Sample-Specific Database from WGS/WES
+## 8. Metaproteomics: Microbiome and Environmental Samples
+
+Metaproteomics characterizes the functional activity of microbial communities.
+The central challenge is building the protein search database, since communities
+can contain hundreds to thousands of species. When matched metagenomic
+sequencing data is available, six-frame translation of assembled contigs
+provides a sample-matched database.
 
-Translate variants from a whole-genome or whole-exome sequencing VCF of an
-individual sample. These VCFs typically lack VEP annotations.
+### Six-frame translation of metagenome assembly
+
+After assembling metagenomic reads into contigs (e.g. with megahit or
+metaSPAdes), translate them:
 
 ```bash
-pgatk vcf-to-proteindb \
-    --vcf sample_variants.vcf \
-    --input_fasta transcripts.fa \
-    --gene_annotations_gtf genes.gtf \
-    --annotation_field_name '' \
-    --ignore_filters \
-    --output_proteindb sample_proteins.fa
+pgatk dnaseq-to-proteindb \
+    --input_fasta metagenome_contigs.fa \
+    --output_proteindb metagenome_6frame.fa \
+    --biotype_str '' \
+    --num_orfs 3 \
+    --num_orfs_complement 3 \
+    --protein_prefix meta_
 ```
 
-- `--annotation_field_name ''` tells the tool to skip parsing VEP/CSQ
-  annotations (not present in raw variant callers like GATK HaplotypeCaller)
-- `--ignore_filters` includes all variants regardless of the FILTER column
+### Three-frame translation of predicted ORFs
 
-To include only variants that passed quality filtering:
+If gene prediction has been run on the metagenome assembly (e.g. Prodigal,
+MetaGeneAnnotator), translate the predicted CDS regions:
 
 ```bash
-pgatk vcf-to-proteindb \
-    --vcf sample_variants.vcf \
-    --input_fasta transcripts.fa \
-    --gene_annotations_gtf genes.gtf \
-    --annotation_field_name '' \
-    --accepted_filters PASS \
-    --output_proteindb sample_proteins_pass.fa
+pgatk dnaseq-to-proteindb \
+    --input_fasta predicted_genes.fna \
+    --output_proteindb metagenome_proteins.fa \
+    --biotype_str '' \
+    --protein_prefix meta_
 ```
 
+### Add decoy for metaproteomics search
+
+```bash
+pgatk generate-decoy \
+    --input metagenome_proteins.fa \
+    --output metagenome_target_decoy.fa \
+    --method decoypyrat
+```
+
+!!! note
+    For very large metaproteomics databases (millions of sequences),
+    consider using `--memory_save` with `generate-decoy` to reduce memory
+    usage, or splitting the database by contig/bin before decoy generation.
+
 ---
 
-## 10. Digest and Filter Variant Peptides
+## 9. Long-Read Transcriptomics + Proteogenomics
 
-After generating a variant protein database you may want to digest the proteins
-*in silico* and keep only peptides that differ from the canonical proteome.
-This produces a compact FASTA of variant-specific peptides.
+Long-read RNA sequencing (PacBio Iso-Seq, Oxford Nanopore) captures full-length
+transcript isoforms without assembly, enabling unambiguous protein isoform
+prediction. Studies have shown that 25% of long-read-detected protein isoforms
+may be absent from reference annotations.
+
+### Translate long-read transcript sequences
+
+After processing long reads through an isoform classification pipeline (e.g.
+SQANTI3), translate the high-quality transcript sequences:
+
+```bash
+# Translate all long-read transcripts, including novel isoforms
+pgatk dnaseq-to-proteindb \
+    --input_fasta long_read_transcripts.fa \
+    --output_proteindb long_read_proteins.fa \
+    --biotype_str '' \
+    --protein_prefix lr_
+```
+
+### Combine with canonical proteome
+
+```bash
+cat canonical_proteins.fa long_read_proteins.fa > lr_target.fa
+
+pgatk generate-decoy \
+    --input lr_target.fa \
+    --output lr_target_decoy.fa \
+    --method decoypyrat
+```
+
+### Extract novel isoform peptides
+
+After searching, identify peptides unique to long-read isoforms that are not
+present in the reference:
 
 ```bash
 pgatk digest-mutant-protein \
-    --input variant_proteins.fa \
+    --input long_read_proteins.fa \
     --fasta canonical_proteins.fa \
-    --output unique_variant_peptides.fa \
+    --output novel_isoform_peptides.fa \
     --min-len 7 \
     --max-len 40 \
     --missed-cleavages 2
 ```
 
-To combine multiple variant sources (e.g. COSMIC + ClinVar):
+---
+
+## 10. Plant and Non-Model Organism Proteogenomics
+
+For non-model organisms with draft or incomplete genome annotations,
+proteogenomics improves gene model quality. This approach has been particularly
+impactful for crop species -- for example, a wheat proteogenomics study
+validated over 33,000 gene models and recommended promoting 3,700
+low-confidence genes.
+
+### Step 1 -- Download ENSEMBL data for the species
+
+pgatk supports any species available in ENSEMBL. For example, rice
+(*Oryza sativa*, taxonomy 39947):
 
 ```bash
-pgatk digest-mutant-protein \
-    --input cosmic_proteins.fa,clinvar_proteins.fa \
-    --fasta canonical_proteins.fa \
-    --output combined_variant_peptides.fa
+pgatk ensembl-downloader \
+    -t 39947 \
+    -o ensembl_rice \
+    --skip_dna
 ```
 
----
+### Step 2 -- Generate transcript sequences and canonical proteome
+
+```bash
+gffread -F -w ensembl_rice/transcripts.fa \
+    -g ensembl_rice/genome.fa \
+    ensembl_rice/Oryza_sativa.IRGSP-1.0.*.gtf.gz
+
+pgatk dnaseq-to-proteindb \
+    --input_fasta ensembl_rice/transcripts.fa \
+    --output_proteindb rice_canonical.fa
+```
+
+### Step 3 -- Three-frame translation for novel gene discovery
+
+```bash
+pgatk threeframe-translation \
+    --input_fasta ensembl_rice/transcripts.fa \
+    --output rice_3frame.fa
+```
+
+### Step 4 -- Six-frame genome translation (unbiased)
 
-## 11. Map Identified Peptides to Genomic Coordinates
+```bash
+pgatk dnaseq-to-proteindb \
+    --input_fasta ensembl_rice/genome.fa \
+    --output_proteindb rice_genome_6frame.fa \
+    --biotype_str '' \
+    --num_orfs 3 \
+    --num_orfs_complement 3
+```
+
+### Step 5 -- Combine databases
+
+```bash
+cat rice_canonical.fa rice_3frame.fa > rice_target.fa
+
+pgatk generate-decoy \
+    --input rice_target.fa \
+    --output rice_target_decoy.fa \
+    --method decoypyrat
+```
 
-After a proteomics search, map the identified peptides back to genomic
-coordinates for visualization in genome browsers (e.g. UCSC, IGV).
+### Step 6 -- Map identified novel peptides to the genome
 
 ```bash
 pgatk map-peptide2genome \
-    --input peptide_identifications.tsv \
-    --gtf genes.gtf \
-    --fasta proteins.fa \
+    --input rice_novel_peptides.tsv \
+    --gtf ensembl_rice/Oryza_sativa.IRGSP-1.0.*.gtf.gz \
+    --fasta rice_target.fa \
     --idmap protein_to_transcript.tsv \
-    --output peptides.gff3
+    --output rice_novel_peptides.gff3
 ```
 
-The input TSV should contain at least peptide sequence and protein accession
-columns (configurable with `--pep-col` and `--prot-col`).
+### Mouse example
+
+For model organisms like mouse (*Mus musculus*, taxonomy 10090):
+
+```bash
+# Download
+pgatk ensembl-downloader -t 10090 -o ensembl_mouse --skip_dna
+
+# Generate transcript sequences
+gffread -F -w ensembl_mouse/transcripts.fa \
+    -g ensembl_mouse/genome.fa \
+    ensembl_mouse/Mus_musculus.GRCm39.*.gtf.gz
+
+# Canonical + variant proteins
+pgatk dnaseq-to-proteindb \
+    --input_fasta ensembl_mouse/transcripts.fa \
+    --output_proteindb mouse_canonical.fa
+
+pgatk vcf-to-proteindb \
+    --vcf ensembl_mouse/mus_musculus_incl_consequences.vcf.gz \
+    --input_fasta ensembl_mouse/transcripts.fa \
+    --gene_annotations_gtf ensembl_mouse/Mus_musculus.GRCm39.*.gtf.gz \
+    --output_proteindb mouse_variants.fa
+
+# Combine and generate decoy
+cat mouse_canonical.fa mouse_variants.fa > mouse_target.fa
+pgatk generate-decoy \
+    --input mouse_target.fa \
+    --output mouse_target_decoy.fa \
+    --method decoypyrat
+```
+
+---
+
+## 11. Digest and Filter Variant Peptides
+
+After generating variant protein databases from any source, *in silico*
+digestion and filtering against the canonical proteome extracts only the
+peptides that are unique to the variant sequences. This compact peptide list
+is useful for targeted validation, spectral library building, and
+quantification of variant peptides.
+
+### Single source
+
+```bash
+pgatk digest-mutant-protein \
+    --input variant_proteins.fa \
+    --fasta canonical_proteins.fa \
+    --output unique_variant_peptides.fa \
+    --min-len 7 \
+    --max-len 40 \
+    --missed-cleavages 2
+```
+
+### Combine multiple variant sources
+
+```bash
+pgatk digest-mutant-protein \
+    --input cosmic_proteins.fa,clinvar_proteins.fa,ensembl_variants.fa \
+    --fasta canonical_proteins.fa \
+    --output combined_variant_peptides.fa
+```
 
 ---
 
-## 12. Complete Proteogenomics Workflow
+## 12. Complete Multi-Source Proteogenomics Workflow
 
 A full end-to-end workflow combining canonical proteins, population variants,
 clinical variants, cancer mutations, and non-coding translations into a single
-search-ready database.
+search-ready database. This is the approach used in large-scale studies that aim
+to maximize the discovery of non-canonical peptides.
 
 ### Step 1 -- Download all data sources
 
@@ -513,6 +984,15 @@ pgatk dnaseq-to-proteindb \
     --include_biotypes lincRNA \
     --num_orfs 3 \
     --skip_including_all_cds
+
+# Pseudogenes
+pgatk dnaseq-to-proteindb \
+    --input_fasta ensembl_data/transcripts.fa \
+    --output_proteindb pseudogene.fa \
+    --var_prefix pseudogene_ \
+    --include_biotypes processed_pseudogene,transcribed_processed_pseudogene \
+    --num_orfs 3 \
+    --skip_including_all_cds
 ```
 
 ### Step 4 -- Combine all databases
@@ -523,14 +1003,22 @@ cat canonical.fa \
     clinvar_variants.fa \
     cosmic_variants.fa \
     lincRNA.fa \
+    pseudogene.fa \
     > combined_target.fa
 ```
 
-### Step 5 -- Add decoy sequences
+### Step 5 -- Quality check and decoy generation
 
 ```bash
+# Filter short sequences and validate
+pgatk ensembl-check \
+    --input_fasta combined_target.fa \
+    --output validated_target.fa \
+    --num_aa 6
+
+# Add decoy sequences
 pgatk generate-decoy \
-    --input combined_target.fa \
+    --input validated_target.fa \
     --output proteogenomics_target_decoy.fa \
     --method decoypyrat \
     --decoy_prefix DECOY_
@@ -540,7 +1028,7 @@ pgatk generate-decoy \
 
 ```bash
 pgatk digest-mutant-protein \
-    --input ensembl_variants.fa,clinvar_variants.fa,cosmic_variants.fa \
+    --input ensembl_variants.fa,clinvar_variants.fa,cosmic_variants.fa,lincRNA.fa,pseudogene.fa \
     --fasta canonical.fa \
     --output unique_variant_peptides.fa \
     --min-len 7 \
@@ -549,63 +1037,21 @@ pgatk digest-mutant-protein \
 ```
 
 The file `proteogenomics_target_decoy.fa` is a comprehensive search database,
-and `unique_variant_peptides.fa` provides a focused list of variant-specific
+and `unique_variant_peptides.fa` provides a focused list of non-canonical
 peptides for validation.
 
----
-
-## 13. Non-Human Species
-
-pgatk supports any species available in ENSEMBL. Here is an example for mouse
-(*Mus musculus*, taxonomy 10090):
-
-```bash
-# Download
-pgatk ensembl-downloader -t 10090 -o ensembl_mouse --skip_dna
-
-# Generate transcript sequences
-gffread -F -w ensembl_mouse/transcripts.fa \
-    -g ensembl_mouse/genome.fa \
-    ensembl_mouse/Mus_musculus.GRCm39.*.gtf.gz
-
-# Canonical proteins
-pgatk dnaseq-to-proteindb \
-    --input_fasta ensembl_mouse/transcripts.fa \
-    --output_proteindb mouse_canonical.fa
-
-# Variant proteins
-pgatk vcf-to-proteindb \
-    --vcf ensembl_mouse/mus_musculus_incl_consequences.vcf.gz \
-    --input_fasta ensembl_mouse/transcripts.fa \
-    --gene_annotations_gtf ensembl_mouse/Mus_musculus.GRCm39.*.gtf.gz \
-    --output_proteindb mouse_variants.fa
-
-# Combine and generate decoy
-cat mouse_canonical.fa mouse_variants.fa > mouse_target.fa
-pgatk generate-decoy \
-    --input mouse_target.fa \
-    --output mouse_target_decoy.fa \
-    --method decoypyrat
-```
+!!! tip "Database size considerations"
+    Larger proteogenomic databases improve coverage but increase the FDR
+    penalty. Consider a two-pass strategy: first search against a focused
+    database (canonical + variants), then a broader database (adding
+    non-coding ORFs and six-frame translations) for discovery.
 
 ---
 
-## 14. Three-Frame Translation
+## 13. Quality Check: Validate Protein Database
 
-Perform a simple three-frame translation of any nucleotide FASTA file:
-
-```bash
-pgatk threeframe-translation \
-    --input_fasta input_sequences.fa \
-    --output translated_proteins.fa
-```
-
----
-
-## 15. Quality Check: Validate Protein Database
-
-Before using a protein database, check it for internal stop codons and short
-sequences:
+Before using a protein database for searching, validate it for internal stop
+codons and short sequences:
 
 ```bash
 pgatk ensembl-check \
@@ -617,3 +1063,16 @@ pgatk ensembl-check \
 This filters out sequences shorter than 6 amino acids. Use `--add_stop_codons`
 to include proteins that contain internal stop codons (useful for
 proteogenomics where frameshifts may introduce premature stops).
+
+---
+
+## References
+
+If you use pgatk in your research, please cite:
+
+> Husen M Umer, Enrique Audain, Yafeng Zhu, Julianus Pfeuffer, Timo
+> Sachsenberg, Janne Lehtiö, Rui M Branca, Yasset Perez-Riverol.
+> **Generation of ENSEMBL-based proteogenomics databases boosts the
+> identification of non-canonical peptides.**
+> *Bioinformatics*, Volume 38, Issue 5, 1 March 2022, Pages 1470--1472.
+> [https://doi.org/10.1093/bioinformatics/btab838](https://doi.org/10.1093/bioinformatics/btab838)