bigbio · ypriverol · May 11, 2026 · May 11, 2026 · May 11, 2026 · May 11, 2026
diff --git a/.codacy.yaml b/.codacy.yaml
@@ -0,0 +1,15 @@
+---
+# Codacy configuration. https://docs.codacy.com/repositories-configure/codacy-configuration-file/
+#
+# pydocstyle has two pairs of mutually-exclusive rules:
+#   D203 vs D211 (blank line before class docstring; codebase follows D203)
+#   D212 vs D213 (multi-line summary line; codebase follows D212)
+# Both rules in each pair are enabled in Codacy's default profile, which produces
+# unavoidable noise — pick one and silence the other. Settings here also apply to
+# any local pydocstyle invocation that reads `.pydocstyle`.
+
+engines:
+  pydocstyle:
+    enabled: true
+    settings:
+      add_ignore: ["D211", "D213"]
diff --git a/.gitignore b/.gitignore
@@ -135,3 +135,11 @@ pgatk/testdata/Meleagris_gallopavo*
 .DS_Store
 .codacy/
 .cursor/
+
+# Internal working docs (implementation plans, scratch notes)
+docs/plans/
+
+# BioPython SeqIO.index_db SQLite indexes — built lazily on first use,
+# rebuilt automatically when the source FASTA changes (mtime check).
+*.fa.idx
+*.fasta.idx
diff --git a/.pydocstyle b/.pydocstyle
@@ -0,0 +1,6 @@
+[pydocstyle]
+# D211 conflicts with D203 (codebase follows D203: one blank line before class docstring).
+# D213 conflicts with D212 (codebase follows D212: multi-line summary on the first line).
+# Disabling the rules we don't follow stops the mutually-exclusive-pair noise from
+# static analysers (pydocstyle, Codacy).
+add-ignore = D211,D213
diff --git a/docs/index.md b/docs/index.md
@@ -23,6 +23,7 @@ See the [Installation](installation.md) page for more options (Bioconda, Docker,
 | [Introduction](introduction.md) | Overview of the proteogenomics field |
 | [Installation](installation.md) | How to install pgatk (pip, Bioconda, Docker, source) |
 | [pgatk CLI](pgatk-cli.md) | Full command-line reference for all tools |
+| [Validations](validations.md) | Tests and validations to ensure correctness of the modules|
 | [Use Cases](use-cases.md) | End-to-end workflows and recipes for common scenarios |
 | [File Formats](formats.md) | BED, GTF, GCT format specifications |
 | [Changelog](changelog.md) | Version history and release notes |

diff --git a/docs/pgatk-cli.md b/docs/pgatk-cli.md
@@ -318,7 +318,7 @@ Usage: pgatk vcf-to-proteindb [OPTIONS]
   Options:
     --translation_table INTEGER        Translation table (Default 1)
     --mito_translation_table INTEGER   Mito_trans_table (default 2)
-    --var_prefix TEXT                   String to add as prefix for the variant peptides
+    --protein_prefix TEXT               String to add as prefix for the variant peptides
     --report_ref_seq                   Also report the reference peptide from overlapping transcripts
     --annotation_field_name TEXT       Annotation field name in INFO column (default: CSQ)
     --af_field TEXT                    Field name for variant allele frequency (default: none)
@@ -467,7 +467,7 @@ Usage: pgatk dnaseq-to-proteindb [OPTIONS]
     --biotype_str TEXT                 String used to identify gene/transcript biotype (default: transcript_biotype)
     --expression_str TEXT              String for extracting expression value (default: None)
     --expression_thresh FLOAT          Threshold for expression value filtering (default: 5)
-    --var_prefix TEXT                  Prefix to be added to fasta headers (default: none)
+    --protein_prefix TEXT              Prefix to be added to fasta headers (default: none)
     -h, --help                         Show this message and exit.
 ```
 
@@ -489,7 +489,7 @@ Usage: pgatk dnaseq-to-proteindb [OPTIONS]
         --config_file config/ensembl_config.yaml \
         --input_fasta transcript_sequences.fa \
         --output_proteindb proteindb_from_lincRNA_canonical_sequences.fa \
-        --var_prefix lincRNA_ \
+        --protein_prefix lincRNA_ \
         --include_biotypes lincRNA
     ```
 
@@ -500,7 +500,7 @@ Usage: pgatk dnaseq-to-proteindb [OPTIONS]
         --config_file config/ensembl_config.yaml \
         --input_fasta transcript_sequences.fa \
         --output_proteindb proteindb_from_processed_pseudogene.fa \
-        --var_prefix pseudogene_ \
+        --protein_prefix pseudogene_ \
         --include_biotypes processed_pseudogene,transcribed_processed_pseudogene,translated_processed_pseudogene \
         --skip_including_all_cds
     ```
@@ -512,7 +512,7 @@ Usage: pgatk dnaseq-to-proteindb [OPTIONS]
         --config_file config/ensembl_config.yaml \
         --input_fasta transcript_sequences.fa \
         --output_proteindb proteindb_from_altORFs.fa \
-        --var_prefix altorf_ \
+        --protein_prefix altorf_ \
         --include_biotypes altORFs \
         --skip_including_all_cds
     ```

diff --git a/docs/plans/2026-03-01-pgatk-graph-engine-design.md b/docs/plans/2026-03-01-pgatk-graph-engine-design.md