From 2dab3d72f8a2ed2945b178808c2c83e6ed52c297 Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Thu, 16 Oct 2025 13:44:33 -0700
Subject: [PATCH 1/5] ingest: Update snakefile descriptions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Sync these up with the source code.

In fetch_from_ncbi, the reference to 'config.sources' seems to be outdated –
the code is hardcoded to fetch from GenBank.
---
 ingest/rules/curate.smk          | 14 +++++++-------
 ingest/rules/fetch_from_ncbi.smk | 14 ++++++--------
 ingest/rules/nextclade.smk       |  7 +++----
 3 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk
index a3f6ba3..f3b4e3a 100644
--- a/ingest/rules/curate.smk
+++ b/ingest/rules/curate.smk
@@ -1,15 +1,15 @@
 """
-This part of the workflow handles transforming the data into standardized
-formats and expects input file
+This part of the workflow handles the curation of data from NCBI
 
-    sequences_ndjson = "data/sequences_{serotype}.ndjson"
+REQUIRED INPUTS:
 
-This will produce output files as
+    sequences_ndjson = data/genbank.ndjson
 
-    metadata = "results/metadata_{serotype}.tsv"
-    sequences = "results/sequences_{serotype}.fasta"
+OUTPUTS:
+
+    metadata         = data/subset_metadata.tsv
+    sequences        = results/sequences.fasta
 
-Parameters are expected to be defined in `config.curate`.
 """
 
 
diff --git a/ingest/rules/fetch_from_ncbi.smk b/ingest/rules/fetch_from_ncbi.smk
index d1d4b90..86f0947 100644
--- a/ingest/rules/fetch_from_ncbi.smk
+++ b/ingest/rules/fetch_from_ncbi.smk
@@ -1,15 +1,13 @@
 """
-This part of the workflow handles fetching sequences from various sources.
-Uses `config.sources` to determine which sequences to include in final output.
+This part of the workflow handles fetching sequences and metadata from GenBank.
 
-Currently only fetches sequences from GenBank, but other sources can be
-defined in the config. If adding other sources, add a new rule upstream
-of rule `fetch_all_sequences` to create the file `data/{source}.ndjson` or the
-file must exist as a static file in the repo.
+REQUIRED INPUTS:
 
-Produces final output as
+    None
 
-    sequences_ndjson = "data/sequences.ndjson"
+OUTPUTS:
+
+    ndjson = data/genbank.ndjson
 
 """
 workflow.global_resources.setdefault("concurrent_deploys", 2)
diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk
index 982e32c..c2ebfa1 100644
--- a/ingest/rules/nextclade.smk
+++ b/ingest/rules/nextclade.smk
@@ -3,11 +3,10 @@ This part of the workflow handles running Nextclade on the curated metadata
 and sequences.
 REQUIRED INPUTS:
     metadata    = data/subset_metadata.tsv
-    sequences   = data/sequences_all.fasta
-    nextclade_datasets = ../nextclade/dataset
+    sequences   = results/sequences.fasta
+    dataset     = (from config)
 OUTPUTS:
-    metadata        = data/metadata_all.tsv
-    nextclade       = data/nextclade_clades.tsv
+    metadata    = results/metadata.tsv
 See Nextclade docs for more details on usage, inputs, and outputs if you would
 like to customize the rules:
 https://docs.nextstrain.org/projects/nextclade/page/user/nextclade-cli.html

From 248ea153549b94ca623d27ed9d9605a474909b3d Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Thu, 16 Oct 2025 13:29:32 -0700
Subject: [PATCH 2/5] ingest: Move 'fetch_from_ncbi' to 'fetch'

Generalize the name before adding rules for other data sources.
---
 ingest/Snakefile                                | 4 ++--
 ingest/rules/{fetch_from_ncbi.smk => fetch.smk} | 0
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename ingest/rules/{fetch_from_ncbi.smk => fetch.smk} (100%)

diff --git a/ingest/Snakefile b/ingest/Snakefile
index 090f793..424ffb0 100644
--- a/ingest/Snakefile
+++ b/ingest/Snakefile
@@ -18,7 +18,7 @@ include: "../shared/vendored/snakemake/config.smk"
 # If there are build-specific customizations, they should be added with the
 # custom_rules imported below to ensure that the core workflow is not complicated
 # by build-specific rules.
-include: "rules/fetch_from_ncbi.smk"
+include: "rules/fetch.smk"
 include: "rules/curate.smk"
 include: "rules/nextclade.smk"
 
@@ -35,4 +35,4 @@ include: "rules/nextclade.smk"
 if "custom_rules" in config:
     for rule_file in config["custom_rules"]:
 
-        include: rule_file
\ No newline at end of file
+        include: rule_file
diff --git a/ingest/rules/fetch_from_ncbi.smk b/ingest/rules/fetch.smk
similarity index 100%
rename from ingest/rules/fetch_from_ncbi.smk
rename to ingest/rules/fetch.smk

From 1898ef37b409c6865199fd5d12bd4c80eb11ed04 Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Thu, 16 Oct 2025 14:27:24 -0700
Subject: [PATCH 3/5] ingest: Replace NCBI Datasets with Pathoplexus
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is largely inspired by ebola ingest¹ which recently switched to
Pathoplexus data. Many parts were copied directly with adjustments to
conform to the repo's current structure and syntax.

¹ https://github.com/nextstrain/ebola/commit/979b2dcfecd809931041fe96232c2783f9dd7d2a
---
 ingest/README.md            |  16 +---
 ingest/Snakefile            |   2 +
 ingest/defaults/config.yaml | 166 ++++++++++++++++++++++--------------
 ingest/rules/curate.smk     |  63 ++++++++++----
 ingest/rules/fetch.smk      |  94 +++++++-------------
 ingest/rules/nextclade.smk  |   4 +-
 6 files changed, 182 insertions(+), 163 deletions(-)

diff --git a/ingest/README.md b/ingest/README.md
index 883d299..ae60a59 100644
--- a/ingest/README.md
+++ b/ingest/README.md
@@ -1,7 +1,7 @@
 # Ingest
 
-This workflow ingests public data from NCBI and outputs curated metadata and
-sequences that can be used as input for the phylogenetic workflow.
+This workflow ingests public data from Pathoplexus and outputs curated metadata
+and sequences that can be used as input for the phylogenetic workflow.
 
 If you have another data source or private data that needs to be formatted for
 the phylogenetic workflow, then you can use a similar workflow to curate your
@@ -25,18 +25,6 @@ This produces the default outputs of the ingest workflow:
 - metadata      = results/metadata_all.tsv
 - sequences     = results/sequences_all.fasta
 
-### Dumping the full raw metadata from NCBI Datasets
-
-The workflow has a target for dumping the full raw metadata from NCBI Datasets.
-
-```
-nextstrain build ingest dump_ncbi_dataset_report
-```
-
-This will produce the file `ingest/data/ncbi_dataset_report_raw.tsv`,
-which you can inspect to determine what fields and data to use if you want to
-configure the workflow for your pathogen.
-
 ## Defaults
 
 The defaults directory contains all of the default configurations for the ingest workflow.
diff --git a/ingest/Snakefile b/ingest/Snakefile
index 424ffb0..95216fa 100644
--- a/ingest/Snakefile
+++ b/ingest/Snakefile
@@ -10,6 +10,8 @@ rule all:
     input:
         sequences="results/sequences.fasta",
         metadata="results/metadata.tsv",
+        sequences_open="results/sequences_open.fasta",
+        metadata_open="results/metadata_open.tsv",
 
 # Shared Snakemake files with generic functions are shared across pathogens
 include: "../shared/vendored/snakemake/config.smk"
diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml
index 486503b..158b96a 100644
--- a/ingest/defaults/config.yaml
+++ b/ingest/defaults/config.yaml
@@ -4,70 +4,38 @@
 # Define optional config parameters with their default values here so that users
 # do not have to dig through the workflows to figure out the default values
 
-# Required to fetch from NCBI Datasets
-ncbi_taxon_id: "11082"
-
-# The list of NCBI Datasets fields to include from NCBI Datasets output
-# These need to be the "mnemonics" of the NCBI Datasets fields, see docs for full list of fields
-# https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
-# Note: the "accession" field MUST be provided to match with the sequences
-ncbi_datasets_fields:
-  - accession
-  - sourcedb
-  - isolate-lineage
-  - geo-region
-  - geo-location
-  - isolate-collection-date
-  - release-date
-  - update-date
-  - length
-  - host-name
-  - is-lab-host
-  - isolate-lineage-source
-  - bioprojects
-  - biosample-acc
-  - sra-accs
-  - submitter-names
-  - submitter-affiliation
+ppx_fetch:
+  seqs: https://lapis.pathoplexus.org/west-nile/sample/unalignedNucleotideSequences?versionStatus=LATEST_VERSION
+  meta: https://lapis.pathoplexus.org/west-nile/sample/details?dataFormat=csv&versionStatus=LATEST_VERSION
 
 # Config parameters related to the curate pipeline
 curate:
   # The path to the local geolocation rules within the pathogen repo
   # The path should be relative to the ingest directory.
   local_geolocation_rules: "defaults/geolocation-rules.tsv"
-  # The original field names should match the ncbi_datasets_fields provided above.
   # This is the first step in the pipeline, so any references to field names in the configs below should use the new field names
   field_map:
-    accession: accession
-    accession_version: accession_version
-    sourcedb: database
-    isolate-lineage: strain
-    geo-region: region
-    geo-location: location
-    isolate-collection-date: date
-    release-date: date_released
-    update-date: date_updated
-    length: length
-    host-name: host
-    is-lab-host: is_lab_host
-    isolate-lineage-source: sample_type
-    biosample-acc: biosample_accessions
-    sra-accs: sra_accessions
-    submitter-names: full_authors
-    submitter-affiliation: institution
-  # Standardized strain name regex
-  # Currently accepts any characters because we do not have a clear standard for strain names across pathogens
-  strain_regex: "^.+$"
-  # Back up strain name field to use if "strain" doesn"t match regex above
-  strain_backup_fields: ["accession"]
+    accessionVersion: PPX_accession
+    insdcAccessionFull: INSDC_accession
+    insdcRawReadsAccession: sra_accession
+    displayName: strain
+    geoLocCountry: country
+    geoLocAdmin1: division
+    geoLocAdmin2: location
+    sampleCollectionDate: date
+    earliestReleaseDate: date_submitted
+    hostNameCommon: host
+    isLabHost: is_lab_host
+    dataUseTermsRestrictedUntil: restrictedUntil
+    dataUseTermsUrl: dataUseTerms__url
+    authors: full_authors
+    authorAffiliations: institution
   # List of date fields to standardize to ISO format YYYY-MM-DD
-  date_fields: ["date", "date_released", "date_updated"]
+  date_fields: ["date", "date_submitted"]
   # List of expected date formats that are present in the date fields provided above
   # These date formats should use directives expected by datetime
   # See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes
   expected_date_formats: ["%Y", "%Y-%m", "%Y-%m-%d", "%Y-%m-%dT%H:%M:%SZ"]
-  # The expected field that contains the GenBank geo_loc_name
-  genbank_location_field: location
   titlecase:
     # List of string fields to titlecase
     fields: ["region", "country", "division", "location"]
@@ -93,16 +61,19 @@ curate:
   output_id_field: "accession"
   # The field in the NDJSON record that contains the actual genomic sequence
   output_sequence_field: "sequence"
-  # The field in the NDJSON record that contains the actual GenBank accession
-  genbank_accession: 'accession'
+  # The field in the NDJSON record that contains the actual Pathoplexus accession
+  pathoplexus_accession: 'PPX_accession'
+  # The field in the NDJSON record that contains the actual INSDC accession
+  insdc_accession: 'INSDC_accession'
 
   # The list of metadata columns to keep in the final output of the curation pipeline.
   metadata_columns: [
     'accession',
-    #'genbank_accession_rev',
+    'PPX_accession',
+    'PPX_accession__url',
+    'INSDC_accession',
+    'INSDC_accession__url',
     #'strain',
-    #'strain_s',
-    #'viruslineage_ids',
     'date',
     #'updated',
     'region',
@@ -116,15 +87,11 @@ curate:
     'is_lab_host',
     #'date_submitted',
     #'sra_accession',
-    #'full_authors',
-    #'reverse',
     'authors',
-    #'institution',
-    #'title',
-    #'journal',
-    #'publications',
-    #'paper_url',
-    'url',
+    'institution',
+    'dataUseTerms',
+    'dataUseTerms__url',
+    'restrictedUntil',
     'length',
   ]
 
@@ -135,5 +102,72 @@ nextclade:
 
 pathoplexus:
   URL: 'https://lapis.pathoplexus.org/west-nile/sample/details'
-  fields: 'insdcAccessionBase,lineage'
-  accession_field: 'insdcAccessionBase'
+  fields: 'accession,lineage'
+  accession_field: 'accession'
+
+ppx_metadata_fields:
+ - "accessionVersion"
+ - "accession"
+ - "version"
+ - "submitter"
+ - "groupName"
+ - "submittedDate"
+ - "releasedDate"
+ - "dataUseTerms"
+ - "dataUseTermsRestrictedUntil"
+ - "dataUseTermsUrl"
+ - "assemblyReferenceGenomeAccession"
+ - "authorAffiliations"
+ - "authors"
+ - "bioprojectAccession"
+ - "biosampleAccession"
+ - "completeness"
+ - "displayName"
+ - "earliestReleaseDate"
+ - "frameShifts"
+ - "geoLocAdmin1"
+ - "geoLocAdmin2"
+ - "geoLocCity"
+ - "geoLocCountry"
+ - "geoLocLatitude"
+ - "geoLocLongitude"
+ - "geoLocSite"
+ - "hostAge"
+ - "hostAgeBin"
+ - "hostDisease"
+ - "hostGender"
+ - "hostHealthOutcome"
+ - "hostHealthState"
+ - "hostNameCommon"
+ - "hostOriginCountry"
+ - "hostVaccinationStatus"
+ - "insdcAccessionBase"
+ - "insdcAccessionFull"
+ - "insdcRawReadsAccession"
+ - "insdcVersion"
+ - "isLabHost"
+ - "length"
+ - "ncbiReleaseDate"
+ - "ncbiSourceDb"
+ - "ncbiSubmitterCountry"
+ - "ncbiUpdateDate"
+ - "ncbiVirusName"
+ - "ncbiVirusTaxId"
+ - "purposeOfSampling"
+ - "purposeOfSequencing"
+ - "qualityControlDetails"
+ - "qualityControlDetermination"
+ - "qualityControlIssues"
+ - "qualityControlMethodName"
+ - "qualityControlMethodVersion"
+ - "sampleCollectionDate"
+ - "sampleCollectionDateRangeLower"
+ - "sampleCollectionDateRangeUpper"
+ - "sampleType"
+ - "totalAmbiguousNucs"
+ - "totalDeletedNucs"
+ - "totalFrameShifts"
+ - "totalInsertedNucs"
+ - "totalSnps"
+ - "totalUnknownNucs"
+ - "travelHistory"
diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk
index f3b4e3a..361771b 100644
--- a/ingest/rules/curate.smk
+++ b/ingest/rules/curate.smk
@@ -1,9 +1,9 @@
 """
-This part of the workflow handles the curation of data from NCBI
+This part of the workflow handles the curation of data from Pathoplexus
 
 REQUIRED INPUTS:
 
-    sequences_ndjson = data/genbank.ndjson
+    sequences_ndjson = data/sequences.ndjson
 
 OUTPUTS:
 
@@ -21,7 +21,7 @@ def format_field_map(field_map: dict[str, str]) -> str:
 
 rule curate:
     input:
-        sequences_ndjson="data/genbank.ndjson",
+        sequences_ndjson="data/sequences.ndjson",
         geolocation_rules=config["curate"]["local_geolocation_rules"],
         annotations=config["curate"]["annotations"],
         manual_mapping="defaults/host_hostgenus_hosttype_map.tsv",
@@ -34,11 +34,8 @@ rule curate:
         "benchmarks/curate.txt",
     params:
         field_map=format_field_map(config["curate"]["field_map"]),
-        strain_regex=config["curate"]["strain_regex"],
-        strain_backup_fields=config["curate"]["strain_backup_fields"],
         date_fields=config["curate"]["date_fields"],
         expected_date_formats=config["curate"]["expected_date_formats"],
-        genbank_location_field=config["curate"]["genbank_location_field"],
         articles=config["curate"]["titlecase"]["articles"],
         abbreviations=config["curate"]["titlecase"]["abbreviations"],
         titlecase_fields=config["curate"]["titlecase"]["fields"],
@@ -54,14 +51,9 @@ rule curate:
             | augur curate rename \
                 --field-map {params.field_map} \
             | augur curate normalize-strings \
-            | augur curate transform-strain-name \
-                --strain-regex {params.strain_regex} \
-                --backup-fields {params.strain_backup_fields} \
             | augur curate format-dates \
                 --date-fields {params.date_fields} \
                 --expected-date-formats {params.expected_date_formats} \
-            | augur curate parse-genbank-location \
-                --location-field {params.genbank_location_field} \
             | augur curate titlecase \
                 --titlecase-fields {params.titlecase_fields} \
                 --articles {params.articles} \
@@ -88,23 +80,34 @@ rule curate:
                 --output-id-field {params.id_field} \
                 --output-seq-field {params.sequence_field} ) 2>> {log}
         """
-rule add_metadata_columns:
+rule add_accession_urls:
     """Add columns to metadata
     Notable columns:
-    - [NEW] url: URL linking to the NCBI GenBank record ('https://www.ncbi.nlm.nih.gov/nuccore/*').
+    - PPX_accession__url: URL linking to the Pathoplexus record.
+    - INSDC_accession__url: URL linking to the NCBI GenBank record.
+    - url: URL linking to the NCBI GenBank record (kept for backwards compatibility).
     """
     input:
         metadata = "data/all_metadata.tsv"
     output:
         metadata = temp("data/all_metadata_added.tsv")
     params:
-        accession=config['curate']['genbank_accession']
+        pathoplexus_accession=config['curate']['pathoplexus_accession'],
+        pathoplexus_accession_url=config['curate']['pathoplexus_accession'] + "__url",
+        insdc_accession=config['curate']['insdc_accession'],
+        insdc_accession_url=config['curate']['insdc_accession'] + "__url",
     shell:
         """
-        csvtk mutate2 -t \
-          -n url \
-          -e '"https://www.ncbi.nlm.nih.gov/nuccore/" + ${params.accession}' \
-          {input.metadata} \
+        cat {input.metadata} \
+            | csvtk mutate2 -t \
+                -n {params.pathoplexus_accession_url} \
+                -e '"https://pathoplexus.org/seq/" + ${params.pathoplexus_accession}' \
+            | csvtk mutate2 -t \
+                -n {params.insdc_accession_url} \
+                -e '"https://www.ncbi.nlm.nih.gov/nuccore/" + ${params.insdc_accession}' \
+            | csvtk mutate2 -t \
+                -n url \
+                -e '"https://www.ncbi.nlm.nih.gov/nuccore/" + ${params.insdc_accession}' \
         > {output.metadata}
         """
 
@@ -121,6 +124,30 @@ rule subset_metadata:
             {input.metadata} > {output.metadata}
         """
 
+rule extract_open_data:
+    input:
+        metadata = "results/metadata.tsv",
+        sequences = "results/sequences.fasta"
+    output:
+        metadata = "results/metadata_open.tsv",
+        sequences = "results/sequences_open.fasta"
+    benchmark:
+        "benchmarks/extract_open_data.txt"
+    log:
+        "logs/extract_open_data.txt"
+    shell:
+        r"""
+        exec &> >(tee {log:q})
+
+        augur filter \
+            --metadata {input.metadata:q} \
+            --sequences {input.sequences:q} \
+            --metadata-id-columns accession \
+            --exclude-where "dataUseTerms=RESTRICTED" \
+            --output-metadata {output.metadata:q} \
+            --output-sequences {output.sequences:q}
+        """
+
 rule compress:
     input:
         file="{a_file}",
diff --git a/ingest/rules/fetch.smk b/ingest/rules/fetch.smk
index 86f0947..5d9ce5a 100644
--- a/ingest/rules/fetch.smk
+++ b/ingest/rules/fetch.smk
@@ -1,5 +1,5 @@
 """
-This part of the workflow handles fetching sequences and metadata from GenBank.
+This part of the workflow handles fetching sequences and metadata from Pathoplexus.
 
 REQUIRED INPUTS:
 
@@ -7,94 +7,60 @@ REQUIRED INPUTS:
 
 OUTPUTS:
 
-    ndjson = data/genbank.ndjson
+    ndjson = data/sequences.ndjson
 
 """
 workflow.global_resources.setdefault("concurrent_deploys", 2)
 
-rule fetch_ncbi_dataset_package:
+rule download_ppx_seqs:
     output:
-        dataset_package = temp("data/ncbi_dataset.zip")
-    retries: 5 # Requires snakemake 7.7.0 or later
-    log:
-        "logs/fetch_ncbi_dataset_package.txt"
-    benchmark:
-        "benchmarks/fetch_ncbi_dataset_package.txt"
+        sequences= "data/ppx_sequences.fasta",
     params:
-        ncbi_taxon_id = config["ncbi_taxon_id"]
-    shell:
-        """
-        datasets download virus genome taxon {params.ncbi_taxon_id} \
-            --no-progressbar \
-            --filename {output.dataset_package} 2>&1 | tee {log}
-        """
-
-# Note: This rule is not part of the default workflow!
-# It is intended to be used as a specific target for users to be able
-# to inspect and explore the full raw metadata from NCBI Datasets.
-rule dump_ncbi_dataset_report:
-    input:
-        dataset_package="data/ncbi_dataset.zip",
-    output:
-        ncbi_dataset_tsv="data/ncbi_dataset_report_raw.tsv",
-    shell:
-        """
-        dataformat tsv virus-genome \
-            --package {input.dataset_package} > {output.ncbi_dataset_tsv}
-        """
-
-rule extract_ncbi_dataset_sequences:
-    input:
-        dataset_package = "data/ncbi_dataset.zip"
-    output:
-        ncbi_dataset_sequences = temp("data/ncbi_dataset_sequences.fasta")
+        sequences_url=config["ppx_fetch"]["seqs"],
+    # Allow retries in case of network errors
+    retries: 5
     benchmark:
-        "benchmarks/extract_ncbi_dataset_sequences.txt"
+        "benchmarks/download_ppx_seqs.txt"
+    log:
+        "logs/download_ppx_seqs.txt"
     shell:
         """
-        unzip -jp {input.dataset_package} \
-            ncbi_dataset/data/genomic.fna > {output.ncbi_dataset_sequences}
+        curl {params.sequences_url} -o {output.sequences}
         """
 
-rule format_ncbi_dataset_report:
-    input:
-        dataset_package = "data/ncbi_dataset.zip",
+rule download_ppx_meta:
     output:
-        ncbi_dataset_tsv = temp("data/ncbi_dataset_report.tsv")
+        metadata= "data/ppx_metadata.csv"
     params:
-        ncbi_dataset_fields = ",".join(config["ncbi_datasets_fields"]),
+        metadata_url=config["ppx_fetch"]["meta"],
+        fields = ",".join(config["ppx_metadata_fields"])
+    # Allow retries in case of network errors
+    retries: 5
     benchmark:
-        "benchmarks/format_ncbi_dataset_report.txt"
+        "benchmarks/download_ppx_meta.txt"
+    log:
+        "logs/download_ppx_meta.txt"
     shell:
         """
-        dataformat tsv virus-genome \
-            --package {input.dataset_package} \
-            --fields {params.ncbi_dataset_fields:q} \
-            --elide-header \
-            | csvtk fix-quotes -Ht \
-            | csvtk add-header -t -n {params.ncbi_dataset_fields} \
-            | csvtk rename -t -f accession -n accession_version \
-            | csvtk -t mutate -f accession_version -n accession -p "^(.+?)\." --at 1 \
-            > {output.ncbi_dataset_tsv}
+        curl '{params.metadata_url}&fields={params.fields}' -o {output.metadata}
         """
 
-
-rule format_ncbi_datasets_ndjson:
+rule format_ppx_ndjson:
     input:
-        ncbi_dataset_sequences = "data/ncbi_dataset_sequences.fasta",
-        ncbi_dataset_tsv = "data/ncbi_dataset_report.tsv",
+        sequences = "data/ppx_sequences.fasta",
+        metadata = "data/ppx_metadata.csv",
     output:
-        ndjson = "data/genbank.ndjson",
+        ndjson = "data/sequences.ndjson",
     log:
-        "logs/format_ncbi_datasets_ndjson.txt"
+        "logs/format_ppx_ndjson.txt"
     benchmark:
-        "benchmarks/format_ncbi_datasets_ndjson.txt"
+        "benchmarks/format_ppx_ndjson.txt"
     shell:
         """
         augur curate passthru \
-            --metadata {input.ncbi_dataset_tsv} \
-            --fasta {input.ncbi_dataset_sequences} \
-            --seq-id-column accession_version \
+            --metadata {input.metadata} \
+            --fasta {input.sequences} \
+            --seq-id-column accessionVersion \
             --seq-field sequence \
             --unmatched-reporting warn \
             --duplicate-reporting warn \
diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk
index c2ebfa1..7f6a308 100644
--- a/ingest/rules/nextclade.smk
+++ b/ingest/rules/nextclade.smk
@@ -12,6 +12,8 @@ like to customize the rules:
 https://docs.nextstrain.org/projects/nextclade/page/user/nextclade-cli.html
 """
 
+# TODO: This separate fetch should not be necessary - 'lineage' can be added
+# to data/subset_metadata.tsv.
 rule pathoplexus_classify:
     """
     Pulls global lineage calls from Pathoplexus API
@@ -25,7 +27,7 @@ rule pathoplexus_classify:
         id_field=config["curate"]["output_id_field"],
     shell:
         r"""
-        curl "{params.URL}?dataFormat=TSV&downloadAsFile=false&fields={params.fields}" \
+        curl "{params.URL}?versionStatus=LATEST_VERSION&dataFormat=TSV&downloadAsFile=false&fields={params.fields}" \
         | tsv-filter -H --not-empty {params.accession_field} \
         | uniq \
         | csvtk -t rename -f {params.accession_field} -n {params.id_field} \

From 3fe7663b13ecfbe6e46961c5b5b6f014d5411d99 Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Thu, 16 Oct 2025 15:37:32 -0700
Subject: [PATCH 4/5] phylogenetic: Update for Pathoplexus data

Accessions in **/include.txt updated with the following command:

    for FILE in phylogenetic/defaults/{all-lineages,lineage-1A,lineage-2}/include.txt; do
      tail -n +2 ingest/results/metadata.tsv | awk -F'\t' '{print $1"\t"$4}' | while IFS=$'\t' read -r new old; do
        sed -i '' "s/^${old%.*} /${new} /" "$FILE"
      done
    done
---
 .../defaults/all-lineages/auspice_config.json |  10 +-
 .../defaults/all-lineages/include.txt         | 182 +++++++++---------
 phylogenetic/defaults/config.yaml             |   2 +-
 .../defaults/lineage-1A/auspice_config.json   |  10 +-
 phylogenetic/defaults/lineage-1A/include.txt  | 132 ++++++-------
 .../defaults/lineage-2/auspice_config.json    |  10 +-
 phylogenetic/defaults/lineage-2/include.txt   |  14 +-
 7 files changed, 189 insertions(+), 171 deletions(-)

diff --git a/phylogenetic/defaults/all-lineages/auspice_config.json b/phylogenetic/defaults/all-lineages/auspice_config.json
index d406200..269c6e4 100644
--- a/phylogenetic/defaults/all-lineages/auspice_config.json
+++ b/phylogenetic/defaults/all-lineages/auspice_config.json
@@ -1,6 +1,10 @@
 {
   "title": "Genomic epidemiology of West Nile Virus",
   "data_provenance": [
+    {
+      "name": "Pathoplexus",
+      "url": "https://pathoplexus.org"
+    },
     {
       "name": "GenBank",
       "url": "https://www.ncbi.nlm.nih.gov/genbank/"
@@ -16,6 +20,7 @@
     {"key": "lineage", "title": "Lineage", "type": "categorical"},
     {"key": "clade_membership", "title": "Clade", "type": "categorical"},
     {"key": "author", "title": "Authors", "type": "categorical"},
+    {"key": "dataUseTerms", "title": "Data use terms", "type": "categorical"},
     {"key": "host", "title": "Host Species", "type": "categorical"},
     {"key": "host_genus", "title": "Host Genus", "type": "categorical"},
     {"key": "host_type", "title": "Host Type", "type": "categorical"}
@@ -52,9 +57,10 @@
     "geo_resolution": "country"
   },
   "metadata_columns": [
-    "accession",
+    "PPX_accession",
+    "INSDC_accession",
     "division",
-    "url"
+    "restrictedUntil"
   ],
   "extensions": {
     "nextclade": {
diff --git a/phylogenetic/defaults/all-lineages/include.txt b/phylogenetic/defaults/all-lineages/include.txt
index 1e92593..6471a7f 100644
--- a/phylogenetic/defaults/all-lineages/include.txt
+++ b/phylogenetic/defaults/all-lineages/include.txt
@@ -1,91 +1,91 @@
-AF260968 # Egypt 1951 all-lineages reference
-NC_001563 # Lineage 2 reference
-NC_009942 # Lineage 1 reference
-HM051416 # Isreal 1953
-GQ851607 # Nigeria 1965
-GQ851606 # Senegal 1979
-AF481864 # pre-NY
-MH166901 # NY99
-MH166903 # NY99
-MH166904 # NY99
-KX547395 # NY99
-KX547519 # NY99
-KX547602 # NY99
-HM488130 # NY99
-HM488132 # NY99
-HQ671707 # NY99
-AF202541 # NY99
-AF206518 # NY99
-HM488127 # NY99
-HM488126 # NY99
-KX547410 # WN02
-KJ501434 # WN02
-KX547456 # WN02
-KY216155 # WN02
-KX547460 # WN02
-MF175829 # WN02
-KX547482 # WN02
-MF175827 # WN02
-MF175839 # WN02
-KT020853 # WN02
-KX547548 # WN02
-MF175863 # WN02
-KX547286 # WN02
-MF175873 # WN02
-MF175865 # WN02
-MF175831 # WN02
-MF175858 # WN02
-KJ501117 # SW03
-KJ501120 # SW03
-MF175815 # SW03
-MG004533 # SW03
-KF704147 # SW03
-KF704153 # SW03
-KR348940 # SW03
-KR348937 # SW03
-KX547361 # SW03
-JX015523 # SW03
-KR348944 # SW03
-KJ501124 # SW03
-KX547552 # SW03
-KJ145829 # SW03
-KR348981 # SW03
-KJ501118 # SW03
-KR348938 # SW03
-KR348976 # SW03
-KJ501170 # SW03
-KR348993 # SW03
-JQ700438 # SW03
-KR348977 # SW03
-KR348942 # SW03
-KR348941 # SW03
-KJ501121 # SW03
-KJ501122 # SW03
-KX547375 # SW03
-KM012172 # SW03
-KC333375 # SW03
-KJ501222 # SW03
-MG004537 # SW03
-MF175866 # SW03
-MG004540 # SW03
-MW383507 # Lineage 2
-HM147822 # Lineage 2
-GQ903680 # Lineage 2
-DQ176636 # Lineage 2
-KU978767 # Lineage 2
-HM147823 # Lineage 2
-PP445212 # Lineage 3
-AY765264 # Lineage 3
-AY277251 # Lineage 4
-FJ159131 # Lineage 4
-FJ159129 # Lineage 4
-FJ159130 # Lineage 4
-KJ831223 # Lineage 4
-KU978770 # Lineage 5
-DQ256376 # Lineage 5
-JX041632 # Lineage 5
-GQ851604 # Lineage 5
-GQ851605 # Lineage 5
-KY703855 # Lineage 7
-OP846972 # Lineage 7
-KY703856 # Lineage 8
+PP_000HJBT # Egypt 1951 all-lineages reference
+PP_0003ASZ # Lineage 2 reference
+PP_0003ATX # Lineage 1 reference
+PP_0008AWF # Isreal 1953
+PP_000K976 # Nigeria 1965
+PP_000K968 # Senegal 1979
+PP_000HP18 # pre-NY
+PP_0002EDQ # NY99
+PP_0002EFL # NY99
+PP_0002EGJ # NY99
+PP_0001RJ4 # NY99
+PP_0001V6R # NY99
+PP_0001XMS # NY99
+PP_0008D7R # NY99
+PP_0008D9M # NY99
+PP_0008M3R # NY99
+PP_000HHM7 # NY99
+PP_000HHXM # NY99
+PP_0008D4X # NY99
+PP_0008D3Z # NY99
+PP_0001RZ8 # WN02
+PP_00012ZX # WN02
+PP_0001TBH # WN02
+PP_0001Z6M # WN02
+PP_0001TF9 # WN02
+PP_0002AES # WN02
+PP_0001U3Y # WN02
+PP_0002ACW # WN02
+PP_0002AQ5 # WN02
+PP_0001F1F # WN02
+PP_0001W10 # WN02
+PP_0002BER # WN02
+PP_0001NBN # WN02
+PP_0002BQ4 # WN02
+PP_0002BGM # WN02
+PP_0002AGN # WN02
+PP_0002B91 # WN02
+PP_0000T23 # SW03
+PP_0000T6V # SW03
+PP_0002A0L # SW03
+PP_0002DFM # SW03
+PP_0000Q26 # SW03
+PP_0000Q8U # SW03
+PP_0001C3E # SW03
+PP_0001BZN # SW03
+PP_0001QJ5 # SW03
+PP_0000FKE # SW03
+PP_0001C76 # SW03
+PP_0000TAM # SW03
+PP_0001W5S # SW03
+PP_0000RRS # SW03
+PP_0001DFP # SW03
+PP_0000T31 # SW03
+PP_0001C0L # SW03
+PP_0001DAZ # SW03
+PP_0000UWD # SW03
+PP_0001DVU # SW03
+PP_0000DT0 # SW03
+PP_0001DBX # SW03
+PP_0001C5A # SW03
+PP_0001C4C # SW03
+PP_0000T7T # SW03
+PP_0000T8R # SW03
+PP_0001QYB # SW03
+PP_00017WY # SW03
+PP_0000HXN # SW03
+PP_0000WPR # SW03
+PP_0002DLA # SW03
+PP_0002BHJ # SW03
+PP_0002DP4 # SW03
+PP_000370M # Lineage 2
+PP_0008CDE # Lineage 2
+PP_000K9BY # Lineage 2
+PP_000JB76 # Lineage 2
+PP_0001H9X # Lineage 2
+PP_0008CEC # Lineage 2
+PP_000RH4S # Lineage 3
+PP_000HY01 # Lineage 3
+PP_000HRWF # Lineage 4
+PP_000JWG3 # Lineage 4
+PP_000JWE7 # Lineage 4
+PP_000JWF5 # Lineage 4
+PP_00017EX # Lineage 4
+PP_0001HCR # Lineage 5
+PP_000JBA0 # Lineage 5
+PP_0000FR2 # Lineage 5
+PP_000K94C # Lineage 5
+PP_000K95A # Lineage 5
+PP_0001ZMQ # Lineage 7
+PP_0003L7U # Lineage 7
+PP_0001ZNN # Lineage 8
diff --git a/phylogenetic/defaults/config.yaml b/phylogenetic/defaults/config.yaml
index b50e603..0d08cd4 100644
--- a/phylogenetic/defaults/config.yaml
+++ b/phylogenetic/defaults/config.yaml
@@ -56,7 +56,7 @@ build_params:
 
   lineage-1A:
     reference: "defaults/lineage-1A/reference.gb"
-    root: "KX394399"
+    root: "PP_0001JCQ"
 
     subsample:
       samples:
diff --git a/phylogenetic/defaults/lineage-1A/auspice_config.json b/phylogenetic/defaults/lineage-1A/auspice_config.json
index 945cdff..a0f3d12 100644
--- a/phylogenetic/defaults/lineage-1A/auspice_config.json
+++ b/phylogenetic/defaults/lineage-1A/auspice_config.json
@@ -1,6 +1,10 @@
 {
   "title": "Genomic epidemiology of West Nile Virus lineage 1A",
   "data_provenance": [
+    {
+      "name": "Pathoplexus",
+      "url": "https://pathoplexus.org"
+    },
     {
       "name": "GenBank",
       "url": "https://www.ncbi.nlm.nih.gov/genbank/"
@@ -16,6 +20,7 @@
     {"key": "lineage", "title": "Lineage", "type": "categorical"},
     {"key": "clade_membership", "title": "Clade", "type": "categorical"},
     {"key": "author", "title": "Authors", "type": "categorical"},
+    {"key": "dataUseTerms", "title": "Data use terms", "type": "categorical"},
     {"key": "host", "title": "Host Species", "type": "categorical"},
     {"key": "host_genus", "title": "Host Genus", "type": "categorical"},
     {"key": "host_type", "title": "Host Type", "type": "categorical"}
@@ -53,9 +58,10 @@
     "distance_measure": "div"
   },
   "metadata_columns": [
-    "accession",
+    "PPX_accession",
+    "INSDC_accession",
     "division",
-    "url"
+    "restrictedUntil"
   ],
   "extensions": {
     "nextclade": {
diff --git a/phylogenetic/defaults/lineage-1A/include.txt b/phylogenetic/defaults/lineage-1A/include.txt
index 2bc4e0b..f4634a9 100644
--- a/phylogenetic/defaults/lineage-1A/include.txt
+++ b/phylogenetic/defaults/lineage-1A/include.txt
@@ -1,66 +1,66 @@
-KX394399 # Lineage 1B outgroup
-NC_009942 # Lineage 1 reference
-AF481864 # pre-NY
-MH166901 # NY99
-MH166903 # NY99
-MH166904 # NY99
-KX547395 # NY99
-KX547519 # NY99
-KX547602 # NY99
-HM488130 # NY99
-HM488132 # NY99
-HQ671707 # NY99
-AF202541 # NY99
-AF206518 # NY99
-HM488127 # NY99
-HM488126 # NY99
-KX547410 # WN02
-KJ501434 # WN02
-KX547456 # WN02
-KY216155 # WN02
-KX547460 # WN02
-MF175829 # WN02
-KX547482 # WN02
-MF175827 # WN02
-MF175839 # WN02
-KT020853 # WN02
-KX547548 # WN02
-MF175863 # WN02
-KX547286 # WN02
-MF175873 # WN02
-MF175865 # WN02
-MF175831 # WN02
-MF175858 # WN02
-KJ501117 # SW03
-KJ501120 # SW03
-MF175815 # SW03
-MG004533 # SW03
-KF704147 # SW03
-KF704153 # SW03
-KR348940 # SW03
-KR348937 # SW03
-KX547361 # SW03
-JX015523 # SW03
-KR348944 # SW03
-KJ501124 # SW03
-KX547552 # SW03
-KJ145829 # SW03
-KR348981 # SW03
-KJ501118 # SW03
-KR348938 # SW03
-KR348976 # SW03
-KJ501170 # SW03
-KR348993 # SW03
-JQ700438 # SW03
-KR348977 # SW03
-KR348942 # SW03
-KR348941 # SW03
-KJ501121 # SW03
-KJ501122 # SW03
-KX547375 # SW03
-KM012172 # SW03
-KC333375 # SW03
-KJ501222 # SW03
-MG004537 # SW03
-MF175866 # SW03
-MG004540 # SW03
+PP_0001JCQ # Lineage 1B outgroup
+PP_0003ATX # Lineage 1 reference
+PP_000HP18 # pre-NY
+PP_0002EDQ # NY99
+PP_0002EFL # NY99
+PP_0002EGJ # NY99
+PP_0001RJ4 # NY99
+PP_0001V6R # NY99
+PP_0001XMS # NY99
+PP_0008D7R # NY99
+PP_0008D9M # NY99
+PP_0008M3R # NY99
+PP_000HHM7 # NY99
+PP_000HHXM # NY99
+PP_0008D4X # NY99
+PP_0008D3Z # NY99
+PP_0001RZ8 # WN02
+PP_00012ZX # WN02
+PP_0001TBH # WN02
+PP_0001Z6M # WN02
+PP_0001TF9 # WN02
+PP_0002AES # WN02
+PP_0001U3Y # WN02
+PP_0002ACW # WN02
+PP_0002AQ5 # WN02
+PP_0001F1F # WN02
+PP_0001W10 # WN02
+PP_0002BER # WN02
+PP_0001NBN # WN02
+PP_0002BQ4 # WN02
+PP_0002BGM # WN02
+PP_0002AGN # WN02
+PP_0002B91 # WN02
+PP_0000T23 # SW03
+PP_0000T6V # SW03
+PP_0002A0L # SW03
+PP_0002DFM # SW03
+PP_0000Q26 # SW03
+PP_0000Q8U # SW03
+PP_0001C3E # SW03
+PP_0001BZN # SW03
+PP_0001QJ5 # SW03
+PP_0000FKE # SW03
+PP_0001C76 # SW03
+PP_0000TAM # SW03
+PP_0001W5S # SW03
+PP_0000RRS # SW03
+PP_0001DFP # SW03
+PP_0000T31 # SW03
+PP_0001C0L # SW03
+PP_0001DAZ # SW03
+PP_0000UWD # SW03
+PP_0001DVU # SW03
+PP_0000DT0 # SW03
+PP_0001DBX # SW03
+PP_0001C5A # SW03
+PP_0001C4C # SW03
+PP_0000T7T # SW03
+PP_0000T8R # SW03
+PP_0001QYB # SW03
+PP_00017WY # SW03
+PP_0000HXN # SW03
+PP_0000WPR # SW03
+PP_0002DLA # SW03
+PP_0002BHJ # SW03
+PP_0002DP4 # SW03
diff --git a/phylogenetic/defaults/lineage-2/auspice_config.json b/phylogenetic/defaults/lineage-2/auspice_config.json
index 3453014..4d2e362 100644
--- a/phylogenetic/defaults/lineage-2/auspice_config.json
+++ b/phylogenetic/defaults/lineage-2/auspice_config.json
@@ -1,6 +1,10 @@
 {
   "title": "Genomic epidemiology of West Nile Virus lineage 2",
   "data_provenance": [
+    {
+      "name": "Pathoplexus",
+      "url": "https://pathoplexus.org"
+    },
     {
       "name": "GenBank",
       "url": "https://www.ncbi.nlm.nih.gov/genbank/"
@@ -16,6 +20,7 @@
     {"key": "lineage", "title": "Lineage", "type": "categorical"},
     {"key": "clade_membership", "title": "Clade", "type": "categorical"},
     {"key": "author", "title": "Authors", "type": "categorical"},
+    {"key": "dataUseTerms", "title": "Data use terms", "type": "categorical"},
     {"key": "host", "title": "Host Species", "type": "categorical"},
     {"key": "host_genus", "title": "Host Genus", "type": "categorical"},
     {"key": "host_type", "title": "Host Type", "type": "categorical"}
@@ -53,9 +58,10 @@
     "distance_measure": "div"
   },
   "metadata_columns": [
-    "accession",
+    "PPX_accession",
+    "INSDC_accession",
     "division",
-    "url"
+    "restrictedUntil"
   ],
   "extensions": {
     "nextclade": {
diff --git a/phylogenetic/defaults/lineage-2/include.txt b/phylogenetic/defaults/lineage-2/include.txt
index e33db5b..23924ce 100644
--- a/phylogenetic/defaults/lineage-2/include.txt
+++ b/phylogenetic/defaults/lineage-2/include.txt
@@ -1,7 +1,7 @@
-NC_001563 # Lineage 2 reference
-MW383507 # Lineage 2
-HM147822 # Lineage 2
-GQ903680 # Lineage 2
-DQ176636 # Lineage 2
-KU978767 # Lineage 2
-HM147823 # Lineage 2
+PP_0003ASZ # Lineage 2 reference
+PP_000370M # Lineage 2
+PP_0008CDE # Lineage 2
+PP_000K9BY # Lineage 2
+PP_000JB76 # Lineage 2
+PP_0001H9X # Lineage 2
+PP_0008CEC # Lineage 2

From 34ea83efef9dbd4d3278a5339af8a1990fc60f80 Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Fri, 17 Oct 2025 14:59:12 -0700
Subject: [PATCH 5/5] ingest: Update accessions in annotations file

Command:

    tail -n +2 ingest/results/metadata.tsv | awk -F'\t' '{print $1"\t"$4}' | while IFS=$'\t' read -r new old; do
      sed -i '' "s/^${old%.*} /${new} /" "ingest/defaults/annotations.tsv"
    done
---
 ingest/defaults/annotations.tsv | 44 ++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/ingest/defaults/annotations.tsv b/ingest/defaults/annotations.tsv
index 45ab88a..d454abf 100644
--- a/ingest/defaults/annotations.tsv
+++ b/ingest/defaults/annotations.tsv
@@ -272,25 +272,25 @@ ON694341	institution	Centre for Biological Threats, Highly Pathogenic Viruses, R
 ON694342	institution	Centre for Biological Threats, Highly Pathogenic Viruses, Robert Koch Institute, Germany
 ON720848	institution	Microbial Genomics, Hospital General Universitario Gregorio Marañón, Madrid, Spain
 ON720849	institution	Microbial Genomics, Hospital General Universitario Gregorio Marañón, Madrid, Spain
-KT163243	date	1968-XX-XX
-AF260968	date	1951-XX-XX
-AF260968	region	Africa
-AF260968	country	Egypt
-AF260968	host	Homo sapians
-AF196835	host	Phoenicopterus chilensis
-AF196835	date	1999-XX-XX
-AY765264	date	1997-XX-XX
-AY765264	country	Czech Republic
-AY765264	region	Europe
-DQ318020	date	1972-XX-XX
-DQ318020	host	Culex tigripes
-D00246	country	Australia
-D00246	date	1960-XX-XX
-EF631122	date	XXXX-XX-XX
-EF631123	date	XXXX-XX-XX
-DQ116961	date	2004-XX-XX
-AY603654	date	1976-XX-XX
-AM404308	date	1971-XX-XX
-AF260968	date	1951-XX-XX
-AY660002	date	2003-XX-XX
-AY268132	date	2000-XX-XX
+PP_0001F2D	date	1968-XX-XX
+PP_000HJBT	date	1951-XX-XX
+PP_000HJBT	region	Africa
+PP_000HJBT	country	Egypt
+PP_000HJBT	host	Homo sapians
+PP_000HHL9	host	Phoenicopterus chilensis
+PP_000HHL9	date	1999-XX-XX
+PP_000HY01	date	1997-XX-XX
+PP_000HY01	country	Czech Republic
+PP_000HY01	region	Europe
+PP_000JBDU	date	1972-XX-XX
+PP_000JBDU	host	Culex tigripes
+PP_000HZ4S	country	Australia
+PP_000HZ4S	date	1960-XX-XX
+PP_000JSDD	date	XXXX-XX-XX
+PP_000JSEB	date	XXXX-XX-XX
+PP_000J96A	date	2004-XX-XX
+PP_000HXJZ	date	1976-XX-XX
+PP_000HQ6X	date	1971-XX-XX
+PP_000HJBT	date	1951-XX-XX
+PP_000HXRK	date	2003-XX-XX
+PP_000HRSP	date	2000-XX-XX