From 2dab3d72f8a2ed2945b178808c2c83e6ed52c297 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Thu, 16 Oct 2025 13:44:33 -0700 Subject: [PATCH 1/5] ingest: Update snakefile descriptions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sync these up with the source code. In fetch_from_ncbi, the reference to 'config.sources' seems to be outdated – the code is hardcoded to fetch from GenBank. --- ingest/rules/curate.smk | 14 +++++++------- ingest/rules/fetch_from_ncbi.smk | 14 ++++++-------- ingest/rules/nextclade.smk | 7 +++---- 3 files changed, 16 insertions(+), 19 deletions(-) diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk index a3f6ba3..f3b4e3a 100644 --- a/ingest/rules/curate.smk +++ b/ingest/rules/curate.smk @@ -1,15 +1,15 @@ """ -This part of the workflow handles transforming the data into standardized -formats and expects input file +This part of the workflow handles the curation of data from NCBI - sequences_ndjson = "data/sequences_{serotype}.ndjson" +REQUIRED INPUTS: -This will produce output files as + sequences_ndjson = data/genbank.ndjson - metadata = "results/metadata_{serotype}.tsv" - sequences = "results/sequences_{serotype}.fasta" +OUTPUTS: + + metadata = data/subset_metadata.tsv + sequences = results/sequences.fasta -Parameters are expected to be defined in `config.curate`. """ diff --git a/ingest/rules/fetch_from_ncbi.smk b/ingest/rules/fetch_from_ncbi.smk index d1d4b90..86f0947 100644 --- a/ingest/rules/fetch_from_ncbi.smk +++ b/ingest/rules/fetch_from_ncbi.smk @@ -1,15 +1,13 @@ """ -This part of the workflow handles fetching sequences from various sources. -Uses `config.sources` to determine which sequences to include in final output. +This part of the workflow handles fetching sequences and metadata from GenBank. -Currently only fetches sequences from GenBank, but other sources can be -defined in the config. If adding other sources, add a new rule upstream -of rule `fetch_all_sequences` to create the file `data/{source}.ndjson` or the -file must exist as a static file in the repo. +REQUIRED INPUTS: -Produces final output as + None - sequences_ndjson = "data/sequences.ndjson" +OUTPUTS: + + ndjson = data/genbank.ndjson """ workflow.global_resources.setdefault("concurrent_deploys", 2) diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index 982e32c..c2ebfa1 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -3,11 +3,10 @@ This part of the workflow handles running Nextclade on the curated metadata and sequences. REQUIRED INPUTS: metadata = data/subset_metadata.tsv - sequences = data/sequences_all.fasta - nextclade_datasets = ../nextclade/dataset + sequences = results/sequences.fasta + dataset = (from config) OUTPUTS: - metadata = data/metadata_all.tsv - nextclade = data/nextclade_clades.tsv + metadata = results/metadata.tsv See Nextclade docs for more details on usage, inputs, and outputs if you would like to customize the rules: https://docs.nextstrain.org/projects/nextclade/page/user/nextclade-cli.html From 248ea153549b94ca623d27ed9d9605a474909b3d Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Thu, 16 Oct 2025 13:29:32 -0700 Subject: [PATCH 2/5] ingest: Move 'fetch_from_ncbi' to 'fetch' Generalize the name before adding rules for other data sources. --- ingest/Snakefile | 4 ++-- ingest/rules/{fetch_from_ncbi.smk => fetch.smk} | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename ingest/rules/{fetch_from_ncbi.smk => fetch.smk} (100%) diff --git a/ingest/Snakefile b/ingest/Snakefile index 090f793..424ffb0 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -18,7 +18,7 @@ include: "../shared/vendored/snakemake/config.smk" # If there are build-specific customizations, they should be added with the # custom_rules imported below to ensure that the core workflow is not complicated # by build-specific rules. -include: "rules/fetch_from_ncbi.smk" +include: "rules/fetch.smk" include: "rules/curate.smk" include: "rules/nextclade.smk" @@ -35,4 +35,4 @@ include: "rules/nextclade.smk" if "custom_rules" in config: for rule_file in config["custom_rules"]: - include: rule_file \ No newline at end of file + include: rule_file diff --git a/ingest/rules/fetch_from_ncbi.smk b/ingest/rules/fetch.smk similarity index 100% rename from ingest/rules/fetch_from_ncbi.smk rename to ingest/rules/fetch.smk From 1898ef37b409c6865199fd5d12bd4c80eb11ed04 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Thu, 16 Oct 2025 14:27:24 -0700 Subject: [PATCH 3/5] ingest: Replace NCBI Datasets with Pathoplexus MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is largely inspired by ebola ingest¹ which recently switched to Pathoplexus data. Many parts were copied directly with adjustments to conform to the repo's current structure and syntax. ¹ https://github.com/nextstrain/ebola/commit/979b2dcfecd809931041fe96232c2783f9dd7d2a --- ingest/README.md | 16 +--- ingest/Snakefile | 2 + ingest/defaults/config.yaml | 166 ++++++++++++++++++++++-------------- ingest/rules/curate.smk | 63 ++++++++++---- ingest/rules/fetch.smk | 94 +++++++------------- ingest/rules/nextclade.smk | 4 +- 6 files changed, 182 insertions(+), 163 deletions(-) diff --git a/ingest/README.md b/ingest/README.md index 883d299..ae60a59 100644 --- a/ingest/README.md +++ b/ingest/README.md @@ -1,7 +1,7 @@ # Ingest -This workflow ingests public data from NCBI and outputs curated metadata and -sequences that can be used as input for the phylogenetic workflow. +This workflow ingests public data from Pathoplexus and outputs curated metadata +and sequences that can be used as input for the phylogenetic workflow. If you have another data source or private data that needs to be formatted for the phylogenetic workflow, then you can use a similar workflow to curate your @@ -25,18 +25,6 @@ This produces the default outputs of the ingest workflow: - metadata = results/metadata_all.tsv - sequences = results/sequences_all.fasta -### Dumping the full raw metadata from NCBI Datasets - -The workflow has a target for dumping the full raw metadata from NCBI Datasets. - -``` -nextstrain build ingest dump_ncbi_dataset_report -``` - -This will produce the file `ingest/data/ncbi_dataset_report_raw.tsv`, -which you can inspect to determine what fields and data to use if you want to -configure the workflow for your pathogen. - ## Defaults The defaults directory contains all of the default configurations for the ingest workflow. diff --git a/ingest/Snakefile b/ingest/Snakefile index 424ffb0..95216fa 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -10,6 +10,8 @@ rule all: input: sequences="results/sequences.fasta", metadata="results/metadata.tsv", + sequences_open="results/sequences_open.fasta", + metadata_open="results/metadata_open.tsv", # Shared Snakemake files with generic functions are shared across pathogens include: "../shared/vendored/snakemake/config.smk" diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml index 486503b..158b96a 100644 --- a/ingest/defaults/config.yaml +++ b/ingest/defaults/config.yaml @@ -4,70 +4,38 @@ # Define optional config parameters with their default values here so that users # do not have to dig through the workflows to figure out the default values -# Required to fetch from NCBI Datasets -ncbi_taxon_id: "11082" - -# The list of NCBI Datasets fields to include from NCBI Datasets output -# These need to be the "mnemonics" of the NCBI Datasets fields, see docs for full list of fields -# https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields -# Note: the "accession" field MUST be provided to match with the sequences -ncbi_datasets_fields: - - accession - - sourcedb - - isolate-lineage - - geo-region - - geo-location - - isolate-collection-date - - release-date - - update-date - - length - - host-name - - is-lab-host - - isolate-lineage-source - - bioprojects - - biosample-acc - - sra-accs - - submitter-names - - submitter-affiliation +ppx_fetch: + seqs: https://lapis.pathoplexus.org/west-nile/sample/unalignedNucleotideSequences?versionStatus=LATEST_VERSION + meta: https://lapis.pathoplexus.org/west-nile/sample/details?dataFormat=csv&versionStatus=LATEST_VERSION # Config parameters related to the curate pipeline curate: # The path to the local geolocation rules within the pathogen repo # The path should be relative to the ingest directory. local_geolocation_rules: "defaults/geolocation-rules.tsv" - # The original field names should match the ncbi_datasets_fields provided above. # This is the first step in the pipeline, so any references to field names in the configs below should use the new field names field_map: - accession: accession - accession_version: accession_version - sourcedb: database - isolate-lineage: strain - geo-region: region - geo-location: location - isolate-collection-date: date - release-date: date_released - update-date: date_updated - length: length - host-name: host - is-lab-host: is_lab_host - isolate-lineage-source: sample_type - biosample-acc: biosample_accessions - sra-accs: sra_accessions - submitter-names: full_authors - submitter-affiliation: institution - # Standardized strain name regex - # Currently accepts any characters because we do not have a clear standard for strain names across pathogens - strain_regex: "^.+$" - # Back up strain name field to use if "strain" doesn"t match regex above - strain_backup_fields: ["accession"] + accessionVersion: PPX_accession + insdcAccessionFull: INSDC_accession + insdcRawReadsAccession: sra_accession + displayName: strain + geoLocCountry: country + geoLocAdmin1: division + geoLocAdmin2: location + sampleCollectionDate: date + earliestReleaseDate: date_submitted + hostNameCommon: host + isLabHost: is_lab_host + dataUseTermsRestrictedUntil: restrictedUntil + dataUseTermsUrl: dataUseTerms__url + authors: full_authors + authorAffiliations: institution # List of date fields to standardize to ISO format YYYY-MM-DD - date_fields: ["date", "date_released", "date_updated"] + date_fields: ["date", "date_submitted"] # List of expected date formats that are present in the date fields provided above # These date formats should use directives expected by datetime # See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes expected_date_formats: ["%Y", "%Y-%m", "%Y-%m-%d", "%Y-%m-%dT%H:%M:%SZ"] - # The expected field that contains the GenBank geo_loc_name - genbank_location_field: location titlecase: # List of string fields to titlecase fields: ["region", "country", "division", "location"] @@ -93,16 +61,19 @@ curate: output_id_field: "accession" # The field in the NDJSON record that contains the actual genomic sequence output_sequence_field: "sequence" - # The field in the NDJSON record that contains the actual GenBank accession - genbank_accession: 'accession' + # The field in the NDJSON record that contains the actual Pathoplexus accession + pathoplexus_accession: 'PPX_accession' + # The field in the NDJSON record that contains the actual INSDC accession + insdc_accession: 'INSDC_accession' # The list of metadata columns to keep in the final output of the curation pipeline. metadata_columns: [ 'accession', - #'genbank_accession_rev', + 'PPX_accession', + 'PPX_accession__url', + 'INSDC_accession', + 'INSDC_accession__url', #'strain', - #'strain_s', - #'viruslineage_ids', 'date', #'updated', 'region', @@ -116,15 +87,11 @@ curate: 'is_lab_host', #'date_submitted', #'sra_accession', - #'full_authors', - #'reverse', 'authors', - #'institution', - #'title', - #'journal', - #'publications', - #'paper_url', - 'url', + 'institution', + 'dataUseTerms', + 'dataUseTerms__url', + 'restrictedUntil', 'length', ] @@ -135,5 +102,72 @@ nextclade: pathoplexus: URL: 'https://lapis.pathoplexus.org/west-nile/sample/details' - fields: 'insdcAccessionBase,lineage' - accession_field: 'insdcAccessionBase' + fields: 'accession,lineage' + accession_field: 'accession' + +ppx_metadata_fields: + - "accessionVersion" + - "accession" + - "version" + - "submitter" + - "groupName" + - "submittedDate" + - "releasedDate" + - "dataUseTerms" + - "dataUseTermsRestrictedUntil" + - "dataUseTermsUrl" + - "assemblyReferenceGenomeAccession" + - "authorAffiliations" + - "authors" + - "bioprojectAccession" + - "biosampleAccession" + - "completeness" + - "displayName" + - "earliestReleaseDate" + - "frameShifts" + - "geoLocAdmin1" + - "geoLocAdmin2" + - "geoLocCity" + - "geoLocCountry" + - "geoLocLatitude" + - "geoLocLongitude" + - "geoLocSite" + - "hostAge" + - "hostAgeBin" + - "hostDisease" + - "hostGender" + - "hostHealthOutcome" + - "hostHealthState" + - "hostNameCommon" + - "hostOriginCountry" + - "hostVaccinationStatus" + - "insdcAccessionBase" + - "insdcAccessionFull" + - "insdcRawReadsAccession" + - "insdcVersion" + - "isLabHost" + - "length" + - "ncbiReleaseDate" + - "ncbiSourceDb" + - "ncbiSubmitterCountry" + - "ncbiUpdateDate" + - "ncbiVirusName" + - "ncbiVirusTaxId" + - "purposeOfSampling" + - "purposeOfSequencing" + - "qualityControlDetails" + - "qualityControlDetermination" + - "qualityControlIssues" + - "qualityControlMethodName" + - "qualityControlMethodVersion" + - "sampleCollectionDate" + - "sampleCollectionDateRangeLower" + - "sampleCollectionDateRangeUpper" + - "sampleType" + - "totalAmbiguousNucs" + - "totalDeletedNucs" + - "totalFrameShifts" + - "totalInsertedNucs" + - "totalSnps" + - "totalUnknownNucs" + - "travelHistory" diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk index f3b4e3a..361771b 100644 --- a/ingest/rules/curate.smk +++ b/ingest/rules/curate.smk @@ -1,9 +1,9 @@ """ -This part of the workflow handles the curation of data from NCBI +This part of the workflow handles the curation of data from Pathoplexus REQUIRED INPUTS: - sequences_ndjson = data/genbank.ndjson + sequences_ndjson = data/sequences.ndjson OUTPUTS: @@ -21,7 +21,7 @@ def format_field_map(field_map: dict[str, str]) -> str: rule curate: input: - sequences_ndjson="data/genbank.ndjson", + sequences_ndjson="data/sequences.ndjson", geolocation_rules=config["curate"]["local_geolocation_rules"], annotations=config["curate"]["annotations"], manual_mapping="defaults/host_hostgenus_hosttype_map.tsv", @@ -34,11 +34,8 @@ rule curate: "benchmarks/curate.txt", params: field_map=format_field_map(config["curate"]["field_map"]), - strain_regex=config["curate"]["strain_regex"], - strain_backup_fields=config["curate"]["strain_backup_fields"], date_fields=config["curate"]["date_fields"], expected_date_formats=config["curate"]["expected_date_formats"], - genbank_location_field=config["curate"]["genbank_location_field"], articles=config["curate"]["titlecase"]["articles"], abbreviations=config["curate"]["titlecase"]["abbreviations"], titlecase_fields=config["curate"]["titlecase"]["fields"], @@ -54,14 +51,9 @@ rule curate: | augur curate rename \ --field-map {params.field_map} \ | augur curate normalize-strings \ - | augur curate transform-strain-name \ - --strain-regex {params.strain_regex} \ - --backup-fields {params.strain_backup_fields} \ | augur curate format-dates \ --date-fields {params.date_fields} \ --expected-date-formats {params.expected_date_formats} \ - | augur curate parse-genbank-location \ - --location-field {params.genbank_location_field} \ | augur curate titlecase \ --titlecase-fields {params.titlecase_fields} \ --articles {params.articles} \ @@ -88,23 +80,34 @@ rule curate: --output-id-field {params.id_field} \ --output-seq-field {params.sequence_field} ) 2>> {log} """ -rule add_metadata_columns: +rule add_accession_urls: """Add columns to metadata Notable columns: - - [NEW] url: URL linking to the NCBI GenBank record ('https://www.ncbi.nlm.nih.gov/nuccore/*'). + - PPX_accession__url: URL linking to the Pathoplexus record. + - INSDC_accession__url: URL linking to the NCBI GenBank record. + - url: URL linking to the NCBI GenBank record (kept for backwards compatibility). """ input: metadata = "data/all_metadata.tsv" output: metadata = temp("data/all_metadata_added.tsv") params: - accession=config['curate']['genbank_accession'] + pathoplexus_accession=config['curate']['pathoplexus_accession'], + pathoplexus_accession_url=config['curate']['pathoplexus_accession'] + "__url", + insdc_accession=config['curate']['insdc_accession'], + insdc_accession_url=config['curate']['insdc_accession'] + "__url", shell: """ - csvtk mutate2 -t \ - -n url \ - -e '"https://www.ncbi.nlm.nih.gov/nuccore/" + ${params.accession}' \ - {input.metadata} \ + cat {input.metadata} \ + | csvtk mutate2 -t \ + -n {params.pathoplexus_accession_url} \ + -e '"https://pathoplexus.org/seq/" + ${params.pathoplexus_accession}' \ + | csvtk mutate2 -t \ + -n {params.insdc_accession_url} \ + -e '"https://www.ncbi.nlm.nih.gov/nuccore/" + ${params.insdc_accession}' \ + | csvtk mutate2 -t \ + -n url \ + -e '"https://www.ncbi.nlm.nih.gov/nuccore/" + ${params.insdc_accession}' \ > {output.metadata} """ @@ -121,6 +124,30 @@ rule subset_metadata: {input.metadata} > {output.metadata} """ +rule extract_open_data: + input: + metadata = "results/metadata.tsv", + sequences = "results/sequences.fasta" + output: + metadata = "results/metadata_open.tsv", + sequences = "results/sequences_open.fasta" + benchmark: + "benchmarks/extract_open_data.txt" + log: + "logs/extract_open_data.txt" + shell: + r""" + exec &> >(tee {log:q}) + + augur filter \ + --metadata {input.metadata:q} \ + --sequences {input.sequences:q} \ + --metadata-id-columns accession \ + --exclude-where "dataUseTerms=RESTRICTED" \ + --output-metadata {output.metadata:q} \ + --output-sequences {output.sequences:q} + """ + rule compress: input: file="{a_file}", diff --git a/ingest/rules/fetch.smk b/ingest/rules/fetch.smk index 86f0947..5d9ce5a 100644 --- a/ingest/rules/fetch.smk +++ b/ingest/rules/fetch.smk @@ -1,5 +1,5 @@ """ -This part of the workflow handles fetching sequences and metadata from GenBank. +This part of the workflow handles fetching sequences and metadata from Pathoplexus. REQUIRED INPUTS: @@ -7,94 +7,60 @@ REQUIRED INPUTS: OUTPUTS: - ndjson = data/genbank.ndjson + ndjson = data/sequences.ndjson """ workflow.global_resources.setdefault("concurrent_deploys", 2) -rule fetch_ncbi_dataset_package: +rule download_ppx_seqs: output: - dataset_package = temp("data/ncbi_dataset.zip") - retries: 5 # Requires snakemake 7.7.0 or later - log: - "logs/fetch_ncbi_dataset_package.txt" - benchmark: - "benchmarks/fetch_ncbi_dataset_package.txt" + sequences= "data/ppx_sequences.fasta", params: - ncbi_taxon_id = config["ncbi_taxon_id"] - shell: - """ - datasets download virus genome taxon {params.ncbi_taxon_id} \ - --no-progressbar \ - --filename {output.dataset_package} 2>&1 | tee {log} - """ - -# Note: This rule is not part of the default workflow! -# It is intended to be used as a specific target for users to be able -# to inspect and explore the full raw metadata from NCBI Datasets. -rule dump_ncbi_dataset_report: - input: - dataset_package="data/ncbi_dataset.zip", - output: - ncbi_dataset_tsv="data/ncbi_dataset_report_raw.tsv", - shell: - """ - dataformat tsv virus-genome \ - --package {input.dataset_package} > {output.ncbi_dataset_tsv} - """ - -rule extract_ncbi_dataset_sequences: - input: - dataset_package = "data/ncbi_dataset.zip" - output: - ncbi_dataset_sequences = temp("data/ncbi_dataset_sequences.fasta") + sequences_url=config["ppx_fetch"]["seqs"], + # Allow retries in case of network errors + retries: 5 benchmark: - "benchmarks/extract_ncbi_dataset_sequences.txt" + "benchmarks/download_ppx_seqs.txt" + log: + "logs/download_ppx_seqs.txt" shell: """ - unzip -jp {input.dataset_package} \ - ncbi_dataset/data/genomic.fna > {output.ncbi_dataset_sequences} + curl {params.sequences_url} -o {output.sequences} """ -rule format_ncbi_dataset_report: - input: - dataset_package = "data/ncbi_dataset.zip", +rule download_ppx_meta: output: - ncbi_dataset_tsv = temp("data/ncbi_dataset_report.tsv") + metadata= "data/ppx_metadata.csv" params: - ncbi_dataset_fields = ",".join(config["ncbi_datasets_fields"]), + metadata_url=config["ppx_fetch"]["meta"], + fields = ",".join(config["ppx_metadata_fields"]) + # Allow retries in case of network errors + retries: 5 benchmark: - "benchmarks/format_ncbi_dataset_report.txt" + "benchmarks/download_ppx_meta.txt" + log: + "logs/download_ppx_meta.txt" shell: """ - dataformat tsv virus-genome \ - --package {input.dataset_package} \ - --fields {params.ncbi_dataset_fields:q} \ - --elide-header \ - | csvtk fix-quotes -Ht \ - | csvtk add-header -t -n {params.ncbi_dataset_fields} \ - | csvtk rename -t -f accession -n accession_version \ - | csvtk -t mutate -f accession_version -n accession -p "^(.+?)\." --at 1 \ - > {output.ncbi_dataset_tsv} + curl '{params.metadata_url}&fields={params.fields}' -o {output.metadata} """ - -rule format_ncbi_datasets_ndjson: +rule format_ppx_ndjson: input: - ncbi_dataset_sequences = "data/ncbi_dataset_sequences.fasta", - ncbi_dataset_tsv = "data/ncbi_dataset_report.tsv", + sequences = "data/ppx_sequences.fasta", + metadata = "data/ppx_metadata.csv", output: - ndjson = "data/genbank.ndjson", + ndjson = "data/sequences.ndjson", log: - "logs/format_ncbi_datasets_ndjson.txt" + "logs/format_ppx_ndjson.txt" benchmark: - "benchmarks/format_ncbi_datasets_ndjson.txt" + "benchmarks/format_ppx_ndjson.txt" shell: """ augur curate passthru \ - --metadata {input.ncbi_dataset_tsv} \ - --fasta {input.ncbi_dataset_sequences} \ - --seq-id-column accession_version \ + --metadata {input.metadata} \ + --fasta {input.sequences} \ + --seq-id-column accessionVersion \ --seq-field sequence \ --unmatched-reporting warn \ --duplicate-reporting warn \ diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index c2ebfa1..7f6a308 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -12,6 +12,8 @@ like to customize the rules: https://docs.nextstrain.org/projects/nextclade/page/user/nextclade-cli.html """ +# TODO: This separate fetch should not be necessary - 'lineage' can be added +# to data/subset_metadata.tsv. rule pathoplexus_classify: """ Pulls global lineage calls from Pathoplexus API @@ -25,7 +27,7 @@ rule pathoplexus_classify: id_field=config["curate"]["output_id_field"], shell: r""" - curl "{params.URL}?dataFormat=TSV&downloadAsFile=false&fields={params.fields}" \ + curl "{params.URL}?versionStatus=LATEST_VERSION&dataFormat=TSV&downloadAsFile=false&fields={params.fields}" \ | tsv-filter -H --not-empty {params.accession_field} \ | uniq \ | csvtk -t rename -f {params.accession_field} -n {params.id_field} \ From 3fe7663b13ecfbe6e46961c5b5b6f014d5411d99 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Thu, 16 Oct 2025 15:37:32 -0700 Subject: [PATCH 4/5] phylogenetic: Update for Pathoplexus data Accessions in **/include.txt updated with the following command: for FILE in phylogenetic/defaults/{all-lineages,lineage-1A,lineage-2}/include.txt; do tail -n +2 ingest/results/metadata.tsv | awk -F'\t' '{print $1"\t"$4}' | while IFS=$'\t' read -r new old; do sed -i '' "s/^${old%.*} /${new} /" "$FILE" done done --- .../defaults/all-lineages/auspice_config.json | 10 +- .../defaults/all-lineages/include.txt | 182 +++++++++--------- phylogenetic/defaults/config.yaml | 2 +- .../defaults/lineage-1A/auspice_config.json | 10 +- phylogenetic/defaults/lineage-1A/include.txt | 132 ++++++------- .../defaults/lineage-2/auspice_config.json | 10 +- phylogenetic/defaults/lineage-2/include.txt | 14 +- 7 files changed, 189 insertions(+), 171 deletions(-) diff --git a/phylogenetic/defaults/all-lineages/auspice_config.json b/phylogenetic/defaults/all-lineages/auspice_config.json index d406200..269c6e4 100644 --- a/phylogenetic/defaults/all-lineages/auspice_config.json +++ b/phylogenetic/defaults/all-lineages/auspice_config.json @@ -1,6 +1,10 @@ { "title": "Genomic epidemiology of West Nile Virus", "data_provenance": [ + { + "name": "Pathoplexus", + "url": "https://pathoplexus.org" + }, { "name": "GenBank", "url": "https://www.ncbi.nlm.nih.gov/genbank/" @@ -16,6 +20,7 @@ {"key": "lineage", "title": "Lineage", "type": "categorical"}, {"key": "clade_membership", "title": "Clade", "type": "categorical"}, {"key": "author", "title": "Authors", "type": "categorical"}, + {"key": "dataUseTerms", "title": "Data use terms", "type": "categorical"}, {"key": "host", "title": "Host Species", "type": "categorical"}, {"key": "host_genus", "title": "Host Genus", "type": "categorical"}, {"key": "host_type", "title": "Host Type", "type": "categorical"} @@ -52,9 +57,10 @@ "geo_resolution": "country" }, "metadata_columns": [ - "accession", + "PPX_accession", + "INSDC_accession", "division", - "url" + "restrictedUntil" ], "extensions": { "nextclade": { diff --git a/phylogenetic/defaults/all-lineages/include.txt b/phylogenetic/defaults/all-lineages/include.txt index 1e92593..6471a7f 100644 --- a/phylogenetic/defaults/all-lineages/include.txt +++ b/phylogenetic/defaults/all-lineages/include.txt @@ -1,91 +1,91 @@ -AF260968 # Egypt 1951 all-lineages reference -NC_001563 # Lineage 2 reference -NC_009942 # Lineage 1 reference -HM051416 # Isreal 1953 -GQ851607 # Nigeria 1965 -GQ851606 # Senegal 1979 -AF481864 # pre-NY -MH166901 # NY99 -MH166903 # NY99 -MH166904 # NY99 -KX547395 # NY99 -KX547519 # NY99 -KX547602 # NY99 -HM488130 # NY99 -HM488132 # NY99 -HQ671707 # NY99 -AF202541 # NY99 -AF206518 # NY99 -HM488127 # NY99 -HM488126 # NY99 -KX547410 # WN02 -KJ501434 # WN02 -KX547456 # WN02 -KY216155 # WN02 -KX547460 # WN02 -MF175829 # WN02 -KX547482 # WN02 -MF175827 # WN02 -MF175839 # WN02 -KT020853 # WN02 -KX547548 # WN02 -MF175863 # WN02 -KX547286 # WN02 -MF175873 # WN02 -MF175865 # WN02 -MF175831 # WN02 -MF175858 # WN02 -KJ501117 # SW03 -KJ501120 # SW03 -MF175815 # SW03 -MG004533 # SW03 -KF704147 # SW03 -KF704153 # SW03 -KR348940 # SW03 -KR348937 # SW03 -KX547361 # SW03 -JX015523 # SW03 -KR348944 # SW03 -KJ501124 # SW03 -KX547552 # SW03 -KJ145829 # SW03 -KR348981 # SW03 -KJ501118 # SW03 -KR348938 # SW03 -KR348976 # SW03 -KJ501170 # SW03 -KR348993 # SW03 -JQ700438 # SW03 -KR348977 # SW03 -KR348942 # SW03 -KR348941 # SW03 -KJ501121 # SW03 -KJ501122 # SW03 -KX547375 # SW03 -KM012172 # SW03 -KC333375 # SW03 -KJ501222 # SW03 -MG004537 # SW03 -MF175866 # SW03 -MG004540 # SW03 -MW383507 # Lineage 2 -HM147822 # Lineage 2 -GQ903680 # Lineage 2 -DQ176636 # Lineage 2 -KU978767 # Lineage 2 -HM147823 # Lineage 2 -PP445212 # Lineage 3 -AY765264 # Lineage 3 -AY277251 # Lineage 4 -FJ159131 # Lineage 4 -FJ159129 # Lineage 4 -FJ159130 # Lineage 4 -KJ831223 # Lineage 4 -KU978770 # Lineage 5 -DQ256376 # Lineage 5 -JX041632 # Lineage 5 -GQ851604 # Lineage 5 -GQ851605 # Lineage 5 -KY703855 # Lineage 7 -OP846972 # Lineage 7 -KY703856 # Lineage 8 +PP_000HJBT # Egypt 1951 all-lineages reference +PP_0003ASZ # Lineage 2 reference +PP_0003ATX # Lineage 1 reference +PP_0008AWF # Isreal 1953 +PP_000K976 # Nigeria 1965 +PP_000K968 # Senegal 1979 +PP_000HP18 # pre-NY +PP_0002EDQ # NY99 +PP_0002EFL # NY99 +PP_0002EGJ # NY99 +PP_0001RJ4 # NY99 +PP_0001V6R # NY99 +PP_0001XMS # NY99 +PP_0008D7R # NY99 +PP_0008D9M # NY99 +PP_0008M3R # NY99 +PP_000HHM7 # NY99 +PP_000HHXM # NY99 +PP_0008D4X # NY99 +PP_0008D3Z # NY99 +PP_0001RZ8 # WN02 +PP_00012ZX # WN02 +PP_0001TBH # WN02 +PP_0001Z6M # WN02 +PP_0001TF9 # WN02 +PP_0002AES # WN02 +PP_0001U3Y # WN02 +PP_0002ACW # WN02 +PP_0002AQ5 # WN02 +PP_0001F1F # WN02 +PP_0001W10 # WN02 +PP_0002BER # WN02 +PP_0001NBN # WN02 +PP_0002BQ4 # WN02 +PP_0002BGM # WN02 +PP_0002AGN # WN02 +PP_0002B91 # WN02 +PP_0000T23 # SW03 +PP_0000T6V # SW03 +PP_0002A0L # SW03 +PP_0002DFM # SW03 +PP_0000Q26 # SW03 +PP_0000Q8U # SW03 +PP_0001C3E # SW03 +PP_0001BZN # SW03 +PP_0001QJ5 # SW03 +PP_0000FKE # SW03 +PP_0001C76 # SW03 +PP_0000TAM # SW03 +PP_0001W5S # SW03 +PP_0000RRS # SW03 +PP_0001DFP # SW03 +PP_0000T31 # SW03 +PP_0001C0L # SW03 +PP_0001DAZ # SW03 +PP_0000UWD # SW03 +PP_0001DVU # SW03 +PP_0000DT0 # SW03 +PP_0001DBX # SW03 +PP_0001C5A # SW03 +PP_0001C4C # SW03 +PP_0000T7T # SW03 +PP_0000T8R # SW03 +PP_0001QYB # SW03 +PP_00017WY # SW03 +PP_0000HXN # SW03 +PP_0000WPR # SW03 +PP_0002DLA # SW03 +PP_0002BHJ # SW03 +PP_0002DP4 # SW03 +PP_000370M # Lineage 2 +PP_0008CDE # Lineage 2 +PP_000K9BY # Lineage 2 +PP_000JB76 # Lineage 2 +PP_0001H9X # Lineage 2 +PP_0008CEC # Lineage 2 +PP_000RH4S # Lineage 3 +PP_000HY01 # Lineage 3 +PP_000HRWF # Lineage 4 +PP_000JWG3 # Lineage 4 +PP_000JWE7 # Lineage 4 +PP_000JWF5 # Lineage 4 +PP_00017EX # Lineage 4 +PP_0001HCR # Lineage 5 +PP_000JBA0 # Lineage 5 +PP_0000FR2 # Lineage 5 +PP_000K94C # Lineage 5 +PP_000K95A # Lineage 5 +PP_0001ZMQ # Lineage 7 +PP_0003L7U # Lineage 7 +PP_0001ZNN # Lineage 8 diff --git a/phylogenetic/defaults/config.yaml b/phylogenetic/defaults/config.yaml index b50e603..0d08cd4 100644 --- a/phylogenetic/defaults/config.yaml +++ b/phylogenetic/defaults/config.yaml @@ -56,7 +56,7 @@ build_params: lineage-1A: reference: "defaults/lineage-1A/reference.gb" - root: "KX394399" + root: "PP_0001JCQ" subsample: samples: diff --git a/phylogenetic/defaults/lineage-1A/auspice_config.json b/phylogenetic/defaults/lineage-1A/auspice_config.json index 945cdff..a0f3d12 100644 --- a/phylogenetic/defaults/lineage-1A/auspice_config.json +++ b/phylogenetic/defaults/lineage-1A/auspice_config.json @@ -1,6 +1,10 @@ { "title": "Genomic epidemiology of West Nile Virus lineage 1A", "data_provenance": [ + { + "name": "Pathoplexus", + "url": "https://pathoplexus.org" + }, { "name": "GenBank", "url": "https://www.ncbi.nlm.nih.gov/genbank/" @@ -16,6 +20,7 @@ {"key": "lineage", "title": "Lineage", "type": "categorical"}, {"key": "clade_membership", "title": "Clade", "type": "categorical"}, {"key": "author", "title": "Authors", "type": "categorical"}, + {"key": "dataUseTerms", "title": "Data use terms", "type": "categorical"}, {"key": "host", "title": "Host Species", "type": "categorical"}, {"key": "host_genus", "title": "Host Genus", "type": "categorical"}, {"key": "host_type", "title": "Host Type", "type": "categorical"} @@ -53,9 +58,10 @@ "distance_measure": "div" }, "metadata_columns": [ - "accession", + "PPX_accession", + "INSDC_accession", "division", - "url" + "restrictedUntil" ], "extensions": { "nextclade": { diff --git a/phylogenetic/defaults/lineage-1A/include.txt b/phylogenetic/defaults/lineage-1A/include.txt index 2bc4e0b..f4634a9 100644 --- a/phylogenetic/defaults/lineage-1A/include.txt +++ b/phylogenetic/defaults/lineage-1A/include.txt @@ -1,66 +1,66 @@ -KX394399 # Lineage 1B outgroup -NC_009942 # Lineage 1 reference -AF481864 # pre-NY -MH166901 # NY99 -MH166903 # NY99 -MH166904 # NY99 -KX547395 # NY99 -KX547519 # NY99 -KX547602 # NY99 -HM488130 # NY99 -HM488132 # NY99 -HQ671707 # NY99 -AF202541 # NY99 -AF206518 # NY99 -HM488127 # NY99 -HM488126 # NY99 -KX547410 # WN02 -KJ501434 # WN02 -KX547456 # WN02 -KY216155 # WN02 -KX547460 # WN02 -MF175829 # WN02 -KX547482 # WN02 -MF175827 # WN02 -MF175839 # WN02 -KT020853 # WN02 -KX547548 # WN02 -MF175863 # WN02 -KX547286 # WN02 -MF175873 # WN02 -MF175865 # WN02 -MF175831 # WN02 -MF175858 # WN02 -KJ501117 # SW03 -KJ501120 # SW03 -MF175815 # SW03 -MG004533 # SW03 -KF704147 # SW03 -KF704153 # SW03 -KR348940 # SW03 -KR348937 # SW03 -KX547361 # SW03 -JX015523 # SW03 -KR348944 # SW03 -KJ501124 # SW03 -KX547552 # SW03 -KJ145829 # SW03 -KR348981 # SW03 -KJ501118 # SW03 -KR348938 # SW03 -KR348976 # SW03 -KJ501170 # SW03 -KR348993 # SW03 -JQ700438 # SW03 -KR348977 # SW03 -KR348942 # SW03 -KR348941 # SW03 -KJ501121 # SW03 -KJ501122 # SW03 -KX547375 # SW03 -KM012172 # SW03 -KC333375 # SW03 -KJ501222 # SW03 -MG004537 # SW03 -MF175866 # SW03 -MG004540 # SW03 +PP_0001JCQ # Lineage 1B outgroup +PP_0003ATX # Lineage 1 reference +PP_000HP18 # pre-NY +PP_0002EDQ # NY99 +PP_0002EFL # NY99 +PP_0002EGJ # NY99 +PP_0001RJ4 # NY99 +PP_0001V6R # NY99 +PP_0001XMS # NY99 +PP_0008D7R # NY99 +PP_0008D9M # NY99 +PP_0008M3R # NY99 +PP_000HHM7 # NY99 +PP_000HHXM # NY99 +PP_0008D4X # NY99 +PP_0008D3Z # NY99 +PP_0001RZ8 # WN02 +PP_00012ZX # WN02 +PP_0001TBH # WN02 +PP_0001Z6M # WN02 +PP_0001TF9 # WN02 +PP_0002AES # WN02 +PP_0001U3Y # WN02 +PP_0002ACW # WN02 +PP_0002AQ5 # WN02 +PP_0001F1F # WN02 +PP_0001W10 # WN02 +PP_0002BER # WN02 +PP_0001NBN # WN02 +PP_0002BQ4 # WN02 +PP_0002BGM # WN02 +PP_0002AGN # WN02 +PP_0002B91 # WN02 +PP_0000T23 # SW03 +PP_0000T6V # SW03 +PP_0002A0L # SW03 +PP_0002DFM # SW03 +PP_0000Q26 # SW03 +PP_0000Q8U # SW03 +PP_0001C3E # SW03 +PP_0001BZN # SW03 +PP_0001QJ5 # SW03 +PP_0000FKE # SW03 +PP_0001C76 # SW03 +PP_0000TAM # SW03 +PP_0001W5S # SW03 +PP_0000RRS # SW03 +PP_0001DFP # SW03 +PP_0000T31 # SW03 +PP_0001C0L # SW03 +PP_0001DAZ # SW03 +PP_0000UWD # SW03 +PP_0001DVU # SW03 +PP_0000DT0 # SW03 +PP_0001DBX # SW03 +PP_0001C5A # SW03 +PP_0001C4C # SW03 +PP_0000T7T # SW03 +PP_0000T8R # SW03 +PP_0001QYB # SW03 +PP_00017WY # SW03 +PP_0000HXN # SW03 +PP_0000WPR # SW03 +PP_0002DLA # SW03 +PP_0002BHJ # SW03 +PP_0002DP4 # SW03 diff --git a/phylogenetic/defaults/lineage-2/auspice_config.json b/phylogenetic/defaults/lineage-2/auspice_config.json index 3453014..4d2e362 100644 --- a/phylogenetic/defaults/lineage-2/auspice_config.json +++ b/phylogenetic/defaults/lineage-2/auspice_config.json @@ -1,6 +1,10 @@ { "title": "Genomic epidemiology of West Nile Virus lineage 2", "data_provenance": [ + { + "name": "Pathoplexus", + "url": "https://pathoplexus.org" + }, { "name": "GenBank", "url": "https://www.ncbi.nlm.nih.gov/genbank/" @@ -16,6 +20,7 @@ {"key": "lineage", "title": "Lineage", "type": "categorical"}, {"key": "clade_membership", "title": "Clade", "type": "categorical"}, {"key": "author", "title": "Authors", "type": "categorical"}, + {"key": "dataUseTerms", "title": "Data use terms", "type": "categorical"}, {"key": "host", "title": "Host Species", "type": "categorical"}, {"key": "host_genus", "title": "Host Genus", "type": "categorical"}, {"key": "host_type", "title": "Host Type", "type": "categorical"} @@ -53,9 +58,10 @@ "distance_measure": "div" }, "metadata_columns": [ - "accession", + "PPX_accession", + "INSDC_accession", "division", - "url" + "restrictedUntil" ], "extensions": { "nextclade": { diff --git a/phylogenetic/defaults/lineage-2/include.txt b/phylogenetic/defaults/lineage-2/include.txt index e33db5b..23924ce 100644 --- a/phylogenetic/defaults/lineage-2/include.txt +++ b/phylogenetic/defaults/lineage-2/include.txt @@ -1,7 +1,7 @@ -NC_001563 # Lineage 2 reference -MW383507 # Lineage 2 -HM147822 # Lineage 2 -GQ903680 # Lineage 2 -DQ176636 # Lineage 2 -KU978767 # Lineage 2 -HM147823 # Lineage 2 +PP_0003ASZ # Lineage 2 reference +PP_000370M # Lineage 2 +PP_0008CDE # Lineage 2 +PP_000K9BY # Lineage 2 +PP_000JB76 # Lineage 2 +PP_0001H9X # Lineage 2 +PP_0008CEC # Lineage 2 From 34ea83efef9dbd4d3278a5339af8a1990fc60f80 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Fri, 17 Oct 2025 14:59:12 -0700 Subject: [PATCH 5/5] ingest: Update accessions in annotations file Command: tail -n +2 ingest/results/metadata.tsv | awk -F'\t' '{print $1"\t"$4}' | while IFS=$'\t' read -r new old; do sed -i '' "s/^${old%.*} /${new} /" "ingest/defaults/annotations.tsv" done --- ingest/defaults/annotations.tsv | 44 ++++++++++++++++----------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/ingest/defaults/annotations.tsv b/ingest/defaults/annotations.tsv index 45ab88a..d454abf 100644 --- a/ingest/defaults/annotations.tsv +++ b/ingest/defaults/annotations.tsv @@ -272,25 +272,25 @@ ON694341 institution Centre for Biological Threats, Highly Pathogenic Viruses, R ON694342 institution Centre for Biological Threats, Highly Pathogenic Viruses, Robert Koch Institute, Germany ON720848 institution Microbial Genomics, Hospital General Universitario Gregorio Marañón, Madrid, Spain ON720849 institution Microbial Genomics, Hospital General Universitario Gregorio Marañón, Madrid, Spain -KT163243 date 1968-XX-XX -AF260968 date 1951-XX-XX -AF260968 region Africa -AF260968 country Egypt -AF260968 host Homo sapians -AF196835 host Phoenicopterus chilensis -AF196835 date 1999-XX-XX -AY765264 date 1997-XX-XX -AY765264 country Czech Republic -AY765264 region Europe -DQ318020 date 1972-XX-XX -DQ318020 host Culex tigripes -D00246 country Australia -D00246 date 1960-XX-XX -EF631122 date XXXX-XX-XX -EF631123 date XXXX-XX-XX -DQ116961 date 2004-XX-XX -AY603654 date 1976-XX-XX -AM404308 date 1971-XX-XX -AF260968 date 1951-XX-XX -AY660002 date 2003-XX-XX -AY268132 date 2000-XX-XX +PP_0001F2D date 1968-XX-XX +PP_000HJBT date 1951-XX-XX +PP_000HJBT region Africa +PP_000HJBT country Egypt +PP_000HJBT host Homo sapians +PP_000HHL9 host Phoenicopterus chilensis +PP_000HHL9 date 1999-XX-XX +PP_000HY01 date 1997-XX-XX +PP_000HY01 country Czech Republic +PP_000HY01 region Europe +PP_000JBDU date 1972-XX-XX +PP_000JBDU host Culex tigripes +PP_000HZ4S country Australia +PP_000HZ4S date 1960-XX-XX +PP_000JSDD date XXXX-XX-XX +PP_000JSEB date XXXX-XX-XX +PP_000J96A date 2004-XX-XX +PP_000HXJZ date 1976-XX-XX +PP_000HQ6X date 1971-XX-XX +PP_000HJBT date 1951-XX-XX +PP_000HXRK date 2003-XX-XX +PP_000HRSP date 2000-XX-XX