From 25b97320a5898f9712e220fed26f9f0cccbaa3d3 Mon Sep 17 00:00:00 2001 From: jbloom Date: Thu, 12 Feb 2026 19:43:35 -0800 Subject: [PATCH 01/19] add citation to preprint for deep mutational scanning antibody escape data The tree shows some antibody escape scores from deep mutational scanning. This adds a link to the preprint describing that data source ([here](https://www.biorxiv.org/content/10.64898/2026.02.12.705519)) to README and description where before there was a dummy link. @rneher or @jameshadfield, can you review and merge? --- README.md | 2 +- config/auspice_config.json | 2 +- config/description.md | 4 ++-- dms-data/README.md | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 2c85115..fe171cc 100644 --- a/README.md +++ b/README.md @@ -129,7 +129,7 @@ See [shared/vendored/README.md](./shared/vendored/README.md#vendoring) for instr ## Deep mutational scanning data -The trees are annotated with escape from key monoclonal antibodies as assessed by the effects of mutations measured in deep mutational scanning by [Simonich et al]() **ADD CITATION**. +The trees are annotated with escape from key monoclonal antibodies as assessed by the effects of mutations measured in deep mutational scanning by [Simonich et al (2026)](https://www.biorxiv.org/content/10.64898/2026.02.12.705519). ## Update example data diff --git a/config/auspice_config.json b/config/auspice_config.json index b41aa67..b641b53 100644 --- a/config/auspice_config.json +++ b/config/auspice_config.json @@ -3,7 +3,7 @@ "maintainers": [ {"name": "Nextstrain team", "url": "http://nextstrain.org"}, {"name": "Richard Neher", "url": "http://nextstrain.org"}, - {"name": "Bloom lab (antibody escape annotations)", "url": "https://jbloomlab.org/"} + {"name": "Bloom lab (antibody escape annotations)", "url": "https://www.biorxiv.org/content/10.64898/2026.02.12.705519"} ], "data_provenance": [ { diff --git a/config/description.md b/config/description.md index 00f7cf6..a0a00f0 100644 --- a/config/description.md +++ b/config/description.md @@ -8,7 +8,7 @@ The second is ['rsv/a/G'](https://nextstrain.org/rsv/a/G) and ['rsv/b/G'](https: The third is ['rsv/a/F'](https://nextstrain.org/rsv/a/F) and ['rsv/b/F'](https://nextstrain.org/rsv/b/F), which show evolution of only the F gene. -The fourth is ['rsv/a/F-antibody-escape'](https://nextstrain.org/rsv/a/F-antibody-escape) and ['rsv/b/F-antibody-escape'](https://nextstrain.org/rsv/b/F-antibody-escape), which show evolution of only the F gene but with the available sequences subsampled so as to enrich for sequences that have escape mutations to key monoclonal antibodies as assessed by the deep mutational scanning reported in [Simonich et al]() **ADD CITATION**. Note that these trees as well as the other ones can be colored by the escape from key monoclonal antibodies as computed under an additive model of the mutation effects measured in the deep mutational scanning, with annotations for either *Total Escape* (sum of effects of all mutations) and *Max Escape* (the maximum escape caused by any mutation in the strain). +The fourth is ['rsv/a/F-antibody-escape'](https://nextstrain.org/rsv/a/F-antibody-escape) and ['rsv/b/F-antibody-escape'](https://nextstrain.org/rsv/b/F-antibody-escape), which show evolution of only the F gene but with the available sequences subsampled so as to enrich for sequences that have escape mutations to key monoclonal antibodies as assessed by the deep mutational scanning reported in [Simonich et al (2026)](https://www.biorxiv.org/content/10.64898/2026.02.12.705519). Note that these trees as well as the other ones can be colored by the escape from key monoclonal antibodies as computed under an additive model of the mutation effects measured in the deep mutational scanning, with annotations for either *Total Escape* (sum of effects of all mutations) and *Max Escape* (the maximum escape caused by any mutation in the strain). #### Analysis @@ -22,7 +22,7 @@ Our bioinformatic processing workflow can be found at [github.com/nextstrain/rsv [RSV-A](https://raw.githubusercontent.com/rsv-lineages/lineage-designation-A/main/.auto-generated/lineage.tsv) [RSV-B](https://raw.githubusercontent.com/rsv-lineages/lineage-designation-A/main/.auto-generated/lineage.tsv) These clade definitions are based on the [nomenclature proposal by the RSV Genotyping Consensus Consortium](https://wwwnc.cdc.gov/eid/article/30/8/24-0209_article). -- annotation of antibody escape is done based on an additive model of mutational effects measured in the deep mutational scanning of [Simonich et al]() **ADD CITATION**, and additionally the *F-antibody-escape* builds have the sequences subsampled to enrich for those with escape mutations. +- annotation of antibody escape is done based on an additive model of mutational effects measured in the deep mutational scanning of [Simonich et al](https://www.biorxiv.org/content/10.64898/2026.02.12.705519), and additionally the *F-antibody-escape* builds have the sequences subsampled to enrich for those with escape mutations. #### Underlying sequence data diff --git a/dms-data/README.md b/dms-data/README.md index e9c1af7..f7f0462 100644 --- a/dms-data/README.md +++ b/dms-data/README.md @@ -1,5 +1,5 @@ # Deep mutational scanning data -Data from [Bloom lab](https://jbloomlab.org/) pseudovirus deep mutational scanning of RSV F. +Data from [Bloom lab](https://jbloomlab.org/) pseudovirus deep mutational scanning of RSV F, see [Simonich et al (2026)](https://www.biorxiv.org/content/10.64898/2026.02.12.705519). [all_antibodyes.csv](all_antibodies.csv) taken from [https://github.com/dms-vep/RSV_Long_F_DMS/blob/main/results/summaries/all_antibodies.csv](https://github.com/dms-vep/RSV_Long_F_DMS/blob/main/results/summaries/all_antibodies.csv) and then antibody columns renamed like *Nirsevimab-IgG escape* -> *Nirsevimab-IgG*. From a81c57477edb738639729d7d85d822786f864448 Mon Sep 17 00:00:00 2001 From: jbloom Date: Sat, 28 Feb 2026 19:32:21 -0800 Subject: [PATCH 02/19] fix exclusion of preduplication sequence sequences from background In [this pull request](https://github.com/nextstrain/rsv/pull/110), @rneher limited background sequences from recent builds to not include extinct preduplication lineages in order not to have the background sequences go back too far time. @victorlin noticed that a key part of this change was reverted in #114 (see [here](https://github.com/nextstrain/rsv/pull/114/changes#r2835307384)); that reversion was incorrect. This pull request fixes that. --- workflow/snakemake_rules/core.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/snakemake_rules/core.smk b/workflow/snakemake_rules/core.smk index 7f936cf..154e851 100644 --- a/workflow/snakemake_rules/core.smk +++ b/workflow/snakemake_rules/core.smk @@ -173,7 +173,7 @@ rule combine_samples: ( [ rules.filter_recent.output.sequences, - rules.filter_background.output.sequences, + rules.exclude_preduplication.output.sequences, ] if "background_min_date" in config["filter"]["resolutions"][w.resolution] else [rules.filter_recent.output.sequences] From f8f4820a4b3756951651e2d5cc099b09914c8bee Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Tue, 3 Mar 2026 10:16:00 -0800 Subject: [PATCH 03/19] git subrepo pull (merge) shared/vendored subrepo: subdir: "shared/vendored" merged: "37cf39c" upstream: origin: "https://github.com/nextstrain/shared" branch: "main" commit: "37cf39c" git-subrepo: version: "0.4.9" origin: "https://github.com/ingydotnet/git-subrepo" commit: "4f60dd7" --- shared/vendored/.github/workflows/ci.yaml | 2 +- .../.github/workflows/pre-commit.yaml | 2 +- shared/vendored/.gitrepo | 6 +- shared/vendored/snakemake/config.smk | 72 +++++++++++++++++-- 4 files changed, 73 insertions(+), 9 deletions(-) diff --git a/shared/vendored/.github/workflows/ci.yaml b/shared/vendored/.github/workflows/ci.yaml index 94d3054..6dba82b 100644 --- a/shared/vendored/.github/workflows/ci.yaml +++ b/shared/vendored/.github/workflows/ci.yaml @@ -11,5 +11,5 @@ jobs: shellcheck: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 - uses: nextstrain/.github/actions/shellcheck@master diff --git a/shared/vendored/.github/workflows/pre-commit.yaml b/shared/vendored/.github/workflows/pre-commit.yaml index bea15f6..c63b890 100644 --- a/shared/vendored/.github/workflows/pre-commit.yaml +++ b/shared/vendored/.github/workflows/pre-commit.yaml @@ -7,7 +7,7 @@ jobs: pre-commit: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 - uses: actions/setup-python@v6 with: python-version: "3.12" diff --git a/shared/vendored/.gitrepo b/shared/vendored/.gitrepo index c561adb..3723d3d 100644 --- a/shared/vendored/.gitrepo +++ b/shared/vendored/.gitrepo @@ -6,7 +6,7 @@ [subrepo] remote = https://github.com/nextstrain/shared branch = main - commit = bfbbb6875b084e22920712d89704ccb259f8950e - parent = 401279d49d2c703f5f79a584c4530dec736fbaad + commit = 37cf39c2a4f4c42474046e70c856acb6a2031e2d + parent = f1f17cec6fe39fa2dd06429a44bcd925a441e47f method = merge - cmdver = 0.4.6 + cmdver = 0.4.9 diff --git a/shared/vendored/snakemake/config.smk b/shared/vendored/snakemake/config.smk index 15727b0..2f0df30 100644 --- a/shared/vendored/snakemake/config.smk +++ b/shared/vendored/snakemake/config.smk @@ -11,6 +11,45 @@ from typing import Optional from textwrap import dedent, indent +# Set search paths for Augur +if "AUGUR_SEARCH_PATHS" in os.environ: + print(dedent(f"""\ + Using existing search paths in AUGUR_SEARCH_PATHS: + + {os.environ["AUGUR_SEARCH_PATHS"]!r} + """), file=sys.stderr) +else: + # Note that this differs from the search paths used in + # resolve_config_path(). + # This is the preferred default moving forwards, and the plan is to + # eventually update resolve_config_path() to use AUGUR_SEARCH_PATHS. + search_paths = [ + # User analysis directory + Path.cwd(), + + # Workflow defaults folder + Path(workflow.basedir) / "defaults", + + # Workflow root (contains Snakefile) + Path(workflow.basedir), + ] + + # This should work for majority of workflows, but we could consider doing a + # more thorough search for the nextstrain-pathogen.yaml. This would likely + # replicate how CLI searches for the root.¹ + # ¹ + repo_root = Path(workflow.basedir) / ".." + if (repo_root / "nextstrain-pathogen.yaml").is_file(): + search_paths.extend([ + # Pathogen repo root + repo_root, + ]) + + search_paths = [path.resolve() for path in search_paths if path.is_dir()] + + os.environ["AUGUR_SEARCH_PATHS"] = ":".join(map(str, search_paths)) + + class InvalidConfigError(Exception): pass @@ -147,15 +186,40 @@ def resolve_config_path(path: str, defaults_dir: Optional[str] = None) -> Callab return _resolve_config_path -def write_config(path): +def write_config(path, section=None): """ - Write Snakemake's 'config' variable to a file. + Write Snakemake's 'config' variable, or a section of it, to a file. + + *section* is an optional list of keys to navigate to a specific section of + config. If provided, only that section will be written. """ global config os.makedirs(os.path.dirname(path), exist_ok=True) + data = config + section_str = "config" + + if section: + # Navigate to the specified section + for key in section: + # Error if key doesn't exist + if key not in data: + raise Exception(f"ERROR: Key {key!r} not found in {section_str!r}.") + + data = data[key] + section_str += f".{key}" + + # Error if value is not a mapping + if not isinstance(data, dict): + raise Exception(f"ERROR: {section_str!r} is not a mapping of key/value pairs.") + with open(path, 'w') as f: - yaml.dump(config, f, sort_keys=False) + yaml.dump(data, f, sort_keys=False, Dumper=NoAliasDumper) + + print(f"Saved {section_str!r} to {path!r}.", file=sys.stderr) + - print(f"Saved current run config to {path!r}.", file=sys.stderr) +class NoAliasDumper(yaml.SafeDumper): + def ignore_aliases(self, data): + return True From 9243fdfe73aff5c3176ca50787905e18dfb0b308 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Mon, 22 Sep 2025 13:31:39 -0700 Subject: [PATCH 04/19] Write config to a file Useful for debugging. --- Snakefile | 2 ++ workflow/snakemake_rules/config.smk | 8 ++++++++ 2 files changed, 10 insertions(+) create mode 100644 workflow/snakemake_rules/config.smk diff --git a/Snakefile b/Snakefile index 1ac3c23..d2ffb97 100644 --- a/Snakefile +++ b/Snakefile @@ -9,6 +9,8 @@ min_version("8.0.0") configfile: "config/configfile.yaml" +include: "shared/vendored/snakemake/config.smk" +include: "workflow/snakemake_rules/config.smk" wildcard_constraints: a_or_b=r"a|b", diff --git a/workflow/snakemake_rules/config.smk b/workflow/snakemake_rules/config.smk new file mode 100644 index 0000000..199b4a9 --- /dev/null +++ b/workflow/snakemake_rules/config.smk @@ -0,0 +1,8 @@ +""" +This part of the workflow deals with configuration. + +OUTPUTS: + + results/run_config.yaml +""" +write_config("results/run_config.yaml") From be77edd4d6a26b900fec8d6d72ba60e00b1dfecb Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Mon, 22 Sep 2025 12:29:16 -0700 Subject: [PATCH 05/19] Remove unused input augur filter has never taken a reference file as input, so my guess is this was originally copied by mistake. --- workflow/snakemake_rules/core.smk | 2 -- 1 file changed, 2 deletions(-) diff --git a/workflow/snakemake_rules/core.smk b/workflow/snakemake_rules/core.smk index 154e851..8ee57ea 100644 --- a/workflow/snakemake_rules/core.smk +++ b/workflow/snakemake_rules/core.smk @@ -55,7 +55,6 @@ rule filter_recent: """ input: sequences="results/{a_or_b}/sequences.fasta", - reference="config/{a_or_b}reference.gbk", metadata="results/{a_or_b}/metadata.tsv", sequence_index=rules.index_sequences.output, exclude=config["exclude"], @@ -96,7 +95,6 @@ rule filter_background: """ input: sequences="results/{a_or_b}/sequences.fasta", - reference="config/{a_or_b}reference.gbk", metadata="results/{a_or_b}/metadata.tsv", sequence_index=rules.index_sequences.output, include="config/include_{a_or_b}.txt", From a8e39f03c9421f92b200daa600d652fc51c3bd0f Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Mon, 22 Sep 2025 13:32:49 -0700 Subject: [PATCH 06/19] Decouple example data grouping columns from config Preparing to make changes to the config that would have broken this. It seems fine to hardcode along with the already hardcoded sample size. --- workflow/snakemake_rules/chores.smk | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/workflow/snakemake_rules/chores.smk b/workflow/snakemake_rules/chores.smk index eae7d72..234f8a6 100644 --- a/workflow/snakemake_rules/chores.smk +++ b/workflow/snakemake_rules/chores.smk @@ -6,7 +6,7 @@ rule update_example_data_wildcards: The subset of data is generated by an augur filter call which: - sets the subsampling size to 50 - - applies the grouping from the config + - groups by year and country """ message: "Update example data" @@ -18,14 +18,13 @@ rule update_example_data_wildcards: metadata = "example_data/{a_or_b}/metadata.tsv", params: strain_id=config["strain_id_field"], - group_by=config["filter"]["group_by"], shell: """ augur filter \ --metadata {input.metadata} \ --metadata-id-columns {params.strain_id} \ --sequences {input.sequences} \ - --group-by {params.group_by} \ + --group-by year country \ --subsample-max-sequences 50 \ --subsample-seed 0 \ --output-metadata {output.metadata} \ From a87eebdee51e58f7ca5df9654591307fb480883d Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Mon, 22 Sep 2025 13:51:38 -0700 Subject: [PATCH 07/19] Move all filter parameters to config This will make it easier to compare changes in the switch to augur subsample which will define all parameters in config. --- config/configfile.yaml | 6 ++++++ workflow/snakemake_rules/core.smk | 12 ++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/config/configfile.yaml b/config/configfile.yaml index 87271ce..8288f4a 100644 --- a/config/configfile.yaml +++ b/config/configfile.yaml @@ -61,6 +61,12 @@ filter: F: 3000 F-antibody-escape: 2000 + exclude_where: + recent: ["qc.overallStatus=bad"] + background: ["qc.overallStatus=bad", "qc.overallStatus=mediocre"] + + missing_data_threshold: 1000 + files: auspice_config: "config/auspice_config.json" auspice_config_additional_colorings: "config/auspice_config_additional_colorings.json" diff --git a/workflow/snakemake_rules/core.smk b/workflow/snakemake_rules/core.smk index 8ee57ea..fa801fb 100644 --- a/workflow/snakemake_rules/core.smk +++ b/workflow/snakemake_rules/core.smk @@ -70,6 +70,8 @@ rule filter_recent: ][w.build_name], strain_id=config["strain_id_field"], min_date=lambda w: config["filter"]["resolutions"][w.resolution]["min_date"], + exclude_where=config["filter"]["exclude_where"]["recent"], + missing_data_threshold=config["filter"]["missing_data_threshold"], shell: """ augur filter \ @@ -78,13 +80,13 @@ rule filter_recent: --metadata {input.metadata} \ --metadata-id-columns {params.strain_id} \ --exclude {input.exclude} \ - --exclude-where 'qc.overallStatus=bad' \ + --exclude-where {params.exclude_where:q} \ --min-date {params.min_date} \ --min-length {params.min_length} \ --output {output.sequences} \ --group-by {params.group_by} \ --subsample-max-sequences {params.subsample_max_sequences} \ - --query '({params.min_coverage}) & missing_data<1000' + --query '({params.min_coverage}) & missing_data<{params.missing_data_threshold}' """ @@ -117,6 +119,8 @@ rule filter_background: min_date=lambda w: config["filter"]["resolutions"][w.resolution][ "background_min_date" ], + exclude_where=config["filter"]["exclude_where"]["background"], + missing_data_threshold=config["filter"]["missing_data_threshold"], shell: """ augur filter \ @@ -126,7 +130,7 @@ rule filter_background: --metadata-id-columns {params.strain_id} \ --include {input.include} \ --exclude {input.exclude} \ - --exclude-where 'qc.overallStatus=bad' 'qc.overallStatus=mediocre' \ + --exclude-where {params.exclude_where:q} \ --min-date {params.min_date} \ --max-date {params.max_date} \ --min-length {params.min_length} \ @@ -134,7 +138,7 @@ rule filter_background: --output-metadata {output.metadata} \ --group-by {params.group_by} \ --subsample-max-sequences {params.subsample_max_sequences} \ - --query '({params.min_coverage}) & missing_data<1000' + --query '({params.min_coverage}) & missing_data<{params.missing_data_threshold}' """ rule exclude_preduplication: From 0b2218513525469da029b5f78c131f9069aac724 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Mon, 22 Sep 2025 15:22:47 -0700 Subject: [PATCH 08/19] Add separate frequencies config This shouldn't rely on config from another rule. --- config/configfile.yaml | 9 +++++++++ workflow/snakemake_rules/core.smk | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/config/configfile.yaml b/config/configfile.yaml index 8288f4a..3bd9a4c 100644 --- a/config/configfile.yaml +++ b/config/configfile.yaml @@ -90,6 +90,15 @@ cds: traits: columns: "country region" +frequencies: + resolutions: + all-time: + min_date: 1975-01-01 + 6y: + min_date: 6Y + 3y: + min_date: 3Y + nextclade_attributes: a: name: "RSV-A NextClade using real-time tree" diff --git a/workflow/snakemake_rules/core.smk b/workflow/snakemake_rules/core.smk index fa801fb..88ef71c 100644 --- a/workflow/snakemake_rules/core.smk +++ b/workflow/snakemake_rules/core.smk @@ -683,7 +683,7 @@ rule frequencies: output: frequencies = build_dir + "/{a_or_b}/{build_name}/{resolution}/frequencies.json" params: - min_date_arg = lambda w: f"--min-date {config['filter']['resolutions'][w.resolution]['min_date']}", + min_date_arg = lambda w: f"--min-date {config['frequencies']['resolutions'][w.resolution]['min_date']}", shell: """ augur frequencies \ From f8a68c0cd17fc60d4dd6ff62b13d5b27ed96a176 Mon Sep 17 00:00:00 2001 From: Victor Lin Date: Fri, 6 Mar 2026 14:23:40 -0800 Subject: [PATCH 09/19] Standard logs and benchmarks Previously, logs were only captured for a few rules without a clear mapping between log file and rule name. This commit adds log and benchmark files to all rules with a consistent name format of `{rule}_.txt` and usage of `exec` to capture output. --- workflow/snakemake_rules/clades.smk | 30 ++++- workflow/snakemake_rules/core.smk | 150 +++++++++++++++++++-- workflow/snakemake_rules/export.smk | 30 +++++ workflow/snakemake_rules/glycosylation.smk | 6 + workflow/snakemake_rules/merge_inputs.smk | 16 +-- 5 files changed, 207 insertions(+), 25 deletions(-) diff --git a/workflow/snakemake_rules/clades.smk b/workflow/snakemake_rules/clades.smk index 51935df..fde307e 100644 --- a/workflow/snakemake_rules/clades.smk +++ b/workflow/snakemake_rules/clades.smk @@ -9,15 +9,19 @@ rule clades_genome: output: node_data = build_dir + "/{a_or_b}/{build_name}/{resolution}/clades_genome.json" log: - "logs/{a_or_b}/clades_genome_{build_name}_{resolution}.txt" + "logs/clades_genome_{a_or_b}_{build_name}_{resolution}.txt" + benchmark: + "benchmarks/clades_genome_{a_or_b}_{build_name}_{resolution}.txt" shell: """ + exec &> >(tee {log:q}) + augur clades --tree {input.tree} \ --mutations {input.nuc_muts} {input.aa_muts} \ --clades {input.clades} \ --membership-name genome_clade \ --label-name genome_clade \ - --output-node-data {output.node_data} 2>&1 | tee {log} + --output-node-data {output.node_data} """ @@ -31,15 +35,19 @@ rule clades_Goya: output: node_data = build_dir + "/{a_or_b}/{build_name}/{resolution}/clades_G.json" log: - "logs/{a_or_b}/clades_{build_name}_{resolution}.txt" + "logs/clades_Goya_{a_or_b}_{build_name}_{resolution}.txt" + benchmark: + "benchmarks/clades_Goya_{a_or_b}_{build_name}_{resolution}.txt" shell: """ + exec &> >(tee {log:q}) + augur clades --tree {input.tree} \ --mutations {input.nuc_muts} {input.aa_muts} \ --clades {input.clades} \ --membership-name G_clade \ --label-name G_clade \ - --output-node-data {output.node_data} 2>&1 | tee {log} + --output-node-data {output.node_data} """ rule clades_consortium: @@ -52,21 +60,31 @@ rule clades_consortium: output: node_data = build_dir + "/{a_or_b}/{build_name}/{resolution}/clades_consortium.json" log: - "logs/{a_or_b}/clades_{build_name}_{resolution}.txt" + "logs/clades_consortium_{a_or_b}_{build_name}_{resolution}.txt" + benchmark: + "benchmarks/clades_consortium_{a_or_b}_{build_name}_{resolution}.txt" shell: """ + exec &> >(tee {log:q}) + augur clades --tree {input.tree} \ --mutations {input.nuc_muts} {input.aa_muts} \ --clades {input.clades} \ - --output-node-data {output.node_data} 2>&1 | tee {log} + --output-node-data {output.node_data} """ rule download_clades: output: clades = "results/clades_consortium_{a_or_b}.tsv" + log: + "logs/download_clades_{a_or_b}.txt" + benchmark: + "benchmarks/download_clades_{a_or_b}.txt" params: url = lambda w: f"https://raw.githubusercontent.com/rsv-lineages/lineage-designation-{w.a_or_b.upper()}/main/.auto-generated/lineages.tsv" shell: """ + exec &> >(tee {log:q}) + curl {params.url} --output {output.clades} """ diff --git a/workflow/snakemake_rules/core.smk b/workflow/snakemake_rules/core.smk index 88ef71c..8f70a72 100644 --- a/workflow/snakemake_rules/core.smk +++ b/workflow/snakemake_rules/core.smk @@ -16,8 +16,14 @@ rule index_sequences: output: sequence_index=build_dir + "/{a_or_b}/sequence_index.tsv", + log: + "logs/index_sequences_{a_or_b}.txt" + benchmark: + "benchmarks/index_sequences_{a_or_b}.txt" shell: """ + exec &> >(tee {log:q}) + augur index \ --sequences {input.sequences} \ --output {output.sequence_index} @@ -36,10 +42,16 @@ rule newreference: + "/{a_or_b}/{gene}_reference.gbk", newreferencefasta=build_dir + "/{a_or_b}/{gene}_reference.fasta", + log: + "logs/newreference_{a_or_b}_{gene}.txt" + benchmark: + "benchmarks/newreference_{a_or_b}_{gene}.txt" params: gene=lambda w: w.gene.split("-")[0], shell: """ + exec &> >(tee {log:q}) + python scripts/newreference.py \ --reference {input.oldreference} \ --output-genbank {output.newreferencegbk} \ @@ -61,6 +73,10 @@ rule filter_recent: output: sequences=build_dir + "/{a_or_b}/{build_name}/{resolution}/filtered_recent.fasta", + log: + "logs/filter_recent_{a_or_b}_{build_name}_{resolution}.txt" + benchmark: + "benchmarks/filter_recent_{a_or_b}_{build_name}_{resolution}.txt" params: group_by=config["filter"]["group_by"], min_coverage=lambda w: f'{w.build_name.split("-")[0]}_coverage>{config["filter"]["min_coverage"][w.build_name]}', @@ -74,6 +90,8 @@ rule filter_recent: missing_data_threshold=config["filter"]["missing_data_threshold"], shell: """ + exec &> >(tee {log:q}) + augur filter \ --sequences {input.sequences} \ --sequence-index {input.sequence_index} \ @@ -106,6 +124,10 @@ rule filter_background: + "/{a_or_b}/{build_name}/{resolution}/filtered_background_pre.fasta", metadata=build_dir + "/{a_or_b}/{build_name}/{resolution}/filtered_background_metadata.tsv", + log: + "logs/filter_background_{a_or_b}_{build_name}_{resolution}.txt" + benchmark: + "benchmarks/filter_background_{a_or_b}_{build_name}_{resolution}.txt" params: group_by=config["filter"]["group_by"], min_coverage=lambda w: f'{w.build_name.split("-")[0]}_coverage>{config["filter"]["min_coverage"][w.build_name]}', @@ -123,6 +145,8 @@ rule filter_background: missing_data_threshold=config["filter"]["missing_data_threshold"], shell: """ + exec &> >(tee {log:q}) + augur filter \ --sequences {input.sequences} \ --sequence-index {input.sequence_index} \ @@ -152,6 +176,8 @@ rule exclude_preduplication: output: sequences=build_dir + "/{a_or_b}/{build_name}/{resolution}/filtered_background.fasta", + benchmark: + "benchmarks/exclude_preduplication_{a_or_b}_{build_name}_{resolution}.txt" run: import pandas as pd import Bio.SeqIO as SeqIO @@ -193,8 +219,14 @@ rule combine_samples: ), output: sequences=build_dir + "/{a_or_b}/{build_name}/{resolution}/filtered.fasta", + log: + "logs/combine_samples_{a_or_b}_{build_name}_{resolution}.txt" + benchmark: + "benchmarks/combine_samples_{a_or_b}_{build_name}_{resolution}.txt" shell: """ + exec &> >(tee {log:q}) + cat {input.subsamples} | seqkit rmdup > {output} """ @@ -206,6 +238,10 @@ rule get_nextclade_dataset: """ output: dataset="results/nextclade_rsv-{a_or_b}.zip", + log: + "logs/get_nextclade_dataset_{a_or_b}.txt" + benchmark: + "benchmarks/get_nextclade_dataset_{a_or_b}.txt" params: ds_name=lambda w: ( "nextstrain/rsv/a/EPI_ISL_412866" @@ -214,6 +250,8 @@ rule get_nextclade_dataset: ), shell: """ + exec &> >(tee {log:q}) + nextclade3 dataset get -n {params.ds_name} --output-zip {output.dataset} """ @@ -229,6 +267,10 @@ rule filter_for_pre_subsample_alignment: exclude=config["exclude"], output: sequences=build_dir + "/{a_or_b}/{build_name}/{resolution}/pre_subsample/filtered_for_alignment.fasta", + log: + "logs/filter_for_pre_subsample_alignment_{a_or_b}_{build_name}_{resolution}.txt" + benchmark: + "benchmarks/filter_for_pre_subsample_alignment_{a_or_b}_{build_name}_{resolution}.txt" params: min_coverage=lambda w: f'{w.build_name.split("-")[0]}_coverage>{config["filter"]["min_coverage"][w.build_name]}', min_length=lambda w: config["filter"]["min_length"][w.build_name], @@ -236,6 +278,8 @@ rule filter_for_pre_subsample_alignment: min_date=lambda w: config["filter"]["resolutions"][w.resolution]["min_date"], shell: """ + exec &> >(tee {log:q}) + augur filter \ --sequences {input.sequences} \ --metadata {input.metadata} \ @@ -265,15 +309,19 @@ rule align_pre_subsample_sequences: genes=lambda w: config["cds"][w.build_name], threads: 8 log: - "logs/align_all_{a_or_b}_{build_name}_{resolution}.txt", + "logs/align_pre_subsample_sequences_{a_or_b}_{build_name}_{resolution}.txt" + benchmark: + "benchmarks/align_pre_subsample_sequences_{a_or_b}_{build_name}_{resolution}.txt" shell: """ + exec &> >(tee {log:q}) + nextclade3 run -j {threads}\ {input.sequences} \ -D {input.dataset} \ --output-fasta {output.alignment} \ --cds-selection {params.genes} \ - --output-translations "{output.translations}/{{cds}}.fasta" 2>&1 | tee {log} && touch {output.translations_done} + --output-translations "{output.translations}/{{cds}}.fasta" && touch {output.translations_done} """ @@ -285,12 +333,18 @@ rule score_pre_subsample_f_proteins: dms_scores=config["f_dms_data"], output: scores=build_dir + "/{a_or_b}/{build_name}/{resolution}/pre_subsample/f_protein_scores.tsv", + log: + "logs/score_pre_subsample_f_proteins_{a_or_b}_{build_name}_{resolution}.txt" + benchmark: + "benchmarks/score_pre_subsample_f_proteins_{a_or_b}_{build_name}_{resolution}.txt" params: f_sequences=lambda w: build_dir + f"/{w.a_or_b}/{w.build_name}/{w.resolution}/pre_subsample/translations/F.fasta", dms_antibodies=lambda w: " ".join(shlex.quote(ab) for ab in config["f_dms_antibodies"]), only_positive_escape=config["dms_only_positive_escape"], shell: """ + exec &> >(tee {log:q}) + python scripts/score_f_sequences.py fasta \ --sequences {params.f_sequences} \ --dms-scores {input.dms_scores} \ @@ -308,10 +362,16 @@ rule add_f_scores_to_pre_subsample_metadata: f_scores=rules.score_pre_subsample_f_proteins.output.scores, output: enhanced_metadata=build_dir + "/{a_or_b}/{build_name}/{resolution}/pre_subsample/metadata_with_scores.tsv", + log: + "logs/add_f_scores_to_pre_subsample_metadata_{a_or_b}_{build_name}_{resolution}.txt" + benchmark: + "benchmarks/add_f_scores_to_pre_subsample_metadata_{a_or_b}_{build_name}_{resolution}.txt" params: strain_id=config["strain_id_field"], shell: """ + exec &> >(tee {log:q}) + python scripts/merge_f_scores.py \ --metadata {input.original_metadata} \ --scores {input.f_scores} \ @@ -331,6 +391,10 @@ rule enrich_antibody_escape: metadata=rules.add_f_scores_to_pre_subsample_metadata.output.enhanced_metadata, output: sequences=build_dir + "/{a_or_b}/{build_name}/{resolution}/filtered_{antibody}_{scoretype}.fasta" + log: + "logs/enrich_antibody_escape_{a_or_b}_{build_name}_{resolution}_{antibody}_{scoretype}.txt" + benchmark: + "benchmarks/enrich_antibody_escape_{a_or_b}_{build_name}_{resolution}_{antibody}_{scoretype}.txt" params: strain_id=config["strain_id_field"], escape_col=lambda w: f"{w.antibody}_{w.scoretype}", @@ -340,6 +404,8 @@ rule enrich_antibody_escape: max_identical_max_escape_mut=lambda w: config["enrich_antibody_escape"][w.build_name]["max_identical_max_escape_mut"], shell: """ + exec &> >(tee {log:q}) + python scripts/enrich_antibody_escape.py \ --input-sequences {input.sequences} \ --metadata {input.metadata} \ @@ -368,15 +434,19 @@ rule genome_align: genes=lambda w: config["cds"][w.build_name], threads: 4 log: - "logs/align_{a_or_b}_{build_name}_{resolution}.txt", + "logs/genome_align_{a_or_b}_{build_name}_{resolution}.txt" + benchmark: + "benchmarks/genome_align_{a_or_b}_{build_name}_{resolution}.txt" shell: """ + exec &> >(tee {log:q}) + nextclade3 run -j {threads}\ {input.sequences} \ -D {input.dataset} \ --output-fasta {output.alignment} \ --cds-selection {params.genes} \ - --output-translations "{output.translations}/{{cds}}.fasta" 2>&1 | tee {log} \ + --output-translations "{output.translations}/{{cds}}.fasta" """ @@ -388,8 +458,14 @@ rule cut: output: slicedalignment=build_dir + "/{a_or_b}/{build_name}/{resolution}/{gene}_slicedalignment.fasta", + log: + "logs/cut_{a_or_b}_{build_name}_{resolution}_{gene}.txt" + benchmark: + "benchmarks/cut_{a_or_b}_{build_name}_{resolution}_{gene}.txt" shell: """ + exec &> >(tee {log:q}) + python scripts/cut.py \ --oldalignment {input.oldalignment} \ --slicedalignment {output.slicedalignment} \ @@ -406,9 +482,15 @@ rule realign: + "/{a_or_b}/{gene}_reference.fasta", output: realigned=build_dir + "/{a_or_b}/{build_name}/{resolution}/{gene}_aligned.fasta", + log: + "logs/realign_{a_or_b}_{build_name}_{resolution}_{gene}.txt" + benchmark: + "benchmarks/realign_{a_or_b}_{build_name}_{resolution}_{gene}.txt" threads: 4 shell: """ + exec &> >(tee {log:q}) + augur align --nthreads {threads} \ --sequences {input.slicedalignment} \ --reference-sequence {input.reference} \ @@ -424,8 +506,14 @@ rule hybrid_align: output: hybrid_alignment=build_dir + "/{a_or_b}/{build_name}/{resolution}/hybrid_alignment.fasta", + log: + "logs/hybrid_align_{a_or_b}_{build_name}_{resolution}.txt" + benchmark: + "benchmarks/hybrid_align_{a_or_b}_{build_name}_{resolution}.txt" shell: """ + exec &> >(tee {log:q}) + python scripts/align_for_tree.py \ --realign {input.G_alignment} \ --original {input.original} \ @@ -453,9 +541,15 @@ rule tree: alignment=get_alignment, output: tree=build_dir + "/{a_or_b}/{build_name}/{resolution}/tree_raw.nwk", + log: + "logs/tree_{a_or_b}_{build_name}_{resolution}.txt" + benchmark: + "benchmarks/tree_{a_or_b}_{build_name}_{resolution}.txt" threads: 4 shell: """ + exec &> >(tee {log:q}) + augur tree \ --alignment {input.alignment} \ --output {output.tree} \ @@ -479,6 +573,10 @@ rule refine: output: tree=build_dir + "/{a_or_b}/{build_name}/{resolution}/tree.nwk", node_data=build_dir + "/{a_or_b}/{build_name}/{resolution}/branch_lengths.json", + log: + "logs/refine_{a_or_b}_{build_name}_{resolution}.txt" + benchmark: + "benchmarks/refine_{a_or_b}_{build_name}_{resolution}.txt" params: coalescent=config["refine"]["coalescent"], clock_filter_iqd=config["refine"]["clock_filter_iqd"], @@ -486,6 +584,8 @@ rule refine: strain_id=config["strain_id_field"], shell: """ + exec &> >(tee {log:q}) + augur refine \ --tree {input.tree} \ --alignment {input.alignment} \ @@ -549,12 +649,16 @@ rule distances: comparisons=_get_distance_comparisons_by_lineage_and_segment, attribute_names=_get_distance_attributes_by_lineage_and_segment, log: - "logs/distances_{a_or_b}_{build_name}_{resolution}.txt", + "logs/distances_{a_or_b}_{build_name}_{resolution}.txt" + benchmark: + "benchmarks/distances_{a_or_b}_{build_name}_{resolution}.txt" resources: mem_mb=8000, time="00:30:00", shell: """ + exec &> >(tee {log:q}) + augur distance \ --alignment {params.alignments} \ --tree {input.tree} \ @@ -562,7 +666,7 @@ rule distances: --compare-to {params.comparisons} \ --attribute-name {params.attribute_names} \ --map {input.distance_maps} \ - --output {output.distances} 2>&1 | tee {log} + --output {output.distances} """ @@ -586,9 +690,13 @@ rule ancestral: output_translations=lambda w: build_dir + f"/{w.a_or_b}/{w.build_name}/{w.resolution}/translations/%GENE_withInternalNodes.fasta", input_translations=lambda w: build_dir + f"/{w.a_or_b}/{w.build_name}/{w.resolution}/translations/%GENE.fasta", log: - "logs/ancestral_{a_or_b}_{build_name}_{resolution}.txt", + "logs/ancestral_{a_or_b}_{build_name}_{resolution}.txt" + benchmark: + "benchmarks/ancestral_{a_or_b}_{build_name}_{resolution}.txt" shell: """ + exec &> >(tee {log:q}) + augur ancestral \ --tree {input.tree} \ --alignment {input.alignment} \ @@ -598,7 +706,7 @@ rule ancestral: --genes {params.genes} \ --translations "{params.input_translations}" \ --output-translations "{params.output_translations}" \ - --inference {params.inference} 2>&1 | tee {log} && touch {output.translations_done} + --inference {params.inference} && touch {output.translations_done} """ @@ -611,10 +719,16 @@ rule translate: reference=build_dir + "/{a_or_b}/{build_name}_reference.gbk", output: node_data=build_dir + "/{a_or_b}/{build_name}/{resolution}/aa_muts.json", + log: + "logs/translate_{a_or_b}_{build_name}_{resolution}.txt" + benchmark: + "benchmarks/translate_{a_or_b}_{build_name}_{resolution}.txt" params: alignment_file_mask=build_dir + "/{a_or_b}/{build_name}/{resolution}/aligned_%GENE.fasta", shell: """ + exec &> >(tee {log:q}) + augur translate \ --tree {input.tree} \ --ancestral-sequences {input.node_data} \ @@ -640,9 +754,13 @@ rule compute_f_scores_node_data: f_antibodies=lambda w: " ".join(shlex.quote(ab) for ab in config["f_dms_antibodies"]), only_positive_escape=config["dms_only_positive_escape"], log: - "logs/compute_f_scores_node_data_{a_or_b}_{build_name}_{resolution}.txt", + "logs/compute_f_scores_node_data_{a_or_b}_{build_name}_{resolution}.txt" + benchmark: + "benchmarks/compute_f_scores_node_data_{a_or_b}_{build_name}_{resolution}.txt" shell: """ + exec &> >(tee {log:q}) + python scripts/score_f_sequences.py tree \ --tree-newick {input.tree_newick} \ --aa-muts {input.aa_muts} \ @@ -650,7 +768,7 @@ rule compute_f_scores_node_data: --dms-scores {input.f_scores} \ --dms-antibodies {params.f_antibodies} \ --only-positive-escape {params.only_positive_escape} \ - --output {output.f_scores_node_data} &> {log} + --output {output.f_scores_node_data} """ @@ -661,12 +779,16 @@ rule traits: output: node_data=build_dir + "/{a_or_b}/{build_name}/{resolution}/traits.json", log: - "logs/{a_or_b}/traits_{build_name}_{resolution}_rsv.txt", + "logs/traits_{a_or_b}_{build_name}_{resolution}.txt" + benchmark: + "benchmarks/traits_{a_or_b}_{build_name}_{resolution}.txt" params: columns=config["traits"]["columns"], strain_id=config["strain_id_field"], shell: """ + exec &> >(tee {log:q}) + augur traits \ --tree {input.tree} \ --metadata {input.metadata} \ @@ -682,10 +804,16 @@ rule frequencies: metadata = "results/{a_or_b}/metadata.tsv" output: frequencies = build_dir + "/{a_or_b}/{build_name}/{resolution}/frequencies.json" + log: + "logs/frequencies_{a_or_b}_{build_name}_{resolution}.txt" + benchmark: + "benchmarks/frequencies_{a_or_b}_{build_name}_{resolution}.txt" params: min_date_arg = lambda w: f"--min-date {config['frequencies']['resolutions'][w.resolution]['min_date']}", shell: """ + exec &> >(tee {log:q}) + augur frequencies \ --tree {input.tree} \ --method kde \ diff --git a/workflow/snakemake_rules/export.smk b/workflow/snakemake_rules/export.smk index 660e980..f1a202d 100644 --- a/workflow/snakemake_rules/export.smk +++ b/workflow/snakemake_rules/export.smk @@ -19,6 +19,10 @@ rule generate_f_dms_antibody_auspice_config: f_scores_node_data = rules.compute_f_scores_node_data.output.f_scores_node_data output: auspice_config = "results/{a_or_b}/{build_name}/{resolution}/auspice_config_f_dms_antibodies.json" + log: + "logs/generate_f_dms_antibody_auspice_config_{a_or_b}_{build_name}_{resolution}.txt" + benchmark: + "benchmarks/generate_f_dms_antibody_auspice_config_{a_or_b}_{build_name}_{resolution}.txt" params: antibodies = config["f_dms_antibodies"], continuous_scale = " ".join( @@ -28,6 +32,8 @@ rule generate_f_dms_antibody_auspice_config: ) shell: """ + exec &> >(tee {log:q}) + python scripts/generate_f_dms_antibody_auspice_config.py \ --antibodies {params.antibodies} \ --continuous-scale {params.continuous_scale} \ @@ -42,8 +48,14 @@ rule colors: metadata = "results/{a_or_b}/metadata.tsv", output: colors = "results/{a_or_b}/colors.tsv" + log: + "logs/colors_{a_or_b}.txt" + benchmark: + "benchmarks/colors_{a_or_b}.txt" shell: """ + exec &> >(tee {log:q}) + python scripts/assign-colors.py \ --color-schemes {input.color_schemes} \ --ordering {input.color_orderings} \ @@ -82,11 +94,17 @@ rule export: description = config["description"] output: auspice_json = build_dir + "/{a_or_b}/{build_name}/{resolution}/tree.json" + log: + "logs/export_{a_or_b}_{build_name}_{resolution}.txt" + benchmark: + "benchmarks/export_{a_or_b}_{build_name}_{resolution}.txt" params: title = lambda w: f"RSV-{w.a_or_b.upper()} phylogeny", strain_id=config["strain_id_field"], shell: """ + exec &> >(tee {log:q}) + augur export v2 \ --tree {input.tree} \ --metadata {input.metadata} \ @@ -110,11 +128,17 @@ rule final_strain_name: output: auspice_json=build_dir + "/{a_or_b}/{build_name}/{resolution}/tree_renamed.json", freq_json= "auspice/rsv_{a_or_b}_{build_name}_{resolution}_tip-frequencies.json" + log: + "logs/final_strain_name_{a_or_b}_{build_name}_{resolution}.txt" + benchmark: + "benchmarks/final_strain_name_{a_or_b}_{build_name}_{resolution}.txt" params: strain_id=config["strain_id_field"], display_strain_field=config["display_strain_field"], shell: """ + exec &> >(tee {log:q}) + python3 scripts/set_final_strain_name.py --metadata {input.metadata} \ --metadata-id-columns {params.strain_id} \ --input-auspice-json {input.auspice_json} \ @@ -131,12 +155,18 @@ rule rename_and_ready_for_nextclade: pathogen_json= "nextclade/config/pathogen.json", output: auspice_json= "auspice/rsv_{a_or_b}_{build_name}_{resolution}.json", + log: + "logs/rename_and_ready_for_nextclade_{a_or_b}_{build_name}_{resolution}.txt" + benchmark: + "benchmarks/rename_and_ready_for_nextclade_{a_or_b}_{build_name}_{resolution}.txt" params: accession= lambda w: config["nextclade_attributes"][w.a_or_b]["accession"], name= lambda w: config["nextclade_attributes"][w.a_or_b]["name"], ref_name= lambda w: config["nextclade_attributes"][w.a_or_b]["reference_name"] shell: """ + exec &> >(tee {log:q}) + python3 scripts/rename_and_nextclade.py \ --input-auspice-json {input.auspice_json} \ --pathogen-json {input.pathogen_json} \ diff --git a/workflow/snakemake_rules/glycosylation.smk b/workflow/snakemake_rules/glycosylation.smk index 599e20a..2f7a6fc 100644 --- a/workflow/snakemake_rules/glycosylation.smk +++ b/workflow/snakemake_rules/glycosylation.smk @@ -4,10 +4,16 @@ rule glycosylation: translations = rules.translate.output.node_data output: glycosylations = build_dir + "/{a_or_b}/{build_name}/{resolution}/glyc.json" + log: + "logs/glycosylation_{a_or_b}_{build_name}_{resolution}.txt" + benchmark: + "benchmarks/glycosylation_{a_or_b}_{build_name}_{resolution}.txt" params: aa_data = build_dir + "/{a_or_b}/{build_name}/{resolution}/aligned_{build_name}.fasta" shell: """ + exec &> >(tee {log:q}) + python scripts/glycosylation.py \ --alignment {params.aa_data} \ --tree {input.tree} \ diff --git a/workflow/snakemake_rules/merge_inputs.smk b/workflow/snakemake_rules/merge_inputs.smk index fb335af..7d66df0 100644 --- a/workflow/snakemake_rules/merge_inputs.smk +++ b/workflow/snakemake_rules/merge_inputs.smk @@ -70,9 +70,9 @@ if len(_input_metadata) == 1: output: metadata = "results/{a_or_b}/metadata.tsv", log: - "logs/{a_or_b}/decompress_metadata.txt", + "logs/decompress_metadata_{a_or_b}.txt" benchmark: - "benchmarks/{a_or_b}/decompress_metadata.txt", + "benchmarks/decompress_metadata_{a_or_b}.txt" shell: r""" exec &> >(tee {log:q}) @@ -95,9 +95,9 @@ else: output: metadata = "results/{a_or_b}/metadata.tsv" log: - "logs/{a_or_b}/merge_metadata.txt", + "logs/merge_metadata_{a_or_b}.txt" benchmark: - "benchmarks/{a_or_b}/merge_metadata.txt" + "benchmarks/merge_metadata_{a_or_b}.txt" shell: r""" exec &> >(tee {log:q}) @@ -122,9 +122,9 @@ if len(_input_sequences) == 1: output: sequences = "results/{a_or_b}/sequences.fasta", log: - "logs/{a_or_b}/decompress_sequences.txt", + "logs/decompress_sequences_{a_or_b}.txt" benchmark: - "benchmarks/{a_or_b}/decompress_sequences.txt", + "benchmarks/decompress_sequences_{a_or_b}.txt" shell: r""" exec &> >(tee {log:q}) @@ -144,9 +144,9 @@ else: output: sequences = "results/{a_or_b}/sequences.fasta", log: - "logs/{a_or_b}/merge_sequences.txt", + "logs/merge_sequences_{a_or_b}.txt" benchmark: - "benchmarks/{a_or_b}/merge_sequences.txt" + "benchmarks/merge_sequences_{a_or_b}.txt" shell: r""" exec &> >(tee {log:q}) From 16e121f6ba8d9d2bfc94fc8efe3f0f88776d073a Mon Sep 17 00:00:00 2001 From: Victor Lin Date: Fri, 6 Mar 2026 14:24:20 -0800 Subject: [PATCH 10/19] Use raw, triple-quoted shell blocks Improves readability of Snakemake console output. --- workflow/snakemake_rules/clades.smk | 8 ++-- workflow/snakemake_rules/core.smk | 46 +++++++++++----------- workflow/snakemake_rules/export.smk | 10 ++--- workflow/snakemake_rules/glycosylation.smk | 2 +- 4 files changed, 33 insertions(+), 33 deletions(-) diff --git a/workflow/snakemake_rules/clades.smk b/workflow/snakemake_rules/clades.smk index fde307e..66993f4 100644 --- a/workflow/snakemake_rules/clades.smk +++ b/workflow/snakemake_rules/clades.smk @@ -13,7 +13,7 @@ rule clades_genome: benchmark: "benchmarks/clades_genome_{a_or_b}_{build_name}_{resolution}.txt" shell: - """ + r""" exec &> >(tee {log:q}) augur clades --tree {input.tree} \ @@ -39,7 +39,7 @@ rule clades_Goya: benchmark: "benchmarks/clades_Goya_{a_or_b}_{build_name}_{resolution}.txt" shell: - """ + r""" exec &> >(tee {log:q}) augur clades --tree {input.tree} \ @@ -64,7 +64,7 @@ rule clades_consortium: benchmark: "benchmarks/clades_consortium_{a_or_b}_{build_name}_{resolution}.txt" shell: - """ + r""" exec &> >(tee {log:q}) augur clades --tree {input.tree} \ @@ -83,7 +83,7 @@ rule download_clades: params: url = lambda w: f"https://raw.githubusercontent.com/rsv-lineages/lineage-designation-{w.a_or_b.upper()}/main/.auto-generated/lineages.tsv" shell: - """ + r""" exec &> >(tee {log:q}) curl {params.url} --output {output.clades} diff --git a/workflow/snakemake_rules/core.smk b/workflow/snakemake_rules/core.smk index 8f70a72..b9e42a4 100644 --- a/workflow/snakemake_rules/core.smk +++ b/workflow/snakemake_rules/core.smk @@ -21,7 +21,7 @@ rule index_sequences: benchmark: "benchmarks/index_sequences_{a_or_b}.txt" shell: - """ + r""" exec &> >(tee {log:q}) augur index \ @@ -49,7 +49,7 @@ rule newreference: params: gene=lambda w: w.gene.split("-")[0], shell: - """ + r""" exec &> >(tee {log:q}) python scripts/newreference.py \ @@ -89,7 +89,7 @@ rule filter_recent: exclude_where=config["filter"]["exclude_where"]["recent"], missing_data_threshold=config["filter"]["missing_data_threshold"], shell: - """ + r""" exec &> >(tee {log:q}) augur filter \ @@ -144,7 +144,7 @@ rule filter_background: exclude_where=config["filter"]["exclude_where"]["background"], missing_data_threshold=config["filter"]["missing_data_threshold"], shell: - """ + r""" exec &> >(tee {log:q}) augur filter \ @@ -224,7 +224,7 @@ rule combine_samples: benchmark: "benchmarks/combine_samples_{a_or_b}_{build_name}_{resolution}.txt" shell: - """ + r""" exec &> >(tee {log:q}) cat {input.subsamples} | seqkit rmdup > {output} @@ -249,7 +249,7 @@ rule get_nextclade_dataset: else "nextstrain/rsv/b/EPI_ISL_1653999" ), shell: - """ + r""" exec &> >(tee {log:q}) nextclade3 dataset get -n {params.ds_name} --output-zip {output.dataset} @@ -277,7 +277,7 @@ rule filter_for_pre_subsample_alignment: strain_id=config["strain_id_field"], min_date=lambda w: config["filter"]["resolutions"][w.resolution]["min_date"], shell: - """ + r""" exec &> >(tee {log:q}) augur filter \ @@ -313,7 +313,7 @@ rule align_pre_subsample_sequences: benchmark: "benchmarks/align_pre_subsample_sequences_{a_or_b}_{build_name}_{resolution}.txt" shell: - """ + r""" exec &> >(tee {log:q}) nextclade3 run -j {threads}\ @@ -342,7 +342,7 @@ rule score_pre_subsample_f_proteins: dms_antibodies=lambda w: " ".join(shlex.quote(ab) for ab in config["f_dms_antibodies"]), only_positive_escape=config["dms_only_positive_escape"], shell: - """ + r""" exec &> >(tee {log:q}) python scripts/score_f_sequences.py fasta \ @@ -369,7 +369,7 @@ rule add_f_scores_to_pre_subsample_metadata: params: strain_id=config["strain_id_field"], shell: - """ + r""" exec &> >(tee {log:q}) python scripts/merge_f_scores.py \ @@ -403,7 +403,7 @@ rule enrich_antibody_escape: max_identical_f_prot_muts=lambda w: config["enrich_antibody_escape"][w.build_name]["max_identical_f_prot_muts"], max_identical_max_escape_mut=lambda w: config["enrich_antibody_escape"][w.build_name]["max_identical_max_escape_mut"], shell: - """ + r""" exec &> >(tee {log:q}) python scripts/enrich_antibody_escape.py \ @@ -438,7 +438,7 @@ rule genome_align: benchmark: "benchmarks/genome_align_{a_or_b}_{build_name}_{resolution}.txt" shell: - """ + r""" exec &> >(tee {log:q}) nextclade3 run -j {threads}\ @@ -463,7 +463,7 @@ rule cut: benchmark: "benchmarks/cut_{a_or_b}_{build_name}_{resolution}_{gene}.txt" shell: - """ + r""" exec &> >(tee {log:q}) python scripts/cut.py \ @@ -488,7 +488,7 @@ rule realign: "benchmarks/realign_{a_or_b}_{build_name}_{resolution}_{gene}.txt" threads: 4 shell: - """ + r""" exec &> >(tee {log:q}) augur align --nthreads {threads} \ @@ -511,7 +511,7 @@ rule hybrid_align: benchmark: "benchmarks/hybrid_align_{a_or_b}_{build_name}_{resolution}.txt" shell: - """ + r""" exec &> >(tee {log:q}) python scripts/align_for_tree.py \ @@ -547,7 +547,7 @@ rule tree: "benchmarks/tree_{a_or_b}_{build_name}_{resolution}.txt" threads: 4 shell: - """ + r""" exec &> >(tee {log:q}) augur tree \ @@ -583,7 +583,7 @@ rule refine: date_inference=config["refine"]["date_inference"], strain_id=config["strain_id_field"], shell: - """ + r""" exec &> >(tee {log:q}) augur refine \ @@ -656,7 +656,7 @@ rule distances: mem_mb=8000, time="00:30:00", shell: - """ + r""" exec &> >(tee {log:q}) augur distance \ @@ -694,7 +694,7 @@ rule ancestral: benchmark: "benchmarks/ancestral_{a_or_b}_{build_name}_{resolution}.txt" shell: - """ + r""" exec &> >(tee {log:q}) augur ancestral \ @@ -726,7 +726,7 @@ rule translate: params: alignment_file_mask=build_dir + "/{a_or_b}/{build_name}/{resolution}/aligned_%GENE.fasta", shell: - """ + r""" exec &> >(tee {log:q}) augur translate \ @@ -758,7 +758,7 @@ rule compute_f_scores_node_data: benchmark: "benchmarks/compute_f_scores_node_data_{a_or_b}_{build_name}_{resolution}.txt" shell: - """ + r""" exec &> >(tee {log:q}) python scripts/score_f_sequences.py tree \ @@ -786,7 +786,7 @@ rule traits: columns=config["traits"]["columns"], strain_id=config["strain_id_field"], shell: - """ + r""" exec &> >(tee {log:q}) augur traits \ @@ -811,7 +811,7 @@ rule frequencies: params: min_date_arg = lambda w: f"--min-date {config['frequencies']['resolutions'][w.resolution]['min_date']}", shell: - """ + r""" exec &> >(tee {log:q}) augur frequencies \ diff --git a/workflow/snakemake_rules/export.smk b/workflow/snakemake_rules/export.smk index f1a202d..0cfc01f 100644 --- a/workflow/snakemake_rules/export.smk +++ b/workflow/snakemake_rules/export.smk @@ -31,7 +31,7 @@ rule generate_f_dms_antibody_auspice_config: ["#440154", "#472d7b", "#3b528b", "#2c728e", "#21918c", "#28ae80", "#5ec962", "#addc30", "#fde725"] ) shell: - """ + r""" exec &> >(tee {log:q}) python scripts/generate_f_dms_antibody_auspice_config.py \ @@ -53,7 +53,7 @@ rule colors: benchmark: "benchmarks/colors_{a_or_b}.txt" shell: - """ + r""" exec &> >(tee {log:q}) python scripts/assign-colors.py \ @@ -102,7 +102,7 @@ rule export: title = lambda w: f"RSV-{w.a_or_b.upper()} phylogeny", strain_id=config["strain_id_field"], shell: - """ + r""" exec &> >(tee {log:q}) augur export v2 \ @@ -136,7 +136,7 @@ rule final_strain_name: strain_id=config["strain_id_field"], display_strain_field=config["display_strain_field"], shell: - """ + r""" exec &> >(tee {log:q}) python3 scripts/set_final_strain_name.py --metadata {input.metadata} \ @@ -164,7 +164,7 @@ rule rename_and_ready_for_nextclade: name= lambda w: config["nextclade_attributes"][w.a_or_b]["name"], ref_name= lambda w: config["nextclade_attributes"][w.a_or_b]["reference_name"] shell: - """ + r""" exec &> >(tee {log:q}) python3 scripts/rename_and_nextclade.py \ diff --git a/workflow/snakemake_rules/glycosylation.smk b/workflow/snakemake_rules/glycosylation.smk index 2f7a6fc..f7138a2 100644 --- a/workflow/snakemake_rules/glycosylation.smk +++ b/workflow/snakemake_rules/glycosylation.smk @@ -11,7 +11,7 @@ rule glycosylation: params: aa_data = build_dir + "/{a_or_b}/{build_name}/{resolution}/aligned_{build_name}.fasta" shell: - """ + r""" exec &> >(tee {log:q}) python scripts/glycosylation.py \ From cc32e8283f39fbd7f0a3a8b75b9d44696f29fdfd Mon Sep 17 00:00:00 2001 From: Victor Lin Date: Fri, 6 Mar 2026 14:43:44 -0800 Subject: [PATCH 11/19] Avoid the `message` rule attribute This allows Snakemake to show other critical details such as job id, rule name, input, output. --- workflow/snakemake_rules/chores.smk | 2 - workflow/snakemake_rules/clades.smk | 13 +++- workflow/snakemake_rules/core.smk | 117 +++++++++++++--------------- workflow/snakemake_rules/export.smk | 8 +- 4 files changed, 70 insertions(+), 70 deletions(-) diff --git a/workflow/snakemake_rules/chores.smk b/workflow/snakemake_rules/chores.smk index 234f8a6..27578ea 100644 --- a/workflow/snakemake_rules/chores.smk +++ b/workflow/snakemake_rules/chores.smk @@ -8,8 +8,6 @@ rule update_example_data_wildcards: - sets the subsampling size to 50 - groups by year and country """ - message: - "Update example data" input: sequences = "results/{a_or_b}/sequences.fasta", metadata = "results/{a_or_b}/metadata.tsv", diff --git a/workflow/snakemake_rules/clades.smk b/workflow/snakemake_rules/clades.smk index 66993f4..53aee03 100644 --- a/workflow/snakemake_rules/clades.smk +++ b/workflow/snakemake_rules/clades.smk @@ -1,6 +1,7 @@ rule clades_genome: - message: - "adding clades based on the entire genome" + """ + adding clades based on the entire genome + """ input: tree = rules.refine.output.tree, aa_muts = rules.translate.output.node_data, @@ -26,7 +27,9 @@ rule clades_genome: rule clades_Goya: - message: "Adding internal clade labels" + """ + Adding internal clade labels + """ input: tree = rules.refine.output.tree, aa_muts = rules.translate.output.node_data, @@ -51,7 +54,9 @@ rule clades_Goya: """ rule clades_consortium: - message: "Adding internal clade labels" + """ + Adding internal clade labels + """ input: tree = rules.refine.output.tree, aa_muts = rules.translate.output.node_data, diff --git a/workflow/snakemake_rules/core.smk b/workflow/snakemake_rules/core.smk index b9e42a4..d4b9212 100644 --- a/workflow/snakemake_rules/core.smk +++ b/workflow/snakemake_rules/core.smk @@ -7,10 +7,9 @@ This part of the workflow expects input files rule index_sequences: - message: - """ - Creating an index of sequence composition for filtering. - """ + """ + Creating an index of sequence composition for filtering. + """ input: sequences="results/{a_or_b}/sequences.fasta", output: @@ -31,10 +30,9 @@ rule index_sequences: rule newreference: - message: - """ - Making new reference - """ + """ + Making new reference + """ input: oldreference="config/{a_or_b}reference.gbk", output: @@ -61,10 +59,9 @@ rule newreference: rule filter_recent: - message: - """ - filtering sequences - """ + """ + filtering sequences + """ input: sequences="results/{a_or_b}/sequences.fasta", metadata="results/{a_or_b}/metadata.tsv", @@ -109,10 +106,9 @@ rule filter_recent: rule filter_background: - message: - """ - filtering sequences - """ + """ + filtering sequences + """ input: sequences="results/{a_or_b}/sequences.fasta", metadata="results/{a_or_b}/metadata.tsv", @@ -166,10 +162,9 @@ rule filter_background: """ rule exclude_preduplication: - message: - """ - excluding sequences predate the duplication starting with A.D or B.D as lineage names - """ + """ + excluding sequences predate the duplication starting with A.D or B.D as lineage names + """ input: sequences=rules.filter_background.output.sequences, metadata=rules.filter_background.output.metadata, @@ -232,10 +227,9 @@ rule combine_samples: rule get_nextclade_dataset: - message: - """ - fetching nextclade dataset - """ + """ + fetching nextclade dataset + """ output: dataset="results/nextclade_rsv-{a_or_b}.zip", log: @@ -257,10 +251,9 @@ rule get_nextclade_dataset: rule filter_for_pre_subsample_alignment: - message: - """ - Do the quality filtering applied to each sequence set before subsampling - """ + """ + Do the quality filtering applied to each sequence set before subsampling + """ input: sequences="results/{a_or_b}/sequences.fasta", metadata="results/{a_or_b}/metadata.tsv", @@ -294,10 +287,9 @@ rule filter_for_pre_subsample_alignment: rule align_pre_subsample_sequences: - message: - """ - Aligning all pre-subsampled quality-filtered sequences - """ + """ + Aligning all pre-subsampled quality-filtered sequences + """ input: sequences=rules.filter_for_pre_subsample_alignment.output.sequences, dataset=rules.get_nextclade_dataset.output.dataset, @@ -326,8 +318,9 @@ rule align_pre_subsample_sequences: rule score_pre_subsample_f_proteins: - message: - "Computing F protein DMS scores for pre-subsampled sequences" + """ + Computing F protein DMS scores for pre-subsampled sequences + """ input: translations_done=rules.align_pre_subsample_sequences.output.translations_done, dms_scores=config["f_dms_data"], @@ -355,8 +348,9 @@ rule score_pre_subsample_f_proteins: rule add_f_scores_to_pre_subsample_metadata: - message: - "Adding F protein scores to pre-subsampled metadata" + """ + Adding F protein scores to pre-subsampled metadata + """ input: original_metadata="results/{a_or_b}/metadata.tsv", f_scores=rules.score_pre_subsample_f_proteins.output.scores, @@ -381,8 +375,9 @@ rule add_f_scores_to_pre_subsample_metadata: rule enrich_antibody_escape: - message: - "Get sequences with high antibody escape to add to tree via custom filtering rule." + """ + Get sequences with high antibody escape to add to tree via custom filtering rule. + """ wildcard_constraints: antibody="|".join(re.escape(antibody) for antibody in config["f_dms_antibodies"]), scoretype="total_escape|max_escape", @@ -420,10 +415,9 @@ rule enrich_antibody_escape: rule genome_align: - message: - """ - Aligning sequences to the reference - """ + """ + Aligning sequences to the reference + """ input: sequences=rules.combine_samples.output.sequences, dataset=rules.get_nextclade_dataset.output.dataset, @@ -535,8 +529,9 @@ def get_alignment(w): rule tree: - message: - "Building tree" + """ + Building tree + """ input: alignment=get_alignment, output: @@ -559,13 +554,12 @@ rule tree: rule refine: - message: - """ - Refining tree - - estimate timetree - - use {params.coalescent} coalescent timescale - - estimate {params.date_inference} node dates - """ + """ + Refining tree + - estimate timetree + - use {params.coalescent} coalescent timescale + - estimate {params.date_inference} node dates + """ input: tree=rules.tree.output.tree, alignment=get_alignment, @@ -671,11 +665,10 @@ rule distances: rule ancestral: - message: - """ - Reconstructing ancestral sequences and mutations - - inferring ambiguous mutations - """ + """ + Reconstructing ancestral sequences and mutations + - inferring ambiguous mutations + """ input: tree=rules.refine.output.tree, alignment=get_alignment, @@ -711,8 +704,9 @@ rule ancestral: rule translate: - message: - "Translating amino acid sequences" + """ + Translating amino acid sequences + """ input: tree=rules.refine.output.tree, node_data=rules.ancestral.output.node_data, @@ -739,10 +733,9 @@ rule translate: rule compute_f_scores_node_data: - message: - """ - Computing F protein antibody escape scores for all tree nodes - """ + """ + Computing F protein antibody escape scores for all tree nodes + """ input: tree_newick=rules.refine.output.tree, aa_muts=rules.translate.output.node_data, diff --git a/workflow/snakemake_rules/export.smk b/workflow/snakemake_rules/export.smk index 0cfc01f..38d3b8b 100644 --- a/workflow/snakemake_rules/export.smk +++ b/workflow/snakemake_rules/export.smk @@ -14,7 +14,9 @@ def get_node_data(w): return node_data rule generate_f_dms_antibody_auspice_config: - message: "Generating auspice config for F DMS antibody colorings" + """ + Generating auspice config for F DMS antibody colorings + """ input: f_scores_node_data = rules.compute_f_scores_node_data.output.f_scores_node_data output: @@ -84,7 +86,9 @@ def auspice_configs(wildcards): return configs rule export: - message: "Exporting data files for auspice" + """ + Exporting data files for auspice + """ input: tree = rules.refine.output.tree, metadata = "results/{a_or_b}/metadata.tsv", From b5be8998e48daba57017c0d3f68f9f1a4c916b83 Mon Sep 17 00:00:00 2001 From: Victor Lin Date: Wed, 4 Mar 2026 14:54:13 -0800 Subject: [PATCH 12/19] Remove unused import --- scripts/newreference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/newreference.py b/scripts/newreference.py index a89c819..200e766 100644 --- a/scripts/newreference.py +++ b/scripts/newreference.py @@ -1,6 +1,6 @@ from Bio import SeqIO from Bio.SeqRecord import SeqRecord -from Bio.SeqFeature import SeqFeature, FeatureLocation, Seq +from Bio.SeqFeature import SeqFeature, FeatureLocation import shutil import argparse import sys From c366c93e24621912b6b2b3302bd65038c95e428d Mon Sep 17 00:00:00 2001 From: Victor Lin Date: Wed, 4 Mar 2026 14:54:23 -0800 Subject: [PATCH 13/19] Fix typo --- scripts/newreference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/newreference.py b/scripts/newreference.py index 200e766..e5e4575 100644 --- a/scripts/newreference.py +++ b/scripts/newreference.py @@ -40,7 +40,7 @@ def new_reference(referencefile, outgenbank, outfasta, gene): formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument("--reference", required=True, help="GenBank file with reference sequences") - parser.add_argument("--output-fasta", required=True, help="GenBank new reference file") + parser.add_argument("--output-fasta", required=True, help="FASTA new reference file") parser.add_argument("--output-genbank", required=True, help="GenBank new reference file") parser.add_argument("--gene", help="gene name or genome for entire genome") args = parser.parse_args() From 9d18c686633c1f1e13be30d468414cf5c2e05d09 Mon Sep 17 00:00:00 2001 From: Victor Lin Date: Wed, 4 Mar 2026 14:39:07 -0800 Subject: [PATCH 14/19] Move conditional input logic to function This removes an unnecessary run of rule newreference with gene=genome. That had made an unnecessary file copy and conversion from GenBank to FASTA format. This is also more consistent with the existing get_alignment() function. --- scripts/newreference.py | 11 +++-------- workflow/snakemake_rules/core.smk | 11 +++++++++-- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/scripts/newreference.py b/scripts/newreference.py index e5e4575..8d411b3 100644 --- a/scripts/newreference.py +++ b/scripts/newreference.py @@ -1,7 +1,6 @@ from Bio import SeqIO from Bio.SeqRecord import SeqRecord from Bio.SeqFeature import SeqFeature, FeatureLocation -import shutil import argparse import sys @@ -36,18 +35,14 @@ def new_reference(referencefile, outgenbank, outfasta, gene): if __name__ == '__main__': parser = argparse.ArgumentParser( - description="make new reference depending on whether the entire genome or only part is to be used for the tree", + description="make new reference", formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument("--reference", required=True, help="GenBank file with reference sequences") parser.add_argument("--output-fasta", required=True, help="FASTA new reference file") parser.add_argument("--output-genbank", required=True, help="GenBank new reference file") - parser.add_argument("--gene", help="gene name or genome for entire genome") + parser.add_argument("--gene", help="gene name") args = parser.parse_args() - if args.gene=='genome': - shutil.copy(args.reference, args.output_genbank) - SeqIO.write(SeqIO.read(args.reference, 'genbank'), args.output_fasta, 'fasta') - else: - new_reference(args.reference, args.output_genbank, args.output_fasta, args.gene) + new_reference(args.reference, args.output_genbank, args.output_fasta, args.gene) diff --git a/workflow/snakemake_rules/core.smk b/workflow/snakemake_rules/core.smk index d4b9212..a7f7c13 100644 --- a/workflow/snakemake_rules/core.smk +++ b/workflow/snakemake_rules/core.smk @@ -528,6 +528,13 @@ def get_alignment(w): ) +def get_reference(w): + if w.build_name == "genome": + return f"config/{w.a_or_b}reference.gbk" + else: + return build_dir + f"/{w.a_or_b}/{w.build_name}_reference.gbk" + + rule tree: """ Building tree @@ -673,7 +680,7 @@ rule ancestral: tree=rules.refine.output.tree, alignment=get_alignment, translations=rules.genome_align.output.translations, - root_sequence=build_dir + "/{a_or_b}/{build_name}_reference.gbk", + root_sequence=get_reference, output: node_data=build_dir + "/{a_or_b}/{build_name}/{resolution}/nt_muts.json", translations_done=build_dir + "/{a_or_b}/{build_name}/{resolution}/translations.done", @@ -710,7 +717,7 @@ rule translate: input: tree=rules.refine.output.tree, node_data=rules.ancestral.output.node_data, - reference=build_dir + "/{a_or_b}/{build_name}_reference.gbk", + reference=get_reference, output: node_data=build_dir + "/{a_or_b}/{build_name}/{resolution}/aa_muts.json", log: From abd3bd5475c8375798f6e50278ec4224994d9bec Mon Sep 17 00:00:00 2001 From: Victor Lin Date: Wed, 4 Mar 2026 14:39:07 -0800 Subject: [PATCH 15/19] Restrict gene to G|F This removes an unnecessary run of rule newreference with gene=F-antibody-escape. build_name=F-antibody-escape now reuses the output files from gene=F. --- Snakefile | 1 + workflow/snakemake_rules/core.smk | 7 +++---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Snakefile b/Snakefile index d2ffb97..33e77ff 100644 --- a/Snakefile +++ b/Snakefile @@ -16,6 +16,7 @@ wildcard_constraints: a_or_b=r"a|b", build_name="|".join(config.get("builds_to_run", ["genome"])), resolution="|".join(config.get("resolutions_to_run", ["all-time"])), + gene="G|F", build_dir = "results" diff --git a/workflow/snakemake_rules/core.smk b/workflow/snakemake_rules/core.smk index a7f7c13..6e6a2d5 100644 --- a/workflow/snakemake_rules/core.smk +++ b/workflow/snakemake_rules/core.smk @@ -44,8 +44,6 @@ rule newreference: "logs/newreference_{a_or_b}_{gene}.txt" benchmark: "benchmarks/newreference_{a_or_b}_{gene}.txt" - params: - gene=lambda w: w.gene.split("-")[0], shell: r""" exec &> >(tee {log:q}) @@ -54,7 +52,7 @@ rule newreference: --reference {input.oldreference} \ --output-genbank {output.newreferencegbk} \ --output-fasta {output.newreferencefasta} \ - --gene {params.gene} + --gene {wildcards.gene} """ @@ -532,7 +530,8 @@ def get_reference(w): if w.build_name == "genome": return f"config/{w.a_or_b}reference.gbk" else: - return build_dir + f"/{w.a_or_b}/{w.build_name}_reference.gbk" + gene = config["cds"][w.build_name] + return build_dir + f"/{w.a_or_b}/{gene}_reference.gbk" rule tree: From 323387650c08e878d27b63e32739c199dcf4f345 Mon Sep 17 00:00:00 2001 From: Victor Lin Date: Wed, 4 Mar 2026 15:01:29 -0800 Subject: [PATCH 16/19] Make --gene required All usage in the workflow specifies a gene. --- scripts/newreference.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/newreference.py b/scripts/newreference.py index 8d411b3..00355da 100644 --- a/scripts/newreference.py +++ b/scripts/newreference.py @@ -19,7 +19,7 @@ def new_reference(referencefile, outgenbank, outfasta, gene): # If user provides a --gene 'some name' that is not found, error out as this may indicate that # the gene name is misspelled or the user may be using the wrong GenBank file. - if(gene is not None and startofgene is None and endofgene is None): + if(startofgene is None and endofgene is None): print(f"ERROR: No '{gene}' was found under 'gene' or 'CDS' features in the GenBank file.", file=sys.stderr) sys.exit(1) @@ -35,13 +35,13 @@ def new_reference(referencefile, outgenbank, outfasta, gene): if __name__ == '__main__': parser = argparse.ArgumentParser( - description="make new reference", + description="make new reference based on a gene", formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument("--reference", required=True, help="GenBank file with reference sequences") parser.add_argument("--output-fasta", required=True, help="FASTA new reference file") parser.add_argument("--output-genbank", required=True, help="GenBank new reference file") - parser.add_argument("--gene", help="gene name") + parser.add_argument("--gene", required=True, help="gene name") args = parser.parse_args() new_reference(args.reference, args.output_genbank, args.output_fasta, args.gene) From e560953e49e7a4b53d556fcd2e8d2638a52758b1 Mon Sep 17 00:00:00 2001 From: Victor Lin Date: Mon, 16 Mar 2026 16:45:42 -0700 Subject: [PATCH 17/19] Remove log file This was added before .gitignore. --- logs/traits_rsv_rsv.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 logs/traits_rsv_rsv.txt diff --git a/logs/traits_rsv_rsv.txt b/logs/traits_rsv_rsv.txt deleted file mode 100644 index e69de29..0000000 From 4f0af48429f6aeed864e5427424443c5653b12fc Mon Sep 17 00:00:00 2001 From: Victor Lin Date: Fri, 6 Mar 2026 15:38:27 -0800 Subject: [PATCH 18/19] Replace exclude_preduplication with query The previous behavior increased the chance of under-sampling by allowing sequences that don't start with "{subtype}.D" to pass subsampling, only to be later excluded. The new query approach turns it into a proper pre-subsampling filter step, and also avoids an extra step in the workflow. --- workflow/snakemake_rules/core.smk | 36 ++++--------------------------- 1 file changed, 4 insertions(+), 32 deletions(-) diff --git a/workflow/snakemake_rules/core.smk b/workflow/snakemake_rules/core.smk index 6e6a2d5..260afd8 100644 --- a/workflow/snakemake_rules/core.smk +++ b/workflow/snakemake_rules/core.smk @@ -115,7 +115,7 @@ rule filter_background: exclude=config["exclude"], output: sequences=build_dir - + "/{a_or_b}/{build_name}/{resolution}/filtered_background_pre.fasta", + + "/{a_or_b}/{build_name}/{resolution}/filtered_background.fasta", metadata=build_dir + "/{a_or_b}/{build_name}/{resolution}/filtered_background_metadata.tsv", log: @@ -137,6 +137,7 @@ rule filter_background: ], exclude_where=config["filter"]["exclude_where"]["background"], missing_data_threshold=config["filter"]["missing_data_threshold"], + clade_prefix=lambda w: f"{w.a_or_b.upper()}.D", shell: r""" exec &> >(tee {log:q}) @@ -156,45 +157,16 @@ rule filter_background: --output-metadata {output.metadata} \ --group-by {params.group_by} \ --subsample-max-sequences {params.subsample_max_sequences} \ - --query '({params.min_coverage}) & missing_data<{params.missing_data_threshold}' + --query '({params.min_coverage}) & missing_data<{params.missing_data_threshold} & clade.str.startswith("{params.clade_prefix}", na=False)' """ -rule exclude_preduplication: - """ - excluding sequences predate the duplication starting with A.D or B.D as lineage names - """ - input: - sequences=rules.filter_background.output.sequences, - metadata=rules.filter_background.output.metadata, - output: - sequences=build_dir - + "/{a_or_b}/{build_name}/{resolution}/filtered_background.fasta", - benchmark: - "benchmarks/exclude_preduplication_{a_or_b}_{build_name}_{resolution}.txt" - run: - import pandas as pd - import Bio.SeqIO as SeqIO - - desired_prefix = wildcards.a_or_b.upper() + ".D" - - metadata_df = pd.read_csv(input.metadata, sep="\t") - ids_to_keep = set(metadata_df[ - metadata_df["clade"].str.startswith(desired_prefix, na=False) - ]["accession"].tolist()) - - with open(output.sequences, "w") as seq_out: - for seq in SeqIO.parse(input.sequences, "fasta"): - if seq.id in ids_to_keep: - SeqIO.write(seq, seq_out, "fasta") - - rule combine_samples: input: subsamples=lambda w: ( ( [ rules.filter_recent.output.sequences, - rules.exclude_preduplication.output.sequences, + rules.filter_background.output.sequences, ] if "background_min_date" in config["filter"]["resolutions"][w.resolution] else [rules.filter_recent.output.sequences] From 839fe02e15743908938a1ab50d5c897cc460f294 Mon Sep 17 00:00:00 2001 From: Victor Lin Date: Tue, 7 Apr 2026 17:29:00 -0700 Subject: [PATCH 19/19] Remove unused config "phylo: limit time-scoped builds to ancestor with G duplication" (af8a9b78) added a "1y" entry to resolutions under filter config, but not resolutions_to_run. I assume this was added by mistake. --- config/configfile.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/config/configfile.yaml b/config/configfile.yaml index 3bd9a4c..8d463b0 100644 --- a/config/configfile.yaml +++ b/config/configfile.yaml @@ -51,9 +51,6 @@ filter: 3y: min_date: 3Y background_min_date: 12Y - 1y: - min_date: 1Y - background_min_date: 12Y subsample_max_sequences: genome: 3000