From 03e8f9b86e3484c45f407a3236353daaa1eb5f19 Mon Sep 17 00:00:00 2001 From: Sarah Griffiths Date: Mon, 28 Oct 2024 15:52:49 +0000 Subject: [PATCH 1/8] add seqkit module --- assets/multiqc_config.yml | 4 ++-- docs/output.md | 13 +++++++++++++ modules.json | 5 +++++ nextflow.config | 2 +- workflows/seqinspector.nf | 10 ++++++++++ 5 files changed, 31 insertions(+), 3 deletions(-) diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index e960d07e..ca215202 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -1,7 +1,7 @@ report_comment: > - This report has been generated by the nf-core/seqinspector + This report has been generated by the nf-core/seqinspector analysis pipeline. For information about how to interpret these results, please see the - documentation. + documentation. report_section_order: "nf-core-seqinspector-methods-description": order: -1000 diff --git a/docs/output.md b/docs/output.md index 15c29ce2..6b42a222 100644 --- a/docs/output.md +++ b/docs/output.md @@ -12,6 +12,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [FastQC](#fastqc) - Raw read QC - [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline +- [SeqkitStats](#seqkitstats) - Per sample TSV file with summary statistics - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution ### FastQC @@ -69,6 +70,18 @@ nf-core/seqinspector will generate the following MultiQC reports: Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . +### SeqkitStats + +
+Output files + +- `seqkit/` + - `*.tsv`: Per sample TSV file with summary statistics. + +
+ +[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). + ### Pipeline information
diff --git a/modules.json b/modules.json index 70f3486c..da456548 100644 --- a/modules.json +++ b/modules.json @@ -14,6 +14,11 @@ "branch": "master", "git_sha": "19ca321db5d8bd48923262c2eca6422359633491", "installed_by": ["modules"] + }, + "seqkit/stats": { + "branch": "master", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", + "installed_by": ["modules"] } } }, diff --git a/nextflow.config b/nextflow.config index 70e433a0..08f9c6f6 100644 --- a/nextflow.config +++ b/nextflow.config @@ -240,7 +240,7 @@ manifest { description = """Pipeline to QC your sequences""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = '1.0dev' + version = '3.0.2' doi = '' } diff --git a/workflows/seqinspector.nf b/workflows/seqinspector.nf index 1ba00c62..b0bf95cb 100644 --- a/workflows/seqinspector.nf +++ b/workflows/seqinspector.nf @@ -9,6 +9,8 @@ include { FASTQC } from '../modules/nf-core/fastqc/main' include { MULTIQC as MULTIQC_GLOBAL } from '../modules/nf-core/multiqc/main' include { MULTIQC as MULTIQC_PER_TAG } from '../modules/nf-core/multiqc/main' +include { SEQKIT_STATS } from '../modules/nf-core/seqkit/stats/main' + include { paramsSummaryMap } from 'plugin/nf-validation' include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' @@ -138,6 +140,14 @@ workflow SEQINSPECTOR { Channel.empty().toList() ) + // + // MODULE: Run SEQKIT_STATS + // + SEQKIT_STATS ( + ch_samplesheet + ) + ch_versions = ch_versions.mix(SEQKIT_STATS.out.versions.first()) + emit: global_report = MULTIQC_GLOBAL.out.report.toList() // channel: /path/to/multiqc_report.html grouped_reports = MULTIQC_PER_TAG.out.report.toList() // channel: [ /path/to/multiqc_report.html ] From 352e1733e6246d0c31422984e34be838c0953d28 Mon Sep 17 00:00:00 2001 From: Sarah Griffiths Date: Tue, 29 Oct 2024 15:20:21 +0000 Subject: [PATCH 2/8] update docs --- docs/output.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/output.md b/docs/output.md index 45afbb18..659bf9d9 100644 --- a/docs/output.md +++ b/docs/output.md @@ -70,7 +70,7 @@ Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQ
-[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). +[SeqkitStats](https://bioinf.shenwei.me/seqkit/usage/#stats) it gives general quality metrics about your sequenced reads including average read lengths, GC(%) and n50's. For further reading and documentation see the [Seqkit help pages]([Seqkit help](https://bioinf.shenwei.me/seqkit/)). ### Pipeline information From ee77c9ad2b39b512b7a46c440846507f729be727 Mon Sep 17 00:00:00 2001 From: Sarah Griffiths Date: Tue, 29 Oct 2024 15:23:14 +0000 Subject: [PATCH 3/8] docs --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 38c05c1d..c6e66120 100644 --- a/nextflow.config +++ b/nextflow.config @@ -227,7 +227,7 @@ manifest { description = """Pipeline to QC your sequences""" mainScript = 'main.nf' nextflowVersion = '!>=24.04.2' - version = '3.0.2' + version = '1.0dev' doi = '' } From e4ac008cf1e68ccd4cfd2d80e6adca1b464aea36 Mon Sep 17 00:00:00 2001 From: Sarah Griffiths Date: Tue, 29 Oct 2024 15:25:49 +0000 Subject: [PATCH 4/8] fix-ci --- assets/multiqc_config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 9ef5ef5a..e960d07e 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -1,7 +1,7 @@ report_comment: > - This report has been generated by the nf-core/seqinspector + This report has been generated by the nf-core/seqinspector analysis pipeline. For information about how to interpret these results, please see the - documentation. + documentation. report_section_order: "nf-core-seqinspector-methods-description": order: -1000 From 5c0e56df3f90b7fe13bcd8013b7f2179964356d1 Mon Sep 17 00:00:00 2001 From: Sarah Griffiths Date: Tue, 29 Oct 2024 15:37:00 +0000 Subject: [PATCH 5/8] actually add the module --- modules/nf-core/seqkit/stats/environment.yml | 5 + modules/nf-core/seqkit/stats/main.nf | 34 +++ modules/nf-core/seqkit/stats/meta.yml | 48 ++++ .../nf-core/seqkit/stats/tests/main.nf.test | 141 ++++++++++++ .../seqkit/stats/tests/main.nf.test.snap | 212 ++++++++++++++++++ modules/nf-core/seqkit/stats/tests/tags.yml | 2 + 6 files changed, 442 insertions(+) create mode 100644 modules/nf-core/seqkit/stats/environment.yml create mode 100644 modules/nf-core/seqkit/stats/main.nf create mode 100644 modules/nf-core/seqkit/stats/meta.yml create mode 100644 modules/nf-core/seqkit/stats/tests/main.nf.test create mode 100644 modules/nf-core/seqkit/stats/tests/main.nf.test.snap create mode 100644 modules/nf-core/seqkit/stats/tests/tags.yml diff --git a/modules/nf-core/seqkit/stats/environment.yml b/modules/nf-core/seqkit/stats/environment.yml new file mode 100644 index 00000000..41f3e7de --- /dev/null +++ b/modules/nf-core/seqkit/stats/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::seqkit=2.8.1 diff --git a/modules/nf-core/seqkit/stats/main.nf b/modules/nf-core/seqkit/stats/main.nf new file mode 100644 index 00000000..117c6052 --- /dev/null +++ b/modules/nf-core/seqkit/stats/main.nf @@ -0,0 +1,34 @@ +process SEQKIT_STATS { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/seqkit:2.8.1--h9ee0642_0' : + 'biocontainers/seqkit:2.8.1--h9ee0642_0' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*.tsv"), emit: stats + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '--all' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + seqkit stats \\ + --tabular \\ + $args \\ + $reads > '${prefix}.tsv' + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqkit: \$( seqkit version | sed 's/seqkit v//' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/seqkit/stats/meta.yml b/modules/nf-core/seqkit/stats/meta.yml new file mode 100644 index 00000000..797712b5 --- /dev/null +++ b/modules/nf-core/seqkit/stats/meta.yml @@ -0,0 +1,48 @@ +name: "seqkit_stats" +description: simple statistics of FASTA/Q files +keywords: + - seqkit + - fasta + - stats +tools: + - "seqkit": + description: Cross-platform and ultrafast toolkit for FASTA/Q file manipulation, + written by Wei Shen. + homepage: https://bioinf.shenwei.me/seqkit/usage/ + documentation: https://bioinf.shenwei.me/seqkit/usage/ + tool_dev_url: https://github.com/shenwei356/seqkit/ + doi: "10.1371/journal.pone.0163962" + licence: ["MIT"] + identifier: biotools:seqkit +input: + - - meta: + type: map + description: > + Groovy Map containing sample information e.g. [ id:'test', single_end:false + ] + - reads: + type: file + description: > + Either FASTA or FASTQ files. + pattern: "*.{fa,fna,faa,fasta,fq,fastq}[.gz]" +output: + - stats: + - meta: + type: map + description: > + Groovy Map containing sample information e.g. [ id:'test', single_end:false + ] + - "*.tsv": + type: file + description: > + Tab-separated output file with basic sequence statistics. + pattern: "*.tsv" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@Midnighter" +maintainers: + - "@Midnighter" diff --git a/modules/nf-core/seqkit/stats/tests/main.nf.test b/modules/nf-core/seqkit/stats/tests/main.nf.test new file mode 100644 index 00000000..2cd4eb49 --- /dev/null +++ b/modules/nf-core/seqkit/stats/tests/main.nf.test @@ -0,0 +1,141 @@ +nextflow_process { + + name "Test Process SEQKIT_STATS" + script "../main.nf" + process "SEQKIT_STATS" + + tag "modules" + tag "modules_nfcore" + tag "seqkit" + tag "seqkit/stats" + + test("single_end") { + + when { + process { + """ + input[0] = [[ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("paired_end") { + + when { + process { + """ + input[0] = [[ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("nanopore") { + + when { + process { + """ + input[0] = [[ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/nanopore/fastq/test.fastq.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("genome_fasta") { + + when { + process { + """ + input[0] = [[ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("transcriptome_fasta") { + + when { + process { + """ + input[0] = [[ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/transcriptome.fasta', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("single_end - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [[ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/seqkit/stats/tests/main.nf.test.snap b/modules/nf-core/seqkit/stats/tests/main.nf.test.snap new file mode 100644 index 00000000..43aa9fe2 --- /dev/null +++ b/modules/nf-core/seqkit/stats/tests/main.nf.test.snap @@ -0,0 +1,212 @@ +{ + "nanopore": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.tsv:md5,49ea41204974d82fb9bbb30572485ccb" + ] + ], + "1": [ + "versions.yml:md5,e5810dac4e10040b1a74b8672a829734" + ], + "stats": [ + [ + { + "id": "test", + "single_end": true + }, + "test.tsv:md5,49ea41204974d82fb9bbb30572485ccb" + ] + ], + "versions": [ + "versions.yml:md5,e5810dac4e10040b1a74b8672a829734" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-08T08:35:18.621881722" + }, + "genome_fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.tsv:md5,af2521fa0c7e1a0683e60df6292429de" + ] + ], + "1": [ + "versions.yml:md5,e5810dac4e10040b1a74b8672a829734" + ], + "stats": [ + [ + { + "id": "test", + "single_end": false + }, + "test.tsv:md5,af2521fa0c7e1a0683e60df6292429de" + ] + ], + "versions": [ + "versions.yml:md5,e5810dac4e10040b1a74b8672a829734" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-08T08:35:28.193047869" + }, + "transcriptome_fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.tsv:md5,48d5664c3214880f3509226cb839eb17" + ] + ], + "1": [ + "versions.yml:md5,e5810dac4e10040b1a74b8672a829734" + ], + "stats": [ + [ + { + "id": "test", + "single_end": false + }, + "test.tsv:md5,48d5664c3214880f3509226cb839eb17" + ] + ], + "versions": [ + "versions.yml:md5,e5810dac4e10040b1a74b8672a829734" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-08T08:35:39.129387891" + }, + "single_end": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.tsv:md5,97eb700f7e607dba8371e82d5de8a1a2" + ] + ], + "1": [ + "versions.yml:md5,e5810dac4e10040b1a74b8672a829734" + ], + "stats": [ + [ + { + "id": "test", + "single_end": true + }, + "test.tsv:md5,97eb700f7e607dba8371e82d5de8a1a2" + ] + ], + "versions": [ + "versions.yml:md5,e5810dac4e10040b1a74b8672a829734" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-08T08:34:57.889856349" + }, + "paired_end": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.tsv:md5,97eb700f7e607dba8371e82d5de8a1a2" + ] + ], + "1": [ + "versions.yml:md5,e5810dac4e10040b1a74b8672a829734" + ], + "stats": [ + [ + { + "id": "test", + "single_end": false + }, + "test.tsv:md5,97eb700f7e607dba8371e82d5de8a1a2" + ] + ], + "versions": [ + "versions.yml:md5,e5810dac4e10040b1a74b8672a829734" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-08T08:35:09.114729727" + }, + "single_end - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.tsv:md5,97eb700f7e607dba8371e82d5de8a1a2" + ] + ], + "1": [ + "versions.yml:md5,e5810dac4e10040b1a74b8672a829734" + ], + "stats": [ + [ + { + "id": "test", + "single_end": true + }, + "test.tsv:md5,97eb700f7e607dba8371e82d5de8a1a2" + ] + ], + "versions": [ + "versions.yml:md5,e5810dac4e10040b1a74b8672a829734" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-08T08:35:49.267660809" + } +} \ No newline at end of file diff --git a/modules/nf-core/seqkit/stats/tests/tags.yml b/modules/nf-core/seqkit/stats/tests/tags.yml new file mode 100644 index 00000000..7f33c7df --- /dev/null +++ b/modules/nf-core/seqkit/stats/tests/tags.yml @@ -0,0 +1,2 @@ +seqkit/stats: + - "modules/nf-core/seqkit/stats/**" From 918000a7c65a62a05bf9871bc1162434a91cdc50 Mon Sep 17 00:00:00 2001 From: Sarah Griffiths Date: Tue, 29 Oct 2024 16:20:34 +0000 Subject: [PATCH 6/8] changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b0b12de1..7a40ac39 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ Initial release of nf-core/seqinspector, created with the [nf-core](https://nf-c - [#20](https://github.com/nf-core/seqinspector/pull/20) Use tags to generate group reports - [#13](https://github.com/nf-core/seqinspector/pull/13) Generate reports per run, per project and per lane. - [#49](https://github.com/nf-core/seqinspector/pull/49) Merge with template 3.0.2. +- [#40](https://github.com/nf-core/seqinspector/pull/59) Seqkit Stats TSV output. ### `Fixed` From dd0cea0e00bbb93e2689348a90b5eec8cb71e1f8 Mon Sep 17 00:00:00 2001 From: Sarah Griffiths Date: Wed, 30 Oct 2024 10:14:56 +0000 Subject: [PATCH 7/8] fix gnarly spacing --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index ab6d08d5..38eb3127 100644 --- a/nextflow.config +++ b/nextflow.config @@ -227,7 +227,7 @@ manifest { description = """Pipeline to QC your sequences""" mainScript = 'main.nf' nextflowVersion = '!>=24.04.2' - version = '1.0dev' + version = '1.0dev' doi = '' } From 4fa2731797b72bbe21bcbce2edc1640e0e3d6a9f Mon Sep 17 00:00:00 2001 From: Sarah Griffiths Date: Wed, 30 Oct 2024 10:38:55 +0000 Subject: [PATCH 8/8] unmangle modules.json --- modules.json | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modules.json b/modules.json index 334f22dd..3a3635b8 100644 --- a/modules.json +++ b/modules.json @@ -19,6 +19,11 @@ "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", "installed_by": ["modules"] + }, + "seqtk/sample": { + "branch": "master", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", + "installed_by": ["modules"] } } },