diff --git a/CHANGELOG.md b/CHANGELOG.md index 09cffbcb..611c987c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ Initial release of nf-core/seqinspector, created with the [nf-core](https://nf-c - [#20](https://github.com/nf-core/seqinspector/pull/20) Use tags to generate group reports - [#13](https://github.com/nf-core/seqinspector/pull/13) Generate reports per run, per project and per lane. - [#49](https://github.com/nf-core/seqinspector/pull/49) Merge with template 3.0.2. +- [#59](https://github.com/nf-core/seqinspector/pull/59) Seqkit Stats TSV output. - [#50](https://github.com/nf-core/seqinspector/pull/50) Add an optional subsampling step. - [#51](https://github.com/nf-core/seqinspector/pull/51) Add nf-test to CI. - [#63](https://github.com/nf-core/seqinspector/pull/63) Contribution guidelines added about displaying results for new tools diff --git a/CITATIONS.md b/CITATIONS.md index 8a4e350b..e1afbe92 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -18,6 +18,10 @@ > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. +- [Seqkit](https://doi.org/10.1371/journal.pone.0163962) + +> Wei Shen, Botond Sipos, and Liuyang Zhao. 2024. SeqKit2: A Swiss Army Knife for Sequence and Alignment Processing. iMeta e191. [doi:10.1002/imt2.191](https://doi.org/10.1002/imt2.191). + - [Seqtk](https://github.com/lh3/seqtk) ## Software packaging/containerisation tools diff --git a/README.md b/README.md index 6cf36dcc..c37d61e2 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,7 @@ 1. Subsample reads ([`Seqtk`](https://github.com/lh3/seqtk)) 2. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) 3. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) +4. Provide Seqkit summary stats ([`SeqkitStats`](https://bioinf.shenwei.me/seqkit/usage/#stats)) ## Usage diff --git a/conf/modules.config b/conf/modules.config index d3c597b3..7c1b21e6 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -26,6 +26,10 @@ process { ext.args = '--quiet' } + withName: SEQKIT_STATS { + ext.args = '' + } + withName: 'MULTIQC_GLOBAL' { ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' } publishDir = [ diff --git a/docs/output.md b/docs/output.md index 2d4efb02..eb7a4622 100644 --- a/docs/output.md +++ b/docs/output.md @@ -13,6 +13,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [Seqtk](#seqtk) - Subsample a specific number of reads per sample - [FastQC](#fastqc) - Raw read QC - [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline +- [SeqkitStats](#seqkitstats) - Per sample TSV file with summary statistics - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution ### Seqtk @@ -72,6 +73,18 @@ nf-core/seqinspector will generate the following MultiQC reports: Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . +### SeqkitStats + +
+Output files + +- `seqkit/` + - `*.tsv`: Per sample TSV file with summary statistics. + +
+ +[SeqkitStats](https://bioinf.shenwei.me/seqkit/usage/#stats) it gives simple statistics such as number of sequences, min/max_len, N50, Q20%, Q30% and GC%. For further reading and documentation see the [Seqkit help pages]([Seqkit help](https://bioinf.shenwei.me/seqkit/)). + ### Pipeline information
diff --git a/modules.json b/modules.json index 7e57ea15..3a3635b8 100644 --- a/modules.json +++ b/modules.json @@ -15,6 +15,11 @@ "git_sha": "cf17ca47590cc578dfb47db1c2a44ef86f89976d", "installed_by": ["modules"] }, + "seqkit/stats": { + "branch": "master", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", + "installed_by": ["modules"] + }, "seqtk/sample": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", diff --git a/modules/nf-core/seqkit/stats/environment.yml b/modules/nf-core/seqkit/stats/environment.yml new file mode 100644 index 00000000..41f3e7de --- /dev/null +++ b/modules/nf-core/seqkit/stats/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::seqkit=2.8.1 diff --git a/modules/nf-core/seqkit/stats/main.nf b/modules/nf-core/seqkit/stats/main.nf new file mode 100644 index 00000000..117c6052 --- /dev/null +++ b/modules/nf-core/seqkit/stats/main.nf @@ -0,0 +1,34 @@ +process SEQKIT_STATS { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/seqkit:2.8.1--h9ee0642_0' : + 'biocontainers/seqkit:2.8.1--h9ee0642_0' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*.tsv"), emit: stats + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '--all' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + seqkit stats \\ + --tabular \\ + $args \\ + $reads > '${prefix}.tsv' + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqkit: \$( seqkit version | sed 's/seqkit v//' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/seqkit/stats/meta.yml b/modules/nf-core/seqkit/stats/meta.yml new file mode 100644 index 00000000..797712b5 --- /dev/null +++ b/modules/nf-core/seqkit/stats/meta.yml @@ -0,0 +1,48 @@ +name: "seqkit_stats" +description: simple statistics of FASTA/Q files +keywords: + - seqkit + - fasta + - stats +tools: + - "seqkit": + description: Cross-platform and ultrafast toolkit for FASTA/Q file manipulation, + written by Wei Shen. + homepage: https://bioinf.shenwei.me/seqkit/usage/ + documentation: https://bioinf.shenwei.me/seqkit/usage/ + tool_dev_url: https://github.com/shenwei356/seqkit/ + doi: "10.1371/journal.pone.0163962" + licence: ["MIT"] + identifier: biotools:seqkit +input: + - - meta: + type: map + description: > + Groovy Map containing sample information e.g. [ id:'test', single_end:false + ] + - reads: + type: file + description: > + Either FASTA or FASTQ files. + pattern: "*.{fa,fna,faa,fasta,fq,fastq}[.gz]" +output: + - stats: + - meta: + type: map + description: > + Groovy Map containing sample information e.g. [ id:'test', single_end:false + ] + - "*.tsv": + type: file + description: > + Tab-separated output file with basic sequence statistics. + pattern: "*.tsv" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@Midnighter" +maintainers: + - "@Midnighter" diff --git a/modules/nf-core/seqkit/stats/tests/main.nf.test b/modules/nf-core/seqkit/stats/tests/main.nf.test new file mode 100644 index 00000000..2cd4eb49 --- /dev/null +++ b/modules/nf-core/seqkit/stats/tests/main.nf.test @@ -0,0 +1,141 @@ +nextflow_process { + + name "Test Process SEQKIT_STATS" + script "../main.nf" + process "SEQKIT_STATS" + + tag "modules" + tag "modules_nfcore" + tag "seqkit" + tag "seqkit/stats" + + test("single_end") { + + when { + process { + """ + input[0] = [[ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("paired_end") { + + when { + process { + """ + input[0] = [[ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("nanopore") { + + when { + process { + """ + input[0] = [[ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/nanopore/fastq/test.fastq.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("genome_fasta") { + + when { + process { + """ + input[0] = [[ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("transcriptome_fasta") { + + when { + process { + """ + input[0] = [[ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/transcriptome.fasta', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("single_end - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [[ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/seqkit/stats/tests/main.nf.test.snap b/modules/nf-core/seqkit/stats/tests/main.nf.test.snap new file mode 100644 index 00000000..43aa9fe2 --- /dev/null +++ b/modules/nf-core/seqkit/stats/tests/main.nf.test.snap @@ -0,0 +1,212 @@ +{ + "nanopore": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.tsv:md5,49ea41204974d82fb9bbb30572485ccb" + ] + ], + "1": [ + "versions.yml:md5,e5810dac4e10040b1a74b8672a829734" + ], + "stats": [ + [ + { + "id": "test", + "single_end": true + }, + "test.tsv:md5,49ea41204974d82fb9bbb30572485ccb" + ] + ], + "versions": [ + "versions.yml:md5,e5810dac4e10040b1a74b8672a829734" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-08T08:35:18.621881722" + }, + "genome_fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.tsv:md5,af2521fa0c7e1a0683e60df6292429de" + ] + ], + "1": [ + "versions.yml:md5,e5810dac4e10040b1a74b8672a829734" + ], + "stats": [ + [ + { + "id": "test", + "single_end": false + }, + "test.tsv:md5,af2521fa0c7e1a0683e60df6292429de" + ] + ], + "versions": [ + "versions.yml:md5,e5810dac4e10040b1a74b8672a829734" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-08T08:35:28.193047869" + }, + "transcriptome_fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.tsv:md5,48d5664c3214880f3509226cb839eb17" + ] + ], + "1": [ + "versions.yml:md5,e5810dac4e10040b1a74b8672a829734" + ], + "stats": [ + [ + { + "id": "test", + "single_end": false + }, + "test.tsv:md5,48d5664c3214880f3509226cb839eb17" + ] + ], + "versions": [ + "versions.yml:md5,e5810dac4e10040b1a74b8672a829734" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-08T08:35:39.129387891" + }, + "single_end": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.tsv:md5,97eb700f7e607dba8371e82d5de8a1a2" + ] + ], + "1": [ + "versions.yml:md5,e5810dac4e10040b1a74b8672a829734" + ], + "stats": [ + [ + { + "id": "test", + "single_end": true + }, + "test.tsv:md5,97eb700f7e607dba8371e82d5de8a1a2" + ] + ], + "versions": [ + "versions.yml:md5,e5810dac4e10040b1a74b8672a829734" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-08T08:34:57.889856349" + }, + "paired_end": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.tsv:md5,97eb700f7e607dba8371e82d5de8a1a2" + ] + ], + "1": [ + "versions.yml:md5,e5810dac4e10040b1a74b8672a829734" + ], + "stats": [ + [ + { + "id": "test", + "single_end": false + }, + "test.tsv:md5,97eb700f7e607dba8371e82d5de8a1a2" + ] + ], + "versions": [ + "versions.yml:md5,e5810dac4e10040b1a74b8672a829734" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-08T08:35:09.114729727" + }, + "single_end - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.tsv:md5,97eb700f7e607dba8371e82d5de8a1a2" + ] + ], + "1": [ + "versions.yml:md5,e5810dac4e10040b1a74b8672a829734" + ], + "stats": [ + [ + { + "id": "test", + "single_end": true + }, + "test.tsv:md5,97eb700f7e607dba8371e82d5de8a1a2" + ] + ], + "versions": [ + "versions.yml:md5,e5810dac4e10040b1a74b8672a829734" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-08T08:35:49.267660809" + } +} \ No newline at end of file diff --git a/modules/nf-core/seqkit/stats/tests/tags.yml b/modules/nf-core/seqkit/stats/tests/tags.yml new file mode 100644 index 00000000..7f33c7df --- /dev/null +++ b/modules/nf-core/seqkit/stats/tests/tags.yml @@ -0,0 +1,2 @@ +seqkit/stats: + - "modules/nf-core/seqkit/stats/**" diff --git a/workflows/seqinspector.nf b/workflows/seqinspector.nf index 7a2dfae5..d824d856 100644 --- a/workflows/seqinspector.nf +++ b/workflows/seqinspector.nf @@ -10,6 +10,8 @@ include { FASTQC } from '../modules/nf-core/fastqc/main' include { MULTIQC as MULTIQC_GLOBAL } from '../modules/nf-core/multiqc/main' include { MULTIQC as MULTIQC_PER_TAG } from '../modules/nf-core/multiqc/main' +include { SEQKIT_STATS } from '../modules/nf-core/seqkit/stats/main' + include { paramsSummaryMap } from 'plugin/nf-schema' include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' @@ -156,6 +158,14 @@ workflow SEQINSPECTOR { [] ) + // + // MODULE: Run SEQKIT_STATS + // + SEQKIT_STATS ( + ch_samplesheet + ) + ch_versions = ch_versions.mix(SEQKIT_STATS.out.versions.first()) + emit: global_report = MULTIQC_GLOBAL.out.report.toList() // channel: [ /path/to/multiqc_report.html ] grouped_reports = MULTIQC_PER_TAG.out.report.toList() // channel: [ /path/to/multiqc_report.html ]