From 5a0a8f959ad7f4a577f87178d639a2e7430e2fd7 Mon Sep 17 00:00:00 2001 From: mschertzer Date: Mon, 4 May 2026 18:35:49 -0400 Subject: [PATCH 01/31] fix: update SQANTI QC command line parameters --- modules/local/isocall_call/main.nf | 8 +++++--- modules/local/sqanti_qc/main.nf | 22 ++++++++++------------ 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/modules/local/isocall_call/main.nf b/modules/local/isocall_call/main.nf index 59c02cb..ebf0280 100644 --- a/modules/local/isocall_call/main.nf +++ b/modules/local/isocall_call/main.nf @@ -15,7 +15,7 @@ process ISOCALL_CALL { output: tuple val(meta), path("*.isocall.isoforms.gtf.gz"), emit: gtf - tuple val(meta), path("*.isocall.count_matrix.txt"), emit: count_matrix + tuple val(meta), path("*.isocall.count_matrix.csv"), emit: count_matrix path "versions.yml", emit: versions when: @@ -38,7 +38,9 @@ process ISOCALL_CALL { --min-reads-per-isoform $min_read_support \\ --max-bundles-per-gene $max_bundles_per_gene \\ $args - + + mv ${prefix}.isocall.count_matrix.txt ${prefix}.isocall.count_matrix.csv + cat <<-END_VERSIONS > versions.yml "${task.process}": isocall: \$( isocall --version 2>&1 | sed 's/isocall //g' ) @@ -49,7 +51,7 @@ process ISOCALL_CALL { def prefix = task.ext.prefix ?: "${meta.id}" """ touch ${prefix}.isocall.isoforms.gtf.gz - touch ${prefix}.isocall.count_matrix.txt + touch ${prefix}.isocall.count_matrix.csv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/sqanti_qc/main.nf b/modules/local/sqanti_qc/main.nf index b2e9697..954cd48 100644 --- a/modules/local/sqanti_qc/main.nf +++ b/modules/local/sqanti_qc/main.nf @@ -30,8 +30,9 @@ process SQANTI_QC { def prefix = task.ext.prefix ?: "${meta.id}" """ + source /conda/miniconda3/etc/profile.d/conda.sh - conda activate SQANTI3.env + conda activate sqanti3 # Decompress GTF if it's gzipped ISOFORMS_INPUT="$isoforms_gtf" @@ -45,16 +46,13 @@ process SQANTI_QC { fi sqanti3_qc.py \\ - --force_id_ignore \\ - --skipORF \\ - --output ${prefix}.transcriptome \\ - --dir . \\ - --cpus $task.cpus \\ + --isoforms "\$ISOFORMS_INPUT" \\ + --refGTF $reference_gtf \\ + --refFasta $reference_fasta \\ + -o ${prefix}.transcriptome \\ + -d . \\ --report skip \\ - --fl_count $flnc_count \\ - "\$ISOFORMS_INPUT" \\ - $reference_gtf \\ - $reference_fasta \\ + --fl $flnc_count \\ $args mv ${prefix}.transcriptome_classification.txt ${prefix}.transcriptome.SQANTI_classification.txt @@ -64,7 +62,7 @@ process SQANTI_QC { cat <<-END_VERSIONS > versions.yml "${task.process}": - sqanti3: 5.2.2 + sqanti3: 6.0.1 END_VERSIONS """ @@ -78,7 +76,7 @@ process SQANTI_QC { cat <<-END_VERSIONS > versions.yml "${task.process}": - sqanti3: 5.2.2 + sqanti3: 6.0.1 END_VERSIONS """ } From b877d472e06cd071b2c1728fa61a4cb8057286aa Mon Sep 17 00:00:00 2001 From: = Date: Mon, 27 Apr 2026 16:03:23 -0400 Subject: [PATCH 02/31] fix: ensure FragPipe module uses fixed v24.0 for consistency --- modules/local/fragpipe/main.nf | 10 +++++----- nextflow.config | 1 - nextflow_schema.json | 7 ------- 3 files changed, 5 insertions(+), 13 deletions(-) diff --git a/modules/local/fragpipe/main.nf b/modules/local/fragpipe/main.nf index 4cc2c22..f6cebd6 100644 --- a/modules/local/fragpipe/main.nf +++ b/modules/local/fragpipe/main.nf @@ -3,7 +3,7 @@ process FRAGPIPE { label 'process_high_memory' conda "${moduleDir}/environment.yml" - container "docker://docker.io/fcyucn/fragpipe:${params.fragpipe_version}" + container "docker://docker.io/fcyucn/fragpipe:24.0" input: tuple val(meta), path(mzml_files), path(protein_fasta) @@ -176,8 +176,8 @@ process FRAGPIPE { # Find FragPipe executable if command -v fragpipe &> /dev/null; then FRAGPIPE_CMD="fragpipe" - elif [ -f "/fragpipe_bin/fragpipe-${params.fragpipe_version}/fragpipe-${params.fragpipe_version}/bin/fragpipe" ]; then - FRAGPIPE_CMD="/fragpipe_bin/fragpipe-${params.fragpipe_version}/fragpipe-${params.fragpipe_version}/bin/fragpipe" + elif [ -f "/fragpipe_bin/fragpipe-24.0/fragpipe-24.0/bin/fragpipe" ]; then + FRAGPIPE_CMD="/fragpipe_bin/fragpipe-24.0/fragpipe-24.0/bin/fragpipe" elif [ -d "/fragpipe_bin" ]; then FRAGPIPE_CMD=\$(find /fragpipe_bin -name "fragpipe" -type f -executable 2>/dev/null | head -1) if [ -z "\$FRAGPIPE_CMD" ]; then @@ -369,7 +369,7 @@ process FRAGPIPE { cat <<-END_VERSIONS > versions.yml "${task.process}": - fragpipe: \$(\$FRAGPIPE_CMD --version 2>&1 | grep -oP 'FragPipe \\K[0-9.]+' || echo "${params.fragpipe_version}") + fragpipe: \$(\$FRAGPIPE_CMD --version 2>&1 | grep -oP 'FragPipe \\K[0-9.]+' || echo "24.0") msfragger: \$(basename ${msfragger_jar} | grep -oP 'MSFragger-\\K[0-9.]+' || echo "unknown") ionquant: \$(basename ${ionquant_jar} | grep -oP 'IonQuant-\\K[0-9.]+' || echo "unknown") END_VERSIONS @@ -385,7 +385,7 @@ process FRAGPIPE { cat <<-END_VERSIONS > versions.yml "${task.process}": - fragpipe: ${params.fragpipe_version} + fragpipe: 24.0 msfragger: 4.1 ionquant: 1.10.12 END_VERSIONS diff --git a/nextflow.config b/nextflow.config index ce4ad57..96d6e82 100644 --- a/nextflow.config +++ b/nextflow.config @@ -94,7 +94,6 @@ params { // FragPipe options fragpipe_workflow = null // Path to custom FragPipe workflow file (optional - auto-downloads DIA_SpecLib_Quant.workflow for DIA or LFQ-MBR.workflow for DDA if not specified) - fragpipe_version = '24.0' // FragPipe container version fragpipe_decoy_tag = 'rev_' // Decoy sequence prefix tag for database preparation fragpipe_threads = null // Number of threads for FragPipe (default: uses task.cpus from process label) fragpipe_tools_dir = "${projectDir}/fragpipe_tools" // Directory to cache downloaded MSFragger/IonQuant tools diff --git a/nextflow_schema.json b/nextflow_schema.json index 614c6c5..89e169c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -476,13 +476,6 @@ "fa_icon": "fas fa-search", "enum": ["fragpipe", "metamorpheus"] }, - "fragpipe_version": { - "type": "string", - "default": "24.0", - "description": "FragPipe version to use.", - "help_text": "Specify the FragPipe version for proteomics analysis.", - "fa_icon": "fas fa-tag" - }, "fragpipe_tools_dir": { "type": "string", "format": "directory-path", From 453805c42fbc87163a76ea5f7d601dc85573b264 Mon Sep 17 00:00:00 2001 From: Julia Lewandowski <68295300+JTL-lab@users.noreply.github.com> Date: Mon, 27 Apr 2026 16:24:03 -0400 Subject: [PATCH 03/31] Revert "feat: update SQANTI container versions to latest v6.0.1" --- modules/local/sqanti_protein/main.nf | 2 +- modules/local/sqanti_qc/main.nf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/local/sqanti_protein/main.nf b/modules/local/sqanti_protein/main.nf index 5a14540..cd072c4 100644 --- a/modules/local/sqanti_protein/main.nf +++ b/modules/local/sqanti_protein/main.nf @@ -3,7 +3,7 @@ process SQANTI_PROTEIN { label 'process_long' conda "${moduleDir}/environment.yml" - container 'docker://docker.io/anaconesalab/sqanti3:v6.0.1' + container 'docker://docker.io/anaconesalab/sqanti3:v5.5.4' input: tuple val(meta), path(cds_gtf) diff --git a/modules/local/sqanti_qc/main.nf b/modules/local/sqanti_qc/main.nf index 954cd48..391b18e 100644 --- a/modules/local/sqanti_qc/main.nf +++ b/modules/local/sqanti_qc/main.nf @@ -3,7 +3,7 @@ process SQANTI_QC { label 'process_long' conda "${moduleDir}/environment.yml" - container 'docker://docker.io/anaconesalab/sqanti3:v6.0.1' + container 'docker://docker.io/anaconesalab/sqanti3:5.2.2' input: tuple val(meta), path(isoforms_gtf), path(flnc_count) From 2c9d57bcc73826330ba52feb1b4f039801e7d0ca Mon Sep 17 00:00:00 2001 From: = Date: Mon, 27 Apr 2026 17:44:51 -0400 Subject: [PATCH 04/31] refactor: ensure default singularity cache in work directory --- conf/slurm.config | 3 +-- modules/local/metamorpheus/main.nf | 2 +- nextflow.config | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/conf/slurm.config b/conf/slurm.config index 197eef5..c5bcc99 100644 --- a/conf/slurm.config +++ b/conf/slurm.config @@ -57,7 +57,6 @@ singularity { pullTimeout = '120 min' // Fix for FUSE mounting issues with Singularity 4.x to use SIF format instead runOptions = '--disable-cache --userns' - // You can specify a cache directory here for use in HPC environments to avoid re-converting containers on each run, e.g. - //cacheDir = '/my/dir/singularity_cache' + cacheDir = "${projectDir}/work/singularity" } diff --git a/modules/local/metamorpheus/main.nf b/modules/local/metamorpheus/main.nf index f7b042f..a991ead 100644 --- a/modules/local/metamorpheus/main.nf +++ b/modules/local/metamorpheus/main.nf @@ -13,7 +13,7 @@ process METAMORPHEUS { # Auto-extract MetaMorpheus default files from container on first run METAMORPH_DATA_DIR="${projectDir}/assets/metamorpheus_data" - CONTAINER="\${NXF_SINGULARITY_CACHEDIR:-${projectDir}/singularity_cache}/docker.io-smithchemwisc-metamorpheus-latest.img" + CONTAINER="\${NXF_SINGULARITY_CACHEDIR:-${projectDir}/work/singularity}/docker.io-smithchemwisc-metamorpheus-latest.img" if [ ! -f "\${METAMORPH_DATA_DIR}/Mods/ProteaseMods.txt" ]; then LOCK_FILE="\${METAMORPH_DATA_DIR}/.extraction.lock" diff --git a/nextflow.config b/nextflow.config index 96d6e82..99103bb 100644 --- a/nextflow.config +++ b/nextflow.config @@ -305,7 +305,7 @@ env { R_ENVIRON_USER = "/.Renviron" JULIA_DEPOT_PATH = "/usr/local/share/julia" SINGULARITY_TMPDIR = "${projectDir}/work/singularity_tmp" - SINGULARITY_CACHEDIR = "${projectDir}/singularity" + SINGULARITY_CACHEDIR = "${projectDir}/work/singularity" } // Set bash options From 126dcc49c5ec63438a27a1f7a2325551e446591a Mon Sep 17 00:00:00 2001 From: mschertzer Date: Fri, 17 Apr 2026 13:01:33 -0400 Subject: [PATCH 05/31] fix: separate GENCODE and custom protein FASTA inputs in BUILD_PROTEOME_REFERENCE Previously, params.protein_fasta was used for both GENCODE and custom protein FASTAs, causing header parsing failures when a custom/LRP FASTA was passed into the GENCODE slot (expected 8-field pipe-delimited headers). Changes: - Rename params.protein_fasta to params.custom_protein_fasta - Add params.lrp_protein_fasta for pre-computed LRP proteomes (proteomics-only runs) - GENCODE protein FASTA (pc_translations.fa) now auto-resolved from --genome via gencode_refs - BUILD_PROTEOME_REFERENCE module input tuple expanded from 4 to 5 elements (lrp_fasta, counts, custom_fasta, gencode_protein_fasta) - Each FASTA passed as named arg to R script (--lrp_fasta, --custom_fasta, --gencode_fasta) - GENCODE protein FASTA decompression gated on protein samples, not RNA samples - Add gencode_refs entries for v40-v42; fix v45 protein_fasta URL pointing to v46 - Fix FragPipe workflow file missing trailing newline (upstream Nesvilab #2730) --- conf/gencode_references.config | 29 +++- .../local/build_proteome_reference/main.nf | 7 +- nextflow.config | 5 +- workflows/lrp2.nf | 146 ++++++++++++++---- 4 files changed, 151 insertions(+), 36 deletions(-) diff --git a/conf/gencode_references.config b/conf/gencode_references.config index 8209a60..4e7565d 100644 --- a/conf/gencode_references.config +++ b/conf/gencode_references.config @@ -50,7 +50,7 @@ params { fasta = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_45/GRCh38.primary_assembly.genome.fa.gz' gtf = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_45/gencode.v45.annotation.gtf.gz' transcripts = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_45/gencode.v45.transcripts.fa.gz' - protein_fasta = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_46/gencode.v46.pc_translations.fa.gz' + protein_fasta = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_45/gencode.v45.pc_translations.fa.gz' mito_name = 'chrM' } 'GRCh38.p14.v44' { @@ -71,6 +71,33 @@ params { protein_fasta = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_43/gencode.v43.pc_translations.fa.gz' mito_name = 'chrM' } + 'GRCh38.p13.v42' { + release = 42 + genome_build = 'GRCh38.p13' + fasta = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_42/GRCh38.primary_assembly.genome.fa.gz' + gtf = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_42/gencode.v42.annotation.gtf.gz' + transcripts = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_42/gencode.v42.transcripts.fa.gz' + protein_fasta = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_42/gencode.v42.pc_translations.fa.gz' + mito_name = 'chrM' + } + 'GRCh38.p13.v41' { + release = 41 + genome_build = 'GRCh38.p13' + fasta = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz' + gtf = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz' + transcripts = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.transcripts.fa.gz' + protein_fasta = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.pc_translations.fa.gz' + mito_name = 'chrM' + } + 'GRCh38.p13.v40' { + release = 40 + genome_build = 'GRCh38.p13' + fasta = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_40/GRCh38.primary_assembly.genome.fa.gz' + gtf = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_40/gencode.v40.annotation.gtf.gz' + transcripts = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_40/gencode.v40.transcripts.fa.gz' + protein_fasta = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_40/gencode.v40.pc_translations.fa.gz' + mito_name = 'chrM' + } 'GRCh37.p13.v19' { release = 19 genome_build = 'GRCh37.p13' diff --git a/modules/local/build_proteome_reference/main.nf b/modules/local/build_proteome_reference/main.nf index ad693c8..948b0be 100644 --- a/modules/local/build_proteome_reference/main.nf +++ b/modules/local/build_proteome_reference/main.nf @@ -6,7 +6,7 @@ process BUILD_PROTEOME_REFERENCE { container "docker://docker.io/jtllab/lrp2-lite:latest" input: - tuple val(meta), path(lrp_fasta), path(counts), path(gencode_fasta) + tuple val(meta), path(lrp_fasta), path(counts), path(custom_fasta), path(gencode_protein_fasta) path build_proteome_reference_script val genome_name val no_gencode @@ -25,13 +25,16 @@ process BUILD_PROTEOME_REFERENCE { // Handle both generic NO_FILE and sample-specific placeholder names (e.g., sample1_NO_LRP_FASTA) def lrp_fasta_arg = (lrp_fasta.name != 'NO_FILE' && !lrp_fasta.name.contains('_NO_LRP_FASTA')) ? "--lrp_fasta ${lrp_fasta}" : "" def counts_arg = (counts.name != 'NO_FILE' && !counts.name.contains('_NO_COUNTS')) ? "--counts ${counts}" : "" - def gencode_fasta_arg = "--gencode_fasta ${gencode_fasta}" + //def gencode_fasta_arg = "--gencode_fasta ${gencode_fasta}" + def gencode_fasta_arg = (gencode_protein_fasta.name != 'NO_FILE' && !gencode_protein_fasta.name.contains('_NO_GENCODE_PROTEIN_FASTA')) ? "--gencode_fasta ${gencode_protein_fasta}" : "" def gencode_flag = no_gencode ? "--no_gencode" : "" + def custom_fasta_arg = (custom_fasta.name != 'NO_FILE' && !custom_fasta.name.contains('_NO_CUSTOM_FASTA')) ? "--custom_fasta ${custom_fasta}" : "" """ Rscript ${build_proteome_reference_script} \\ ${lrp_fasta_arg} \\ ${counts_arg} \\ + ${custom_fasta_arg} \\ ${gencode_fasta_arg} \\ --genome_name ${genome_name} \\ ${gencode_flag} \\ diff --git a/nextflow.config b/nextflow.config index 99103bb..6b3941a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -88,8 +88,11 @@ params { protein_search = 'metamorpheus' // Protein search engine: 'fragpipe' or 'metamorpheus' (default: 'metamorpheus') // MetaMorpheus options + + // Build reference proteome options metamorpheus_config = null // Path to MetaMorpheus task configuration TOML file (default: sample_data/SearchTask.toml) - protein_fasta = null // Reference protein FASTA for database search (auto-detected from GENCODE genome if not provided) + lrp_protein_fasta = null // Pre-computed LRP protein FASTA for proteomics-only runs (headers: transcript_id|gene_id|gene_name|pclass|status) + custom_protein_fasta = null // User-provided custom protein FASTA (headers: transcript_id|gene_id at minimum) no_gencode = false // Do not include GENCODE reference in proteome database (default: false - GENCODE is included) // FragPipe options diff --git a/workflows/lrp2.nf b/workflows/lrp2.nf index a549ff5..7aa2504 100644 --- a/workflows/lrp2.nf +++ b/workflows/lrp2.nf @@ -231,49 +231,117 @@ workflow LRP2 { // Determine protein FASTA for proteomics analysis // Priority: 1) User-provided --protein_fasta, 2) Auto-detect from GENCODE genome, 3) Skip if neither available // - def protein_fasta_path = params.protein_fasta + //def protein_fasta_path = params.protein_fasta // If not provided by user, get from GENCODE genome default - if (!protein_fasta_path && params.gencode_refs?.containsKey(params.genome)) { - protein_fasta_path = params.gencode_refs[params.genome].protein_fasta - log.info "-${colors.purple}[sheynkmanlab/lrp2]${colors.cyan} Auto-detected protein FASTA from GENCODE genome ${params.genome}: ${protein_fasta_path}${colors.reset}-" + //if (!protein_fasta_path && params.gencode_refs?.containsKey(params.genome)) { + // protein_fasta_path = params.gencode_refs[params.genome].protein_fasta + // log.info "-${colors.purple}[sheynkmanlab/lrp2]${colors.cyan} Auto-detected protein FASTA from GENCODE genome ${params.genome}: ${protein_fasta_path}${colors.reset}-" + //} + + // Resolve GENCODE protein FASTA from --genome (e.g., GRCh38.p14.v49) + def gencode_protein_fasta_path = null + if (params.gencode_refs?.containsKey(params.genome)) { + gencode_protein_fasta_path = params.gencode_refs[params.genome].protein_fasta + log.info "-${colors.purple}[sheynkmanlab/lrp2]${colors.cyan} GENCODE protein FASTA resolved from --genome ${params.genome}: ${gencode_protein_fasta_path}${colors.reset}-" } - - def protein_fasta_file = protein_fasta_path ? file(protein_fasta_path) : null - def is_protein_fasta_gzipped = protein_fasta_file && protein_fasta_file.name.endsWith('.gz') - - if (is_protein_fasta_gzipped) { - ch_protein_fasta_input = channel.of([[ id: 'protein_fasta' ], protein_fasta_file]) - GUNZIP_PROTEIN_FASTA(ch_protein_fasta_input) - ch_protein_fasta = GUNZIP_PROTEIN_FASTA.out.gunzip.map { _meta, file -> file } - } else { - ch_protein_fasta = protein_fasta_file ? channel.value(protein_fasta_file) : channel.empty() + + // Resolve custom protein FASTA (optional, user-provided via --custom_protein_fasta) + def custom_protein_fasta_path = params.custom_protein_fasta ?: null + if (custom_protein_fasta_path) { + log.info "-${colors.purple}[sheynkmanlab/lrp2]${colors.cyan} Custom protein FASTA provided: ${custom_protein_fasta_path}${colors.reset}-" } - - // - // SUBWORKFLOW: Run proteomics analysis - // Runs if protein_fasta is available (either user-provided or auto-detected) AND protein samples are present - // + + // LRP protein fasta when only proteomics subworflow is run + def lrp_protein_fasta_path = params.lrp_protein_fasta ?: null + if (lrp_protein_fasta_path) { + log.info "-${colors.purple}[sheynkmanlab/lrp2]${colors.cyan} LRP protein FASTA provided: ${lrp_protein_fasta_path}${colors.reset}-" + } + + if (lrp_protein_fasta_path && custom_protein_fasta_path) { + error "--lrp_protein_fasta and --custom_protein_fasta are mutually exclusive. Provide one or neither." + } + + //def protein_fasta_file = protein_fasta_path ? file(protein_fasta_path) : null + //def is_protein_fasta_gzipped = protein_fasta_file && protein_fasta_file.name.endsWith('.gz') + + //if (is_protein_fasta_gzipped) { + // ch_protein_fasta_input = channel.of([[ id: 'protein_fasta' ], protein_fasta_file]) + // GUNZIP_PROTEIN_FASTA(ch_protein_fasta_input) + // ch_protein_fasta = GUNZIP_PROTEIN_FASTA.out.gunzip.map { _meta, file -> file } + //} else { + // ch_protein_fasta = protein_fasta_file ? channel.value(protein_fasta_file) : channel.empty() + //} + + // Log skip if no protein samples ch_protein_count .subscribe { count -> - if (protein_fasta_path && count == 0) { - log.warn "-${colors.purple}[sheynkmanlab/lrp2]${colors.yellow} Protein FASTA available but no protein samples detected - skipping PROTEOMICS subworkflow${colors.reset}-" - } else if (!protein_fasta_path && count > 0) { - log.warn "-${colors.purple}[sheynkmanlab/lrp2]${colors.yellow} Protein samples detected but no protein FASTA available (neither --protein_fasta provided nor auto-detected from GENCODE genome) - skipping PROTEOMICS subworkflow${colors.reset}-" - } else if (!protein_fasta_path && count == 0) { + if (count == 0) { log.info "-${colors.purple}[sheynkmanlab/lrp2]${colors.dim} No protein samples detected - skipping PROTEOMICS subworkflow${colors.reset}-" } } + ch_has_protein_samples = ch_protein_count.map { count -> count > 0 } + + // + // SUBWORKFLOW: Run proteomics analysis + // Runs if protein_fasta is available (either user-provided or auto-detected) AND protein samples are present + // + //ch_protein_count + // .subscribe { count -> + // if (protein_fasta_path && count == 0) { + // log.warn "-${colors.purple}[sheynkmanlab/lrp2]${colors.yellow} Protein FASTA available but no protein samples detected - skipping PROTEOMICS subworkflow${colors.reset}-" + // } else if (!protein_fasta_path && count > 0) { + // log.warn "-${colors.purple}[sheynkmanlab/lrp2]${colors.yellow} Protein samples detected but no protein FASTA available (neither --protein_fasta provided nor auto-detected from GENCODE genome) - skipping PROTEOMICS subworkflow${colors.reset}-" + // } else if (!protein_fasta_path && count == 0) { + // log.info "-${colors.purple}[sheynkmanlab/lrp2]${colors.dim} No protein samples detected - skipping PROTEOMICS subworkflow${colors.reset}-" + // } + // } + // note: pipeline will only execute PROTEOMICS if protein_fasta is available (user-provided or auto-detected) - if (protein_fasta_path) { + //if (protein_fasta_path) { + if (gencode_protein_fasta_path || custom_protein_fasta_path || lrp_protein_fasta_path) { ch_metamorpheus_config = channel.value( params.metamorpheus_config ? file(params.metamorpheus_config) : file("${projectDir}/sample_data/SearchTask.toml") ) ch_mm_writable = channel.value(file("${projectDir}/assets/mm_writable_placeholder")) - + + // + // Decompress GENCODE protein FASTA if provided and gzipped (only when protein samples exist) + // + def gencode_protein_fasta_file = gencode_protein_fasta_path ? file(gencode_protein_fasta_path) : null + def is_gencode_protein_fasta_gzipped = gencode_protein_fasta_file && gencode_protein_fasta_file.name.endsWith('.gz') + + if (is_gencode_protein_fasta_gzipped && gencode_protein_fasta_file) { + ch_gencode_protein_fasta_input = ch_has_protein_samples + .filter { it } + .map { _has_protein -> [[ id: 'gencode_protein_fasta' ], gencode_protein_fasta_file] } + GUNZIP_PROTEIN_FASTA(ch_gencode_protein_fasta_input) + ch_gencode_protein_fasta = GUNZIP_PROTEIN_FASTA.out.gunzip.map { _meta, file -> file } + } else if (gencode_protein_fasta_file) { + ch_gencode_protein_fasta = channel.value(gencode_protein_fasta_file) + } else { + ch_gencode_protein_fasta = channel.value(file('NO_FILE')) + } + + // + // Resolve custom protein FASTA channel + // + def custom_protein_fasta_file = custom_protein_fasta_path ? file(custom_protein_fasta_path) : null + ch_custom_protein_fasta = custom_protein_fasta_file + ? channel.value(custom_protein_fasta_file) + : channel.value(file('NO_FILE')) + + // + // Resolve LRP protein FASTA channel (user-provided, for proteomics-only runs) + // + def lrp_protein_fasta_file = lrp_protein_fasta_path ? file(lrp_protein_fasta_path) : null + ch_lrp_protein_fasta = lrp_protein_fasta_file + ? channel.value(lrp_protein_fasta_file) + : channel.value(file('NO_FILE')) + // BUILD_PROTEOME_REFERENCE search db creation logic: // - If RNA samples were processed, we build sample-specific references with LRP proteome + GENCODE concatenated // - If no RNA samples then we build GENCODE-only references per sample group @@ -282,7 +350,7 @@ workflow LRP2 { ch_predicted_proteome_fasta = PREDICTED_PROTEOME.out.protein_all_orfs_fasta .map { _meta, fasta -> fasta } .first() - .ifEmpty(file('NO_FILE')) + .ifEmpty(lrp_protein_fasta_file ?: file('NO_FILE')) ch_transcript_counts_with_id = TRANSCRIPTOME.out.hashids_all .map { meta, counts -> [meta.id, counts] } @@ -319,17 +387,31 @@ workflow LRP2 { // Build proteome references per sample group // Prepare input channel for BUILD_PROTEOME_REFERENCE + //ch_build_ref_input = ch_protein_samples_grouped + // .combine(ch_predicted_proteome_fasta) + // .combine(ch_transcript_counts) + // .combine(ch_protein_fasta) + // .map { meta, _files, lrp_fasta, counts, gencode_fasta -> + // // If lrp_fasta or counts are NO_FILE placeholders, create unique ones per sample to avoid Nextflow staging collisions when multiple samples use the same placeholder + // def unique_lrp_fasta = lrp_fasta.name == 'NO_FILE' ? file("${meta.id}_NO_LRP_FASTA") : lrp_fasta + // def unique_counts = counts.name == 'NO_FILE' ? file("${meta.id}_NO_COUNTS") : counts + // return [meta, unique_lrp_fasta, unique_counts, gencode_fasta] + // } + ch_build_ref_input = ch_protein_samples_grouped .combine(ch_predicted_proteome_fasta) .combine(ch_transcript_counts) - .combine(ch_protein_fasta) - .map { meta, _files, lrp_fasta, counts, gencode_fasta -> - // If lrp_fasta or counts are NO_FILE placeholders, create unique ones per sample to avoid Nextflow staging collisions when multiple samples use the same placeholder + .combine(ch_custom_protein_fasta) + .combine(ch_gencode_protein_fasta) + .map { meta, _files, lrp_fasta, counts, custom_fasta, gencode_protein_fasta -> + // Create unique placeholder names per sample to avoid Nextflow staging collisions def unique_lrp_fasta = lrp_fasta.name == 'NO_FILE' ? file("${meta.id}_NO_LRP_FASTA") : lrp_fasta def unique_counts = counts.name == 'NO_FILE' ? file("${meta.id}_NO_COUNTS") : counts - return [meta, unique_lrp_fasta, unique_counts, gencode_fasta] + def unique_custom = custom_fasta.name == 'NO_FILE' ? file("${meta.id}_NO_CUSTOM_FASTA") : custom_fasta + def unique_gencode = gencode_protein_fasta.name == 'NO_FILE' ? file("${meta.id}_NO_GENCODE_PROTEIN_FASTA") : gencode_protein_fasta + return [meta, unique_lrp_fasta, unique_counts, unique_custom, unique_gencode] } - + // Script path for build_mass_spec_reference.R ch_build_proteome_script = channel.value(file("${projectDir}/bin/build_mass_spec_reference.R")) From 70f8a130b1f024defe072f9dc0c173cf92d1109a Mon Sep 17 00:00:00 2001 From: mschertzer Date: Wed, 22 Apr 2026 18:12:34 -0400 Subject: [PATCH 06/31] fix: ungate BED peptide mapping from end-to-end LRP requirement, add flexible FASTA/GTF inputs to NOVEL_PEPTIDES Previously, peptide-to-genome BED mapping only ran when the full LRP pipeline was executed end to end. GENCODE and custom/LRP BED mapping now run independently based on their own inputs. - Replace --lr_cds_gtf/--lr_orf_fasta with --custom_fasta, --custom_gtf, --gencode_fasta, --gencode_gtf in novel_peptides.R and module - Add params.lrp_gtf and params.custom_gtf for proteomics-only runs - Fallback chain: pipeline output > --lrp_gtf/fasta > --custom_gtf/fasta (mirrors BUILD_PROTEOME_REFERENCE pattern)" --- bin/novel_peptides.R | 134 ++++++++++++++++----------- modules/local/novel_peptides/main.nf | 23 +++-- nextflow.config | 12 ++- subworkflows/local/proteomics.nf | 22 +++-- workflows/lrp2.nf | 32 ++++++- 5 files changed, 141 insertions(+), 82 deletions(-) diff --git a/bin/novel_peptides.R b/bin/novel_peptides.R index 9fb698a..bae3753 100644 --- a/bin/novel_peptides.R +++ b/bin/novel_peptides.R @@ -55,10 +55,14 @@ option_list = list( help = "Fragpipe acquisition tpye: 'DIA' or 'DDA' [required for fragpipe]"), make_option(c("--fragpipe_results_dir"), type = "character", default = NULL, help = "Path to FragPipe results directory (e.g., results/S5_PROTEOMICS/M3_FRAGPIPE/sample_name)"), - make_option(c("--lr_cds_gtf"), type = "character", default = NULL, - help = "Path to LR transcript GTF with CDS entries (enables peptide-to-genome mapping)"), - make_option(c("--lr_orf_fasta"), type = "character", default = NULL, - help = "Path to LR protein FASTA (required if --lr_cds_gtf is provided)"), + make_option(c("--custom_gtf"), type = "character", default = NULL, + help = "Path to custom/LRP transcript GTF with CDS entries"), + make_option(c("--custom_fasta"), type = "character", default = NULL, + help = "Path to custom/LRP protein FASTA (headers: transcript_id|gene_id at minimum)"), + make_option(c("--gencode_fasta"), type = "character", default = NULL, + help = "Path to GENCODE protein FASTA (pc_translations.fa)"), + make_option(c("--gencode_gtf"), type = "character", default = NULL, + help = "Path to GENCODE annotation GTF (basic.annotation.gtf)"), make_option(c("--gencode_version"), type = "character", default = "46", help = "GENCODE version for GTF download [default: 46]"), make_option(c("--species"), type = "character", default = "human", @@ -90,13 +94,22 @@ if (!opt$species %in% c("human", "mouse")) { stop("--species must be 'human' or 'mouse' to pull proper GENCODE gtf") } -# optional lr fasta and gtf requirements if peptide mapping -if (!is.null(opt$lr_cds_gtf) && is.null(opt$lr_orf_fasta)) { - stop("--lr_orf_fasta is required when --lr_cds_gtf is provided") +# optional custom fasta and gtf requirements if peptide mapping +if (!is.null(opt$custom_gtf) && is.null(opt$custom_fasta)) { + stop("--custom_fasta is required when --custom_gtf is provided") } -if (!is.null(opt$lr_orf_fasta) && is.null(opt$lr_cds_gtf)) { - stop("--lr_cds_gtf is required when --lr_orf_fasta is provided") +if (!is.null(opt$custom_fasta) && is.null(opt$custom_gtf)) { + stop("--custom_gtf is required when --custom_fasta is provided") } + +# gencode fasta and gtf should be provided together +if (!is.null(opt$gencode_fasta) && is.null(opt$gencode_gtf)) { + stop("--gencode_gtf is required when --gencode_fasta is provided") +} +if (!is.null(opt$gencode_gtf) && is.null(opt$gencode_fasta)) { + stop("--gencode_fasta is required when --gencode_gtf is provided") +} + # ============================================================================= # Function to map peptides to genome # ============================================================================= @@ -447,40 +460,48 @@ cat(" Annotated (GENCODE only):", n_annotated_gencode, "\n") cat(" Peptides with best status NMD:", sum(out$best_status == "NMD", na.rm = TRUE), "\n") # ============================================================================= -# Optional: Map peptides to genomic coordinates +# Map peptides to genomic coordinates # ============================================================================= -if (!is.null(opt$lr_cds_gtf) && !is.null(opt$lr_orf_fasta)) { +# GENCODE and custom/LRP are independent — either or both can run +# Each requires its own FASTA + GTF pair + +gencode_bed = NULL +custom_bed = NULL + +# --- GENCODE peptides (runs whenever there are GENCODE-only peptides) --- + +gencode_peptides = out %>% + filter(rna_detection_status == "RNA_not_detected") + +if (nrow(gencode_peptides) > 0) { + cat("\n=== GENCODE peptides ===\n") + + gencode_peptides %<>% + select(Sequence, transcript_id, gene_id, PSM) %>% + separate_rows(transcript_id, sep = ",") %>% + separate_rows(gene_id, sep = ",") %>% + filter(transcript_id != "NA") - # --- GENCODE peptides --- - gencode_bed = NULL - gencode_peptides = out %>% - filter(rna_detection_status == "RNA_not_detected") + # Remove peptides mapping to multiple genes + multi_gene = gencode_peptides %>% + distinct(Sequence, gene_id) %>% + count(Sequence) %>% + filter(n > 1) + cat("Removing", nrow(multi_gene), "peptides mapping to multiple genes\n") - if (nrow(gencode_peptides) > 0) { - cat("\n=== GENCODE peptides ===\n") - - gencode_peptides %<>% - select(Sequence, transcript_id, gene_id, PSM) %>% - separate_rows(transcript_id, sep = ",") %>% - separate_rows(gene_id, sep = ",") %>% - filter(transcript_id != "NA") - - # Remove peptides mapping to multiple genes - multi_gene = gencode_peptides %>% - distinct(Sequence, gene_id) %>% - count(Sequence) %>% - filter(n > 1) - cat("Removing", nrow(multi_gene), "peptides mapping to multiple genes\n") - - unique_gencode_peptides = gencode_peptides %>% - filter(!Sequence %in% multi_gene$Sequence) %>% - distinct(Sequence, transcript_id, PSM) %>% - group_by(Sequence, PSM) %>% - slice(1) %>% - ungroup() + unique_gencode_peptides = gencode_peptides %>% + filter(!Sequence %in% multi_gene$Sequence) %>% + distinct(Sequence, transcript_id, PSM) %>% + group_by(Sequence, PSM) %>% + slice(1) %>% + ungroup() - # Download gencode fasta and gtf + # Use provided GENCODE files, fall back to download + if (!is.null(opt$gencode_fasta) && !is.null(opt$gencode_gtf)) { + gencode_fa_local_path = opt$gencode_fasta + gencode_gtf_local_path = opt$gencode_gtf + } else { if (opt$species == "human") { gencode_base = paste0("https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_", opt$gencode_version) gencode_ver = paste0("v", opt$gencode_version) @@ -501,14 +522,17 @@ if (!is.null(opt$lr_cds_gtf) && !is.null(opt$lr_orf_fasta)) { if (!file.exists(gencode_gtf_local_path)) { download.file(gencode_gtf, gencode_gtf_local_path, mode = "wb") } - - # Map GENCODE peptides - gencode_bed = map_peptides_to_genome(unique_gencode_peptides, gencode_fa_local_path, gencode_gtf_local_path) - } - # --- LRP peptides --- - cat("\n=== LRP Peptides ===\n") + # Map GENCODE peptides + gencode_bed = map_peptides_to_genome(unique_gencode_peptides, gencode_fa_local_path, gencode_gtf_local_path) + +} + +# --- Custom/LRP peptides (only when custom/lrp fasta + gtf provided) --- +if (!is.null(opt$custom_fasta) && !is.null(opt$custom_gtf)) { + cat("\n=== Custom/LRP Peptides ===\n") + lr_peptides = out %>% filter(rna_detection_status == "RNA_detected") %>% select(Sequence, transcript_id, gene_id, PSM) %>% @@ -530,18 +554,18 @@ if (!is.null(opt$lr_cds_gtf) && !is.null(opt$lr_orf_fasta)) { slice(1) %>% ungroup() %>% mutate(transcript_id = as.character(transcript_id)) - - # Map LR peptides - lr_bed = map_peptides_to_genome(unique_lr_peptides, opt$lr_orf_fasta, opt$lr_cds_gtf) - - all_bed = bind_rows(gencode_bed, lr_bed) - - if (nrow(all_bed) > 0) { - bed_outfile = file.path(opt$outdir, paste0(opt$sample_name, ".proteomics.all_peptides.bed")) - write_tsv(all_bed %>% select(1:12), bed_outfile, col_names = FALSE) - cat("\nWrote", nrow(all_bed), "BED12 entries to", bed_outfile, "\n") - } + # Map custom/LRP peptides + custom_bed = map_peptides_to_genome(unique_lr_peptides, opt$custom_fasta, opt$custom_gtf) } +all_bed = bind_rows(gencode_bed, custom_bed) + +if (nrow(all_bed) > 0) { + bed_outfile = file.path(opt$outdir, paste0(opt$sample_name, ".proteomics.all_peptides.bed")) + write_tsv(all_bed %>% select(1:12), bed_outfile, col_names = FALSE) + cat("\nWrote", nrow(all_bed), "BED12 entries to", bed_outfile, "\n") +} + + cat("\nDone.\n") \ No newline at end of file diff --git a/modules/local/novel_peptides/main.nf b/modules/local/novel_peptides/main.nf index 53b93d9..d135be6 100644 --- a/modules/local/novel_peptides/main.nf +++ b/modules/local/novel_peptides/main.nf @@ -6,7 +6,7 @@ process NOVEL_PEPTIDES { container "docker://docker.io/jtllab/lrp2-lite:latest" input: - tuple val(meta), path(fragpipe_peptide_file), path(reference_fasta), path(lr_cds_gtf), path(lr_orf_fasta) + tuple val(meta), path(fragpipe_peptide_file), path(reference_fasta), path(custom_gtf), path(custom_fasta), path(gencode_gtf), path(gencode_fasta) path novel_peptides_script val gencode_version @@ -23,17 +23,22 @@ process NOVEL_PEPTIDES { def prefix = task.ext.prefix ?: "${meta.id}" def ms_search_software = meta.protein_search ?: 'fragpipe' def acquisition_type = meta.mass_spec_type ?: 'DDA' - def has_lr_data = (!lr_cds_gtf.name.contains('_NO_GTF') && !lr_orf_fasta.name.contains('_NO_FASTA')) - def lr_gtf_arg = has_lr_data ? "--lr_cds_gtf ${lr_cds_gtf}" : "" - def lr_fasta_arg = has_lr_data ? "--lr_orf_fasta ${lr_orf_fasta}" : "" + def has_custom_data = (!custom_gtf.name.contains('_NO_GTF') && !custom_fasta.name.contains('_NO_FASTA')) + def custom_gtf_arg = has_custom_data ? "--custom_gtf ${custom_gtf}" : "" + def custom_fasta_arg = has_custom_data ? "--custom_fasta ${custom_fasta}" : "" + def has_gencode_data = (!gencode_gtf.name.contains('_NO_GTF') && !gencode_fasta.name.contains('_NO_FASTA')) + def gencode_gtf_arg = has_gencode_data ? "--gencode_gtf ${gencode_gtf}" : "" + def gencode_fasta_arg = has_gencode_data ? "--gencode_fasta ${gencode_fasta}" : "" """ echo "NOVEL PEPTIDES CLASSIFICATION: ${meta.id}" echo "Sample: ${prefix}" echo "Search software: ${ms_search_software}" echo "Acquisition type: ${acquisition_type}" - echo "LR CDS GTF: ${has_lr_data ? lr_cds_gtf : 'N/A (no matched RNA)'}" - echo "LR ORF FASTA: ${has_lr_data ? lr_orf_fasta : 'N/A (no matched RNA)'}" + echo "Custom GTF: ${has_custom_data ? custom_gtf : 'N/A (no custom data)'}" + echo "Custom FASTA: ${has_custom_data ? custom_fasta : 'N/A (no custom data)'}" + echo "GENCODE GTF: ${has_gencode_data ? gencode_gtf : 'N/A (will use fallback)'}" + echo "GENCODE FASTA: ${has_gencode_data ? gencode_fasta : 'N/A (will use fallback)'}" echo "FragPipe peptide file: ${fragpipe_peptide_file}" echo "" @@ -58,8 +63,10 @@ process NOVEL_PEPTIDES { --ms_search_software ${ms_search_software} \\ --acquisition_type ${acquisition_type} \\ --fragpipe_results_dir \$FRAGPIPE_RESULTS_DIR \\ - ${lr_gtf_arg} \\ - ${lr_fasta_arg} \\ + ${custom_gtf_arg} \\ + ${custom_fasta_arg} \\ + ${gencode_gtf_arg} \\ + ${gencode_fasta_arg} \\ --gencode_version ${gencode_version} \\ --outdir . \\ $args diff --git a/nextflow.config b/nextflow.config index 6b3941a..e1ccd00 100644 --- a/nextflow.config +++ b/nextflow.config @@ -88,12 +88,14 @@ params { protein_search = 'metamorpheus' // Protein search engine: 'fragpipe' or 'metamorpheus' (default: 'metamorpheus') // MetaMorpheus options - - // Build reference proteome options metamorpheus_config = null // Path to MetaMorpheus task configuration TOML file (default: sample_data/SearchTask.toml) - lrp_protein_fasta = null // Pre-computed LRP protein FASTA for proteomics-only runs (headers: transcript_id|gene_id|gene_name|pclass|status) - custom_protein_fasta = null // User-provided custom protein FASTA (headers: transcript_id|gene_id at minimum) - no_gencode = false // Do not include GENCODE reference in proteome database (default: false - GENCODE is included) + + // Build reference proteome and novel peptides options + lrp_protein_fasta = null // Pre-computed LRP protein FASTA for proteomics-only runs (headers: transcript_id|gene_id|gene_name|pclass|status) + lrp_gtf = null // LRP CDS GTF from a previous run (paired with lrp_protein_fasta) + custom_protein_fasta = null // User-provided custom protein FASTA (headers: transcript_id|gene_id at minimum) + custom_gtf = null // User-provided custom GTF with CDS entries (paired with custom_protein_fasta) + no_gencode = false // Do not include GENCODE reference in proteome database (default: false - GENCODE is included) // FragPipe options fragpipe_workflow = null // Path to custom FragPipe workflow file (optional - auto-downloads DIA_SpecLib_Quant.workflow for DIA or LFQ-MBR.workflow for DDA if not specified) diff --git a/subworkflows/local/proteomics.nf b/subworkflows/local/proteomics.nf index 3688a85..3f1d633 100644 --- a/subworkflows/local/proteomics.nf +++ b/subworkflows/local/proteomics.nf @@ -32,8 +32,10 @@ workflow PROTEOMICS { // Novel peptides inputs rna_sample_ids // channel: List of RNA sample IDs (to check for matched RNA data) - lr_cds_gtf // path: Long-read CDS GTF (only for samples with matched RNA sample) - lr_orf_fasta // path: Long-read ORF FASTA (only for samples with matched RNA sample) + lr_cds_gtf // path: Custom/LRP CDS GTF (only for samples with matched RNA sample) + lr_orf_fasta // path: Custom/LRP ORF FASTA (only for samples with matched RNA sample) + gencode_gtf_for_novel // path: GENCODE annotation GTF (for novel peptides BED mapping) + gencode_fasta_for_novel // path: GENCODE protein FASTA (for novel peptides BED mapping) genome // val: Genome reference name (e.g., 'GRCh38.p14.v49') main: @@ -181,13 +183,15 @@ workflow PROTEOMICS { ch_novel_peptides_input = ch_fragpipe_peptide_files .combine(ch_protein_dbs, by: 0) .map { sample_name, meta, peptide_file, protein_fasta -> - def rna_ids = rna_sample_ids.val - def lr_gtf_file = lr_cds_gtf.val - def lr_fasta_file = lr_orf_fasta.val - def has_rna = rna_ids.contains(sample_name) - def lr_gtf = has_rna ? lr_gtf_file : file("${sample_name}_NO_GTF") - def lr_fasta = has_rna ? lr_fasta_file : file("${sample_name}_NO_FASTA") - return [meta, peptide_file, protein_fasta, lr_gtf, lr_fasta] + def custom_gtf = lr_cds_gtf.val + def custom_fasta = lr_orf_fasta.val + def gc_gtf = gencode_gtf_for_novel.val + def gc_fasta = gencode_fasta_for_novel.val + return [meta, peptide_file, protein_fasta, + custom_gtf.name != 'NO_FILE' ? custom_gtf : file("${sample_name}_NO_GTF"), + custom_fasta.name != 'NO_FILE' ? custom_fasta : file("${sample_name}_NO_FASTA"), + gc_gtf.name != 'NO_FILE' ? gc_gtf : file("${sample_name}_NO_GTF"), + gc_fasta.name != 'NO_FILE' ? gc_fasta : file("${sample_name}_NO_FASTA")] } def novel_peptides_script = file("${projectDir}/bin/novel_peptides.R") diff --git a/workflows/lrp2.nf b/workflows/lrp2.nf index 7aa2504..f6f6c23 100644 --- a/workflows/lrp2.nf +++ b/workflows/lrp2.nf @@ -252,6 +252,12 @@ workflow LRP2 { log.info "-${colors.purple}[sheynkmanlab/lrp2]${colors.cyan} Custom protein FASTA provided: ${custom_protein_fasta_path}${colors.reset}-" } + // Resolve custom GTF (optional, paired with custom_protein_fasta) + def custom_gtf_file = params.custom_gtf ? file(params.custom_gtf) : null + if (custom_gtf_file) { + log.info "-${colors.purple}[sheynkmanlab/lrp2]${colors.cyan} Custom GTF provided: ${custom_gtf_file}${colors.reset}-" + } + // LRP protein fasta when only proteomics subworflow is run def lrp_protein_fasta_path = params.lrp_protein_fasta ?: null if (lrp_protein_fasta_path) { @@ -262,6 +268,13 @@ workflow LRP2 { error "--lrp_protein_fasta and --custom_protein_fasta are mutually exclusive. Provide one or neither." } + // Resolve LRP GTF (optional, for proteomics-only runs with prior LRP output) + def lrp_gtf_file = params.lrp_gtf ? file(params.lrp_gtf) : null + if (lrp_gtf_file) { + log.info "-${colors.purple}[sheynkmanlab/lrp2]${colors.cyan} LRP GTF provided: ${lrp_gtf_file}${colors.reset}-" + } + + //def protein_fasta_file = protein_fasta_path ? file(protein_fasta_path) : null //def is_protein_fasta_gzipped = protein_fasta_file && protein_fasta_file.name.endsWith('.gz') @@ -360,17 +373,24 @@ workflow LRP2 { ch_transcript_counts = ch_transcript_counts_with_id .map { rna_id, counts -> counts } - // Extract CDS GTF and ORF FASTA for novel peptides classification + // NOVEL_PEPTIDES Extract CDS GTF and ORF FASTA ch_lr_cds_gtf = PREDICTED_PROTEOME.out.cds_gtf .map { _meta, gtf -> gtf } .first() - .ifEmpty(file('NO_FILE')) + .ifEmpty(lrp_gtf_file ?: custom_gtf_file ?: file('NO_FILE')) ch_lr_orf_fasta = PREDICTED_PROTEOME.out.protein_all_orfs_fasta .map { _meta, fasta -> fasta } .first() - .ifEmpty(file('NO_FILE')) + .ifEmpty(lrp_protein_fasta_file ?: custom_protein_fasta_file ?: file('NO_FILE')) + // Resolve GENCODE annotation GTF for novel peptides BED mapping + // Use the already-decompressed ch_gtf if available, otherwise resolve from params + def gencode_gtf_for_novel = params.gencode_gtf ? file(params.gencode_gtf) : null + ch_gencode_gtf_for_novel = gencode_gtf_for_novel + ? channel.value(gencode_gtf_for_novel) + : channel.value(file('NO_FILE')) + // Create a channel that maps each protein sample to its sample_name for grouping // Group protein samples by sample_name (the biosample group) ch_protein_samples_grouped = ch_protein_samples_filtered @@ -451,8 +471,10 @@ workflow LRP2 { params.fragpipe_license_accept, // Novel peptides inputs ch_rna_sample_names, // List of RNA sample names to check for matches - ch_lr_cds_gtf, // LR CDS GTF (for samples with matched RNA) - ch_lr_orf_fasta, // LR ORF FASTA (for samples with matched RNA) + ch_lr_cds_gtf, // Custom/LRP CDS GTF (for samples with matched RNA) + ch_lr_orf_fasta, // Custom/LRP ORF FASTA (for samples with matched RNA) + ch_gencode_gtf_for_novel, // GENCODE annotation GTF (for BED mapping) + ch_gencode_protein_fasta, // GENCODE protein FASTA (for BED mapping) params.genome ) ch_versions = ch_versions.mix(PROTEOMICS.out.versions.ifEmpty([])) From 366587845fd6a846e490a3e52f70d31d8f1bdd34 Mon Sep 17 00:00:00 2001 From: aline2593 Date: Wed, 15 Apr 2026 16:22:42 -0400 Subject: [PATCH 07/31] feat: improve HPC portability for non-SLURM environments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add LSF scheduler configuration. - Resolve FASTA database path inside container at runtime in FragPipe  module. Path was previously resolved at Nextflow script generation  time, which could produce a path not accessible inside the  Apptainer/Singularity container. Switched to printf with $(pwd) for  runtime resolution. - Replace heredoc with echo statements for versions.yml generation in FragPipe module to avoid indentation-sensitive parsing failures. --- conf/lsf.config | 27 +++++++++++++++++++++++++++ modules/local/fragpipe/main.nf | 29 +++++++++++------------------ nextflow.config | 3 +++ 3 files changed, 41 insertions(+), 18 deletions(-) create mode 100644 conf/lsf.config diff --git a/conf/lsf.config b/conf/lsf.config new file mode 100644 index 0000000..68929f7 --- /dev/null +++ b/conf/lsf.config @@ -0,0 +1,27 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for LSF HPC cluster (Minerva/NYGC) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ +process { + executor = 'lsf' + queue = 'premium' + clusterOptions = '-P acc_bigbrain' +} + +executor { + name = 'lsf' + perJobMemLimit = true + pollInterval = '2 min' + queueStatInterval = '10 min' + submitRateLimit = '10/1min' + exitReadTimeout = '24h' + queueSize = 50 +} + +singularity { + enabled = true + autoMounts = true + pullTimeout = '120 min' + runOptions = '--disable-cache --userns' +} diff --git a/modules/local/fragpipe/main.nf b/modules/local/fragpipe/main.nf index f6cebd6..a5410da 100644 --- a/modules/local/fragpipe/main.nf +++ b/modules/local/fragpipe/main.nf @@ -142,11 +142,8 @@ process FRAGPIPE { # Update workflow with database path and decoy tag WORK_DIR="\$(pwd)" - - # Ensure workflow file ends with a newline (some upstream versions omit it) - sed -i -e '\$a\\' workflow.workflow - echo "database.db-path=\$WORK_DIR/database_with_decoys.fasta" >> workflow.workflow - echo "database.decoy-tag=${decoy_tag}" >> workflow.workflow + printf "\ndatabase.db-path=%s\n" "\$WORK_DIR/database_with_decoys.fasta" >> workflow.workflow + printf "database.decoy-tag=%s\n" "${decoy_tag}" >> workflow.workflow echo "" @@ -367,12 +364,10 @@ process FRAGPIPE { echo "FragPipe analysis completed for: ${meta.id}" echo "==========================================================================" - cat <<-END_VERSIONS > versions.yml - "${task.process}": - fragpipe: \$(\$FRAGPIPE_CMD --version 2>&1 | grep -oP 'FragPipe \\K[0-9.]+' || echo "24.0") - msfragger: \$(basename ${msfragger_jar} | grep -oP 'MSFragger-\\K[0-9.]+' || echo "unknown") - ionquant: \$(basename ${ionquant_jar} | grep -oP 'IonQuant-\\K[0-9.]+' || echo "unknown") - END_VERSIONS + echo "\"${task.process}\":" > versions.yml + echo " fragpipe: \$(\$FRAGPIPE_CMD --version 2>&1 | grep -oP 'FragPipe \\K[0-9.]+' || echo \"${params.fragpipe_version}\")" >> versions.yml + echo " msfragger: \$(basename ${msfragger_jar} | grep -oP 'MSFragger-\\K[0-9.]+' || echo \"unknown\")" >> versions.yml + echo " ionquant: \$(basename ${ionquant_jar} | grep -oP 'IonQuant-\\K[0-9.]+' || echo \"unknown\")" >> versions.yml """ stub: @@ -383,11 +378,9 @@ process FRAGPIPE { touch results/psm.tsv touch results/combined_protein.tsv - cat <<-END_VERSIONS > versions.yml - "${task.process}": - fragpipe: 24.0 - msfragger: 4.1 - ionquant: 1.10.12 - END_VERSIONS + echo "\"${task.process}\":" > versions.yml + echo " fragpipe: ${params.fragpipe_version}" >> versions.yml + echo " msfragger: 4.1" >> versions.yml + echo " ionquant: 1.10.12" >> versions.yml """ -} \ No newline at end of file +} diff --git a/nextflow.config b/nextflow.config index e1ccd00..700294d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -260,6 +260,9 @@ profiles { apptainer.runOptions = '--nv' singularity.runOptions = '--nv' } + lsf { + includeConfig 'conf/lsf.config' + } slurm { includeConfig 'conf/slurm.config' } From d597acad5e9d06afbe1dd1a7ecc4386673f1e655 Mon Sep 17 00:00:00 2001 From: = Date: Mon, 11 May 2026 14:37:28 -0400 Subject: [PATCH 08/31] fix: cleanup test param values for lsf config --- conf/lsf.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/lsf.config b/conf/lsf.config index 68929f7..30193db 100644 --- a/conf/lsf.config +++ b/conf/lsf.config @@ -5,8 +5,8 @@ */ process { executor = 'lsf' - queue = 'premium' - clusterOptions = '-P acc_bigbrain' + queue = '' + clusterOptions = '' } executor { From ff52fa885a4079acbe77f0a8bf943abf35fd5088 Mon Sep 17 00:00:00 2001 From: = Date: Mon, 11 May 2026 14:59:32 -0400 Subject: [PATCH 09/31] feat: add --hpc_executor, --hpc_queue, and --hpc_cluster_options as optional run parameters for slurm or lsf configs --- conf/lsf.config | 7 ++++--- conf/slurm.config | 7 ++++--- nextflow.config | 7 +++++++ 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/conf/lsf.config b/conf/lsf.config index 30193db..96cfabd 100644 --- a/conf/lsf.config +++ b/conf/lsf.config @@ -4,9 +4,10 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ process { - executor = 'lsf' - queue = '' - clusterOptions = '' + // Users can override these defaults via command-line params: --hpc_executor, --hpc_queue, --hpc_cluster_options + executor = params.hpc_executor ?: 'lsf' + queue = params.hpc_queue ?: '' + clusterOptions = params.hpc_cluster_options ?: '' } executor { diff --git a/conf/slurm.config b/conf/slurm.config index c5bcc99..bd94240 100644 --- a/conf/slurm.config +++ b/conf/slurm.config @@ -6,10 +6,11 @@ process { // Here we specify slurm as the executor explicitly: for -profile param you will also need to specify 'slurm', see docs - executor = 'slurm' + // Users can override these defaults via command-line params: --hpc_executor, --hpc_queue, --hpc_cluster_options + executor = params.hpc_executor ?: 'slurm' // Queue name should match the name used by your HPC cluster (typically 'cpu', 'gpu', etc.) - queue = 'standard' - clusterOptions = '' + queue = params.hpc_queue ?: 'standard' + clusterOptions = params.hpc_cluster_options ?: '' // Here you may specify custom resource allocations for each module tag. //withLabel:process_single { diff --git a/nextflow.config b/nextflow.config index 700294d..48bb846 100644 --- a/nextflow.config +++ b/nextflow.config @@ -117,6 +117,13 @@ params { // Accession mapping and novel peptides options //uniprot_fasta = null // UniProt reviewed protein FASTA (for accession mapping) + // HPC cluster options + // ------------------------------------ + // These parameters can override default executor settings in slurm.config or lsf.config + hpc_executor = null // OPTIONAL: Override executor type (e.g., 'slurm', 'lsf', 'local') + hpc_queue = null // OPTIONAL: Override queue/partition name for job submission + hpc_cluster_options = null // OPTIONAL: Override cluster-specific options (e.g., '--account=myaccount') + // Boilerplate options outdir = null publish_dir_mode = 'copy' From 47994bb0a03a74f8d8c6a9560a4f5f8480761556 Mon Sep 17 00:00:00 2001 From: = Date: Tue, 12 May 2026 10:57:23 -0400 Subject: [PATCH 10/31] feat (closes #46): add support for mouse data with preset GENCODE reference support, species-based hexamer model selection for CPAT, fix overwrite bug with RNA samples metadata for MULTISAMPLE_ANALYSIS triggered by running multiple datasets in same repository --- conf/gencode.config | 6 +++ conf/gencode_references.config | 45 +++++++++++++++++++ .../local/utils_nfcore_lrp2_pipeline/main.nf | 28 ++++++++++++ workflows/lrp2.nf | 9 +++- 4 files changed, 87 insertions(+), 1 deletion(-) diff --git a/conf/gencode.config b/conf/gencode.config index 25197be..f6b1299 100644 --- a/conf/gencode.config +++ b/conf/gencode.config @@ -12,4 +12,10 @@ params { fasta = params.genomes && params.genome && params.genomes.containsKey(params.genome) ? params.genomes[params.genome].fasta : null gencode_gtf = params.genomes && params.genome && params.genomes.containsKey(params.genome) ? params.genomes[params.genome].gtf : null gencode_fasta = params.genomes && params.genome && params.genomes.containsKey(params.genome) ? params.genomes[params.genome].fasta : null + + // Auto-detect species from genome parameter if using a predefined genome + // Note: if user provides custom gtf/fasta without --genome, they should manually set --species + species = params.genomes && params.genome && params.genomes.containsKey(params.genome) ? + (params.genome.startsWith('GRCm') ? 'mouse' : 'human') : + params.species } diff --git a/conf/gencode_references.config b/conf/gencode_references.config index 4e7565d..1856501 100644 --- a/conf/gencode_references.config +++ b/conf/gencode_references.config @@ -107,5 +107,50 @@ params { protein_fasta = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_19/gencode.v19.pc_translations.fa.gz' mito_name = 'MT' } + 'GRCm39.vM38' { + release = 'M38' + genome_build = 'GRCm39' + fasta = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M38/GRCm39.primary_assembly.genome.fa.gz' + gtf = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M38/gencode.vM38.annotation.gtf.gz' + transcripts = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M38/gencode.vM38.transcripts.fa.gz' + protein_fasta = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M38/gencode.vM38.pc_translations.fa.gz' + mito_name = 'chrM' + } + 'GRCm39.vM37' { + release = 'M37' + genome_build = 'GRCm39' + fasta = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M37/GRCm39.primary_assembly.genome.fa.gz' + gtf = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M37/gencode.vM37.annotation.gtf.gz' + transcripts = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M37/gencode.vM37.transcripts.fa.gz' + protein_fasta = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M37/gencode.vM37.pc_translations.fa.gz' + mito_name = 'chrM' + } + 'GRCm39.vM36' { + release = 'M36' + genome_build = 'GRCm39' + fasta = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M36/GRCm39.primary_assembly.genome.fa.gz' + gtf = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M36/gencode.vM36.annotation.gtf.gz' + transcripts = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M36/gencode.vM36.transcripts.fa.gz' + protein_fasta = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M36/gencode.vM36.pc_translations.fa.gz' + mito_name = 'chrM' + } + 'GRCm39.vM35' { + release = 'M35' + genome_build = 'GRCm39' + fasta = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M35/GRCm39.primary_assembly.genome.fa.gz' + gtf = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M35/gencode.vM35.annotation.gtf.gz' + transcripts = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M35/gencode.vM35.transcripts.fa.gz' + protein_fasta = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M35/gencode.vM35.pc_translations.fa.gz' + mito_name = 'chrM' + } + 'GRCm39.vM34' { + release = 'M34' + genome_build = 'GRCm39' + fasta = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M34/GRCm39.primary_assembly.genome.fa.gz' + gtf = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M34/gencode.vM34.annotation.gtf.gz' + transcripts = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M34/gencode.vM34.transcripts.fa.gz' + protein_fasta = 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M34/gencode.vM34.pc_translations.fa.gz' + mito_name = 'chrM' + } } } diff --git a/subworkflows/local/utils_nfcore_lrp2_pipeline/main.nf b/subworkflows/local/utils_nfcore_lrp2_pipeline/main.nf index ee43a76..24ed3fb 100644 --- a/subworkflows/local/utils_nfcore_lrp2_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_lrp2_pipeline/main.nf @@ -350,6 +350,34 @@ def sanitizeSamplesheet(input_file) { // def validateInputParameters() { genomeExistsError() + validateSpeciesParameter() +} + +// +// Validate species parameter when using custom references +// +def validateSpeciesParameter() { + // If using custom gtf/fasta (not from gencode_refs), require manual species setting + def using_custom_refs = (params.fasta && !params.genome) || (params.gencode_gtf && !params.genome) + def has_rna_samples = params.input ? true : false // validated later in samplesheet parsing + + if (using_custom_refs && has_rna_samples && !params.species) { + def error_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + + " ERROR: --species parameter is required when using custom fasta/gtf files.\n" + + " You are using custom reference files without the --genome parameter.\n" + + " Please specify --species with either 'human' or 'mouse'.\n" + + " Example: --fasta /path/to/genome.fa --gencode_gtf /path/to/annotation.gtf --species mouse\n" + + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + error(error_string) + } + + if (params.species && !['human', 'mouse'].contains(params.species)) { + def error_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + + " ERROR: Invalid --species parameter: '${params.species}'\n" + + " Allowed values are: 'human', 'mouse'\n" + + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + error(error_string) + } } // diff --git a/workflows/lrp2.nf b/workflows/lrp2.nf index f6f6c23..a28106f 100644 --- a/workflows/lrp2.nf +++ b/workflows/lrp2.nf @@ -181,6 +181,13 @@ workflow LRP2 { // // SUBWORKFLOW: Run predicted proteome analysis (only if RNA samples present) // + // Log species information + if (params.genome && params.gencode_refs?.containsKey(params.genome)) { + log.info "-${colors.purple}[sheynkmanlab/lrp2]${colors.cyan} Auto-detected species from genome ${params.genome}: ${params.species}${colors.reset}-" + } else if (params.species) { + log.info "-${colors.purple}[sheynkmanlab/lrp2]${colors.cyan} Species: ${params.species}${colors.reset}-" + } + // Determine species-specific CPAT files def hexamer_file = params.species == 'human' ? file(params.human_hexamer) : file(params.mouse_hexamer) @@ -557,7 +564,7 @@ workflow LRP2 { ch_orfs = PREDICTED_PROTEOME.out.hashids_orf // Create RNA-only metadata file by filtering the original samplesheet - def rna_metadata_file = file("${workDir}/rna_samples_metadata.csv") + def rna_metadata_file = file("${workDir}/${params.dataset_name}_rna_samples_metadata.csv") def sample_path_idx = header_parts.findIndexOf { it.trim() == 'sample_path' } // Filter to RNA samples and create new CSV with 'name' and 'group' columns From 5f97f359f9b70425dec1f2ec0e598349f6442c7f Mon Sep 17 00:00:00 2001 From: = Date: Tue, 12 May 2026 12:27:00 -0400 Subject: [PATCH 11/31] fix: ensure pipeline detects --species param value from --genome param value if provided, but otherwise ensure pipeline halts if user provides custom GTF and FASTA but does not set --species value --- conf/gencode.config | 4 ++-- nextflow.config | 4 ++-- .../local/utils_nfcore_lrp2_pipeline/main.nf | 17 +++++++++++------ workflows/lrp2.nf | 12 +++++++++--- 4 files changed, 24 insertions(+), 13 deletions(-) diff --git a/conf/gencode.config b/conf/gencode.config index f6b1299..579fcff 100644 --- a/conf/gencode.config +++ b/conf/gencode.config @@ -13,8 +13,8 @@ params { gencode_gtf = params.genomes && params.genome && params.genomes.containsKey(params.genome) ? params.genomes[params.genome].gtf : null gencode_fasta = params.genomes && params.genome && params.genomes.containsKey(params.genome) ? params.genomes[params.genome].fasta : null - // Auto-detect species from genome parameter if using a predefined genome - // Note: if user provides custom gtf/fasta without --genome, they should manually set --species + // Auto-detect species ONLY from predefined genome (from gencode_refs) + // If user provides custom gtf/fasta, they MUST manually set --species species = params.genomes && params.genome && params.genomes.containsKey(params.genome) ? (params.genome.startsWith('GRCm') ? 'mouse' : 'human') : params.species diff --git a/nextflow.config b/nextflow.config index 48bb846..270d1c9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -14,7 +14,7 @@ params { dataset_name = 'merged' // Name for the merged dataset (used as prefix for output files) // References - genome = 'GRCh38' + genome = null // Genome name for auto-detection (e.g., 'GRCh38', 'GRCm39.M34') - if not specified, custom references must be provided igenomes_base = 's3://ngi-igenomes/igenomes/' igenomes_ignore = false fasta = null @@ -47,7 +47,7 @@ params { // predicted_proteome options // ------------------------------------ - species = 'human' // Species: 'human' or 'mouse' + species = null // Species: 'human' or 'mouse' (auto-detected from --genome, or must be manually specified when using custom references) min_orf = 75 // Minimum ORF length in nucleotides top_orf = 5 // Number of ORF candidates to report cpat_coding_threshold = null // CPAT coding threshold (auto-set based on species: human=0.364, mouse=0.44) diff --git a/subworkflows/local/utils_nfcore_lrp2_pipeline/main.nf b/subworkflows/local/utils_nfcore_lrp2_pipeline/main.nf index 24ed3fb..65a7ae8 100644 --- a/subworkflows/local/utils_nfcore_lrp2_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_lrp2_pipeline/main.nf @@ -358,15 +358,16 @@ def validateInputParameters() { // def validateSpeciesParameter() { // If using custom gtf/fasta (not from gencode_refs), require manual species setting - def using_custom_refs = (params.fasta && !params.genome) || (params.gencode_gtf && !params.genome) + def is_predefined_genome = params.genome && params.gencode_refs?.containsKey(params.genome) + def using_custom_refs = (params.gencode_fasta || params.gencode_gtf || params.fasta) && !is_predefined_genome def has_rna_samples = params.input ? true : false // validated later in samplesheet parsing if (using_custom_refs && has_rna_samples && !params.species) { def error_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + " ERROR: --species parameter is required when using custom fasta/gtf files.\n" + - " You are using custom reference files without the --genome parameter.\n" + + " You are using custom reference files without specifying a predefined --genome.\n" + " Please specify --species with either 'human' or 'mouse'.\n" + - " Example: --fasta /path/to/genome.fa --gencode_gtf /path/to/annotation.gtf --species mouse\n" + + " Example: --gencode_fasta /path/to/genome.fa --gencode_gtf /path/to/annotation.gtf --species mouse\n" + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" error(error_string) } @@ -422,11 +423,15 @@ def getGenomeAttribute(attribute) { return null } -// -// Exit pipeline if incorrect --genome key provided +// Validation rules: +// 1) Exit pipeline if incorrect --genome key provided +// 2) Skip validation if custom FASTA/GTF files are provided (genome name is auto-detected for naming only) // def genomeExistsError() { - if (params.genomes && params.genome && !params.genomes.containsKey(params.genome)) { + // Check if custom references are being used + def using_custom_refs = params.gencode_fasta || params.gencode_gtf || params.fasta + // Only validate genome name if NOT using custom references + if (!using_custom_refs && params.genomes && params.genome && !params.genomes.containsKey(params.genome)) { def error_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + " Genome '${params.genome}' not found in any config files provided to the pipeline.\n" + " Currently, the available genome keys are:\n" + diff --git a/workflows/lrp2.nf b/workflows/lrp2.nf index a28106f..f54901e 100644 --- a/workflows/lrp2.nf +++ b/workflows/lrp2.nf @@ -443,11 +443,17 @@ workflow LRP2 { ch_build_proteome_script = channel.value(file("${projectDir}/bin/build_mass_spec_reference.R")) // BUILD_PROTEOME_REFERENCE runs once for each sample group to create per-group sample-specific references - // Pass params.genome directly for file naming (e.g., GRCh38.p14.v49) + // note: currently auto-detects genome name from FASTA filename if not explicitly provided, might need to improve upon this logic + def genome_name = params.genome ?: ( + params.gencode_fasta ? file(params.gencode_fasta).name.tokenize('.')[0] : + params.fasta ? file(params.fasta).name.tokenize('.')[0] : + 'custom' + ) + BUILD_PROTEOME_REFERENCE( ch_build_ref_input, ch_build_proteome_script, - params.genome, + genome_name, params.no_gencode ?: false ) ch_versions = ch_versions.mix(BUILD_PROTEOME_REFERENCE.out.versions.ifEmpty([])) @@ -482,7 +488,7 @@ workflow LRP2 { ch_lr_orf_fasta, // Custom/LRP ORF FASTA (for samples with matched RNA) ch_gencode_gtf_for_novel, // GENCODE annotation GTF (for BED mapping) ch_gencode_protein_fasta, // GENCODE protein FASTA (for BED mapping) - params.genome + genome_name ) ch_versions = ch_versions.mix(PROTEOMICS.out.versions.ifEmpty([])) } From 981520bbe60f6b476133cdf448b2e85bfe334f06 Mon Sep 17 00:00:00 2001 From: = Date: Tue, 12 May 2026 12:48:48 -0400 Subject: [PATCH 12/31] docs: update README.md with details on multi-species support and usage --- README.md | 64 +++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 58 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 0a4bb8e..2f1d8da 100644 --- a/README.md +++ b/README.md @@ -182,13 +182,65 @@ nextflow run /path/to/LRP2 \ ## Reference Genome Support -Genome support is under active development. Currently, it is recommended to use GENCODE reference, with support for any version. RefSeq and custom support coming soon, along with support for mouse. +The pipeline supports multiple multi-species (human, mouse) reference genome sources: -1. **GENCODE genomes** (recommended) — High-quality annotations with multiple release versions (e.g., `--genome GRCh38.p14.v49`) -2. **RefSeq genomes (via iGenomes)** — Standard genome builds using NCBI/Ensembl annotations (e.g., `--genome GRCh38`) -3. **Custom references** — Provide your own FASTA and GTF files using `--fasta`, `--gencode_fasta`, and `--gencode_gtf` +1. **GENCODE genomes** (recommended) — High-quality annotations with multiple release versions + - Human: `GRCh38.p14.v49`, `GRCh38.p14.v48`, ..., `GRCh38.p14.v44`, `GRCh38.p13.v43`, ..., `GRCh37.p13.v19` + - Mouse: `GRCm39.vM38`, `GRCm39.vM37`, `GRCm39.vM36`, `GRCm39.vM35`, `GRCm39.vM34` +2. **Custom references** — Provide your own FASTA and GTF files by setting the `--gencode_fasta` and `--gencode_gtf` parameters +3. **RefSeq genomes (under development)** — Standard genome builds using NCBI/Ensembl annotations (e.g., `GRCh38`, `GRCm38`) -The pipeline automatically downloads the appropriate FASTA and GTF files based on your `--genome` selection. +Currently, it is recommended to use GENCODE reference, with support for any version. The pipeline automatically downloads the appropriate FASTA and GTF files based on your `--genome` selection. + +Support for RefSeq / igenomes is under active development. + +### Viewing Available Genomes + +To see all available preconfigured `--genome` options, run this command from the LRP2 directory: + +```bash +grep -oE "^ '[^']+'" conf/gencode_references.config | tr -d "'" | tr -d ' ' | sort +``` + +### Multi-Species Support + +The pipeline supports both human and mouse data analysis. The species being analyzed is automatically detected from the `--genome` parameter when using predefined GENCODE references: + +**Human data:** +```bash +nextflow run /path/to/LRP2 \ + --input samplesheet.csv \ + --outdir results \ + --genome GRCh38.p14.v49 \ + -profile singularity,slurm +``` + +**Mouse data:** +```bash +nextflow run /path/to/LRP2 \ + --input samplesheet.csv \ + --outdir results \ + --genome GRCm39.vM34 \ + -profile singularity,slurm +``` + +Species determines which CPAT model files (human vs mouse-specific models) are used for ORF prediction. + +### Using Custom References + +When providing custom FASTA and GTF files instead of using a predefined `--genome`, you **must explicitly specify** the `--species` parameter: + +```bash +nextflow run /path/to/LRP2 \ + --input samplesheet.csv \ + --outdir results \ + --gencode_fasta /path/to/custom_genome.fa \ + --gencode_gtf /path/to/custom_annotation.gtf \ + --species mouse \ + -profile singularity,slurm +``` + +> **Important**: The `--species` parameter is required when using custom references to ensure the correct CPAT models are used for ORF prediction. Valid values are `human` or `mouse`. ## Parameters @@ -212,7 +264,7 @@ The pipeline automatically downloads the appropriate FASTA and GTF files based o ### Predicted Proteome Subworkflow -- `--species` — `human` or `mouse` (default: `human`) +- `--species` — Species: `human` or `mouse`. Auto-detected from `--genome` when using predefined GENCODE references. **Required** when using custom FASTA/GTF files. - `--min_orf` — Minimum ORF length in nucleotides (default: `75`) - `--cpat_coding_threshold` — Coding probability threshold (default: human=0.364, mouse=0.44) - `--protein_class_keep` — Protein categories to retain (default: `FPM,NPC,NPE`) From 7a477aa36f63e181eab2f0beead09d4a1ee26f5b Mon Sep 17 00:00:00 2001 From: = Date: Tue, 12 May 2026 17:16:16 -0400 Subject: [PATCH 13/31] feat: allow for token only run mode with FRAGPIPE_AUTHENTICATE for easier run mode and suppression of second token being sent to user email when running on HPC --- README.md | 17 +- modules/local/fragpipe_authenticate/main.nf | 198 +++++++++++++------- nextflow.config | 17 +- 3 files changed, 139 insertions(+), 93 deletions(-) diff --git a/README.md b/README.md index 2f1d8da..9429d5b 100644 --- a/README.md +++ b/README.md @@ -93,25 +93,17 @@ curl --location --request POST \ 2. Check your email for a 6-digit token. -3. From the `LRP2` directory, you can run the RNA + DDA proteomics test with your registration details two ways: +3. From the `LRP2` directory, you can run the RNA + DDA proteomics test with your token two ways: To run locally: ```bash nextflow run . -profile test_dda,singularity --outdir test_results_dda \ - --fragpipe_first_name "YOUR_FIRST_NAME" \ - --fragpipe_last_name "YOUR_LAST_NAME" \ - --fragpipe_email "YOUR_EMAIL" \ - --fragpipe_institution "YOUR_INSTITUTION" \ --fragpipe_token "YOUR_TOKEN" ``` To submit individual tasks to the SLURM scheduler (recommended): ```bash nextflow run . -profile test_dda,singularity,slurm --outdir test_results_dda \ - --fragpipe_first_name "YOUR_FIRST_NAME" \ - --fragpipe_last_name "YOUR_LAST_NAME" \ - --fragpipe_email "YOUR_EMAIL" \ - --fragpipe_institution "YOUR_INSTITUTION" \ --fragpipe_token "YOUR_TOKEN" ``` @@ -163,19 +155,14 @@ nextflow run /path/to/LRP2 \ ### With proteomics (FragPipe) -Requires protein samples in the samplesheet and a FragPipe academic license (see [Run the RNA + DDA proteomics test dataset](#run-the-rna--dda-proteomics-test-dataset) above for license setup). +Requires protein samples in the samplesheet and a FragPipe academic license token (see [Run the RNA + DDA proteomics test dataset](#run-the-rna--dda-proteomics-test-dataset) above for obtaining a token). ```bash nextflow run /path/to/LRP2 \ --input samplesheet.csv \ --outdir results \ --genome GRCh38.p14.v49 \ --protein_search fragpipe \ - --fragpipe_first_name YOUR_FIRST_NAME \ - --fragpipe_last_name YOUR_LAST_NAME \ - --fragpipe_email YOUR_EMAIL \ - --fragpipe_institution YOUR_INSTITUTION \ --fragpipe_token YOUR_TOKEN \ - --fragpipe_license_accept true \ -profile singularity,slurm ``` > **Note**: When both RNA and protein samples are provided, the pipeline searches against the predicted proteome combined with a reference protein FASTA (auto-detected from the genome or provided via `--protein_fasta`). For protein-only samples, only the reference database is used. diff --git a/modules/local/fragpipe_authenticate/main.nf b/modules/local/fragpipe_authenticate/main.nf index 0496b94..c78cb33 100644 --- a/modules/local/fragpipe_authenticate/main.nf +++ b/modules/local/fragpipe_authenticate/main.nf @@ -1,14 +1,19 @@ /* * FRAGPIPE_AUTHENTICATE: Interactive license acceptance and tool download * - * This module handles the one-time setup for FragPipe with TWO modes: + * This module handles the one-time setup for FragPipe with THREE modes: * * MODE 1 - INTERACTIVE (Default, like FragNFlow): * - Checks if tools already exist * - If not: displays license, collects user info, sends registration, prompts for token * - If yes: skips download, uses cached tools * - * MODE 2 - NON-INTERACTIVE (For HPC batch jobs): + * MODE 2 - TOKEN-ONLY (Simplified for HPC): + * - User provides only --fragpipe_token parameter + * - Skips registration (user already ran curl command to get token) + * - Goes straight to downloading tools with provided token + * + * MODE 3 - FULL NON-INTERACTIVE (Legacy HPC mode): * - User provides all parameters via command line * - No prompts, fully automated * @@ -49,8 +54,17 @@ process FRAGPIPE_AUTHENTICATE { task.ext.when == null || task.ext.when script: - // Detect mode: if any required param is empty/false, use interactive mode - def interactive_mode = first_name == '' || last_name == '' || email == '' || institution == '' || token == '' || license_accept == false + // Detect mode: + // - MODE 1 (Interactive): All params empty/false + // - MODE 2 (Token-only): Only token provided, rest empty/false + // - MODE 3 (Full non-interactive): All params provided + def has_token = token != '' + def has_user_info = first_name != '' && last_name != '' && email != '' && institution != '' + def has_license = license_accept == true + + def interactive_mode = !has_token && !has_user_info && !has_license + def token_only_mode = has_token && !has_user_info && !has_license + def full_mode = has_token && has_user_info && has_license """ #!/bin/bash set -euo pipefail @@ -91,11 +105,25 @@ process FRAGPIPE_AUTHENTICATE { echo "" # - # STEP 2: Determine mode (interactive vs non-interactive) + # STEP 2: Determine mode (interactive vs token-only vs full non-interactive) # INTERACTIVE="${interactive_mode}" + TOKEN_ONLY="${token_only_mode}" + FULL_MODE="${full_mode}" - if [ "\$INTERACTIVE" = "true" ]; then + if [ "\$TOKEN_ONLY" = "true" ]; then + # + # TOKEN-ONLY MODE + # + echo "\${CYAN}==========================================================================" + echo " TOKEN-ONLY MODE" + echo "==========================================================================\${RESET}" + echo "" + echo "Using token provided via --fragpipe_token parameter" + echo "Skipping registration (assuming you already obtained token via curl)" + echo "" + USER_TOKEN="${token}" + elif [ "\$INTERACTIVE" = "true" ]; then # # INTERACTIVE MODE # @@ -201,53 +229,62 @@ process FRAGPIPE_AUTHENTICATE { fi # - # STEP 3: Send registration to Nesvilab + # STEP 3: Get latest version and send registration (skip registration for token-only mode) # - echo "\${YELLOW}Registering with Nesvilab upgrader server...\${RESET}" + # Get latest MSFragger version (needed for all modes for download URLs) + echo "\${YELLOW}Querying latest MSFragger version...\${RESET}" MSFRAGGER_VERSION=\$(curl -s https://msfragger-upgrader.nesvilab.org/upgrader/latest_version.php) echo "Latest MSFragger version: \$MSFRAGGER_VERSION" - - curl --location --request POST \\ - 'https://msfragger-upgrader.nesvilab.org/upgrader/upgrade_download.php' \\ - --form 'transfer="academic"' \\ - --form 'agreement2="true"' \\ - --form 'agreement3="true"' \\ - --form "first_name=\$USER_FIRST_NAME" \\ - --form "last_name=\$USER_LAST_NAME" \\ - --form "email=\$USER_EMAIL" \\ - --form "organization=\$USER_INSTITUTION" \\ - --form "download=\${MSFRAGGER_VERSION}\\\$zip" \\ - --form 'is_fragpipe="true"' \\ - > /dev/null 2>&1 - - echo "\${GREEN}✓ Registration sent successfully!\${RESET}" echo "" - # - # STEP 4: Get authentication token - # - if [ "\$INTERACTIVE" = "true" ]; then - echo "\${YELLOW}==========================================================================" - echo " AUTHENTICATION TOKEN REQUIRED" - echo "==========================================================================\${RESET}" - echo "" - echo "A 6-digit verification code has been sent to: \$USER_EMAIL" - echo "\${CYAN}Please check your email (including spam folder)\${RESET}" + # Send registration to Nesvilab (skip for token-only mode since user already did this) + if [ "\$TOKEN_ONLY" = "false" ]; then + echo "\${YELLOW}Registering with Nesvilab upgrader server...\${RESET}" + + curl --location --request POST \\ + 'https://msfragger-upgrader.nesvilab.org/upgrader/upgrade_download.php' \\ + --form 'transfer="academic"' \\ + --form 'agreement2="true"' \\ + --form 'agreement3="true"' \\ + --form "first_name=\$USER_FIRST_NAME" \\ + --form "last_name=\$USER_LAST_NAME" \\ + --form "email=\$USER_EMAIL" \\ + --form "organization=\$USER_INSTITUTION" \\ + --form "download=\${MSFRAGGER_VERSION}\\\$zip" \\ + --form 'is_fragpipe="true"' \\ + > /dev/null 2>&1 + + echo "\${GREEN}✓ Registration sent successfully!\${RESET}" echo "" + fi - # Prompt for token with validation - while true; do - read -p "Enter the 6-digit authentication code: " USER_TOKEN - if [[ "\$USER_TOKEN" =~ ^[0-9]{6}\$ ]]; then - echo "\${GREEN}✓ Token accepted: \$USER_TOKEN\${RESET}" - break - else - echo "\${RED}✗ Invalid token. Please enter exactly 6 digits.\${RESET}" - fi - done - else - echo "Using token from parameters: ${token}" - USER_TOKEN="${token}" + # + # STEP 4: Get authentication token (skip for token-only mode - already set) + # + if [ "\$TOKEN_ONLY" = "false" ]; then + if [ "\$INTERACTIVE" = "true" ]; then + echo "\${YELLOW}==========================================================================" + echo " AUTHENTICATION TOKEN REQUIRED" + echo "==========================================================================\${RESET}" + echo "" + echo "A 6-digit verification code has been sent to: \$USER_EMAIL" + echo "\${CYAN}Please check your email (including spam folder)\${RESET}" + echo "" + + # Prompt for token with validation + while true; do + read -p "Enter the 6-digit authentication code: " USER_TOKEN + if [[ "\$USER_TOKEN" =~ ^[0-9]{6}\$ ]]; then + echo "\${GREEN}✓ Token accepted: \$USER_TOKEN\${RESET}" + break + else + echo "\${RED}✗ Invalid token. Please enter exactly 6 digits.\${RESET}" + fi + done + else + echo "Using token from parameters: ${token}" + USER_TOKEN="${token}" + fi fi echo "" @@ -281,28 +318,53 @@ process FRAGPIPE_AUTHENTICATE { echo "" echo "To fix this issue:" echo "" - echo -e " \${CYAN}1.\${RESET} Check your email (\$USER_EMAIL) for a \${CYAN}NEW\${RESET} verification code" - echo -e " \${YELLOW}(The code expires quickly - usually within minutes to hours)\${RESET}" - echo "" - echo -e " \${CYAN}2.\${RESET} If you don't have a recent email, request a new token by running:" - echo "" - echo -e "\${GREEN}" - echo " curl --location --request POST \\\\" - echo " 'https://msfragger-upgrader.nesvilab.org/upgrader/upgrade_download.php' \\\\" - echo " --form 'transfer=\"academic\"' \\\\" - echo " --form 'agreement2=\"true\"' \\\\" - echo " --form 'agreement3=\"true\"' \\\\" - echo " --form 'first_name=\$USER_FIRST_NAME' \\\\" - echo " --form 'last_name=\$USER_LAST_NAME' \\\\" - echo " --form 'email=\$USER_EMAIL' \\\\" - echo " --form 'organization=\$USER_INSTITUTION' \\\\" - echo " --form 'download=\${MSFRAGGER_VERSION}\\\$zip' \\\\" - echo " --form 'is_fragpipe=\"true\"'" - echo -e "\${RESET}" - echo "" - echo -e " \${CYAN}3.\${RESET} Update your pipeline command with the new token:" - echo "" - echo -e " \${GREEN}--fragpipe_token XXXXXX\${RESET}" + + if [ "\$TOKEN_ONLY" = "true" ]; then + echo -e " \${CYAN}1.\${RESET} Request a new token by running this curl command:" + echo "" + echo -e "\${GREEN}" + echo " curl --location --request POST \\\\" + echo " 'https://msfragger-upgrader.nesvilab.org/upgrader/upgrade_download.php' \\\\" + echo " --form 'transfer=\"academic\"' \\\\" + echo " --form 'agreement2=\"true\"' \\\\" + echo " --form 'agreement3=\"true\"' \\\\" + echo " --form 'first_name=YOUR_FIRST_NAME' \\\\" + echo " --form 'last_name=YOUR_LAST_NAME' \\\\" + echo " --form 'email=YOUR_EMAIL' \\\\" + echo " --form 'organization=YOUR_INSTITUTION' \\\\" + echo " --form 'download=4.4.1\\\\\\\$zip' \\\\" + echo " --form 'is_fragpipe=\"true\"'" + echo -e "\${RESET}" + echo "" + echo -e " \${CYAN}2.\${RESET} Check your email for the new 6-digit token" + echo "" + echo -e " \${CYAN}3.\${RESET} Update your pipeline command with the new token:" + echo "" + echo -e " \${GREEN}--fragpipe_token XXXXXX\${RESET}" + else + echo -e " \${CYAN}1.\${RESET} Check your email (\$USER_EMAIL) for a \${CYAN}NEW\${RESET} verification code" + echo -e " \${YELLOW}(The code expires quickly - usually within minutes to hours)\${RESET}" + echo "" + echo -e " \${CYAN}2.\${RESET} If you don't have a recent email, request a new token by running:" + echo "" + echo -e "\${GREEN}" + echo " curl --location --request POST \\\\" + echo " 'https://msfragger-upgrader.nesvilab.org/upgrader/upgrade_download.php' \\\\" + echo " --form 'transfer=\"academic\"' \\\\" + echo " --form 'agreement2=\"true\"' \\\\" + echo " --form 'agreement3=\"true\"' \\\\" + echo " --form 'first_name=\$USER_FIRST_NAME' \\\\" + echo " --form 'last_name=\$USER_LAST_NAME' \\\\" + echo " --form 'email=\$USER_EMAIL' \\\\" + echo " --form 'organization=\$USER_INSTITUTION' \\\\" + echo " --form 'download=\${MSFRAGGER_VERSION}\\\$zip' \\\\" + echo " --form 'is_fragpipe=\"true\"'" + echo -e "\${RESET}" + echo "" + echo -e " \${CYAN}3.\${RESET} Update your pipeline command with the new token:" + echo "" + echo -e " \${GREEN}--fragpipe_token XXXXXX\${RESET}" + fi echo "" echo -e " \${CYAN}4.\${RESET} Resume the pipeline run:" echo "" diff --git a/nextflow.config b/nextflow.config index 270d1c9..29f2b7a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -103,16 +103,13 @@ params { fragpipe_threads = null // Number of threads for FragPipe (default: uses task.cpus from process label) fragpipe_tools_dir = "${projectDir}/fragpipe_tools" // Directory to cache downloaded MSFragger/IonQuant tools - // FragPipe Authentication - // If ALL parameters below are null/false, INTERACTIVE mode is used (recommended) - // If ANY parameter is provided, NON-INTERACTIVE mode is used (required for HPC environment) - // - fragpipe_first_name = null // OPTIONAL: Your first name (leave null for interactive mode) - fragpipe_last_name = null // OPTIONAL: Your last name (leave null for interactive mode) - fragpipe_email = null // OPTIONAL: Your email (leave null for interactive mode) - fragpipe_institution = null // OPTIONAL: Your institution (leave null for interactive mode) - fragpipe_token = null // OPTIONAL: 6-digit token from email (leave null for interactive mode) - fragpipe_license_accept = false // OPTIONAL: License acceptance (leave false for interactive mode) + // FragPipe authentication options + fragpipe_first_name = null // OPTIONAL: Your first name (leave null if just using token) + fragpipe_last_name = null // OPTIONAL: Your last name (leave null if just using token) + fragpipe_email = null // OPTIONAL: Your email (leave null if just using token) + fragpipe_institution = null // OPTIONAL: Your institution (leave null if just using token) + fragpipe_token = null // OPTIONAL: 6-digit token from email (REQUIRED) + fragpipe_license_accept = false // OPTIONAL: License acceptance (leave null if just using token) // Accession mapping and novel peptides options //uniprot_fasta = null // UniProt reviewed protein FASTA (for accession mapping) From 003f1df6d62fec20011e673cf32858ae5c6dae82 Mon Sep 17 00:00:00 2001 From: = Date: Tue, 12 May 2026 18:10:15 -0400 Subject: [PATCH 14/31] fix: add fallback so that if fragpipe module exits with any non-zero exit status immediately fails pipeline (bypass issues related to module failing on HPC clusters but pipeline not terminating) --- modules/local/fragpipe/main.nf | 109 ++++++++++++++++----------------- 1 file changed, 53 insertions(+), 56 deletions(-) diff --git a/modules/local/fragpipe/main.nf b/modules/local/fragpipe/main.nf index a5410da..53c4b67 100644 --- a/modules/local/fragpipe/main.nf +++ b/modules/local/fragpipe/main.nf @@ -212,9 +212,47 @@ process FRAGPIPE { echo "FragPipe exited with code: \$FRAGPIPE_EXIT" echo "" + # + # CHECK FOR FRAGPIPE FAILURE - EXIT IMMEDIATELY IF FRAGPIPE FAILED + # + if [ \$FRAGPIPE_EXIT -ne 0 ] && [ \$FRAGPIPE_EXIT -ne 141 ]; then + echo "" + echo "==========================================================================" + echo "ERROR: FragPipe failed with exit code: \$FRAGPIPE_EXIT" + echo "==========================================================================" + echo "" + echo "Diagnostic information:" + echo "--------------------------------------------------------------------------" + echo "Results directory contents:" + ls -lah results/ 2>/dev/null || echo "Results directory not found or empty" + echo "" + echo "Results subdirectories:" + find results -type d -maxdepth 2 2>/dev/null || echo "No subdirectories found" + echo "" + echo "All TSV files found:" + find results -name "*.tsv" -exec ls -lh {} \\; 2>/dev/null || echo "No TSV files found" + echo "" + echo "--------------------------------------------------------------------------" + echo "FragPipe log (last 200 lines):" + tail -200 fragpipe_execution.log 2>/dev/null || echo "Execution log not found" + echo "" + echo "FragPipe internal log:" + find results -name "log*.txt" -exec tail -100 {} \\; 2>/dev/null || echo "No internal log file found" + echo "==========================================================================" + echo "" + echo "Common causes of FragPipe failure:" + echo " - Insufficient memory (check SLURM logs for OOM errors)" + echo " - Job timeout (check SLURM walltime limits)" + echo " - Invalid input files or corrupted data" + echo " - Missing or incompatible workflow parameters" + echo "" + exit 1 + fi + # # VERIFY SUCCESSFUL COMPLETION BY CHECKING REQUIRED OUTPUT FILES # + echo "FragPipe exit code check passed (exit code: \$FRAGPIPE_EXIT)" echo "Verifying FragPipe output files..." # FragPipe creates sample-specific subdirectories, so we need to look for files there @@ -253,11 +291,11 @@ process FRAGPIPE { if [ -n "\$MISSING_FILES" ]; then echo "" echo "==========================================================================" - echo "ERROR: FragPipe failed - missing or empty required output files:" + echo "ERROR: FragPipe completed but is missing required output files:" echo "\$MISSING_FILES" echo "==========================================================================" echo "" - echo "FragPipe exit code was: \$FRAGPIPE_EXIT" + echo "This indicates FragPipe did not complete successfully despite exit code 0." echo "" echo "Diagnostic information:" echo "--------------------------------------------------------------------------" @@ -278,62 +316,21 @@ process FRAGPIPE { find results -name "log*.txt" -exec tail -100 {} \\; 2>/dev/null || echo "No internal log file found" echo "==========================================================================" exit 1 - elif [ \$FRAGPIPE_EXIT -eq 141 ]; then - echo "" - echo "==========================================================================" - echo "FragPipe exited with SIGPIPE (exit code 141); however, all required output files are present!" - echo "==========================================================================" - echo "" - echo "Output files verified:" - for FILENAME in \$REQUIRED_FILE_NAMES; do - # Use -quit instead of head to avoid SIGPIPE - FOUND_FILE=\$(find results -name "\$(basename \$FILENAME)" -type f 2>/dev/null -quit) - if [ -n "\$FOUND_FILE" ]; then - echo " ✓ \$FOUND_FILE (size: \$(du -h \$FOUND_FILE | cut -f1))" - fi - done - echo "" - elif [ \$FRAGPIPE_EXIT -ne 0 ]; then - echo "" - echo "==========================================================================" - echo "WARNING: FragPipe exited with code: \$FRAGPIPE_EXIT" - echo "==========================================================================" - echo "" - echo "However, all required output files are present and non-empty." - echo "This is a known issue where FragPipe reports non-zero exit codes even on successful completion." - echo "Treating this as successful based on output file verification." - echo "" - echo "Output files verified:" - for FILENAME in \$REQUIRED_FILE_NAMES; do - # Use -quit instead of head to avoid SIGPIPE - FOUND_FILE=\$(find results -name "\$(basename \$FILENAME)" -type f 2>/dev/null -quit) - if [ -n "\$FOUND_FILE" ]; then - echo " ✓ \$FOUND_FILE (size: \$(du -h \$FOUND_FILE | cut -f1))" - fi - done - echo "" - echo "Diagnostic information (last 50 lines of FragPipe log):" - echo "--------------------------------------------------------------------------" - tail -50 fragpipe_execution.log 2>/dev/null || echo "Execution log not found" - echo "--------------------------------------------------------------------------" - echo "" - echo "Continuing with successful completion status..." - echo "" - else - echo "" - echo "FragPipe completed successfully!" - echo "" - echo "Output files verified:" - for FILENAME in \$REQUIRED_FILE_NAMES; do - # Use -quit instead of head to avoid SIGPIPE - FOUND_FILE=\$(find results -name "\$(basename \$FILENAME)" -type f 2>/dev/null -quit) - if [ -n "\$FOUND_FILE" ]; then - echo " ✓ \$FOUND_FILE (size: \$(du -h \$FOUND_FILE | cut -f1))" - fi - done - echo "" fi + echo "" + echo "FragPipe completed successfully!" + echo "" + echo "Output files verified:" + for FILENAME in \$REQUIRED_FILE_NAMES; do + # Use -quit instead of head to avoid SIGPIPE + FOUND_FILE=\$(find results -name "\$(basename \$FILENAME)" -type f 2>/dev/null -quit) + if [ -n "\$FOUND_FILE" ]; then + echo " ✓ \$FOUND_FILE (size: \$(du -h \$FOUND_FILE | cut -f1))" + fi + done + echo "" + # # POST-PROCESSING: Extract PSM table # From b791c05a0733dc579f087e3dcbdc3d64ca85324b Mon Sep 17 00:00:00 2001 From: = Date: Tue, 12 May 2026 19:42:04 -0400 Subject: [PATCH 15/31] feat: redundant process cleanup so that ISOCALL_PROFILE and GUNZIP_FASTA for genome FASTA and GTF do not run when only protein samples with PROTEOMICS runs, modify channel setup in main workflow so that only RNA subworkflow processes shown in interactive mode when running with only RNA samples and vice versa with protein samples --- subworkflows/local/pacbio_isocall.nf | 8 +- workflows/lrp2.nf | 247 ++++++++++++++++----------- 2 files changed, 153 insertions(+), 102 deletions(-) diff --git a/subworkflows/local/pacbio_isocall.nf b/subworkflows/local/pacbio_isocall.nf index 7aed7d3..55cc3e2 100644 --- a/subworkflows/local/pacbio_isocall.nf +++ b/subworkflows/local/pacbio_isocall.nf @@ -50,10 +50,14 @@ workflow PACBIO_ISOCALL { ch_versions = ch_versions.mix(ISOCALL_PROFILE.out.versions) // - // MODULE: Prepare known isoforms database from GTF (runs independently) + // MODULE: Prepare known isoforms database from GTF (only runs if samples are present) // + ch_gtf_for_prep = ISOCALL_ALIGN.out.bam + .map { _meta, _bam -> reference_gtf_gz } + .first() + ISOCALL_PREP ( - reference_gtf_gz + ch_gtf_for_prep ) ch_versions = ch_versions.mix(ISOCALL_PREP.out.versions) diff --git a/workflows/lrp2.nf b/workflows/lrp2.nf index f54901e..2bc996d 100644 --- a/workflows/lrp2.nf +++ b/workflows/lrp2.nf @@ -39,22 +39,33 @@ workflow LRP2 { def colors = logColours(params.monochrome_logs) // - // Decompress reference files if they are gzipped (e.g. GENCODE files gzipped by default) + // Separate RNA and protein samples based on sample_type metadata + // Note: Genome FASTA decompression is deferred until after we determine if RNA samples are present // - def fasta_file = params.fasta ? file(params.fasta) : null - def is_fasta_gzipped = fasta_file && fasta_file.name.endsWith('.gz') + // Parse samplesheet synchronously to determine if RNA samples exist (for conditional subworkflow execution) + def samplesheet_file = file(params.input) + def samplesheet_content = samplesheet_file.text + def samplesheet_lines = samplesheet_content.split('\n') + def samplesheet_header_parts = samplesheet_lines[0].split(',') + def sample_type_idx = samplesheet_header_parts.findIndexOf { it.trim() == 'sample_type' } - if (is_fasta_gzipped) { - ch_fasta_input = channel.of([[ id: 'genome_fasta' ], fasta_file]) - GUNZIP_FASTA(ch_fasta_input) - ch_fasta = GUNZIP_FASTA.out.gunzip.map { _meta, file -> file }.first() - } else { - ch_fasta = fasta_file ? channel.value(fasta_file) : channel.empty() + def has_rna_samples_sync = false + def has_protein_samples_sync = false + + samplesheet_lines.drop(1).each { line -> + if (line.trim()) { + def parts = line.split(',') + if (parts.size() > sample_type_idx) { + def sample_type = parts[sample_type_idx].trim().toLowerCase() + if (sample_type == 'rna') { + has_rna_samples_sync = true + } else if (sample_type == 'protein') { + has_protein_samples_sync = true + } + } + } } - // - // Separate RNA and protein samples based on sample_type metadata - // // Create separate RNA and protein channels using filter ch_rna_samples = ch_samplesheet .filter { meta, _data -> @@ -94,18 +105,37 @@ workflow LRP2 { .set { ch_rna_count } // - // Handle GTF and gencode_fasta decompression ONLY if RNA samples are present + // Handle genome FASTA, GTF, and gencode_fasta decompression ONLY if RNA samples are present // + def fasta_file = params.fasta ? file(params.fasta) : null + def is_fasta_gzipped = fasta_file && fasta_file.name.endsWith('.gz') def gtf_file = params.gencode_gtf ? file(params.gencode_gtf) : null def is_gtf_gzipped = gtf_file && gtf_file.name.endsWith('.gz') def gencode_fasta_file = params.gencode_fasta ? file(params.gencode_fasta) : null def is_gencode_fasta_gzipped = gencode_fasta_file && gencode_fasta_file.name.endsWith('.gz') - // Only process GTF/FASTA files if RNA samples are present + // Only process genome FASTA, GTF, and gencode FASTA files if RNA samples are present ch_rna_count .map { count -> count > 0 } .set { ch_has_rna_samples } + // Decompress genome FASTA only if RNA samples are present + ch_fasta_input_conditional = ch_has_rna_samples + .filter { has_rna -> has_rna && is_fasta_gzipped && fasta_file != null } + .map { _has_rna -> [[ id: 'genome_fasta' ], fasta_file] } + + if (is_fasta_gzipped && fasta_file) { + GUNZIP_FASTA(ch_fasta_input_conditional) + ch_fasta = GUNZIP_FASTA.out.gunzip.map { _meta, file -> file } + } else if (fasta_file) { + ch_fasta = ch_has_rna_samples + .filter { has_rna -> has_rna } + .map { _has_rna -> fasta_file } + .ifEmpty(channel.value(fasta_file)) + } else { + ch_fasta = channel.empty() + } + ch_gtf_input_conditional = ch_has_rna_samples .filter { has_rna -> has_rna && gtf_file != null } .map { _has_rna -> [[ id: 'gencode_gtf' ], gtf_file] } @@ -145,71 +175,79 @@ workflow LRP2 { } // - // SUBWORKFLOW: Run PacBio IsoCall analysis (only if RNA samples present) + // Define sample metadata file (used by both RNA subworkflows and multisample analysis) // - // IsoCall requires config TOML and gzipped GTF reference - ch_isocall_config = channel.value(file("${projectDir}/bin/isocall_config.toml")) - - PACBIO_ISOCALL ( - ch_rna_samples_filtered, - ch_fasta, - ch_gtf_gz, - ch_isocall_config - ) - ch_versions = ch_versions.mix(PACBIO_ISOCALL.out.versions.ifEmpty([])) + sample_metadata_file = params.sample_metadata ?: params.input // - // SUBWORKFLOW: Run SQANTI3 QC and filtering (only if RNA samples present) + // RNA-specific subworkflows (only execute if RNA samples are present) // - // Note: samplesheet is used as sample_metadata - sample_metadata_file = params.sample_metadata ?: params.input + if (has_rna_samples_sync) { + // + // SUBWORKFLOW: Run PacBio IsoCall analysis + // + // IsoCall requires config TOML and gzipped GTF reference + ch_isocall_config = channel.value(file("${projectDir}/bin/isocall_config.toml")) + + PACBIO_ISOCALL ( + ch_rna_samples_filtered, + ch_fasta, + ch_gtf_gz, + ch_isocall_config + ) + ch_versions = ch_versions.mix(PACBIO_ISOCALL.out.versions.ifEmpty([])) - TRANSCRIPTOME ( - PACBIO_ISOCALL.out.called_gtf - .join(PACBIO_ISOCALL.out.count_matrix, by: 0) - .map { meta, gtf, count -> - [meta, gtf, count] }, - ch_gtf, - ch_gencode_fasta, - file(sample_metadata_file), - file(params.filter_script), - file(params.hashlib_script), - file(params.generate_hashids_script) - ) - ch_versions = ch_versions.mix(TRANSCRIPTOME.out.versions.ifEmpty([])) + // + // SUBWORKFLOW: Run SQANTI3 QC and filtering + // - // - // SUBWORKFLOW: Run predicted proteome analysis (only if RNA samples present) - // - // Log species information - if (params.genome && params.gencode_refs?.containsKey(params.genome)) { - log.info "-${colors.purple}[sheynkmanlab/lrp2]${colors.cyan} Auto-detected species from genome ${params.genome}: ${params.species}${colors.reset}-" - } else if (params.species) { - log.info "-${colors.purple}[sheynkmanlab/lrp2]${colors.cyan} Species: ${params.species}${colors.reset}-" - } + TRANSCRIPTOME ( + PACBIO_ISOCALL.out.called_gtf + .join(PACBIO_ISOCALL.out.count_matrix, by: 0) + .map { meta, gtf, count -> + [meta, gtf, count] }, + ch_gtf, + ch_gencode_fasta, + file(sample_metadata_file), + file(params.filter_script), + file(params.hashlib_script), + file(params.generate_hashids_script) + ) + ch_versions = ch_versions.mix(TRANSCRIPTOME.out.versions.ifEmpty([])) + + // + // SUBWORKFLOW: Run predicted proteome analysis + // + // Log species information + if (params.genome && params.gencode_refs?.containsKey(params.genome)) { + log.info "-${colors.purple}[sheynkmanlab/lrp2]${colors.cyan} Auto-detected species from genome ${params.genome}: ${params.species}${colors.reset}-" + } else if (params.species) { + log.info "-${colors.purple}[sheynkmanlab/lrp2]${colors.cyan} Species: ${params.species}${colors.reset}-" + } - // Determine species-specific CPAT files - def hexamer_file = params.species == 'human' ? - file(params.human_hexamer) : file(params.mouse_hexamer) - def logit_model = params.species == 'human' ? - file(params.human_logit_model) : file(params.mouse_logit_model) - - PREDICTED_PROTEOME ( - TRANSCRIPTOME.out.corrected_fasta_filtered - .join(TRANSCRIPTOME.out.corrected_gtf_filtered, by: 0) - .join(TRANSCRIPTOME.out.classification_filtered, by: 0) - .join(TRANSCRIPTOME.out.hashids_filtered, by: 0) - .join(TRANSCRIPTOME.out.hashids_mapping, by: 0) - .map { meta, fasta, gtf, classification, hashids, mapping -> - [meta, fasta, gtf, classification, hashids, mapping] }, - ch_gtf, - hexamer_file, - logit_model, - file(params.filter_cpat_script), - file(params.sqanti_protein_script), - file(params.protein_class_script) - ) - ch_versions = ch_versions.mix(PREDICTED_PROTEOME.out.versions.ifEmpty([])) + // Determine species-specific CPAT files + def hexamer_file = params.species == 'human' ? + file(params.human_hexamer) : file(params.mouse_hexamer) + def logit_model = params.species == 'human' ? + file(params.human_logit_model) : file(params.mouse_logit_model) + + PREDICTED_PROTEOME ( + TRANSCRIPTOME.out.corrected_fasta_filtered + .join(TRANSCRIPTOME.out.corrected_gtf_filtered, by: 0) + .join(TRANSCRIPTOME.out.classification_filtered, by: 0) + .join(TRANSCRIPTOME.out.hashids_filtered, by: 0) + .join(TRANSCRIPTOME.out.hashids_mapping, by: 0) + .map { meta, fasta, gtf, classification, hashids, mapping -> + [meta, fasta, gtf, classification, hashids, mapping] }, + ch_gtf, + hexamer_file, + logit_model, + file(params.filter_cpat_script), + file(params.sqanti_protein_script), + file(params.protein_class_script) + ) + ch_versions = ch_versions.mix(PREDICTED_PROTEOME.out.versions.ifEmpty([])) + } // // SUBWORKFLOW: Run proteomics analysis (only if protein samples are present) @@ -318,9 +356,9 @@ workflow LRP2 { // } // } - // note: pipeline will only execute PROTEOMICS if protein_fasta is available (user-provided or auto-detected) + // note: pipeline will only execute PROTEOMICS if protein_fasta is available (user-provided or auto-detected) AND protein samples exist //if (protein_fasta_path) { - if (gencode_protein_fasta_path || custom_protein_fasta_path || lrp_protein_fasta_path) { + if ((gencode_protein_fasta_path || custom_protein_fasta_path || lrp_protein_fasta_path) && has_protein_samples_sync) { ch_metamorpheus_config = channel.value( params.metamorpheus_config ? file(params.metamorpheus_config) : @@ -367,29 +405,38 @@ workflow LRP2 { // - If no RNA samples then we build GENCODE-only references per sample group // Extract outputs from RNA subworkflows if available (keep the RNA sample meta.id for use in filtering by CPM column name) - ch_predicted_proteome_fasta = PREDICTED_PROTEOME.out.protein_all_orfs_fasta - .map { _meta, fasta -> fasta } - .first() - .ifEmpty(lrp_protein_fasta_file ?: file('NO_FILE')) - - ch_transcript_counts_with_id = TRANSCRIPTOME.out.hashids_all - .map { meta, counts -> [meta.id, counts] } - .first() - .ifEmpty(['NO_RNA_SAMPLE', file('NO_FILE')]) - - ch_transcript_counts = ch_transcript_counts_with_id - .map { rna_id, counts -> counts } - - // NOVEL_PEPTIDES Extract CDS GTF and ORF FASTA - ch_lr_cds_gtf = PREDICTED_PROTEOME.out.cds_gtf - .map { _meta, gtf -> gtf } - .first() - .ifEmpty(lrp_gtf_file ?: custom_gtf_file ?: file('NO_FILE')) - - ch_lr_orf_fasta = PREDICTED_PROTEOME.out.protein_all_orfs_fasta - .map { _meta, fasta -> fasta } - .first() - .ifEmpty(lrp_protein_fasta_file ?: custom_protein_fasta_file ?: file('NO_FILE')) + if (has_rna_samples_sync) { + ch_predicted_proteome_fasta = PREDICTED_PROTEOME.out.protein_all_orfs_fasta + .map { _meta, fasta -> fasta } + .first() + .ifEmpty(lrp_protein_fasta_file ?: file('NO_FILE')) + + ch_transcript_counts_with_id = TRANSCRIPTOME.out.hashids_all + .map { meta, counts -> [meta.id, counts] } + .first() + .ifEmpty(['NO_RNA_SAMPLE', file('NO_FILE')]) + + ch_transcript_counts = ch_transcript_counts_with_id + .map { rna_id, counts -> counts } + + // NOVEL_PEPTIDES Extract CDS GTF and ORF FASTA + ch_lr_cds_gtf = PREDICTED_PROTEOME.out.cds_gtf + .map { _meta, gtf -> gtf } + .first() + .ifEmpty(lrp_gtf_file ?: custom_gtf_file ?: file('NO_FILE')) + + ch_lr_orf_fasta = PREDICTED_PROTEOME.out.protein_all_orfs_fasta + .map { _meta, fasta -> fasta } + .first() + .ifEmpty(lrp_protein_fasta_file ?: custom_protein_fasta_file ?: file('NO_FILE')) + } else { + // No RNA samples - use placeholder/user-provided files for proteomics-only mode + ch_predicted_proteome_fasta = channel.value(lrp_protein_fasta_file ?: file('NO_FILE')) + ch_transcript_counts_with_id = channel.value(['NO_RNA_SAMPLE', file('NO_FILE')]) + ch_transcript_counts = channel.value(file('NO_FILE')) + ch_lr_cds_gtf = channel.value(lrp_gtf_file ?: custom_gtf_file ?: file('NO_FILE')) + ch_lr_orf_fasta = channel.value(lrp_protein_fasta_file ?: custom_protein_fasta_file ?: file('NO_FILE')) + } // Resolve GENCODE annotation GTF for novel peptides BED mapping // Use the already-decompressed ch_gtf if available, otherwise resolve from params @@ -507,12 +554,12 @@ workflow LRP2 { def header_parts = lines[0].split(',') def sample_name_idx = header_parts.findIndexOf { it.trim() == 'sample_name' } def condition_idx = header_parts.findIndexOf { it.trim() == 'condition' } - def sample_type_idx = header_parts.findIndexOf { it.trim() == 'sample_type' } + def multisample_sample_type_idx = header_parts.findIndexOf { it.trim() == 'sample_type' } def samples_per_condition = [:].withDefault { [] } lines.drop(1).each { line -> if (line.trim()) { def parts = line.split(',') - if (parts.size() > sample_type_idx && parts[sample_type_idx].trim().toLowerCase() == 'rna') { + if (parts.size() > multisample_sample_type_idx && parts[multisample_sample_type_idx].trim().toLowerCase() == 'rna') { def condition = parts[condition_idx].trim() def sample_name = parts[sample_name_idx].trim() samples_per_condition[condition] << sample_name @@ -578,7 +625,7 @@ workflow LRP2 { lines.drop(1).each { line -> if (line.trim()) { def parts = line.split(',') - if (parts.size() > sample_type_idx && parts[sample_type_idx].trim().toLowerCase() == 'rna') { + if (parts.size() > multisample_sample_type_idx && parts[multisample_sample_type_idx].trim().toLowerCase() == 'rna') { def name = parts[sample_name_idx].trim() def path = parts[sample_path_idx].trim() def group = parts[condition_idx].trim() From fce657affd74e24c2d3d2195e4df2670d1b97bf4 Mon Sep 17 00:00:00 2001 From: = Date: Wed, 13 May 2026 10:40:52 -0400 Subject: [PATCH 16/31] feat (closes #33): remove deprecated --fasta param, rename --gencode_gtf and --gencode_fasta to --gtf and --fasta --- README.md | 6 +- conf/gencode.config | 5 +- main.nf | 4 +- nextflow.config | 5 +- nextflow_schema.json | 22 ++----- .../local/utils_nfcore_lrp2_pipeline/main.nf | 8 +-- workflows/lrp2.nf | 60 +++++++------------ 7 files changed, 39 insertions(+), 71 deletions(-) diff --git a/README.md b/README.md index 9429d5b..0d25fb7 100644 --- a/README.md +++ b/README.md @@ -174,7 +174,7 @@ The pipeline supports multiple multi-species (human, mouse) reference genome sou 1. **GENCODE genomes** (recommended) — High-quality annotations with multiple release versions - Human: `GRCh38.p14.v49`, `GRCh38.p14.v48`, ..., `GRCh38.p14.v44`, `GRCh38.p13.v43`, ..., `GRCh37.p13.v19` - Mouse: `GRCm39.vM38`, `GRCm39.vM37`, `GRCm39.vM36`, `GRCm39.vM35`, `GRCm39.vM34` -2. **Custom references** — Provide your own FASTA and GTF files by setting the `--gencode_fasta` and `--gencode_gtf` parameters +2. **Custom references** — Provide your own FASTA and GTF files by setting the `--fasta` and `--gtf` parameters 3. **RefSeq genomes (under development)** — Standard genome builds using NCBI/Ensembl annotations (e.g., `GRCh38`, `GRCm38`) Currently, it is recommended to use GENCODE reference, with support for any version. The pipeline automatically downloads the appropriate FASTA and GTF files based on your `--genome` selection. @@ -221,8 +221,8 @@ When providing custom FASTA and GTF files instead of using a predefined `--genom nextflow run /path/to/LRP2 \ --input samplesheet.csv \ --outdir results \ - --gencode_fasta /path/to/custom_genome.fa \ - --gencode_gtf /path/to/custom_annotation.gtf \ + --fasta /path/to/custom_genome.fa \ + --gtf /path/to/custom_annotation.gtf \ --species mouse \ -profile singularity,slurm ``` diff --git a/conf/gencode.config b/conf/gencode.config index 579fcff..f8317bf 100644 --- a/conf/gencode.config +++ b/conf/gencode.config @@ -8,10 +8,9 @@ params { // Merge GENCODE references into params.genomes (GENCODE takes precedence if there are conflicts) genomes = (params.genomes ?: [:]) + (params.gencode_refs ?: [:]) - // Auto-set fasta, gencode_gtf, and gencode_fasta from merged genomes if not already provided + // Auto-set gtf and fasta from merged genomes if not already provided + gtf = params.genomes && params.genome && params.genomes.containsKey(params.genome) ? params.genomes[params.genome].gtf : null fasta = params.genomes && params.genome && params.genomes.containsKey(params.genome) ? params.genomes[params.genome].fasta : null - gencode_gtf = params.genomes && params.genome && params.genomes.containsKey(params.genome) ? params.genomes[params.genome].gtf : null - gencode_fasta = params.genomes && params.genome && params.genomes.containsKey(params.genome) ? params.genomes[params.genome].fasta : null // Auto-detect species ONLY from predefined genome (from gencode_refs) // If user provides custom gtf/fasta, they MUST manually set --species diff --git a/main.nf b/main.nf index 423154e..5f9629a 100644 --- a/main.nf +++ b/main.nf @@ -24,8 +24,8 @@ include { getGenomeAttribute } from './subworkflows/local/utils_nfcore_lrp2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -// NOTE: params.fasta is set in nextflow.config after igenomes config is loaded -// Users can override with --fasta if needed +// NOTE: params.fasta and params.gtf are set in nextflow.config after gencode config is loaded +// Users can override with --fasta and --gtf if needed /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/nextflow.config b/nextflow.config index 29f2b7a..033d8b0 100644 --- a/nextflow.config +++ b/nextflow.config @@ -17,7 +17,6 @@ params { genome = null // Genome name for auto-detection (e.g., 'GRCh38', 'GRCm39.M34') - if not specified, custom references must be provided igenomes_base = 's3://ngi-igenomes/igenomes/' igenomes_ignore = false - fasta = null // #################################### // MODULE OPTIONS @@ -36,8 +35,8 @@ params { filter_script = "${projectDir}/bin/02_filter_sqanti_transcripts.R" hashlib_script = "${projectDir}/bin/hashlib_id_generator.py" - gencode_gtf = null // GENCODE reference GTF file (auto-set from igenomes) - gencode_fasta = null // GENCODE reference FASTA file (auto-set from igenomes) + gtf = null // Reference GTF file (auto-set from igenomes) + fasta = null // Reference genome FASTA file (auto-set from igenomes) sample_metadata = null // Sample metadata CSV file (defaults to samplesheet if not provided) protein_coding_filter = true // Filter to keep only protein-coding genes internal_priming_filter = true // Filter to remove internal priming artifacts diff --git a/nextflow_schema.json b/nextflow_schema.json index 89e169c..053ebb6 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -62,16 +62,6 @@ "fa_icon": "fas fa-book", "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." }, - "fasta": { - "type": "string", - "format": "file-path", - "exists": true, - "mimetype": "text/plain", - "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", - "description": "Path to FASTA genome file.", - "help_text": "This parameter is *mandatory* if `--genome` is not specified. If you don't have a BWA index available this will be generated for you automatically. Combine with `--save_reference` to save BWA index for future runs.", - "fa_icon": "far fa-file-code" - }, "igenomes_ignore": { "type": "boolean", "description": "Do not load the iGenomes reference config.", @@ -173,20 +163,20 @@ "fa_icon": "fas fa-filter", "description": "Options for the SQANTI3 QC and filtering modules.", "properties": { - "gencode_gtf": { + "gtf": { "type": "string", "format": "file-path", "exists": true, - "description": "Path to GENCODE reference GTF file.", - "help_text": "Path to the GENCODE GTF file matching your reference genome. If not provided, will be automatically set from iGenomes configuration using the genome parameter.", + "description": "Path to reference GTF file.", + "help_text": "Path to the GTF file matching your reference genome. If not provided, will be automatically set from iGenomes configuration using the genome parameter.", "fa_icon": "far fa-file-code" }, - "gencode_fasta": { + "fasta": { "type": "string", "format": "file-path", "exists": true, - "description": "Path to GENCODE reference FASTA file.", - "help_text": "Path to the GENCODE FASTA file matching your reference genome. If not provided, will be automatically set from iGenomes configuration using the genome parameter.", + "description": "Path to reference genome FASTA file.", + "help_text": "Path to the genome FASTA file matching your reference genome. If not provided, will be automatically set from iGenomes configuration using the genome parameter.", "fa_icon": "far fa-file-code" }, "sample_metadata": { diff --git a/subworkflows/local/utils_nfcore_lrp2_pipeline/main.nf b/subworkflows/local/utils_nfcore_lrp2_pipeline/main.nf index 65a7ae8..a49e239 100644 --- a/subworkflows/local/utils_nfcore_lrp2_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_lrp2_pipeline/main.nf @@ -359,7 +359,7 @@ def validateInputParameters() { def validateSpeciesParameter() { // If using custom gtf/fasta (not from gencode_refs), require manual species setting def is_predefined_genome = params.genome && params.gencode_refs?.containsKey(params.genome) - def using_custom_refs = (params.gencode_fasta || params.gencode_gtf || params.fasta) && !is_predefined_genome + def using_custom_refs = (params.fasta || params.gtf) && !is_predefined_genome def has_rna_samples = params.input ? true : false // validated later in samplesheet parsing if (using_custom_refs && has_rna_samples && !params.species) { @@ -367,7 +367,7 @@ def validateSpeciesParameter() { " ERROR: --species parameter is required when using custom fasta/gtf files.\n" + " You are using custom reference files without specifying a predefined --genome.\n" + " Please specify --species with either 'human' or 'mouse'.\n" + - " Example: --gencode_fasta /path/to/genome.fa --gencode_gtf /path/to/annotation.gtf --species mouse\n" + + " Example: --fasta /path/to/genome.fa --gtf /path/to/annotation.gtf --species mouse\n" + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" error(error_string) } @@ -423,13 +423,13 @@ def getGenomeAttribute(attribute) { return null } -// Validation rules: +// Validation rules: // 1) Exit pipeline if incorrect --genome key provided // 2) Skip validation if custom FASTA/GTF files are provided (genome name is auto-detected for naming only) // def genomeExistsError() { // Check if custom references are being used - def using_custom_refs = params.gencode_fasta || params.gencode_gtf || params.fasta + def using_custom_refs = params.fasta || params.gtf // Only validate genome name if NOT using custom references if (!using_custom_refs && params.genomes && params.genome && !params.genomes.containsKey(params.genome)) { def error_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + diff --git a/workflows/lrp2.nf b/workflows/lrp2.nf index 2bc996d..746f112 100644 --- a/workflows/lrp2.nf +++ b/workflows/lrp2.nf @@ -105,40 +105,21 @@ workflow LRP2 { .set { ch_rna_count } // - // Handle genome FASTA, GTF, and gencode_fasta decompression ONLY if RNA samples are present + // Handle GTF and FASTA decompression ONLY if RNA samples are present // + def gtf_file = params.gtf ? file(params.gtf) : null + def is_gtf_gzipped = gtf_file && gtf_file.name.endsWith('.gz') def fasta_file = params.fasta ? file(params.fasta) : null def is_fasta_gzipped = fasta_file && fasta_file.name.endsWith('.gz') - def gtf_file = params.gencode_gtf ? file(params.gencode_gtf) : null - def is_gtf_gzipped = gtf_file && gtf_file.name.endsWith('.gz') - def gencode_fasta_file = params.gencode_fasta ? file(params.gencode_fasta) : null - def is_gencode_fasta_gzipped = gencode_fasta_file && gencode_fasta_file.name.endsWith('.gz') - // Only process genome FASTA, GTF, and gencode FASTA files if RNA samples are present + // Only process GTF and FASTA files if RNA samples are present ch_rna_count .map { count -> count > 0 } .set { ch_has_rna_samples } - // Decompress genome FASTA only if RNA samples are present - ch_fasta_input_conditional = ch_has_rna_samples - .filter { has_rna -> has_rna && is_fasta_gzipped && fasta_file != null } - .map { _has_rna -> [[ id: 'genome_fasta' ], fasta_file] } - - if (is_fasta_gzipped && fasta_file) { - GUNZIP_FASTA(ch_fasta_input_conditional) - ch_fasta = GUNZIP_FASTA.out.gunzip.map { _meta, file -> file } - } else if (fasta_file) { - ch_fasta = ch_has_rna_samples - .filter { has_rna -> has_rna } - .map { _has_rna -> fasta_file } - .ifEmpty(channel.value(fasta_file)) - } else { - ch_fasta = channel.empty() - } - ch_gtf_input_conditional = ch_has_rna_samples .filter { has_rna -> has_rna && gtf_file != null } - .map { _has_rna -> [[ id: 'gencode_gtf' ], gtf_file] } + .map { _has_rna -> [[ id: 'gtf' ], gtf_file] } if (is_gtf_gzipped && gtf_file) { // GTF is already gzipped - decompress for TRANSCRIPTOME and keep original for PACBIO_ISOCALL @@ -160,18 +141,18 @@ workflow LRP2 { ch_gtf_gz = channel.empty() } - ch_gencode_fasta_input_conditional = ch_has_rna_samples - .filter { has_rna -> has_rna && is_gencode_fasta_gzipped && gencode_fasta_file != null } - .map { _has_rna -> [[ id: 'gencode_fasta' ], gencode_fasta_file] } + ch_fasta_input_conditional = ch_has_rna_samples + .filter { has_rna -> has_rna && is_fasta_gzipped && fasta_file != null } + .map { _has_rna -> [[ id: 'fasta' ], fasta_file] } - if (is_gencode_fasta_gzipped) { - GUNZIP_GENCODE_FASTA(ch_gencode_fasta_input_conditional) - ch_gencode_fasta = GUNZIP_GENCODE_FASTA.out.gunzip.map { _meta, file -> file } + if (is_fasta_gzipped) { + GUNZIP_GENCODE_FASTA(ch_fasta_input_conditional) + ch_fasta = GUNZIP_GENCODE_FASTA.out.gunzip.map { _meta, file -> file } } else { - ch_gencode_fasta = ch_has_rna_samples - .filter { has_rna -> has_rna && gencode_fasta_file != null } - .map { _has_rna -> gencode_fasta_file } - .ifEmpty(gencode_fasta_file ? channel.value(gencode_fasta_file) : channel.empty()) + ch_fasta = ch_has_rna_samples + .filter { has_rna -> has_rna && fasta_file != null } + .map { _has_rna -> fasta_file } + .ifEmpty(fasta_file ? channel.value(fasta_file) : channel.empty()) } // @@ -207,7 +188,7 @@ workflow LRP2 { .map { meta, gtf, count -> [meta, gtf, count] }, ch_gtf, - ch_gencode_fasta, + ch_fasta, file(sample_metadata_file), file(params.filter_script), file(params.hashlib_script), @@ -440,9 +421,9 @@ workflow LRP2 { // Resolve GENCODE annotation GTF for novel peptides BED mapping // Use the already-decompressed ch_gtf if available, otherwise resolve from params - def gencode_gtf_for_novel = params.gencode_gtf ? file(params.gencode_gtf) : null - ch_gencode_gtf_for_novel = gencode_gtf_for_novel - ? channel.value(gencode_gtf_for_novel) + def gtf_for_novel = params.gtf ? file(params.gtf) : null + ch_gtf_for_novel = gtf_for_novel + ? channel.value(gtf_for_novel) : channel.value(file('NO_FILE')) // Create a channel that maps each protein sample to its sample_name for grouping @@ -492,7 +473,6 @@ workflow LRP2 { // BUILD_PROTEOME_REFERENCE runs once for each sample group to create per-group sample-specific references // note: currently auto-detects genome name from FASTA filename if not explicitly provided, might need to improve upon this logic def genome_name = params.genome ?: ( - params.gencode_fasta ? file(params.gencode_fasta).name.tokenize('.')[0] : params.fasta ? file(params.fasta).name.tokenize('.')[0] : 'custom' ) @@ -533,7 +513,7 @@ workflow LRP2 { ch_rna_sample_names, // List of RNA sample names to check for matches ch_lr_cds_gtf, // Custom/LRP CDS GTF (for samples with matched RNA) ch_lr_orf_fasta, // Custom/LRP ORF FASTA (for samples with matched RNA) - ch_gencode_gtf_for_novel, // GENCODE annotation GTF (for BED mapping) + ch_gtf_for_novel, // Reference annotation GTF (for BED mapping) ch_gencode_protein_fasta, // GENCODE protein FASTA (for BED mapping) genome_name ) From 966d7dca79ad0c83ec24ca55944fb863f70ab479 Mon Sep 17 00:00:00 2001 From: = Date: Wed, 13 May 2026 16:11:39 -0400 Subject: [PATCH 17/31] fix: correct typo with GUNZIP_GENCODE_FASTA instead of GUNZIP_FASTA following deprecated --fasta param removal, simplify container directives for isocall modules to avoid issue with unbound task variable when running with only singularity --- modules/local/gzip/main.nf | 4 +--- modules/local/isocall_call/main.nf | 4 +--- modules/local/isocall_merge/main.nf | 4 +--- modules/local/isocall_prep/main.nf | 4 +--- modules/local/isocall_profile/main.nf | 4 +--- modules/local/isoseq_align/main.nf | 4 +--- subworkflows/local/pacbio_isocall.nf | 6 +----- workflows/lrp2.nf | 9 ++++----- 8 files changed, 11 insertions(+), 28 deletions(-) diff --git a/modules/local/gzip/main.nf b/modules/local/gzip/main.nf index 59db011..d136069 100644 --- a/modules/local/gzip/main.nf +++ b/modules/local/gzip/main.nf @@ -3,9 +3,7 @@ process GZIP { label 'process_single' conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'oras://community.wave.seqera.io/library/gzip:latest' : - 'community.wave.seqera.io/library/gzip:latest' }" + container "community.wave.seqera.io/library/gzip:latest" input: tuple val(meta), path(archive) diff --git a/modules/local/isocall_call/main.nf b/modules/local/isocall_call/main.nf index ebf0280..76f6bd5 100644 --- a/modules/local/isocall_call/main.nf +++ b/modules/local/isocall_call/main.nf @@ -3,9 +3,7 @@ process ISOCALL_CALL { label 'process_low' conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'docker://quay.io/pacbio/isocall:0.15.0_build1' : - 'quay.io/pacbio/isocall:0.15.0_build1' }" + container "quay.io/pacbio/isocall:0.15.0_build1" input: tuple val(meta), path(merged_profile) diff --git a/modules/local/isocall_merge/main.nf b/modules/local/isocall_merge/main.nf index 4b10d22..7bea971 100644 --- a/modules/local/isocall_merge/main.nf +++ b/modules/local/isocall_merge/main.nf @@ -3,9 +3,7 @@ process ISOCALL_MERGE { label 'process_single' conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'docker://quay.io/pacbio/isocall:0.15.0_build1' : - 'quay.io/pacbio/isocall:0.15.0_build1' }" + container "quay.io/pacbio/isocall:0.15.0_build1" input: tuple val(meta), path(profiles) diff --git a/modules/local/isocall_prep/main.nf b/modules/local/isocall_prep/main.nf index bd53803..b4e412c 100644 --- a/modules/local/isocall_prep/main.nf +++ b/modules/local/isocall_prep/main.nf @@ -3,9 +3,7 @@ process ISOCALL_PREP { label 'process_single' conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'docker://quay.io/pacbio/isocall:0.15.0_build1' : - 'quay.io/pacbio/isocall:0.15.0_build1' }" + container "quay.io/pacbio/isocall:0.15.0_build1" input: path(gtf) diff --git a/modules/local/isocall_profile/main.nf b/modules/local/isocall_profile/main.nf index ac458e5..f7c4093 100644 --- a/modules/local/isocall_profile/main.nf +++ b/modules/local/isocall_profile/main.nf @@ -3,9 +3,7 @@ process ISOCALL_PROFILE { label 'process_low' conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'docker://quay.io/pacbio/isocall:0.15.0_build1' : - 'quay.io/pacbio/isocall:0.15.0_build1' }" + container "quay.io/pacbio/isocall:0.15.0_build1" input: tuple val(meta), path(aligned_bam) diff --git a/modules/local/isoseq_align/main.nf b/modules/local/isoseq_align/main.nf index 16f4788..db6b003 100644 --- a/modules/local/isoseq_align/main.nf +++ b/modules/local/isoseq_align/main.nf @@ -3,9 +3,7 @@ process ISOSEQ_ALIGN { label 'process_high' conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/pbmm2:1.14.99--h9ee0642_0' : - 'quay.io/biocontainers/pbmm2:1.14.99--h9ee0642_0' }" + container "quay.io/biocontainers/pbmm2:1.14.99--h9ee0642_0" input: tuple val(meta), path(bam) diff --git a/subworkflows/local/pacbio_isocall.nf b/subworkflows/local/pacbio_isocall.nf index 55cc3e2..eafe338 100644 --- a/subworkflows/local/pacbio_isocall.nf +++ b/subworkflows/local/pacbio_isocall.nf @@ -52,12 +52,8 @@ workflow PACBIO_ISOCALL { // // MODULE: Prepare known isoforms database from GTF (only runs if samples are present) // - ch_gtf_for_prep = ISOCALL_ALIGN.out.bam - .map { _meta, _bam -> reference_gtf_gz } - .first() - ISOCALL_PREP ( - ch_gtf_for_prep + reference_gtf_gz ) ch_versions = ch_versions.mix(ISOCALL_PREP.out.versions) diff --git a/workflows/lrp2.nf b/workflows/lrp2.nf index 746f112..9b2ddd8 100644 --- a/workflows/lrp2.nf +++ b/workflows/lrp2.nf @@ -3,9 +3,8 @@ IMPORT MODULES / SUBWORKFLOWS / FUNCTIONS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { GUNZIP as GUNZIP_FASTA } from '../modules/nf-core/gunzip/main' include { GUNZIP as GUNZIP_GTF } from '../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_GENCODE_FASTA } from '../modules/nf-core/gunzip/main' +include { GUNZIP as GUNZIP_FASTA } from '../modules/nf-core/gunzip/main' include { GUNZIP as GUNZIP_PROTEIN_FASTA } from '../modules/nf-core/gunzip/main' include { GZIP as GZIP_GTF } from '../modules/local/gzip/main' include { BUILD_PROTEOME_REFERENCE } from '../modules/local/build_proteome_reference/main' @@ -146,8 +145,8 @@ workflow LRP2 { .map { _has_rna -> [[ id: 'fasta' ], fasta_file] } if (is_fasta_gzipped) { - GUNZIP_GENCODE_FASTA(ch_fasta_input_conditional) - ch_fasta = GUNZIP_GENCODE_FASTA.out.gunzip.map { _meta, file -> file } + GUNZIP_FASTA(ch_fasta_input_conditional) + ch_fasta = GUNZIP_FASTA.out.gunzip.map { _meta, file -> file } } else { ch_fasta = ch_has_rna_samples .filter { has_rna -> has_rna && fasta_file != null } @@ -167,7 +166,7 @@ workflow LRP2 { // // SUBWORKFLOW: Run PacBio IsoCall analysis // - // IsoCall requires config TOML and gzipped GTF reference + // IsoCall requires config TOML and gzipped GTF reference for ISOCALL_PREP ch_isocall_config = channel.value(file("${projectDir}/bin/isocall_config.toml")) PACBIO_ISOCALL ( From 10f6ac26b60f634f9b10959dcb61cfb6d6727c28 Mon Sep 17 00:00:00 2001 From: = Date: Thu, 14 May 2026 10:20:10 -0400 Subject: [PATCH 18/31] feat (closes #69): add log output for all modules to save stdout logging in user-friendly location --- modules/local/build_proteome_reference/main.nf | 4 ++++ modules/local/cpat_orf/main.nf | 4 ++++ modules/local/differential_expression/main.nf | 4 ++++ modules/local/filter_cpat/main.nf | 4 ++++ modules/local/filter_transcriptome/main.nf | 4 ++++ modules/local/fragpipe/main.nf | 4 ++++ modules/local/generate_hashids/main.nf | 5 ++++- modules/local/isocall_call/main.nf | 4 ++++ modules/local/isocall_merge/main.nf | 4 ++++ modules/local/isocall_prep/main.nf | 4 ++++ modules/local/isocall_profile/main.nf | 4 ++++ modules/local/isoseq_align/main.nf | 4 ++++ modules/local/leafcutter_longread/main.nf | 5 ++++- modules/local/metamorpheus/main.nf | 4 ++++ modules/local/msconvert_mzml/main.nf | 4 ++++ modules/local/novel_peptides/main.nf | 4 ++++ modules/local/protein_classification/main.nf | 4 ++++ modules/local/sqanti_protein/main.nf | 4 ++++ modules/local/sqanti_qc/main.nf | 5 ++++- 19 files changed, 76 insertions(+), 3 deletions(-) diff --git a/modules/local/build_proteome_reference/main.nf b/modules/local/build_proteome_reference/main.nf index 948b0be..3c8e23b 100644 --- a/modules/local/build_proteome_reference/main.nf +++ b/modules/local/build_proteome_reference/main.nf @@ -14,6 +14,7 @@ process BUILD_PROTEOME_REFERENCE { output: tuple val(meta), path("*.proteomics.reference.fasta"), emit: reference_fasta tuple val(meta), path("*.proteomics.reference.tsv"), emit: reference_tsv + tuple val(meta), path("*_S5_PROTEOMICS_M1_BUILD_PROTEOME_REFERENCE_log.txt"), emit: log path "versions.yml", emit: versions when: @@ -31,6 +32,8 @@ process BUILD_PROTEOME_REFERENCE { def custom_fasta_arg = (custom_fasta.name != 'NO_FILE' && !custom_fasta.name.contains('_NO_CUSTOM_FASTA')) ? "--custom_fasta ${custom_fasta}" : "" """ + exec > >(tee ${prefix}_S5_PROTEOMICS_M1_BUILD_PROTEOME_REFERENCE_log.txt) 2>&1 + Rscript ${build_proteome_reference_script} \\ ${lrp_fasta_arg} \\ ${counts_arg} \\ @@ -54,6 +57,7 @@ process BUILD_PROTEOME_REFERENCE { """ touch test.proteomics.reference.fasta touch test.proteomics.reference.tsv + touch ${prefix}_S5_PROTEOMICS_M1_BUILD_PROTEOME_REFERENCE_log.txt cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/cpat_orf/main.nf b/modules/local/cpat_orf/main.nf index 91dddda..540c9a9 100644 --- a/modules/local/cpat_orf/main.nf +++ b/modules/local/cpat_orf/main.nf @@ -16,6 +16,7 @@ process CPAT_ORF { tuple val(meta), path("*.predicted_proteome.CPAT.ORF_seqs.fa"), emit: orf_seqs tuple val(meta), path("*.predicted_proteome.CPAT.no_ORF.txt"), emit: no_orf tuple val(meta), path("*.predicted_proteome.CPAT.error"), emit: error_log + tuple val(meta), path("*_S3_PREDICTED_PROTEOME_M1_CPAT_ORF_log.txt"), emit: log path "versions.yml", emit: versions when: @@ -28,6 +29,8 @@ process CPAT_ORF { def top_orf = task.ext.top_orf ?: params.top_orf """ + exec > >(tee ${prefix}_S3_PREDICTED_PROTEOME_M1_CPAT_ORF_log.txt) 2>&1 + cpat.py \\ -x $hexamer_file \\ -d $logit_model \\ @@ -55,6 +58,7 @@ process CPAT_ORF { touch ${prefix}.predicted_proteome.CPAT.ORF_seqs.fa touch ${prefix}.predicted_proteome.CPAT.no_ORF.txt touch ${prefix}.predicted_proteome.CPAT.error + touch ${prefix}_S3_PREDICTED_PROTEOME_M1_CPAT_ORF_log.txt cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/differential_expression/main.nf b/modules/local/differential_expression/main.nf index 7543345..a7b3f05 100644 --- a/modules/local/differential_expression/main.nf +++ b/modules/local/differential_expression/main.nf @@ -29,6 +29,7 @@ process DIFFERENTIAL_EXPRESSION { tuple val(meta), path("differential_ORF_expression/*_DE_ORF_MD_plot.pdf"), emit: de_orf_plot tuple val(meta), path("differential_transcript_usage/*_DTU_transcript_DRIMSeq_summary.txt"), emit: dtu_summary tuple val(meta), path("differential_ORF_usage/*_DU_ORF_DRIMSeq_summary.txt"), emit: du_orf_summary + tuple val(meta), path("*_S4_MULTISAMPLE_ANALYSIS_M2_DIFFERENTIAL_EXPRESSION_log.txt"), emit: log path "versions.yml", emit: versions when: @@ -41,6 +42,8 @@ process DIFFERENTIAL_EXPRESSION { def drimseq_isoform_prop = drimseq_min_isoform_prop ?: 0.05 """ + exec > >(tee ${prefix}_S4_MULTISAMPLE_ANALYSIS_M2_DIFFERENTIAL_EXPRESSION_log.txt) 2>&1 + # Output subdirectories for organizing results mkdir -p differential_gene_expression mkdir -p differential_transcript_expression @@ -105,6 +108,7 @@ process DIFFERENTIAL_EXPRESSION { touch differential_transcript_usage/${prefix}_DTU_transcript_DRIMSeq_summary.txt touch differential_ORF_usage/${prefix}_DU_ORF_DRIMSeq_summary.txt + touch ${prefix}_S4_MULTISAMPLE_ANALYSIS_M2_DIFFERENTIAL_EXPRESSION_log.txt cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/filter_cpat/main.nf b/modules/local/filter_cpat/main.nf index 9455fb0..eb98d3d 100644 --- a/modules/local/filter_cpat/main.nf +++ b/modules/local/filter_cpat/main.nf @@ -13,6 +13,7 @@ process FILTER_CPAT { output: tuple val(meta), path("*.predicted_proteome.CPAT_ORFs_mapped.tsv"), emit: all_orfs_mapped tuple val(meta), path("*.predicted_proteome.best_ORF.gtf"), emit: cds_gtf + tuple val(meta), path("*_S3_PREDICTED_PROTEOME_M2_FILTER_CPAT_log.txt"), emit: log path "versions.yml", emit: versions when: @@ -24,6 +25,8 @@ process FILTER_CPAT { def cpat_coding_threshold = task.ext.cpat_coding_threshold ?: params.cpat_coding_threshold """ + exec > >(tee ${prefix}_S3_PREDICTED_PROTEOME_M2_FILTER_CPAT_log.txt) 2>&1 + # Ensure R can find packages in the container export R_LIBS_USER="" export R_LIBS="/usr/local/lib/R/site-library:/usr/lib/R/site-library:/usr/lib/R/library" @@ -52,6 +55,7 @@ process FILTER_CPAT { """ touch ${prefix}.predicted_proteome.CPAT_ORFs_mapped.tsv touch ${prefix}.predicted_proteome.best_ORF.gtf + touch ${prefix}_S3_PREDICTED_PROTEOME_M2_FILTER_CPAT_log.txt cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/filter_transcriptome/main.nf b/modules/local/filter_transcriptome/main.nf index e0b8a52..6aaf979 100644 --- a/modules/local/filter_transcriptome/main.nf +++ b/modules/local/filter_transcriptome/main.nf @@ -19,6 +19,7 @@ process FILTER_TRANSCRIPTOME { tuple val(meta), path("*.transcriptome.filtered.fasta"), emit: corrected_fasta_filtered tuple val(meta), path("*.transcriptome.filtered_hashids_with_cpm.txt"), emit: hashids_filtered tuple val(meta), path("*.transcriptome.all_hashids_with_cpm.txt"), emit: hashids_all + tuple val(meta), path("*_S2_TRANSCRIPTOME_M3_FILTER_TRANSCRIPTOME_log.txt"), emit: log path "versions.yml", emit: versions when: @@ -34,6 +35,8 @@ process FILTER_TRANSCRIPTOME { def transcript_class_keep = task.ext.transcript_class_keep ?: params.transcript_class_keep """ + exec > >(tee ${prefix}_S2_TRANSCRIPTOME_M3_FILTER_TRANSCRIPTOME_log.txt) 2>&1 + # Ensure R can find packages in the container export R_LIBS_USER="" export R_LIBS="/usr/local/lib/R/site-library:/usr/lib/R/site-library:/usr/lib/R/library" @@ -67,6 +70,7 @@ process FILTER_TRANSCRIPTOME { touch ${prefix}.transcriptome.filtered.fasta touch ${prefix}.transcriptome.filtered_hashids_with_cpm.txt touch ${prefix}.transcriptome.all_hashids_with_cpm.txt + touch ${prefix}_S2_TRANSCRIPTOME_M3_FILTER_TRANSCRIPTOME_log.txt cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/fragpipe/main.nf b/modules/local/fragpipe/main.nf index 53c4b67..0d2b8e7 100644 --- a/modules/local/fragpipe/main.nf +++ b/modules/local/fragpipe/main.nf @@ -17,6 +17,7 @@ process FRAGPIPE { tuple val(meta), path("results/**"), emit: results tuple val(meta), path("results/peptide.tsv"), optional: true, emit: peptide_tsv tuple val(meta), path("results/combined_peptide.tsv"), optional: true, emit: combined_peptide_tsv + tuple val(meta), path("*_S5_PROTEOMICS_M3_FRAGPIPE_log.txt"), emit: log path "versions.yml", emit: versions when: @@ -34,6 +35,8 @@ process FRAGPIPE { def use_custom_workflow = custom_workflow.name != 'NO_FILE' def workflow_name = data_type == 'DIA' ? 'DIA_SpecLib_Quant' : 'LFQ-MBR' """ + exec > >(tee ${prefix}_S5_PROTEOMICS_M3_FRAGPIPE_log.txt) 2>&1 + echo "==========================================================================" echo " FRAGPIPE ANALYSIS: ${meta.id}" echo "==========================================================================" @@ -374,6 +377,7 @@ process FRAGPIPE { touch ${prefix}.tsv touch results/psm.tsv touch results/combined_protein.tsv + touch ${prefix}_S5_PROTEOMICS_M3_FRAGPIPE_log.txt echo "\"${task.process}\":" > versions.yml echo " fragpipe: ${params.fragpipe_version}" >> versions.yml diff --git a/modules/local/generate_hashids/main.nf b/modules/local/generate_hashids/main.nf index 434ff24..812bc12 100644 --- a/modules/local/generate_hashids/main.nf +++ b/modules/local/generate_hashids/main.nf @@ -14,6 +14,7 @@ process GENERATE_HASHIDS { output: tuple val(meta), path("*.transcriptome.hashids_mapping.txt"), emit: hashids_mapping tuple val(meta), path("*.transcriptome.psl"), emit: corrected_psl + tuple val(meta), path("*_S2_TRANSCRIPTOME_M2_GENERATE_HASHIDS_log.txt"), emit: log path "versions.yml", emit: versions when: @@ -24,7 +25,8 @@ process GENERATE_HASHIDS { def prefix = task.ext.prefix ?: "${meta.id}" """ - + exec > >(tee ${prefix}_S2_TRANSCRIPTOME_M2_GENERATE_HASHIDS_log.txt) 2>&1 + # Ensure R can find packages in the container export R_LIBS_USER="" export R_LIBS="/usr/local/lib/R/site-library:/usr/lib/R/site-library:/usr/lib/R/library" @@ -50,6 +52,7 @@ process GENERATE_HASHIDS { """ touch ${prefix}.transcriptome.hashids_mapping.txt touch ${prefix}.transcriptome.psl + touch ${prefix}_S2_TRANSCRIPTOME_M2_GENERATE_HASHIDS_log.txt cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/isocall_call/main.nf b/modules/local/isocall_call/main.nf index 76f6bd5..060f2e7 100644 --- a/modules/local/isocall_call/main.nf +++ b/modules/local/isocall_call/main.nf @@ -14,6 +14,7 @@ process ISOCALL_CALL { output: tuple val(meta), path("*.isocall.isoforms.gtf.gz"), emit: gtf tuple val(meta), path("*.isocall.count_matrix.csv"), emit: count_matrix + tuple val(meta), path("*_S1_PACBIO_ISOCALL_M5_ISOCALL_CALL_log.txt"), emit: log path "versions.yml", emit: versions when: @@ -26,6 +27,8 @@ process ISOCALL_CALL { def min_read_support = task.ext.min_read_support ?: params.min_read_support def max_bundles_per_gene = task.ext.max_bundles_per_gene ?: params.max_bundles_per_gene """ + exec > >(tee ${prefix}_S1_PACBIO_ISOCALL_M5_ISOCALL_CALL_log.txt) 2>&1 + isocall call \\ --threads $threads \\ --merged-profile $merged_profile \\ @@ -50,6 +53,7 @@ process ISOCALL_CALL { """ touch ${prefix}.isocall.isoforms.gtf.gz touch ${prefix}.isocall.count_matrix.csv + touch ${prefix}_S1_PACBIO_ISOCALL_M5_ISOCALL_CALL_log.txt cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/isocall_merge/main.nf b/modules/local/isocall_merge/main.nf index 7bea971..096f4a2 100644 --- a/modules/local/isocall_merge/main.nf +++ b/modules/local/isocall_merge/main.nf @@ -10,6 +10,7 @@ process ISOCALL_MERGE { output: tuple val(meta), path("*.isocall.merged_profiles.gz"), emit: merged_profile + tuple val(meta), path("*_S1_PACBIO_ISOCALL_M4_ISOCALL_MERGE_log.txt"), emit: log path "versions.yml", emit: versions when: @@ -20,6 +21,8 @@ process ISOCALL_MERGE { def prefix = task.ext.prefix ?: "${meta.id}" def profile_list = profiles.collect{ it }.join(' ') """ + exec > >(tee ${prefix}_S1_PACBIO_ISOCALL_M4_ISOCALL_MERGE_log.txt) 2>&1 + isocall merge \\ --profiles $profile_list \\ --output ${prefix}.isocall.merged_profiles.gz \\ @@ -35,6 +38,7 @@ process ISOCALL_MERGE { def prefix = task.ext.prefix ?: "${meta.id}" """ touch ${prefix}.isocall.merged_profiles.gz + touch ${prefix}_S1_PACBIO_ISOCALL_M4_ISOCALL_MERGE_log.txt cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/isocall_prep/main.nf b/modules/local/isocall_prep/main.nf index b4e412c..5194775 100644 --- a/modules/local/isocall_prep/main.nf +++ b/modules/local/isocall_prep/main.nf @@ -10,6 +10,7 @@ process ISOCALL_PREP { output: path("*.isoforms.gz"), emit: isoforms + path("*_S1_PACBIO_ISOCALL_M1_ISOCALL_PREP_log.txt"), emit: log path "versions.yml", emit: versions when: @@ -19,6 +20,8 @@ process ISOCALL_PREP { def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${gtf.baseName}" """ + exec > >(tee ${prefix}_S1_PACBIO_ISOCALL_M1_ISOCALL_PREP_log.txt) 2>&1 + isocall prep-isoforms \\ --gtf $gtf \\ --output ${prefix}.isoforms.gz \\ @@ -34,6 +37,7 @@ process ISOCALL_PREP { def prefix = task.ext.prefix ?: "${gtf.baseName}" """ touch ${prefix}.isoforms.gz + touch ${prefix}_S1_PACBIO_ISOCALL_M1_ISOCALL_PREP_log.txt cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/isocall_profile/main.nf b/modules/local/isocall_profile/main.nf index f7c4093..a02137f 100644 --- a/modules/local/isocall_profile/main.nf +++ b/modules/local/isocall_profile/main.nf @@ -10,6 +10,7 @@ process ISOCALL_PROFILE { output: tuple val(meta), path("*_profile.gz"), emit: profile + tuple val(meta), path("*_S1_PACBIO_ISOCALL_M3_ISOCALL_PROFILE_log.txt"), emit: log path "versions.yml", emit: versions when: @@ -20,6 +21,8 @@ process ISOCALL_PROFILE { def prefix = task.ext.prefix ?: "${meta.id}" def sample_id = meta.sample_name ?: prefix """ + exec > >(tee ${prefix}_S1_PACBIO_ISOCALL_M3_ISOCALL_PROFILE_log.txt) 2>&1 + isocall profile \\ --reads $aligned_bam \\ --sample ${sample_id} \\ @@ -36,6 +39,7 @@ process ISOCALL_PROFILE { def prefix = task.ext.prefix ?: "${meta.id}" """ touch ${prefix}_profile.gz + touch ${prefix}_S1_PACBIO_ISOCALL_M3_ISOCALL_PROFILE_log.txt cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/isoseq_align/main.nf b/modules/local/isoseq_align/main.nf index db6b003..61d2837 100644 --- a/modules/local/isoseq_align/main.nf +++ b/modules/local/isoseq_align/main.nf @@ -12,6 +12,7 @@ process ISOSEQ_ALIGN { output: tuple val(meta), path("*.aligned.bam"), emit: bam tuple val(meta), path("*.aligned.bam.bai"), emit: bai + tuple val(meta), path("*_S1_PACBIO_ISOCALL_M2_ISOSEQ_ALIGN_log.txt"), emit: log path "versions.yml", emit: versions when: @@ -21,6 +22,8 @@ process ISOSEQ_ALIGN { def args = task.ext.args ?: '--preset ISOSEQ --sort' def prefix = task.ext.prefix ?: "${meta.id}" """ + exec > >(tee ${prefix}_S1_PACBIO_ISOCALL_M2_ISOSEQ_ALIGN_log.txt) 2>&1 + pbmm2 align \\ $args \\ $reference_fasta \\ @@ -38,6 +41,7 @@ process ISOSEQ_ALIGN { """ touch ${prefix}.aligned.bam touch ${prefix}.aligned.bam.bai + touch ${prefix}_S1_PACBIO_ISOCALL_M2_ISOSEQ_ALIGN_log.txt cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/leafcutter_longread/main.nf b/modules/local/leafcutter_longread/main.nf index 5d7a8ac..7b604f0 100644 --- a/modules/local/leafcutter_longread/main.nf +++ b/modules/local/leafcutter_longread/main.nf @@ -28,6 +28,7 @@ process LEAFCUTTER_LONGREAD { tuple val(meta), path("lr_leafcutter.Rplots.pdf"), emit: rplots, optional: true tuple val(meta), path("*.lr_leafcutter.ds_cluster_significance.txt"), emit: cluster_significance, optional: true tuple val(meta), path("*.lr_leafcutter.ds_effect_sizes.txt"), emit: effect_sizes, optional: true + tuple val(meta), path("*_S4_MULTISAMPLE_ANALYSIS_M1_LEAFCUTTER_LONGREAD_log.txt"), emit: log path "versions.yml", emit: versions when: @@ -42,7 +43,8 @@ process LEAFCUTTER_LONGREAD { def threads = leafcutter_threads ?: task.cpus """ - + exec > >(tee ${prefix}_S4_MULTISAMPLE_ANALYSIS_M1_LEAFCUTTER_LONGREAD_log.txt) 2>&1 + # Ensure R can find packages in the container export R_LIBS_USER="" export R_LIBS="/usr/local/lib/R/site-library:/usr/lib/R/site-library:/usr/lib/R/library" @@ -119,6 +121,7 @@ process LEAFCUTTER_LONGREAD { touch lr_leafcutter.Rplots.pdf touch ${prefix}.lr_leafcutter.ds_cluster_significance.txt touch ${prefix}.lr_leafcutter.ds_effect_sizes.txt + touch ${prefix}_S4_MULTISAMPLE_ANALYSIS_M1_LEAFCUTTER_LONGREAD_log.txt cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/metamorpheus/main.nf b/modules/local/metamorpheus/main.nf index a991ead..e44d47b 100644 --- a/modules/local/metamorpheus/main.nf +++ b/modules/local/metamorpheus/main.nf @@ -62,6 +62,7 @@ process METAMORPHEUS { output: tuple val(meta), path("*.psmtsv"), emit: psm_table tuple val(meta), path("results/**"), emit: results + tuple val(meta), path("*_S5_PROTEOMICS_M3_METAMORPHEUS_log.txt"), emit: log path "versions.yml", emit: versions when: @@ -83,6 +84,8 @@ process METAMORPHEUS { def mzml_list = renamed_files.join(' ') """ + exec > >(tee ${prefix}_S5_PROTEOMICS_M3_METAMORPHEUS_log.txt) 2>&1 + # Rename mzML files to simple names for cleaner MetaMorpheus output ${symlink_commands.join('\n ')} @@ -110,6 +113,7 @@ process METAMORPHEUS { mkdir -p results touch ${prefix}.psmtsv touch results/AllPeptides.psmtsv + touch ${prefix}_S5_PROTEOMICS_M3_METAMORPHEUS_log.txt cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/msconvert_mzml/main.nf b/modules/local/msconvert_mzml/main.nf index 02de00a..31ee129 100644 --- a/modules/local/msconvert_mzml/main.nf +++ b/modules/local/msconvert_mzml/main.nf @@ -11,6 +11,7 @@ process MSCONVERT_MZML { output: tuple val(meta), path("*.mzML"), emit: mzml + tuple val(meta), path("*_S5_PROTEOMICS_M2_MSCONVERT_MZML_log.txt"), emit: log path "versions.yml", emit: versions when: @@ -24,6 +25,8 @@ process MSCONVERT_MZML { def output_name = "${prefix}.mzML" """ + exec > >(tee ${prefix}_S5_PROTEOMICS_M2_MSCONVERT_MZML_log.txt) 2>&1 + wine msconvert \\ $raw_file \\ --outdir . \\ @@ -47,6 +50,7 @@ process MSCONVERT_MZML { def prefix = task.ext.prefix ?: "${meta.id}" """ touch ${prefix}.mzML + touch ${prefix}_S5_PROTEOMICS_M2_MSCONVERT_MZML_log.txt cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/novel_peptides/main.nf b/modules/local/novel_peptides/main.nf index d135be6..bf5668b 100644 --- a/modules/local/novel_peptides/main.nf +++ b/modules/local/novel_peptides/main.nf @@ -13,6 +13,7 @@ process NOVEL_PEPTIDES { output: tuple val(meta), path("*.proteomics.novel_peptides.tsv"), emit: novel_peptides tuple val(meta), path("*.proteomics.all_peptides.bed"), optional: true, emit: peptides_bed + tuple val(meta), path("*_S5_PROTEOMICS_M4_NOVEL_PEPTIDES_log.txt"), emit: log path "versions.yml", emit: versions when: @@ -31,6 +32,8 @@ process NOVEL_PEPTIDES { def gencode_fasta_arg = has_gencode_data ? "--gencode_fasta ${gencode_fasta}" : "" """ + exec > >(tee ${prefix}_S5_PROTEOMICS_M4_NOVEL_PEPTIDES_log.txt) 2>&1 + echo "NOVEL PEPTIDES CLASSIFICATION: ${meta.id}" echo "Sample: ${prefix}" echo "Search software: ${ms_search_software}" @@ -92,6 +95,7 @@ process NOVEL_PEPTIDES { def prefix = task.ext.prefix ?: "${meta.id}" """ touch ${prefix}.proteomics.novel_peptides.tsv + touch ${prefix}_S5_PROTEOMICS_M4_NOVEL_PEPTIDES_log.txt cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/protein_classification/main.nf b/modules/local/protein_classification/main.nf index 47c2e95..c6b726f 100644 --- a/modules/local/protein_classification/main.nf +++ b/modules/local/protein_classification/main.nf @@ -16,6 +16,7 @@ process PROTEIN_CLASSIFICATION { tuple val(meta), path("*.predicted_proteome.collapsed_high_confidence_ORF_hashids_with_cpm.txt"), emit: hashids_orf tuple val(meta), path("*.predicted_proteome.collapsed_high_confidence_ORF.gtf"), emit: protein_gtf tuple val(meta), path("*.predicted_proteome.collapsed_high_confidence_ORF.bed"), emit: protein_bed + tuple val(meta), path("*_S3_PREDICTED_PROTEOME_M4_PROTEIN_CLASSIFICATION_log.txt"), emit: log path "versions.yml", emit: versions when: @@ -28,6 +29,8 @@ process PROTEIN_CLASSIFICATION { def protein_class_keep = task.ext.protein_class_keep ?: params.protein_class_keep """ + exec > >(tee ${prefix}_S3_PREDICTED_PROTEOME_M4_PROTEIN_CLASSIFICATION_log.txt) 2>&1 + export R_LIBS_USER="" export R_LIBS="/usr/local/lib/R/site-library:/usr/lib/R/site-library:/usr/lib/R/library" @@ -58,6 +61,7 @@ process PROTEIN_CLASSIFICATION { touch ${prefix}.predicted_proteome.collapsed_high_confidence_ORF_hashids_with_cpm.txt touch ${prefix}.predicted_proteome.collapsed_high_confidence_ORF.gtf touch ${prefix}.predicted_proteome.collapsed_high_confidence_ORF.bed + touch ${prefix}_S3_PREDICTED_PROTEOME_M4_PROTEIN_CLASSIFICATION_log.txt cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/sqanti_protein/main.nf b/modules/local/sqanti_protein/main.nf index cd072c4..70911b5 100644 --- a/modules/local/sqanti_protein/main.nf +++ b/modules/local/sqanti_protein/main.nf @@ -12,6 +12,7 @@ process SQANTI_PROTEIN { output: tuple val(meta), path("*.predicted_proteome.best_ORF_SQANTI_classification.tsv"), emit: protein_classification + tuple val(meta), path("*_S3_PREDICTED_PROTEOME_M3_SQANTI_PROTEIN_log.txt"), emit: log path "versions.yml", emit: versions when: @@ -22,6 +23,8 @@ process SQANTI_PROTEIN { def prefix = task.ext.prefix ?: "${meta.id}" """ + exec > >(tee ${prefix}_S3_PREDICTED_PROTEOME_M3_SQANTI_PROTEIN_log.txt) 2>&1 + source /conda/miniconda3/etc/profile.d/conda.sh conda activate sqanti3 export SQANTI_PATH=\$(dirname \$(which sqanti3_qc.py)) @@ -51,6 +54,7 @@ process SQANTI_PROTEIN { def prefix = task.ext.prefix ?: "${meta.id}" """ touch ${prefix}.predicted_proteome.best_ORF_SQANTI_classification.tsv + touch ${prefix}_S3_PREDICTED_PROTEOME_M3_SQANTI_PROTEIN_log.txt cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/sqanti_qc/main.nf b/modules/local/sqanti_qc/main.nf index 391b18e..0328057 100644 --- a/modules/local/sqanti_qc/main.nf +++ b/modules/local/sqanti_qc/main.nf @@ -20,6 +20,7 @@ process SQANTI_QC { tuple val(meta), path("*.transcriptome.junctions.txt"), emit: junctions // tuple val(meta), path("*.transcriptome.params.txt"), emit: params // tuple val(meta), path("refAnnotation.*.genePred"), emit: refannotation_genepred + tuple val(meta), path("*_S2_TRANSCRIPTOME_M1_SQANTI_QC_log.txt"), emit: log path "versions.yml", emit: versions when: @@ -30,7 +31,8 @@ process SQANTI_QC { def prefix = task.ext.prefix ?: "${meta.id}" """ - + exec > >(tee ${prefix}_S2_TRANSCRIPTOME_M1_SQANTI_QC_log.txt) 2>&1 + source /conda/miniconda3/etc/profile.d/conda.sh conda activate sqanti3 @@ -73,6 +75,7 @@ process SQANTI_QC { touch ${prefix}.transcriptome.gtf touch ${prefix}.transcriptome.fasta touch ${prefix}.transcriptome.junctions.txt + touch ${prefix}_S2_TRANSCRIPTOME_M1_SQANTI_QC_log.txt cat <<-END_VERSIONS > versions.yml "${task.process}": From 10fd94504ea9de483c85b50be450e48571a2a5d7 Mon Sep 17 00:00:00 2001 From: = Date: Thu, 14 May 2026 13:05:41 -0400 Subject: [PATCH 19/31] feat (closes #113): add --chunks param to SQANTI_QC for parallel chunking speedup proportional to cpus allocated, bump PROCESS_LONG tag up to 4 cpus by default to support chunking optimization --- conf/base.config | 2 +- modules/local/sqanti_qc/main.nf | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/conf/base.config b/conf/base.config index ae3731d..45f490c 100644 --- a/conf/base.config +++ b/conf/base.config @@ -51,7 +51,7 @@ process { time = { 8.h * task.attempt } } withLabel:process_long { - cpus = { 1 } + cpus = { 4 } memory = { 32.GB * task.attempt } time = { 20.h * task.attempt } } diff --git a/modules/local/sqanti_qc/main.nf b/modules/local/sqanti_qc/main.nf index 0328057..c2e9eb6 100644 --- a/modules/local/sqanti_qc/main.nf +++ b/modules/local/sqanti_qc/main.nf @@ -51,8 +51,12 @@ process SQANTI_QC { --isoforms "\$ISOFORMS_INPUT" \\ --refGTF $reference_gtf \\ --refFasta $reference_fasta \\ - -o ${prefix}.transcriptome \\ - -d . \\ + --force_id_ignore \\ + --skipORF \\ + --output ${prefix}.transcriptome \\ + --dir . \\ + --cpus $task.cpus \\ + --chunks $task.cpus \\ --report skip \\ --fl $flnc_count \\ $args From f393fe65dd1b719454677e9b485987a13107ef38 Mon Sep 17 00:00:00 2001 From: = Date: Tue, 19 May 2026 10:33:25 -0400 Subject: [PATCH 20/31] fix: update slurm and lsf exitReadTimeout from 24h to 48h to support long-running Fragpipe processes for large datasets --- conf/lsf.config | 2 +- conf/slurm.config | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/lsf.config b/conf/lsf.config index 96cfabd..a075ab9 100644 --- a/conf/lsf.config +++ b/conf/lsf.config @@ -16,7 +16,7 @@ executor { pollInterval = '2 min' queueStatInterval = '10 min' submitRateLimit = '10/1min' - exitReadTimeout = '24h' + exitReadTimeout = '48h' queueSize = 50 } diff --git a/conf/slurm.config b/conf/slurm.config index bd94240..93d1fd9 100644 --- a/conf/slurm.config +++ b/conf/slurm.config @@ -47,7 +47,7 @@ executor { // Throttle job submissions submitRateLimit = '10/1min' // Time to wait for job exit info - exitReadTimeout = '24h' + exitReadTimeout = '48h' } // Here we set up some defaults for how singularity will be run by Nextflow. From dce5b931cf461507084dbaeaa73fea333273c48e Mon Sep 17 00:00:00 2001 From: = Date: Wed, 20 May 2026 16:29:24 -0400 Subject: [PATCH 21/31] docs: update CITATIONS.md --- CITATIONS.md | 77 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 73 insertions(+), 4 deletions(-) diff --git a/CITATIONS.md b/CITATIONS.md index 2a76abd..53aa244 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -10,13 +10,82 @@ ## Pipeline tools -- [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) +### PacBio Long-Read Processing -> Andrews, S. (2010). FastQC: A Quality Control Tool for High Throughput Sequence Data [Online]. +- [PacBio Isocall](https://github.com/PacificBiosciences/isocall) -- [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) +> Citation to be added pending Isocall paper publication. -> Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. +- [pbmm2 (minimap2)](https://github.com/PacificBiosciences/pbmm2) + +> Li H. Minimap2: pairwise alignment for nucleotide sequences. Bioinformatics. 2018 Sep 15;34(18):3094-3100. doi: 10.1093/bioinformatics/bty191. PubMed PMID: 29750242; PubMed Central PMCID: PMC6137996. + +- [PacBio pbtk](https://github.com/PacificBiosciences/pbtk) + +> PacBio BAM toolkit for working with PacBio BAM files. Available at: https://github.com/PacificBiosciences/pbtk + +- [PacBio IsoSeq](https://github.com/PacificBiosciences/IsoSeq) + +> For IsoSeq, please refer to PacBio's official documentation for recommended citation: https://github.com/PacificBiosciences/IsoSeq + +### Transcriptome Quality Control and Annotation + +- [SQANTI3](https://github.com/ConesaLab/SQANTI3) + +> Pardo-Palacios FJ, Arzalluz-Luque A, Kondratova L, Salguero P, Mestre-Tomás J, Amorín R, Estevan-Morió E, Liu T, Nanni A, McIntyre L, Tseng E, Conesa A. SQANTI3: curation of long-read transcriptomes for accurate identification of known and novel isoforms. Nat Methods. 2024 Apr;21(4):793-801. doi: 10.1038/s41592-024-02229-2. PubMed PMID: 38509328; PubMed Central PMCID: PMC11093726. + +### ORF and Coding Potential Prediction + +- [CPAT](https://github.com/liguowang/cpat) + +> Wang L, Park HJ, Dasari S, Wang S, Kocher JP, Li W. CPAT: Coding-Potential Assessment Tool using an alignment-free logistic regression model. Nucleic Acids Res. 2013 Apr 1;41(6):e74. doi: 10.1093/nar/gkt006. PubMed PMID: 23335781; PubMed Central PMCID: PMC3616698. + +### Differential Expression and Splicing Analysis + +- [edgeR](https://bioconductor.org/packages/edgeR/) + +> Robinson MD, McCarthy DJ, Smyth GK. edgeR: a Bioconductor package for differential expression analysis of digital gene expression data. Bioinformatics. 2010 Jan 1;26(1):139-40. doi: 10.1093/bioinformatics/btp616. PubMed PMID: 19910308; PubMed Central PMCID: PMC2796818. + +- [DRIMSeq](https://bioconductor.org/packages/DRIMSeq/) + +> Nowicka M, Robinson MD. DRIMSeq: a Dirichlet-multinomial framework for multivariate count outcomes in genomics. F1000Research. 2016;5:1356. doi: 10.12688/f1000research.8900.1. PubMed PMID: 27303634; PubMed Central PMCID: PMC5200948. + +- [Leafcutter2](https://github.com/leafcutter2/leafcutter-ds) +> A Python Leafcutter version upon which long-read Leafcutter in the pipeline is based. While citation is pending, please use the original Leafcutter citation for now: Li YI, Knowles DA, Humphrey J, Barbeira AN, Dickinson SP, Im HK, Pritchard JK. Annotation-free quantification of RNA splicing using LeafCutter. Nat Genet. 2018 Jan;50(1):151-158. doi: 10.1038/s41588-017-0004-9. PubMed PMID: 29251728; PubMed Central PMCID: PMC5742080. + +- [Leafcutter](https://github.com/davidaknowles/leafcutter) + +> Li YI, Knowles DA, Humphrey J, Barbeira AN, Dickinson SP, Im HK, Pritchard JK. Annotation-free quantification of RNA splicing using LeafCutter. Nat Genet. 2018 Jan;50(1):151-158. doi: 10.1038/s41588-017-0004-9. PubMed PMID: 29251728; PubMed Central PMCID: PMC5742080. + +### Proteomics - Mass Spectrometry Data Conversion + +- [ProteoWizard (msconvert)](https://proteowizard.sourceforge.io/) + +> Chambers MC, Maclean B, Burke R, Amodei D, Ruderman DL, Neumann S, Gatto L, Fischer B, Pratt B, Egertson J, Hoff K, Kessner D, Tasman N, Shulman N, Frewen B, Baker TA, Brusniak MY, Paulse C, Creasy D, Flashner L, Kani K, Moulding C, Seymour SL, Nuwaysir LM, Lefebvre B, Kuhlmann F, Roark J, Rainer P, Detlev S, Hemenway T, Huhmer A, Langridge J, Connolly B, Chadick T, Holly K, Eckels J, Deutsch EW, Moritz RL, Katz JE, Agus DB, MacCoss M, Tabb DL, Mallick P. A cross-platform toolkit for mass spectrometry and proteomics. Nat Biotechnol. 2012 Oct;30(10):918-20. doi: 10.1038/nbt.2377. PubMed PMID: 23051804; PubMed Central PMCID: PMC3471674. + +### Proteomics - Database Search Engines + +- [FragPipe/MSFragger](https://fragpipe.nesvilab.org/) + +> Kong AT, Leprevost FV, Avtonomov DM, Mellacheruvu D, Nesvizhskii AI. MSFragger: ultrafast and comprehensive peptide identification in mass spectrometry-based proteomics. Nat Methods. 2017 May;14(5):513-520. doi: 10.1038/nmeth.4256. PubMed PMID: 28394336; PubMed Central PMCID: PMC5409104. + +- [IonQuant](http://ionquant.nesvilab.org/) + +> Yu F, Haynes SE, Nesvizhskii AI. IonQuant Enables Accurate and Sensitive Label-Free Quantification With FDR-Controlled Match-Between-Runs. Mol Cell Proteomics. 2021;20:100077. doi: 10.1016/j.mcpro.2021.100077. PubMed PMID: 33813024; PubMed Central PMCID: PMC8131922. + +- [Philosopher](https://philosopher.nesvilab.org/) + +> da Veiga Leprevost F, Haynes SE, Avtonomov DM, Chang HY, Shanmugam AK, Mellacheruvu D, Kong AT, Nesvizhskii AI. Philosopher: a versatile toolkit for shotgun proteomics data analysis. Nat Methods. 2020 Sep;17(9):869-870. doi: 10.1038/s41592-020-0912-y. PubMed PMID: 32669682; PubMed Central PMCID: PMC7509848. + +- [MetaMorpheus](https://github.com/smith-chem-wisc/MetaMorpheus) + +> Solntsev SK, Shortreed MR, Frey BL, Smith LM. Enhanced Global Post-translational Modification Discovery with MetaMorpheus. J Proteome Res. 2018 May 4;17(5):1844-1851. doi: 10.1021/acs.jproteome.7b00873. PubMed PMID: 29578715. + +### Python and R Packages + +- [BioPython](https://biopython.org/) + +> Cock PA, Antao T, Chang JT, Chapman BA, Cox CJ, Dalke A, Friedberg I, Hamelryck T, Kauff F, Wilczynski B, de Hoon MJL. Biopython: freely available Python tools for computational molecular biology and bioinformatics. Bioinformatics. 2009 Jun 1;25(11):1422-3. doi: 10.1093/bioinformatics/btp163. PubMed PMID: 19304878; PubMed Central PMCID: PMC2682512. ## Software packaging/containerisation tools From dc1f52efc6da7d3ddfe3ba9b759581ca82ae3bbe Mon Sep 17 00:00:00 2001 From: = Date: Thu, 21 May 2026 17:19:21 -0400 Subject: [PATCH 22/31] docs: update README.md with containers and HPC primer, more specific input file details, example profile usage --- README.md | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 83 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 0d25fb7..765d29f 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,7 @@ The LRP2 Nextflow pipeline consists of five subworkflows: | 5. Proteomics *(optional)* | Build custom reference database, convert raw MS files, search with FragPipe or MetaMorpheus, map peptides to isoforms | > **Note**: Differential usage analyses (DTU/DPU with DRIMSeq) require biological replicates. Datasets without replicates may lead to unexpected results with the pipeline currently. + > **Note**: The PROTEOMICS subworkflow can only run when protein samples (sample_type='protein') are provided in the samplesheet. When both RNA and protein samples are present, it searches against a concatenated database of the predicted proteome from Stage 3 plus the reference protein FASTA. For protein-only samples, it uses only the reference protein database via `--protein_fasta` (or auto-detected from the GENCODE genome if specified). Select the search engine with `--protein_search` (options: `fragpipe` or `metamorpheus`, default: `fragpipe`). ## Quick Start @@ -31,6 +32,24 @@ The LRP2 Nextflow pipeline consists of five subworkflows: - **Nextflow** ≥ 24.04.2 ([install guide](https://www.nextflow.io/docs/latest/install.html)) - **Singularity/Apptainer** or **Docker** for containerized dependencies +### System Requirements and HPC Recommendations + +**LRP2 is primarily designed for High-Performance Computing (HPC) environments.** Although the test dataset can run on a local machine (minimum of 4 CPUs and 32GB RAM recommended), real-world datasets require substantial computational resources to be processed efficiently. + +#### A Primer on Containers + +LRP2 uses containers to manage software dependencies. Containers allow for the packaging of libraries, code, and configurations such that each tool in the pipeline can be reliably run in any computing environment without compatibility issues. + +To run LRP2, you **must have one** of the following installed: + +**Singularity/Apptainer** (required for HPC): Most HPC systems have Singularity or Apptainer pre-installed as a module. You can check this by running `module avail singularity` or `module avail apptainer`. If not available, contact your HPC administrator or see [Apptainer installation guide](https://apptainer.org/docs/admin/main/installation.html), + +**Docker** (may be used for local systems): +- Installation guides: [Docker Desktop](https://docs.docker.com/get-docker/) (Mac/Windows) or [Docker Engine](https://docs.docker.com/engine/install/) (Linux) +> **Note**: Docker typically requires root/admin privileges, and may not be available on shared HPC systems. We therefore strongly recommend the use of Singularity/Apptainer. + +The pipeline will automatically pull and cache container images on first run. Singularity images are cached in `work/singularity/` by default. + ### Clone the repository ```bash git clone https://github.com/sheynkman-lab/LRP2.git @@ -111,20 +130,29 @@ nextflow run . -profile test_dda,singularity,slurm --outdir test_results_dda \ ## Preparing Input Data +### Input File Requirements + +**RNA samples** must be provided as **PacBio full-length non-chimeric (FLNC) reads** as outputted by PacBio Isoseq refine, in either BAM or FASTQ format. It is assumed that input files are **post-processed** and have already undergone deconcatenation, demultiplexing, and primer removal. **Do NOT** provide raw subreads or CCS reads directly from the sequencer + +**Protein samples** may be either **DDA** or **DIA**, and can be provided in `.mzML` format or any other non-mzML format (e.g. `.raw`). + +### Samplesheet Structure + Prepare a samplesheet CSV describing your input data: ![Samplesheet structure](assets/samplesheet_structure.png) -> **Note**: Each RNA sample must have a unique `sample_name`, these are used by Isocall to label count matrix columns. For protein samples, all raw files originating from the same biological sample (e.g., multiple fractions or injection replicates) should share the same `sample_name` so they are combined and searched together in FragPipe. This `sample_name` should match the corresponding RNA sample to link the RNA and protein data. The predicted proteome from that sample will be included in the proteomics search database. -> **Note**: If a protein sample has no matched RNA sample, give it a unique `sample_name` that does not match any RNA sample; in this case, only the GENCODE reference proteome will be used as the proteomics search database. +> **Note**: Each RNA sample must have a unique `sample_name`, as these are used by Isocall to label count matrix columns. For protein samples, all raw files originating from the same biological sample (e.g., multiple fractions or injection replicates) should share the same `sample_name` so they are combined and searched together in FragPipe. This `sample_name` should match the corresponding RNA sample to link the RNA and protein data. The predicted proteome from that sample will be included in the proteomics search database. If a protein sample has no matched RNA sample, you can assign it a unique `sample_name` that does not match any RNA sample. In this case, only the GENCODE reference proteome will be used as the proteomics search database. **Required columns:** - `sample_name`: Each RNA sample must have a distinct value. Do not include any spaces in this value. -- `sample_path`: Absolute or relative path to the file. RNA samples should be PacBio FLNC `.bam` or `.fastq` files. Protein samples should be `.raw` or `.mzML` files. +- `sample_path`: Absolute path to the input file. + - RNA samples should be PacBio FLNC `.bam` or `.fastq` files + - Protein samples should be `.raw` or `.mzML` files - `condition`: Sample group (e.g., "control", "treatment"). Used for differential analysis, which performs pairwise comparisons between groups. Two or more groups are supported. If you do not want differential analysis, assign the same condition to all samples. Do not include any spaces in this value. -- `sample_type`: Either `RNA` or `protein`. -- `mass_spec_type`: `DDA` or `DIA`. Required for protein samples. For RNA samples, specify `none`. +- `sample_type`: Must be either `RNA` or `protein`. +- `mass_spec_type`: Must be either `DDA` or `DIA`. Required for protein samples. For RNA samples, specify `none` for this column. ## Running the Pipeline @@ -167,6 +195,56 @@ nextflow run /path/to/LRP2 \ ``` > **Note**: When both RNA and protein samples are provided, the pipeline searches against the predicted proteome combined with a reference protein FASTA (auto-detected from the genome or provided via `--protein_fasta`). For protein-only samples, only the reference database is used. +## Profile Options + +Nextflow profiles control how the pipeline executes. Multiple profiles can be combined using commas (e.g., `-profile test_rna,singularity,slurm`). + +### Container Profiles (choose ONE) + +| Profile | Description | Best For | +|---------|-------------|----------| +| `singularity` | Use Singularity/Apptainer containers | **HPC systems** (most common) | +| `docker` | Use Docker containers | Local machines with Docker installed | +| `conda` | Use Conda environments | Systems without container support (slower) | + +> **Note**: We have extensively tested the pipeline with `singularity` locally and on HPC systems, and recommend its usage. You may use `docker` on local machines. We do not recommend the use of `conda` except as a last resort due to it lacking the same reproducibility as containers. + +### Executor Profiles (optional) + +| Profile | Description | When to Use | +|---------|-------------|-------------| +| `slurm` | Submit jobs to SLURM scheduler on an HPC | HPC environment | +| `lsf` | Submit jobs to LSF scheduler on an HPC | HPC environment | + +If you do not specify an executor, LRP2 will run locally on the current node, which is suitable for an interactive session in an HPC environment or on a local machine. + +> **Important**: When using `slurm` or `lsf`, Nextflow submits individual pipeline tasks as separate jobs. Without a scheduler profile, all tasks run on the node where you launch Nextflow (requires sufficient resources). If you intend to run locally, you may need to lower resource requirements in `conf/base.config`. + +### Test Profiles + +| Profile | Description | Dataset | +|---------|-------------|---------| +| `test_rna` | RNA-only test dataset | Runs RNA subworkflows (S1 - S4) | +| `test_dda` | RNA + DDA proteomics test | Runs all subworkflows (S1 - S5) with FragPipe DDA search | +| `test_dia` | RNA + DIA proteomics test | Runs all subworkflows (S1 - S5) with FragPipe DIA search | + +### Example Profile Combinations + +**HPC with SLURM (recommended for production):** +```bash +-profile singularity,slurm +``` + +**Quick RNA test on HPC:** +```bash +-profile test_rna,singularity,slurm +``` + +**Local machine with Docker:** +```bash +-profile docker +``` + ## Reference Genome Support The pipeline supports multiple multi-species (human, mouse) reference genome sources: From 8ea713e98041b10f964ddb7c2b1db01e821bceaf Mon Sep 17 00:00:00 2001 From: = Date: Thu, 21 May 2026 17:27:18 -0400 Subject: [PATCH 23/31] docs: update README.md formatting, add more specifics --- README.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 765d29f..ba49513 100644 --- a/README.md +++ b/README.md @@ -42,10 +42,10 @@ LRP2 uses containers to manage software dependencies. Containers allow for the p To run LRP2, you **must have one** of the following installed: -**Singularity/Apptainer** (required for HPC): Most HPC systems have Singularity or Apptainer pre-installed as a module. You can check this by running `module avail singularity` or `module avail apptainer`. If not available, contact your HPC administrator or see [Apptainer installation guide](https://apptainer.org/docs/admin/main/installation.html), +1. **Singularity/Apptainer** (required for HPC): Most HPC systems have Singularity or Apptainer pre-installed as a module. You can check this by running `module avail singularity` or `module avail apptainer`. If not available, contact your HPC administrator or see [Apptainer installation guide](https://apptainer.org/docs/admin/main/installation.html), -**Docker** (may be used for local systems): -- Installation guides: [Docker Desktop](https://docs.docker.com/get-docker/) (Mac/Windows) or [Docker Engine](https://docs.docker.com/engine/install/) (Linux) +2. **Docker** (may be used for local systems): + - Installation guides: [Docker Desktop](https://docs.docker.com/get-docker/) (Mac/Windows) or [Docker Engine](https://docs.docker.com/engine/install/) (Linux) > **Note**: Docker typically requires root/admin privileges, and may not be available on shared HPC systems. We therefore strongly recommend the use of Singularity/Apptainer. The pipeline will automatically pull and cache container images on first run. Singularity images are cached in `work/singularity/` by default. @@ -63,13 +63,14 @@ Start a persistent terminal session so the pipeline keeps running if you lose yo screen -S lrp2 ``` > **Tip**: To detach from screen, press `Ctrl+A` then `D`. To reattach later: `screen -r lrp2` -> **Tip**: UVA Rivanna only supports `screen`, but you can use `tmux new -s lrp2` on other systems if preferred. + +> **Tip**: Certain HPC systems (e.g. UVA Rivanna) only support `screen`, but you can use terminal multiplexer by running `tmux new -s lrp2` on other systems if supported/preferred. Request an interactive job with enough resources for the test dataset: ```bash ijob -c 4 --mem=64G -p your_slurm_partition -A your_allocation --time=4:00:00 ``` -> **Note**: Adjust for your HPC system. Replace `your_slurm_partition` with your SLURM partition and `your_allocation` with your SLURM allocation group. The `-c` (CPUs) and `--mem` values above are sufficient for the test dataset. +> **Note**: Adjust for your HPC system. Replace `your_slurm_partition` with your SLURM partition and `your_allocation` with your SLURM allocation group. The `-c` (CPUs), `--mem` (memory), and `--time` values above are sufficient for the test dataset, but should be increased for larger datasets. Load the required modules: ```bash From e164bbbf63bf59eb725e08b7463121fdf53666c7 Mon Sep 17 00:00:00 2001 From: = Date: Thu, 21 May 2026 17:31:33 -0400 Subject: [PATCH 24/31] docs: remove lsf from readme contact mention since it is now supported --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ba49513..aa3d06d 100644 --- a/README.md +++ b/README.md @@ -179,7 +179,7 @@ nextflow run /path/to/LRP2 \ --genome GRCh38.p14.v49 \ -profile singularity,slurm ``` -> **Note**: Running on a non-SLURM scheduler (e.g., LSF, PBS)? Contact us for support: cwp5au@virginia.edu +> **Note**: Running on a non-SLURM scheduler (e.g., PBS)? Contact us for support: cwp5au@virginia.edu > **Note**: Differential analysis will run automatically when two or more conditions are specified in the samplesheet. ### With proteomics (FragPipe) From 8ac27967dd23f2b47ae2f7ed38f3fc481bc710b7 Mon Sep 17 00:00:00 2001 From: = Date: Tue, 26 May 2026 16:55:57 -0400 Subject: [PATCH 25/31] feat: update SQANTI_QC module to use awk for in-place replacement of single-sample column naming to match multi-sample name format of FL.{sample_id} --- modules/local/sqanti_qc/main.nf | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/modules/local/sqanti_qc/main.nf b/modules/local/sqanti_qc/main.nf index c2e9eb6..82e0f12 100644 --- a/modules/local/sqanti_qc/main.nf +++ b/modules/local/sqanti_qc/main.nf @@ -60,7 +60,15 @@ process SQANTI_QC { --report skip \\ --fl $flnc_count \\ $args - + + # Fix single-sample column naming to use "FL.{sample_id}", which is consist with formatting used for multi-sample runs + if head -1 ${prefix}.transcriptome_classification.txt | grep -qE '\\tFL\\t|\\tFL\\.\\t'; then + echo "Single sample detected - renaming FL column to FL.${meta.id}" + awk -v sample="${meta.id}" 'NR==1 {gsub(/\\tFL\\t|\\tFL\\.\\t/, "\\tFL."sample"\\t")} {print}' \\ + ${prefix}.transcriptome_classification.txt > ${prefix}.transcriptome_classification.tmp.txt + mv ${prefix}.transcriptome_classification.tmp.txt ${prefix}.transcriptome_classification.txt + fi + mv ${prefix}.transcriptome_classification.txt ${prefix}.transcriptome.SQANTI_classification.txt mv ${prefix}.transcriptome_corrected.gtf ${prefix}.transcriptome.gtf mv ${prefix}.transcriptome_corrected.fasta ${prefix}.transcriptome.fasta From c1dfd3fe15c9b56fc99cea2a9b323fdcb2dd47f1 Mon Sep 17 00:00:00 2001 From: = Date: Tue, 26 May 2026 17:34:43 -0400 Subject: [PATCH 26/31] feat: bump back to v6.0.1 --- modules/local/sqanti_qc/main.nf | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/modules/local/sqanti_qc/main.nf b/modules/local/sqanti_qc/main.nf index 82e0f12..1f30a66 100644 --- a/modules/local/sqanti_qc/main.nf +++ b/modules/local/sqanti_qc/main.nf @@ -3,7 +3,7 @@ process SQANTI_QC { label 'process_long' conda "${moduleDir}/environment.yml" - container 'docker://docker.io/anaconesalab/sqanti3:5.2.2' + container 'docker://docker.io/anaconesalab/sqanti3:v6.0.1' input: tuple val(meta), path(isoforms_gtf), path(flnc_count) @@ -48,9 +48,6 @@ process SQANTI_QC { fi sqanti3_qc.py \\ - --isoforms "\$ISOFORMS_INPUT" \\ - --refGTF $reference_gtf \\ - --refFasta $reference_fasta \\ --force_id_ignore \\ --skipORF \\ --output ${prefix}.transcriptome \\ @@ -58,7 +55,10 @@ process SQANTI_QC { --cpus $task.cpus \\ --chunks $task.cpus \\ --report skip \\ - --fl $flnc_count \\ + --fl_count $flnc_count \\ + "\$ISOFORMS_INPUT" \\ + $reference_gtf \\ + $reference_fasta \\ $args # Fix single-sample column naming to use "FL.{sample_id}", which is consist with formatting used for multi-sample runs @@ -91,7 +91,7 @@ process SQANTI_QC { cat <<-END_VERSIONS > versions.yml "${task.process}": - sqanti3: 6.0.1 + sqanti3: 5.2.2 END_VERSIONS """ } From 62c46a5a2cd2c562798fdd739c2d556dbca828c3 Mon Sep 17 00:00:00 2001 From: = Date: Wed, 27 May 2026 11:45:35 -0400 Subject: [PATCH 27/31] fix: restore logic from 5a0a8f959ad7f4a577f87178d639a2e7430e2fd7 with single-sample column naming fix incorporated --- modules/local/isocall_call/main.nf | 12 +++++------- modules/local/sqanti_qc/main.nf | 27 ++++++++++----------------- 2 files changed, 15 insertions(+), 24 deletions(-) diff --git a/modules/local/isocall_call/main.nf b/modules/local/isocall_call/main.nf index dfec9c1..ebf0280 100644 --- a/modules/local/isocall_call/main.nf +++ b/modules/local/isocall_call/main.nf @@ -3,7 +3,9 @@ process ISOCALL_CALL { label 'process_low' conda "${moduleDir}/environment.yml" - container "quay.io/pacbio/isocall:0.15.0_build1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://quay.io/pacbio/isocall:0.15.0_build1' : + 'quay.io/pacbio/isocall:0.15.0_build1' }" input: tuple val(meta), path(merged_profile) @@ -13,8 +15,7 @@ process ISOCALL_CALL { output: tuple val(meta), path("*.isocall.isoforms.gtf.gz"), emit: gtf - tuple val(meta), path("*.isocall.count_matrix.txt"), emit: count_matrix - tuple val(meta), path("*_S1_PACBIO_ISOCALL_M5_ISOCALL_CALL_log.txt"), emit: log + tuple val(meta), path("*.isocall.count_matrix.csv"), emit: count_matrix path "versions.yml", emit: versions when: @@ -27,8 +28,6 @@ process ISOCALL_CALL { def min_read_support = task.ext.min_read_support ?: params.min_read_support def max_bundles_per_gene = task.ext.max_bundles_per_gene ?: params.max_bundles_per_gene """ - exec > >(tee ${prefix}_S1_PACBIO_ISOCALL_M5_ISOCALL_CALL_log.txt) 2>&1 - isocall call \\ --threads $threads \\ --merged-profile $merged_profile \\ @@ -52,8 +51,7 @@ process ISOCALL_CALL { def prefix = task.ext.prefix ?: "${meta.id}" """ touch ${prefix}.isocall.isoforms.gtf.gz - touch ${prefix}.isocall.count_matrix.txt - touch ${prefix}_S1_PACBIO_ISOCALL_M5_ISOCALL_CALL_log.txt + touch ${prefix}.isocall.count_matrix.csv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/sqanti_qc/main.nf b/modules/local/sqanti_qc/main.nf index 1f30a66..245007d 100644 --- a/modules/local/sqanti_qc/main.nf +++ b/modules/local/sqanti_qc/main.nf @@ -20,7 +20,6 @@ process SQANTI_QC { tuple val(meta), path("*.transcriptome.junctions.txt"), emit: junctions // tuple val(meta), path("*.transcriptome.params.txt"), emit: params // tuple val(meta), path("refAnnotation.*.genePred"), emit: refannotation_genepred - tuple val(meta), path("*_S2_TRANSCRIPTOME_M1_SQANTI_QC_log.txt"), emit: log path "versions.yml", emit: versions when: @@ -31,8 +30,7 @@ process SQANTI_QC { def prefix = task.ext.prefix ?: "${meta.id}" """ - exec > >(tee ${prefix}_S2_TRANSCRIPTOME_M1_SQANTI_QC_log.txt) 2>&1 - + source /conda/miniconda3/etc/profile.d/conda.sh conda activate sqanti3 @@ -48,19 +46,15 @@ process SQANTI_QC { fi sqanti3_qc.py \\ - --force_id_ignore \\ - --skipORF \\ - --output ${prefix}.transcriptome \\ - --dir . \\ - --cpus $task.cpus \\ - --chunks $task.cpus \\ + --isoforms "\$ISOFORMS_INPUT" \\ + --refGTF $reference_gtf \\ + --refFasta $reference_fasta \\ + -o ${prefix}.transcriptome \\ + -d . \\ --report skip \\ - --fl_count $flnc_count \\ - "\$ISOFORMS_INPUT" \\ - $reference_gtf \\ - $reference_fasta \\ + --fl $flnc_count \\ $args - + # Fix single-sample column naming to use "FL.{sample_id}", which is consist with formatting used for multi-sample runs if head -1 ${prefix}.transcriptome_classification.txt | grep -qE '\\tFL\\t|\\tFL\\.\\t'; then echo "Single sample detected - renaming FL column to FL.${meta.id}" @@ -68,7 +62,7 @@ process SQANTI_QC { ${prefix}.transcriptome_classification.txt > ${prefix}.transcriptome_classification.tmp.txt mv ${prefix}.transcriptome_classification.tmp.txt ${prefix}.transcriptome_classification.txt fi - + mv ${prefix}.transcriptome_classification.txt ${prefix}.transcriptome.SQANTI_classification.txt mv ${prefix}.transcriptome_corrected.gtf ${prefix}.transcriptome.gtf mv ${prefix}.transcriptome_corrected.fasta ${prefix}.transcriptome.fasta @@ -87,11 +81,10 @@ process SQANTI_QC { touch ${prefix}.transcriptome.gtf touch ${prefix}.transcriptome.fasta touch ${prefix}.transcriptome.junctions.txt - touch ${prefix}_S2_TRANSCRIPTOME_M1_SQANTI_QC_log.txt cat <<-END_VERSIONS > versions.yml "${task.process}": - sqanti3: 5.2.2 + sqanti3: 6.0.1 END_VERSIONS """ } From a7baba2f9d2d1f7d640b3870459ecbb50cc2cdae Mon Sep 17 00:00:00 2001 From: = Date: Wed, 27 May 2026 14:02:11 -0400 Subject: [PATCH 28/31] fix: correct tab formatting for ansi-c style quoting for bash environment --- modules/local/sqanti_qc/main.nf | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/modules/local/sqanti_qc/main.nf b/modules/local/sqanti_qc/main.nf index 245007d..7cf0c49 100644 --- a/modules/local/sqanti_qc/main.nf +++ b/modules/local/sqanti_qc/main.nf @@ -55,10 +55,11 @@ process SQANTI_QC { --fl $flnc_count \\ $args - # Fix single-sample column naming to use "FL.{sample_id}", which is consist with formatting used for multi-sample runs - if head -1 ${prefix}.transcriptome_classification.txt | grep -qE '\\tFL\\t|\\tFL\\.\\t'; then - echo "Single sample detected - renaming FL column to FL.${meta.id}" - awk -v sample="${meta.id}" 'NR==1 {gsub(/\\tFL\\t|\\tFL\\.\\t/, "\\tFL."sample"\\t")} {print}' \\ + # Fix single-sample column naming to use "FL.{sample_id}", which is consistent with formatting used for multi-sample runs + TAB=\$'\\t' + if head -1 ${prefix}.transcriptome_classification.txt | grep -qE "\${TAB}FL\${TAB}|\${TAB}FL\\.\${TAB}"; then + SAMPLE_NAME=\$(head -1 $flnc_count | awk -F',' '{print \$2}') + awk -v sample="\${SAMPLE_NAME}" 'NR==1 {gsub(/\\tFL\\t|\\tFL\\.\\t/, "\\tFL."sample"\\t")} {print}' \\ ${prefix}.transcriptome_classification.txt > ${prefix}.transcriptome_classification.tmp.txt mv ${prefix}.transcriptome_classification.tmp.txt ${prefix}.transcriptome_classification.txt fi From 1eff04b04c7de72e3994c0991a394adb42279455 Mon Sep 17 00:00:00 2001 From: = Date: Thu, 28 May 2026 11:53:17 -0400 Subject: [PATCH 29/31] chore: remove test_dia test profile (pending update to dia test data) --- conf/test.config | 15 --------------- nextflow.config | 1 - 2 files changed, 16 deletions(-) diff --git a/conf/test.config b/conf/test.config index 7a1265e..923760d 100644 --- a/conf/test.config +++ b/conf/test.config @@ -7,7 +7,6 @@ Usage examples: nextflow run sheynkmanlab/lrp2 -profile test_rna, --outdir nextflow run sheynkmanlab/lrp2 -profile test_dda, --outdir - nextflow run sheynkmanlab/lrp2 -profile test_dia, --outdir ---------------------------------------------------------------------------------------- */ @@ -44,18 +43,4 @@ profiles { fragpipe_license_accept = true } } - - test_dia { - params { - config_profile_name = 'RNA + DIA mass spec test profile' - config_profile_description = 'Test dataset with RNA and DIA mass spectrometry samples' - - input = "${projectDir}/sample_data/samplesheet_rna_dia.csv" - dataset_name = 'lrptest_dia' - - genome = 'GRCh38.p14.v49' - protein_search = 'fragpipe' - fragpipe_license_accept = true - } - } } diff --git a/nextflow.config b/nextflow.config index 033d8b0..5fd6b13 100644 --- a/nextflow.config +++ b/nextflow.config @@ -271,7 +271,6 @@ profiles { } test_rna { includeConfig 'conf/test.config' } test_dda { includeConfig 'conf/test.config' } - test_dia { includeConfig 'conf/test.config' } test_full { includeConfig 'conf/test_full.config' } } From b8979a731462cc51974e8d5f5f355a6f233873d6 Mon Sep 17 00:00:00 2001 From: = Date: Thu, 28 May 2026 13:33:46 -0400 Subject: [PATCH 30/31] chore: update nextflow_schema.json with added hpc cluster option params --- nextflow_schema.json | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/nextflow_schema.json b/nextflow_schema.json index 053ebb6..d00728c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -528,6 +528,33 @@ } } }, + "hpc_cluster_options": { + "title": "HPC cluster options", + "type": "object", + "fa_icon": "fas fa-server", + "description": "Options for HPC cluster execution.", + "help_text": "These parameters can override default executor settings in slurm.config or lsf.config.", + "properties": { + "hpc_executor": { + "type": "string", + "description": "Override executor type (e.g., 'slurm', 'lsf', 'local').", + "help_text": "Specify the executor to use for job submission on HPC clusters.", + "fa_icon": "fas fa-cogs" + }, + "hpc_queue": { + "type": "string", + "description": "Override queue/partition name for job submission.", + "help_text": "Specify the queue or partition to use for job submission on HPC clusters.", + "fa_icon": "fas fa-list" + }, + "hpc_cluster_options": { + "type": "string", + "description": "Override cluster-specific options (e.g., '--account=myaccount').", + "help_text": "Additional cluster-specific options to pass to the job scheduler.", + "fa_icon": "fas fa-sliders-h" + } + } + }, "generic_options": { "title": "Generic options", "type": "object", @@ -660,6 +687,9 @@ { "$ref": "#/$defs/differential_analysis_options" }, + { + "$ref": "#/$defs/hpc_cluster_options" + }, { "$ref": "#/$defs/institutional_config_options" }, From 1ba7b549562c69ffff3af01c83fa5e27715c8044 Mon Sep 17 00:00:00 2001 From: = Date: Thu, 28 May 2026 17:20:15 -0400 Subject: [PATCH 31/31] docs: update nextflow_schema.json so that --help command shows all relevant params --- nextflow_schema.json | 291 ++++++++++++++++++++++++------------------- 1 file changed, 163 insertions(+), 128 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index d00728c..5558ea0 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -6,7 +6,7 @@ "type": "object", "$defs": { "input_output_options": { - "title": "Input/output options", + "title": "Input/Output Options", "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", @@ -25,7 +25,7 @@ }, "dataset_name": { "type": "string", - "default": "merged_dataset", + "default": "merged", "description": "Name for the merged dataset.", "help_text": "This name will be used as a prefix for all output files. It should be a descriptive name for your project or experiment.", "fa_icon": "fas fa-tag" @@ -42,25 +42,36 @@ "fa_icon": "fas fa-envelope", "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" - }, - "multiqc_title": { - "type": "string", - "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.", - "fa_icon": "fas fa-file-signature" } } }, "reference_genome_options": { - "title": "Reference genome options", + "title": "Genome Reference Options", "type": "object", "fa_icon": "fas fa-dna", "description": "Reference genome related files and options required for the workflow.", "properties": { "genome": { "type": "string", - "description": "Name of iGenomes reference.", + "description": "Name of genome reference. This automatically populates the paths for the reference FASTA and GTF files if using GENCODE references.", "fa_icon": "fas fa-book", - "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." + "help_text": "If using a reference genome configured in the pipeline using GENCODE, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." + }, + "gtf": { + "type": "string", + "format": "file-path", + "exists": true, + "description": "Path to custom reference GTF file (optional).", + "help_text": "Path to the GTF file matching your reference genome. If not provided, will be automatically set from iGenomes configuration using the genome parameter.", + "fa_icon": "far fa-file-code" + }, + "fasta": { + "type": "string", + "format": "file-path", + "exists": true, + "description": "Path to reference genome FASTA file (optional).", + "help_text": "Path to the genome FASTA file matching your reference genome. If not provided, will be automatically set from iGenomes configuration using the genome parameter.", + "fa_icon": "far fa-file-code" }, "igenomes_ignore": { "type": "boolean", @@ -128,7 +139,7 @@ } }, "isoseq_collapse_options": { - "title": "PacBio IsoCall options", + "title": "S1 PacBio IsoCall Options", "type": "object", "fa_icon": "fas fa-dna", "description": "Options for the PacBio IsoCall transcript calling module.", @@ -158,42 +169,19 @@ } }, "sqanti_transcript_options": { - "title": "SQANTI3 transcript QC and filtering options", + "title": "S2 Transcriptome Options", "type": "object", "fa_icon": "fas fa-filter", "description": "Options for the SQANTI3 QC and filtering modules.", "properties": { - "gtf": { - "type": "string", - "format": "file-path", - "exists": true, - "description": "Path to reference GTF file.", - "help_text": "Path to the GTF file matching your reference genome. If not provided, will be automatically set from iGenomes configuration using the genome parameter.", - "fa_icon": "far fa-file-code" - }, - "fasta": { - "type": "string", - "format": "file-path", - "exists": true, - "description": "Path to reference genome FASTA file.", - "help_text": "Path to the genome FASTA file matching your reference genome. If not provided, will be automatically set from iGenomes configuration using the genome parameter.", - "fa_icon": "far fa-file-code" - }, - "sample_metadata": { - "type": "string", - "format": "file-path", - "exists": true, - "description": "Path to sample metadata CSV file.", - "help_text": "Sample metadata file used for SQANTI3 filtering.", - "fa_icon": "fas fa-file-csv" - }, "generate_hashids_script": { "type": "string", "format": "file-path", "exists": true, "description": "Path to hash ID generation R script.", "help_text": "R script used to generate hash IDs for transcripts.", - "fa_icon": "far fa-file-code" + "fa_icon": "far fa-file-code", + "hidden": true }, "filter_script": { "type": "string", @@ -201,7 +189,8 @@ "exists": true, "description": "Path to SQANTI filtering R script.", "help_text": "R script used to filter SQANTI3 QC output.", - "fa_icon": "far fa-file-code" + "fa_icon": "far fa-file-code", + "hidden": true }, "hashlib_script": { "type": "string", @@ -209,7 +198,8 @@ "exists": true, "description": "Path to hashlib ID generator Python script.", "help_text": "Python script used to generate hash IDs for transcripts.", - "fa_icon": "far fa-file-code" + "fa_icon": "far fa-file-code", + "hidden": true }, "sqanti_path": { "type": "string", @@ -217,7 +207,8 @@ "default": "/project/sheynkman/programs/SQANTI3-5.5/src/utilities/cupcake/", "description": "Path to SQANTI3 installation directory.", "help_text": "Path to the SQANTI3 cupcake utilities directory.", - "fa_icon": "fas fa-folder" + "fa_icon": "fas fa-folder", + "hidden": true }, "protein_coding_filter": { "type": "boolean", @@ -251,86 +242,8 @@ } } }, - "differential_analysis_options": { - "title": "Differential analysis options", - "type": "object", - "fa_icon": "fas fa-chart-line", - "description": "Options for multi-sample differential expression and splicing analysis.", - "properties": { - "run_differential_analysis": { - "type": "boolean", - "default": false, - "description": "Enable multi-sample differential analysis subworkflow.", - "help_text": "When enabled, runs differential splicing (leafcutter) and differential expression/usage analyses (edgeR/DRIMSeq) for all pairwise comparisons between unique conditions in the samplesheet.", - "fa_icon": "fas fa-play" - }, - "min_samples_per_intron": { - "type": "integer", - "default": 2, - "description": "Minimum number of samples per intron for leafcutter differential splicing.", - "fa_icon": "fas fa-filter" - }, - "min_samples_per_group": { - "type": "integer", - "default": 1, - "description": "Minimum number of samples per group for leafcutter differential splicing.", - "fa_icon": "fas fa-users" - }, - "min_usage_ratio": { - "type": "number", - "default": 0.01, - "description": "Minimum junction usage ratio for minicutter filtering.", - "fa_icon": "fas fa-percent" - }, - "lr_leafcutter_script": { - "type": "string", - "format": "file-path", - "exists": true, - "description": "Path to long-read leafcutter R script.", - "help_text": "R script used for long-read leafcutter clustering analysis.", - "fa_icon": "far fa-file-code" - }, - "leafcutter_ds_script": { - "type": "string", - "format": "file-path", - "exists": true, - "description": "Path to leafcutter differential splicing Python script.", - "help_text": "Python script used for leafcutter differential splicing analysis (leafcutter_ds.py).", - "fa_icon": "far fa-file-code" - }, - "multisample_script": { - "type": "string", - "format": "file-path", - "exists": true, - "description": "Path to multi-sample analysis R script.", - "help_text": "R script used for differential expression and usage analysis with edgeR and DRIMSeq.", - "fa_icon": "far fa-file-code" - }, - "leafcutter_threads": { - "type": "integer", - "default": 12, - "description": "Number of threads for leafcutter differential splicing.", - "help_text": "Specify the number of threads to use for leafcutter differential splicing analysis. If not specified, defaults to using the number of CPUs allocated to the process (12 for process_high).", - "fa_icon": "fas fa-microchip" - }, - "drimseq_min_gene_expr": { - "type": "integer", - "default": 10, - "description": "Minimum gene-level expression for DRIMSeq filtering.", - "help_text": "Genes with total counts below this threshold will be filtered out before differential usage analysis.", - "fa_icon": "fas fa-filter" - }, - "drimseq_min_isoform_prop": { - "type": "number", - "default": 0.05, - "description": "Minimum isoform/feature proportion for DRIMSeq filtering.", - "help_text": "Isoforms/features with proportion below this threshold will be filtered out before differential usage analysis.", - "fa_icon": "fas fa-percentage" - } - } - }, "predicted_proteome_options": { - "title": "Predicted proteome options", + "title": "S3 Predicted Proteome Options", "type": "object", "fa_icon": "fas fa-dna", "description": "Options for CPAT ORF calling and protein classification.", @@ -374,6 +287,12 @@ "help_text": "Logit model used by CPAT for ORF prediction in mouse samples.", "fa_icon": "far fa-file-code" }, + "cpat_coding_threshold": { + "type": "number", + "description": "CPAT coding probability threshold.", + "help_text": "Coding probability threshold for CPAT. Auto-set based on species if not specified (human=0.364, mouse=0.44).", + "fa_icon": "fas fa-balance-scale" + }, "min_orf": { "type": "integer", "default": 75, @@ -392,7 +311,8 @@ "exists": true, "description": "Path to CPAT filtering R script.", "help_text": "R script used to filter CPAT output and call best ORFs.", - "fa_icon": "far fa-file-code" + "fa_icon": "far fa-file-code", + "hidden": true }, "sqanti_protein_script": { "type": "string", @@ -400,7 +320,8 @@ "exists": true, "description": "Path to SQANTI protein classification Python script.", "help_text": "Python script used for SQANTI3 protein classification.", - "fa_icon": "far fa-file-code" + "fa_icon": "far fa-file-code", + "hidden": true }, "protein_class_script": { "type": "string", @@ -408,7 +329,8 @@ "exists": true, "description": "Path to protein classification R script.", "help_text": "R script used for 5'UTR and protein classification.", - "fa_icon": "far fa-file-code" + "fa_icon": "far fa-file-code", + "hidden": true }, "protein_class_keep": { "type": "string", @@ -425,8 +347,82 @@ } } }, + "differential_analysis_options": { + "title": "S4 Multisample Analysis Options", + "type": "object", + "fa_icon": "fas fa-chart-line", + "description": "Options for multi-sample differential expression and splicing analysis.", + "properties": { + "min_samples_per_intron": { + "type": "integer", + "default": 2, + "description": "Minimum number of samples per intron for leafcutter differential splicing.", + "fa_icon": "fas fa-filter" + }, + "min_samples_per_group": { + "type": "integer", + "default": 1, + "description": "Minimum number of samples per group for leafcutter differential splicing.", + "fa_icon": "fas fa-users" + }, + "min_usage_ratio": { + "type": "number", + "default": 0.01, + "description": "Minimum junction usage ratio for minicutter filtering.", + "fa_icon": "fas fa-percent" + }, + "lr_leafcutter_script": { + "type": "string", + "format": "file-path", + "exists": true, + "description": "Path to long-read leafcutter R script.", + "help_text": "R script used for long-read leafcutter clustering analysis.", + "fa_icon": "far fa-file-code", + "hidden": true + }, + "leafcutter_ds_script": { + "type": "string", + "format": "file-path", + "exists": true, + "description": "Path to leafcutter differential splicing Python script.", + "help_text": "Python script used for leafcutter differential splicing analysis (leafcutter_ds.py).", + "fa_icon": "far fa-file-code", + "hidden": true + }, + "multisample_script": { + "type": "string", + "format": "file-path", + "exists": true, + "description": "Path to multi-sample analysis R script.", + "help_text": "R script used for differential expression and usage analysis with edgeR and DRIMSeq.", + "fa_icon": "far fa-file-code", + "hidden": true + }, + "leafcutter_threads": { + "type": "integer", + "default": 12, + "description": "Number of threads for leafcutter differential splicing.", + "help_text": "Specify the number of threads to use for leafcutter differential splicing analysis. If not specified, defaults to using the number of CPUs allocated to the process (12 for process_high).", + "fa_icon": "fas fa-microchip" + }, + "drimseq_min_gene_expr": { + "type": "integer", + "default": 10, + "description": "Minimum gene-level expression for DRIMSeq filtering.", + "help_text": "Genes with total counts below this threshold will be filtered out before differential usage analysis.", + "fa_icon": "fas fa-filter" + }, + "drimseq_min_isoform_prop": { + "type": "number", + "default": 0.05, + "description": "Minimum isoform/feature proportion for DRIMSeq filtering.", + "help_text": "Isoforms/features with proportion below this threshold will be filtered out before differential usage analysis.", + "fa_icon": "fas fa-percentage" + } + } + }, "proteomics_options": { - "title": "Proteomics options", + "title": "S5 Proteomics Options", "type": "object", "fa_icon": "fas fa-microscope", "description": "Options for mass spectrometry proteomics analysis.", @@ -458,6 +454,34 @@ "help_text": "When enabled, only LRP-predicted or custom protein sequences will be used for proteomics searches, excluding GENCODE reference proteins.", "fa_icon": "fas fa-ban" }, + "lrp_protein_fasta": { + "type": "string", + "format": "file-path", + "description": "Path to pre-computed LRP protein FASTA for proteomics-only runs.", + "help_text": "Pre-computed LRP protein FASTA file (headers: transcript_id|gene_id|gene_name|pclass|status). Use this for proteomics-only runs with previously generated LRP protein sequences.", + "fa_icon": "far fa-file-code" + }, + "lrp_gtf": { + "type": "string", + "format": "file-path", + "description": "Path to LRP CDS GTF from a previous run.", + "help_text": "LRP CDS GTF file from a previous run, paired with lrp_protein_fasta for proteomics-only runs.", + "fa_icon": "far fa-file-code" + }, + "custom_protein_fasta": { + "type": "string", + "format": "file-path", + "description": "Path to user-provided custom protein FASTA.", + "help_text": "User-provided custom protein FASTA file (headers: transcript_id|gene_id at minimum) for proteomics analysis.", + "fa_icon": "far fa-file-code" + }, + "custom_gtf": { + "type": "string", + "format": "file-path", + "description": "Path to user-provided custom GTF with CDS entries.", + "help_text": "User-provided custom GTF file with CDS entries, paired with custom_protein_fasta.", + "fa_icon": "far fa-file-code" + }, "protein_search": { "type": "string", "default": "fragpipe", @@ -477,26 +501,30 @@ "type": "string", "description": "User's first name for FragPipe registration (optional for interactive mode).", "help_text": "Required for non-interactive FragPipe setup. Leave empty for interactive mode.", - "fa_icon": "fas fa-user" + "fa_icon": "fas fa-user", + "hidden": true }, "fragpipe_last_name": { "type": "string", "description": "User's last name for FragPipe registration (optional for interactive mode).", "help_text": "Required for non-interactive FragPipe setup. Leave empty for interactive mode.", - "fa_icon": "fas fa-user" + "fa_icon": "fas fa-user", + "hidden": true }, "fragpipe_email": { "type": "string", "description": "Email address for FragPipe registration (optional for interactive mode).", "help_text": "Required for non-interactive FragPipe setup. Leave empty for interactive mode.", "fa_icon": "fas fa-envelope", - "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" + "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$", + "hidden": true }, "fragpipe_institution": { "type": "string", "description": "Institution/organization for FragPipe registration (optional for interactive mode).", "help_text": "Required for non-interactive FragPipe setup. Leave empty for interactive mode.", - "fa_icon": "fas fa-university" + "fa_icon": "fas fa-university", + "hidden": true }, "fragpipe_token": { "type": "string", @@ -510,7 +538,8 @@ "default": false, "description": "Accept FragPipe license terms (required for non-interactive mode).", "help_text": "Set to true to accept FragPipe license terms for non-interactive setup.", - "fa_icon": "fas fa-file-contract" + "fa_icon": "fas fa-file-contract", + "hidden": true }, "fragpipe_workflow": { "type": "string", @@ -519,6 +548,12 @@ "help_text": "Custom .workflow file for FragPipe. If not provided, default workflows will be auto-downloaded (LFQ-MBR for DDA, DIA_SpecLib_Quant for DIA).", "fa_icon": "fas fa-file" }, + "fragpipe_threads": { + "type": "integer", + "description": "Number of threads for FragPipe.", + "help_text": "Number of threads for FragPipe analysis. If not specified, defaults to using task.cpus from process label.", + "fa_icon": "fas fa-microchip" + }, "fragpipe_decoy_tag": { "type": "string", "default": "rev_",