From 06d6f0ec9a88f4696615f8dd17a7b1ae0b1a1931 Mon Sep 17 00:00:00 2001 From: theHumanBorch Date: Sun, 4 Jan 2026 11:15:45 -0600 Subject: [PATCH 1/5] export functions #7 --- DESCRIPTION | 2 +- NAMESPACE | 4 + R/export.R | 374 ++++++++++++++++++++++++++ man/exportCellRanger.Rd | 55 ++++ man/exportIgBLAST.Rd | 69 +++++ man/exportMiXCR.Rd | 59 +++++ man/exportTRUST4.Rd | 54 ++++ tests/testthat/test-export.R | 500 +++++++++++++++++++++++++++++++++++ vignettes/immReferent.Rmd | 106 +++++++- 9 files changed, 1221 insertions(+), 2 deletions(-) create mode 100644 R/export.R create mode 100644 man/exportCellRanger.Rd create mode 100644 man/exportIgBLAST.Rd create mode 100644 man/exportMiXCR.Rd create mode 100644 man/exportTRUST4.Rd create mode 100644 tests/testthat/test-export.R diff --git a/DESCRIPTION b/DESCRIPTION index 5a2c095..15753fa 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -6,7 +6,7 @@ Authors@R: c( Description: Provides a consistent interface for downloading, storing, and accessing IMGT immune receptor (TCR/BCR) and HLA sequences (both nucleotide and protein). This package serves as a core dependency for immunogenomics packages, ensuring reliable and high-quality sequence access. License: MIT + file LICENSE Encoding: UTF-8 -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 biocViews: Software, Annotation, Sequencing Depends: R (>= 4.5.0) diff --git a/NAMESPACE b/NAMESPACE index bfbcf0d..5d616e8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,9 @@ # Generated by roxygen2: do not edit by hand +export(exportCellRanger) +export(exportIgBLAST) +export(exportMiXCR) +export(exportTRUST4) export(getIMGT) export(getOGRDB) export(is_imgt_available) diff --git a/R/export.R b/R/export.R new file mode 100644 index 0000000..771278d --- /dev/null +++ b/R/export.R @@ -0,0 +1,374 @@ +# This file contains export functions for creating reference databases +# compatible with various immune repertoire analysis tools. + +#' @title Export Reference Sequences to MiXCR Format +#' @description Exports a DNAStringSet or AAStringSet to FASTA files formatted +#' for use with MiXCR's `buildLibrary` command. The function creates separate +#' FASTA files for V, D, J, and C gene segments. +#' +#' @param sequences A `DNAStringSet` or `AAStringSet` object containing immune +#' receptor sequences. Sequence names should follow IMGT nomenclature +#' (e.g., "IGHV1-2*01", "TRBJ2-1*01"). +#' @param output_dir The directory where output files will be written. +#' @param chain The chain type for the output files. One of "IGH", "IGK", "IGL", +#' "TRA", "TRB", "TRD", or "TRG". +#' +#' @details +#' MiXCR expects FASTA files with simple headers containing only the gene name. +#' The function filters sequences by gene type (V, D, J, C) based on the gene + +#' name pattern and writes separate files for each segment type. +#' +#' Output files follow the naming convention: `v-genes..fasta`, +#' `d-genes..fasta`, `j-genes..fasta`, `c-genes..fasta`. +#' +#' @return A named list containing the paths to the created files, invisibly. +#' @export +#' @seealso \url{https://mixcr.com/mixcr/guides/create-custom-library/} +#' @examples +#' # Create a small example DNAStringSet +#' seqs <- Biostrings::DNAStringSet(c( +#' "ATGCGATCGATCGATCG", +#' "ATGCGATCGATCG", +#' "ATGCGATC", +#' "ATGCGATCGATCGATCGATCG" +#' )) +#' names(seqs) <- c("IGHV1-2*01", "IGHD1-1*01", "IGHJ1*01", "IGHC*01") +#' +#' # Export to temporary directory +#' output_dir <- tempdir() +#' files <- exportMiXCR(seqs, output_dir, chain = "IGH") +#' print(files) +#' +#' # Clean up +#' unlink(unlist(files)) +exportMiXCR <- function(sequences, output_dir, chain = c("IGH", "IGK", "IGL", "TRA", "TRB", "TRD", "TRG")) { + chain <- match.arg(chain) + + if (!dir.exists(output_dir)) { + dir.create(output_dir, recursive = TRUE) + } + + # Get sequence names + seq_names <- names(sequences) + if (is.null(seq_names)) { + stop("Sequences must have names following IMGT nomenclature.", call. = FALSE) + } + + # Categorize sequences by segment type + # Pattern matches: IGHV, TRBV, etc. for V; IGHD, TRBD for D; IGHJ, TRBJ for J; IGHC, TRBC for C + v_idx <- grep(paste0("^", chain, "V"), seq_names) + d_idx <- grep(paste0("^", chain, "D"), seq_names) + j_idx <- grep(paste0("^", chain, "J"), seq_names) + c_idx <- grep(paste0("^", chain, "C"), seq_names) + + output_files <- list() + + # Helper function to write FASTA with MiXCR-compatible headers + .write_mixcr_fasta <- function(seqs, path) { + if (length(seqs) == 0) return(NULL) + # MiXCR expects simple gene names as headers + # Extract just the gene name (everything before any extra annotation) + simple_names <- sub("\\|.*$", "", names(seqs)) + names(seqs) <- simple_names + Biostrings::writeXStringSet(seqs, path) + return(path) + } + + # Write V genes + if (length(v_idx) > 0) { + v_path <- file.path(output_dir, paste0("v-genes.", chain, ".fasta")) + output_files$v_genes <- .write_mixcr_fasta(sequences[v_idx], v_path) + } + + # Write D genes + if (length(d_idx) > 0) { + d_path <- file.path(output_dir, paste0("d-genes.", chain, ".fasta")) + output_files$d_genes <- .write_mixcr_fasta(sequences[d_idx], d_path) + } + + # Write J genes + if (length(j_idx) > 0) { + j_path <- file.path(output_dir, paste0("j-genes.", chain, ".fasta")) + output_files$j_genes <- .write_mixcr_fasta(sequences[j_idx], j_path) + } + + # Write C genes + if (length(c_idx) > 0) { + c_path <- file.path(output_dir, paste0("c-genes.", chain, ".fasta")) + output_files$c_genes <- .write_mixcr_fasta(sequences[c_idx], c_path) + } + + if (length(output_files) == 0) { + warning("No sequences matching chain '", chain, "' were found.", call. = FALSE) + } + + invisible(output_files) +} + + +#' @title Export Reference Sequences to TRUST4 Format +#' @description Exports a DNAStringSet to a FASTA file formatted for use with +#' TRUST4. The output follows the format produced by TRUST4's `BuildImgtAnnot.pl` +#' script. +#' +#' @param sequences A `DNAStringSet` object containing immune receptor sequences. +#' Sequence names should follow IMGT nomenclature (e.g., "IGHV1-2*01"). +#' @param output_file The path to the output FASTA file. +#' @param include_constant Logical. If `TRUE`, include constant region sequences. +#' TRUST4's IMGT+C.fa file includes constant regions. Default is `TRUE`. +#' +#' @details +#' TRUST4 expects FASTA files with headers containing only the allele name +#' (e.g., ">IGHV1-2*01"). The function reformats sequence headers to match +#' the output of TRUST4's `BuildImgtAnnot.pl` script. +#' +#' TRUST4 uses this reference for the `--ref` parameter in its analysis pipeline. +#' +#' @return The path to the created file, invisibly. +#' @export +#' @seealso \url{https://github.com/liulab-dfci/TRUST4} +#' @examples +#' # Create a small example DNAStringSet +#' seqs <- Biostrings::DNAStringSet(c( +#' "ATGCGATCGATCGATCG", +#' "ATGCGATCGATCG", +#' "ATGCGATC" +#' )) +#' names(seqs) <- c("IGHV1-2*01", "IGHJ1*01", "IGHC*01") +#' +#' # Export to temporary file +#' output_file <- tempfile(fileext = ".fa") +#' exportTRUST4(seqs, output_file) +#' +#' # View the result +#' cat(readLines(output_file), sep = "\n") +#' +#' # Clean up +#' unlink(output_file) +exportTRUST4 <- function(sequences, output_file, include_constant = TRUE) { + if (!inherits(sequences, "DNAStringSet")) { + stop("sequences must be a DNAStringSet object.", call. = FALSE) + } + + seq_names <- names(sequences) + if (is.null(seq_names)) { + stop("Sequences must have names following IMGT nomenclature.", call. = FALSE) + } + + # Filter out constant regions if requested + if (!include_constant) { + # Keep only sequences with V, D, or J in the name pattern + keep_idx <- grep("(IG[HKL]|TR[ABDG])[VDJ]", seq_names) + sequences <- sequences[keep_idx] + seq_names <- names(sequences) + } + + if (length(sequences) == 0) { + stop("No valid sequences to export.", call. = FALSE) + } + + # TRUST4's BuildImgtAnnot.pl extracts only the allele name from IMGT headers + # Format: gene*allele (e.g., IGHV1-2*01) + # Remove any additional annotation after the allele designation + simple_names <- sub("\\|.*$", "", seq_names) + simple_names <- sub(" .*$", "", simple_names) + names(sequences) <- simple_names + + # Ensure output directory exists + output_dir <- dirname(output_file) + if (!dir.exists(output_dir) && output_dir != ".") { + dir.create(output_dir, recursive = TRUE) + } + + Biostrings::writeXStringSet(sequences, output_file) + + invisible(output_file) +} + + +#' @title Export Reference Sequences to Cell Ranger VDJ Format +#' @description Exports a DNAStringSet to FASTA format suitable for creating +#' a custom Cell Ranger VDJ reference. The function generates a FASTA file +#' with properly formatted headers for use with `cellranger mkvdjref`. +#' +#' @param sequences A `DNAStringSet` object containing immune receptor sequences. +#' Sequence names should follow IMGT nomenclature (e.g., "IGHV1-2*01"). +#' @param output_file The path to the output FASTA file. +#' @param gene_type The type of gene region. One of "V", "D", "J", or "C". +#' If NULL (default), the function will attempt to infer the type from +#' sequence names. +#' +#' @details +#' Cell Ranger's `mkvdjref` command expects FASTA files with specific header +#' formats. This function creates a FASTA file that can be used as input +#' to build a custom VDJ reference. +#' +#' Note: For a complete Cell Ranger VDJ reference, you also need a GTF file +#' with gene annotations. This function only creates the FASTA component. +#' +#' @return The path to the created file, invisibly. +#' @export +#' @seealso \url{https://www.10xgenomics.com/support/software/cell-ranger/latest/analysis/inputs/cr-5p-references} +#' @examples +#' # Create a small example DNAStringSet +#' seqs <- Biostrings::DNAStringSet(c( +#' "ATGCGATCGATCGATCG", +#' "ATGCGATCGATCG" +#' )) +#' names(seqs) <- c("IGHV1-2*01", "IGHV1-3*01") +#' +#' # Export to temporary file +#' output_file <- tempfile(fileext = ".fa") +#' exportCellRanger(seqs, output_file) +#' +#' # View the result +#' cat(readLines(output_file), sep = "\n") +#' +#' # Clean up +#' unlink(output_file) +exportCellRanger <- function(sequences, output_file, gene_type = NULL) { + if (!inherits(sequences, "DNAStringSet")) { + stop("sequences must be a DNAStringSet object.", call. = FALSE) + } + + seq_names <- names(sequences) + if (is.null(seq_names)) { + stop("Sequences must have names following IMGT nomenclature.", call. = FALSE) + } + + if (length(sequences) == 0) { + stop("No sequences to export.", call. = FALSE) + } + + # Cell Ranger expects clean gene names + # Format headers as: >gene_name + simple_names <- sub("\\|.*$", "", seq_names) + simple_names <- sub(" .*$", "", simple_names) + names(sequences) <- simple_names + + # Ensure output directory exists + output_dir <- dirname(output_file) + if (!dir.exists(output_dir) && output_dir != ".") { + dir.create(output_dir, recursive = TRUE) + } + + Biostrings::writeXStringSet(sequences, output_file) + + invisible(output_file) +} + + +#' @title Export Reference Sequences to IgBLAST Format +#' @description Exports a DNAStringSet to FASTA files formatted for use with +#' IgBLAST. The function creates separate FASTA files for V, D, and J gene +#' segments with simplified headers compatible with IgBLAST's requirements. +#' +#' @param sequences A `DNAStringSet` object containing immune receptor sequences. +#' Sequence names should follow IMGT nomenclature (e.g., "IGHV1-2*01"). +#' @param output_dir The directory where output files will be written. +#' @param organism The organism name for the output files. Used in file naming. +#' Default is "custom". +#' @param receptor_type The receptor type. One of "ig" for immunoglobulin or +#' "tcr" for T-cell receptor. Default is "ig". +#' +#' @details +#' IgBLAST requires FASTA files with simplified headers containing only the +#' gene/allele name. This function mimics the output of IgBLAST's +#' `edit_imgt_file.pl` script, which truncates IMGT headers to keep only +#' the allele designation. +#' +#' Output files follow the naming convention used by IgBLAST: +#' `__v.fasta`, `__d.fasta`, +#' `__j.fasta`. +#' +#' After exporting, use `makeblastdb` with the `-parse_seqids` flag to create +#' the BLAST database: +#' ``` +#' makeblastdb -parse_seqids -dbtype nucl -in -out +#' ``` +#' +#' @return A named list containing the paths to the created files, invisibly. +#' @export +#' @seealso \url{https://ncbi.github.io/igblast/} +#' @examples +#' # Create a small example DNAStringSet +#' seqs <- Biostrings::DNAStringSet(c( +#' "ATGCGATCGATCGATCG", +#' "ATGCGATCGATCG", +#' "ATGCGATC" +#' )) +#' names(seqs) <- c("IGHV1-2*01", "IGHD1-1*01", "IGHJ1*01") +#' +#' # Export to temporary directory +#' output_dir <- tempdir() +#' files <- exportIgBLAST(seqs, output_dir, organism = "human", receptor_type = "ig") +#' print(files) +#' +#' # Clean up +#' unlink(unlist(files)) +exportIgBLAST <- function(sequences, output_dir, organism = "custom", receptor_type = c("ig", "tcr")) { + receptor_type <- match.arg(receptor_type) + + if (!inherits(sequences, "DNAStringSet")) { + stop("sequences must be a DNAStringSet object.", call. = FALSE) + } + + if (!dir.exists(output_dir)) { + dir.create(output_dir, recursive = TRUE) + } + + seq_names <- names(sequences) + if (is.null(seq_names)) { + stop("Sequences must have names following IMGT nomenclature.", call. = FALSE) + } + + # IgBLAST's edit_imgt_file.pl simplifies headers to just the allele name + # Categorize by segment type + if (receptor_type == "ig") { + v_idx <- grep("^IG[HKL]V", seq_names) + d_idx <- grep("^IGHD", seq_names) + j_idx <- grep("^IG[HKL]J", seq_names) + } else { + v_idx <- grep("^TR[ABDG]V", seq_names) + d_idx <- grep("^TR[BD]D", seq_names) + j_idx <- grep("^TR[ABDG]J", seq_names) + } + + output_files <- list() + + # Helper function to write FASTA with IgBLAST-compatible headers + .write_igblast_fasta <- function(seqs, path) { + if (length(seqs) == 0) return(NULL) + # Simplify names to just allele designation + simple_names <- sub("\\|.*$", "", names(seqs)) + simple_names <- sub(" .*$", "", simple_names) + names(seqs) <- simple_names + Biostrings::writeXStringSet(seqs, path) + return(path) + } + + # Write V genes + if (length(v_idx) > 0) { + v_path <- file.path(output_dir, paste0(organism, "_", receptor_type, "_v.fasta")) + output_files$v_genes <- .write_igblast_fasta(sequences[v_idx], v_path) + } + + # Write D genes + if (length(d_idx) > 0) { + d_path <- file.path(output_dir, paste0(organism, "_", receptor_type, "_d.fasta")) + output_files$d_genes <- .write_igblast_fasta(sequences[d_idx], d_path) + } + + # Write J genes + if (length(j_idx) > 0) { + j_path <- file.path(output_dir, paste0(organism, "_", receptor_type, "_j.fasta")) + output_files$j_genes <- .write_igblast_fasta(sequences[j_idx], j_path) + } + + if (length(output_files) == 0) { + warning("No sequences matching receptor type '", receptor_type, "' were found.", call. = FALSE) + } + + invisible(output_files) +} diff --git a/man/exportCellRanger.Rd b/man/exportCellRanger.Rd new file mode 100644 index 0000000..f26ee3e --- /dev/null +++ b/man/exportCellRanger.Rd @@ -0,0 +1,55 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/export.R +\name{exportCellRanger} +\alias{exportCellRanger} +\title{Export Reference Sequences to Cell Ranger VDJ Format} +\usage{ +exportCellRanger(sequences, output_file, gene_type = NULL) +} +\arguments{ +\item{sequences}{A `DNAStringSet` object containing immune receptor sequences. +Sequence names should follow IMGT nomenclature (e.g., "IGHV1-2*01").} + +\item{output_file}{The path to the output FASTA file.} + +\item{gene_type}{The type of gene region. One of "V", "D", "J", or "C". +If NULL (default), the function will attempt to infer the type from +sequence names.} +} +\value{ +The path to the created file, invisibly. +} +\description{ +Exports a DNAStringSet to FASTA format suitable for creating +a custom Cell Ranger VDJ reference. The function generates a FASTA file +with properly formatted headers for use with `cellranger mkvdjref`. +} +\details{ +Cell Ranger's `mkvdjref` command expects FASTA files with specific header +formats. This function creates a FASTA file that can be used as input +to build a custom VDJ reference. + +Note: For a complete Cell Ranger VDJ reference, you also need a GTF file +with gene annotations. This function only creates the FASTA component. +} +\examples{ +# Create a small example DNAStringSet +seqs <- Biostrings::DNAStringSet(c( + "ATGCGATCGATCGATCG", + "ATGCGATCGATCG" +)) +names(seqs) <- c("IGHV1-2*01", "IGHV1-3*01") + +# Export to temporary file +output_file <- tempfile(fileext = ".fa") +exportCellRanger(seqs, output_file) + +# View the result +cat(readLines(output_file), sep = "\n") + +# Clean up +unlink(output_file) +} +\seealso{ +\url{https://www.10xgenomics.com/support/software/cell-ranger/latest/analysis/inputs/cr-5p-references} +} diff --git a/man/exportIgBLAST.Rd b/man/exportIgBLAST.Rd new file mode 100644 index 0000000..d330a74 --- /dev/null +++ b/man/exportIgBLAST.Rd @@ -0,0 +1,69 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/export.R +\name{exportIgBLAST} +\alias{exportIgBLAST} +\title{Export Reference Sequences to IgBLAST Format} +\usage{ +exportIgBLAST( + sequences, + output_dir, + organism = "custom", + receptor_type = c("ig", "tcr") +) +} +\arguments{ +\item{sequences}{A `DNAStringSet` object containing immune receptor sequences. +Sequence names should follow IMGT nomenclature (e.g., "IGHV1-2*01").} + +\item{output_dir}{The directory where output files will be written.} + +\item{organism}{The organism name for the output files. Used in file naming. +Default is "custom".} + +\item{receptor_type}{The receptor type. One of "ig" for immunoglobulin or +"tcr" for T-cell receptor. Default is "ig".} +} +\value{ +A named list containing the paths to the created files, invisibly. +} +\description{ +Exports a DNAStringSet to FASTA files formatted for use with +IgBLAST. The function creates separate FASTA files for V, D, and J gene +segments with simplified headers compatible with IgBLAST's requirements. +} +\details{ +IgBLAST requires FASTA files with simplified headers containing only the +gene/allele name. This function mimics the output of IgBLAST's +`edit_imgt_file.pl` script, which truncates IMGT headers to keep only +the allele designation. + +Output files follow the naming convention used by IgBLAST: +`__v.fasta`, `__d.fasta`, +`__j.fasta`. + +After exporting, use `makeblastdb` with the `-parse_seqids` flag to create +the BLAST database: +``` +makeblastdb -parse_seqids -dbtype nucl -in -out +``` +} +\examples{ +# Create a small example DNAStringSet +seqs <- Biostrings::DNAStringSet(c( + "ATGCGATCGATCGATCG", + "ATGCGATCGATCG", + "ATGCGATC" +)) +names(seqs) <- c("IGHV1-2*01", "IGHD1-1*01", "IGHJ1*01") + +# Export to temporary directory +output_dir <- tempdir() +files <- exportIgBLAST(seqs, output_dir, organism = "human", receptor_type = "ig") +print(files) + +# Clean up +unlink(unlist(files)) +} +\seealso{ +\url{https://ncbi.github.io/igblast/} +} diff --git a/man/exportMiXCR.Rd b/man/exportMiXCR.Rd new file mode 100644 index 0000000..8246ee0 --- /dev/null +++ b/man/exportMiXCR.Rd @@ -0,0 +1,59 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/export.R +\name{exportMiXCR} +\alias{exportMiXCR} +\title{Export Reference Sequences to MiXCR Format} +\usage{ +exportMiXCR( + sequences, + output_dir, + chain = c("IGH", "IGK", "IGL", "TRA", "TRB", "TRD", "TRG") +) +} +\arguments{ +\item{sequences}{A `DNAStringSet` or `AAStringSet` object containing immune +receptor sequences. Sequence names should follow IMGT nomenclature +(e.g., "IGHV1-2*01", "TRBJ2-1*01").} + +\item{output_dir}{The directory where output files will be written.} + +\item{chain}{The chain type for the output files. One of "IGH", "IGK", "IGL", +"TRA", "TRB", "TRD", or "TRG".} +} +\value{ +A named list containing the paths to the created files, invisibly. +} +\description{ +Exports a DNAStringSet or AAStringSet to FASTA files formatted +for use with MiXCR's `buildLibrary` command. The function creates separate +FASTA files for V, D, J, and C gene segments. +} +\details{ +MiXCR expects FASTA files with simple headers containing only the gene name. +The function filters sequences by gene type (V, D, J, C) based on the gene +name pattern and writes separate files for each segment type. + +Output files follow the naming convention: `v-genes..fasta`, +`d-genes..fasta`, `j-genes..fasta`, `c-genes..fasta`. +} +\examples{ +# Create a small example DNAStringSet +seqs <- Biostrings::DNAStringSet(c( + "ATGCGATCGATCGATCG", + "ATGCGATCGATCG", + "ATGCGATC", + "ATGCGATCGATCGATCGATCG" +)) +names(seqs) <- c("IGHV1-2*01", "IGHD1-1*01", "IGHJ1*01", "IGHC*01") + +# Export to temporary directory +output_dir <- tempdir() +files <- exportMiXCR(seqs, output_dir, chain = "IGH") +print(files) + +# Clean up +unlink(unlist(files)) +} +\seealso{ +\url{https://mixcr.com/mixcr/guides/create-custom-library/} +} diff --git a/man/exportTRUST4.Rd b/man/exportTRUST4.Rd new file mode 100644 index 0000000..21eb834 --- /dev/null +++ b/man/exportTRUST4.Rd @@ -0,0 +1,54 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/export.R +\name{exportTRUST4} +\alias{exportTRUST4} +\title{Export Reference Sequences to TRUST4 Format} +\usage{ +exportTRUST4(sequences, output_file, include_constant = TRUE) +} +\arguments{ +\item{sequences}{A `DNAStringSet` object containing immune receptor sequences. +Sequence names should follow IMGT nomenclature (e.g., "IGHV1-2*01").} + +\item{output_file}{The path to the output FASTA file.} + +\item{include_constant}{Logical. If `TRUE`, include constant region sequences. +TRUST4's IMGT+C.fa file includes constant regions. Default is `TRUE`.} +} +\value{ +The path to the created file, invisibly. +} +\description{ +Exports a DNAStringSet to a FASTA file formatted for use with +TRUST4. The output follows the format produced by TRUST4's `BuildImgtAnnot.pl` +script. +} +\details{ +TRUST4 expects FASTA files with headers containing only the allele name +(e.g., ">IGHV1-2*01"). The function reformats sequence headers to match +the output of TRUST4's `BuildImgtAnnot.pl` script. + +TRUST4 uses this reference for the `--ref` parameter in its analysis pipeline. +} +\examples{ +# Create a small example DNAStringSet +seqs <- Biostrings::DNAStringSet(c( + "ATGCGATCGATCGATCG", + "ATGCGATCGATCG", + "ATGCGATC" +)) +names(seqs) <- c("IGHV1-2*01", "IGHJ1*01", "IGHC*01") + +# Export to temporary file +output_file <- tempfile(fileext = ".fa") +exportTRUST4(seqs, output_file) + +# View the result +cat(readLines(output_file), sep = "\n") + +# Clean up +unlink(output_file) +} +\seealso{ +\url{https://github.com/liulab-dfci/TRUST4} +} diff --git a/tests/testthat/test-export.R b/tests/testthat/test-export.R new file mode 100644 index 0000000..b1dfb06 --- /dev/null +++ b/tests/testthat/test-export.R @@ -0,0 +1,500 @@ +# tests/testthat/test-export.R + +# Helper function to create test DNAStringSet +.create_test_dna_seqs <- function() { + seqs <- Biostrings::DNAStringSet(c( + "ATGCGATCGATCGATCGATCGATCGATCG", + "ATGCGATCGATCGATCGATCG", + "ATGCGATCGATCGATCG", + "ATGCGATCGATC", + "ATGCGATC", + "ATGC" + )) + names(seqs) <- c( + "IGHV1-2*01", + "IGHV3-11*02", + "IGHD1-1*01", + "IGHJ1*01", + "IGHJ4*02", + "IGHC*01" + ) + seqs +} + +# Helper function to create TCR test sequences +.create_test_tcr_seqs <- function() { + seqs <- Biostrings::DNAStringSet(c( + "ATGCGATCGATCGATCGATCGATCGATCG", + "ATGCGATCGATCGATCGATCG", + "ATGCGATCGATCGATCG", + "ATGCGATC" + )) + names(seqs) <- c( + "TRBV1-1*01", + "TRBD1*01", + "TRBJ1-1*01", + "TRBC1*01" + ) + seqs +} + +# Helper function to create AAStringSet +.create_test_aa_seqs <- function() { + seqs <- Biostrings::AAStringSet(c( + "MSTKVLRQFG", + "MSTKVLRQ", + "MSTKV" + )) + names(seqs) <- c("IGHV1-2*01", "IGHJ1*01", "IGHC*01") + seqs +} + +# ============================================================================== +# Tests for exportMiXCR() +# ============================================================================== + +testthat::test_that("exportMiXCR() creates separate files for V, D, J, C segments", { + seqs <- .create_test_dna_seqs() + output_dir <- withr::local_tempdir() + + result <- exportMiXCR(seqs, output_dir, chain = "IGH") + + # Check that all expected files were created + expect_true(!is.null(result$v_genes)) + expect_true(!is.null(result$d_genes)) + expect_true(!is.null(result$j_genes)) + expect_true(!is.null(result$c_genes)) + + # Check files exist + + expect_true(file.exists(result$v_genes)) + expect_true(file.exists(result$d_genes)) + expect_true(file.exists(result$j_genes)) + expect_true(file.exists(result$c_genes)) + + # Check file naming convention + expect_match(basename(result$v_genes), "^v-genes\\.IGH\\.fasta$") + expect_match(basename(result$d_genes), "^d-genes\\.IGH\\.fasta$") + expect_match(basename(result$j_genes), "^j-genes\\.IGH\\.fasta$") + expect_match(basename(result$c_genes), "^c-genes\\.IGH\\.fasta$") +}) + +testthat::test_that("exportMiXCR() writes correct FASTA content", { + seqs <- .create_test_dna_seqs() + output_dir <- withr::local_tempdir() + + result <- exportMiXCR(seqs, output_dir, chain = "IGH") + + # Read V gene file and check content + v_content <- readLines(result$v_genes) + expect_true(any(grepl("^>IGHV1-2\\*01$", v_content))) + expect_true(any(grepl("^>IGHV3-11\\*02$", v_content))) + expect_true(any(grepl("^ATGCGATCGATCGATCGATCGATCGATCG$", v_content))) + + # Read D gene file + d_content <- readLines(result$d_genes) + expect_true(any(grepl("^>IGHD1-1\\*01$", d_content))) + + # Read J gene file + j_content <- readLines(result$j_genes) + expect_true(any(grepl("^>IGHJ1\\*01$", j_content))) + expect_true(any(grepl("^>IGHJ4\\*02$", j_content))) +}) + +testthat::test_that("exportMiXCR() handles AAStringSet", { + seqs <- .create_test_aa_seqs() + output_dir <- withr::local_tempdir() + + result <- exportMiXCR(seqs, output_dir, chain = "IGH") + + expect_true(!is.null(result$v_genes)) + expect_true(!is.null(result$j_genes)) + expect_true(!is.null(result$c_genes)) + + # Check amino acid content + v_content <- readLines(result$v_genes) + expect_true(any(grepl("^MSTKVLRQFG$", v_content))) +}) + +testthat::test_that("exportMiXCR() creates output directory if needed", { + seqs <- .create_test_dna_seqs() + temp_base <- withr::local_tempdir() + output_dir <- file.path(temp_base, "new", "nested", "dir") + + expect_false(dir.exists(output_dir)) + + result <- exportMiXCR(seqs, output_dir, chain = "IGH") + + expect_true(dir.exists(output_dir)) + expect_true(file.exists(result$v_genes)) +}) + +testthat::test_that("exportMiXCR() errors on sequences without names", { + seqs <- Biostrings::DNAStringSet(c("ATGC", "GCTA")) + + output_dir <- withr::local_tempdir() + + expect_error( + exportMiXCR(seqs, output_dir, chain = "IGH"), + "Sequences must have names" + ) +}) + +testthat::test_that("exportMiXCR() warns when no matching sequences found", { + seqs <- .create_test_dna_seqs() + output_dir <- withr::local_tempdir() + + expect_warning( + result <- exportMiXCR(seqs, output_dir, chain = "TRB"), + "No sequences matching chain 'TRB'" + ) + + expect_equal(length(result), 0) +}) + +testthat::test_that("exportMiXCR() handles all supported chains", { + chains <- c("IGH", "IGK", "IGL", "TRA", "TRB", "TRD", "TRG") + + for (chain in chains) { + # Create sequences for this chain + seqs <- Biostrings::DNAStringSet(c("ATGCGATCGATC", "ATGCGATC")) + names(seqs) <- c(paste0(chain, "V1*01"), paste0(chain, "J1*01")) + + output_dir <- withr::local_tempdir() + result <- exportMiXCR(seqs, output_dir, chain = chain) + + expect_true(!is.null(result$v_genes), info = paste("Failed for chain:", chain)) + expect_true(!is.null(result$j_genes), info = paste("Failed for chain:", chain)) + } +}) + +testthat::test_that("exportMiXCR() removes extra annotation from headers", { + seqs <- Biostrings::DNAStringSet(c("ATGCGATCGATC")) + names(seqs) <- c("IGHV1-2*01|Homo sapiens|F|V-REGION|1..296") + + output_dir <- withr::local_tempdir() + result <- exportMiXCR(seqs, output_dir, chain = "IGH") + + content <- readLines(result$v_genes) + expect_true(any(grepl("^>IGHV1-2\\*01$", content))) + expect_false(any(grepl("Homo sapiens", content))) +}) + +# ============================================================================== +# Tests for exportTRUST4() +# ============================================================================== + +testthat::test_that("exportTRUST4() creates FASTA file with correct format", { + seqs <- .create_test_dna_seqs() + output_file <- tempfile(fileext = ".fa") + withr::defer(unlink(output_file)) + + result <- exportTRUST4(seqs, output_file) + + expect_equal(result, output_file) + expect_true(file.exists(output_file)) + + content <- readLines(output_file) + # Check headers are simplified + expect_true(any(grepl("^>IGHV1-2\\*01$", content))) + expect_true(any(grepl("^>IGHD1-1\\*01$", content))) + expect_true(any(grepl("^>IGHJ1\\*01$", content))) + expect_true(any(grepl("^>IGHC\\*01$", content))) +}) + +testthat::test_that("exportTRUST4() excludes constant regions when requested", { + seqs <- .create_test_dna_seqs() + output_file <- tempfile(fileext = ".fa") + withr::defer(unlink(output_file)) + + result <- exportTRUST4(seqs, output_file, include_constant = FALSE) + + content <- readLines(output_file) + # V, D, J should be present + expect_true(any(grepl("^>IGHV", content))) + expect_true(any(grepl("^>IGHD", content))) + expect_true(any(grepl("^>IGHJ", content))) + # C should be absent + expect_false(any(grepl("^>IGHC", content))) +}) + +testthat::test_that("exportTRUST4() errors on AAStringSet input", { + seqs <- .create_test_aa_seqs() + output_file <- tempfile(fileext = ".fa") + + expect_error( + exportTRUST4(seqs, output_file), + "sequences must be a DNAStringSet" + ) +}) + +testthat::test_that("exportTRUST4() errors on sequences without names", { + seqs <- Biostrings::DNAStringSet(c("ATGC", "GCTA")) + output_file <- tempfile(fileext = ".fa") + + expect_error( + exportTRUST4(seqs, output_file), + "Sequences must have names" + ) +}) + +testthat::test_that("exportTRUST4() creates output directory if needed", { + seqs <- .create_test_dna_seqs() + temp_base <- withr::local_tempdir() + output_file <- file.path(temp_base, "nested", "dir", "output.fa") + + expect_false(dir.exists(dirname(output_file))) + + result <- exportTRUST4(seqs, output_file) + + expect_true(dir.exists(dirname(output_file))) + expect_true(file.exists(output_file)) +}) + +testthat::test_that("exportTRUST4() removes extra annotation from headers", { + seqs <- Biostrings::DNAStringSet(c("ATGCGATCGATC")) + names(seqs) <- c("IGHV1-2*01|Homo sapiens|F|V-REGION|1..296") + + output_file <- tempfile(fileext = ".fa") + withr::defer(unlink(output_file)) + + exportTRUST4(seqs, output_file) + + content <- readLines(output_file) + expect_true(any(grepl("^>IGHV1-2\\*01$", content))) + expect_false(any(grepl("Homo sapiens", content))) +}) + +testthat::test_that("exportTRUST4() errors when no valid sequences after filtering", { + # Create only constant region sequences + seqs <- Biostrings::DNAStringSet(c("ATGCGATC")) + names(seqs) <- c("IGHC*01") + + output_file <- tempfile(fileext = ".fa") + + expect_error( + exportTRUST4(seqs, output_file, include_constant = FALSE), + "No valid sequences to export" + ) +}) + +# ============================================================================== +# Tests for exportCellRanger() +# ============================================================================== + +testthat::test_that("exportCellRanger() creates FASTA file with correct format", { + seqs <- .create_test_dna_seqs() + output_file <- tempfile(fileext = ".fa") + withr::defer(unlink(output_file)) + + result <- exportCellRanger(seqs, output_file) + + expect_equal(result, output_file) + expect_true(file.exists(output_file)) + + content <- readLines(output_file) + # Check headers are simplified + expect_true(any(grepl("^>IGHV1-2\\*01$", content))) +}) + +testthat::test_that("exportCellRanger() errors on AAStringSet input", { + seqs <- .create_test_aa_seqs() + output_file <- tempfile(fileext = ".fa") + + expect_error( + exportCellRanger(seqs, output_file), + "sequences must be a DNAStringSet" + ) +}) + +testthat::test_that("exportCellRanger() errors on empty sequences", { + seqs <- Biostrings::DNAStringSet() + output_file <- tempfile(fileext = ".fa") + + expect_error( + exportCellRanger(seqs, output_file), + "No sequences to export" + ) +}) + +testthat::test_that("exportCellRanger() errors on sequences without names", { + seqs <- Biostrings::DNAStringSet(c("ATGC", "GCTA")) + output_file <- tempfile(fileext = ".fa") + + expect_error( + exportCellRanger(seqs, output_file), + "Sequences must have names" + ) +}) + +testthat::test_that("exportCellRanger() creates output directory if needed", { + seqs <- .create_test_dna_seqs() + temp_base <- withr::local_tempdir() + output_file <- file.path(temp_base, "nested", "dir", "output.fa") + + expect_false(dir.exists(dirname(output_file))) + + result <- exportCellRanger(seqs, output_file) + + expect_true(dir.exists(dirname(output_file))) + expect_true(file.exists(output_file)) +}) + +testthat::test_that("exportCellRanger() removes extra annotation from headers", { + seqs <- Biostrings::DNAStringSet(c("ATGCGATCGATC")) + names(seqs) <- c("IGHV1-2*01|Homo sapiens|F|V-REGION extra info") + + output_file <- tempfile(fileext = ".fa") + withr::defer(unlink(output_file)) + + exportCellRanger(seqs, output_file) + + content <- readLines(output_file) + expect_true(any(grepl("^>IGHV1-2\\*01$", content))) + expect_false(any(grepl("Homo sapiens", content))) + expect_false(any(grepl("extra info", content))) +}) + +# ============================================================================== +# Tests for exportIgBLAST() +# ============================================================================== + +testthat::test_that("exportIgBLAST() creates separate files for V, D, J segments", { + seqs <- .create_test_dna_seqs() + output_dir <- withr::local_tempdir() + + result <- exportIgBLAST(seqs, output_dir, organism = "human", receptor_type = "ig") + + # Check that expected files were created + expect_true(!is.null(result$v_genes)) + expect_true(!is.null(result$d_genes)) + expect_true(!is.null(result$j_genes)) + + # Check files exist + expect_true(file.exists(result$v_genes)) + expect_true(file.exists(result$d_genes)) + expect_true(file.exists(result$j_genes)) + + # Check file naming convention + expect_match(basename(result$v_genes), "^human_ig_v\\.fasta$") + expect_match(basename(result$d_genes), "^human_ig_d\\.fasta$") + expect_match(basename(result$j_genes), "^human_ig_j\\.fasta$") +}) + +testthat::test_that("exportIgBLAST() writes correct FASTA content", { + seqs <- .create_test_dna_seqs() + output_dir <- withr::local_tempdir() + + result <- exportIgBLAST(seqs, output_dir, organism = "human", receptor_type = "ig") + + # Read V gene file and check content + v_content <- readLines(result$v_genes) + expect_true(any(grepl("^>IGHV1-2\\*01$", v_content))) + expect_true(any(grepl("^>IGHV3-11\\*02$", v_content))) + expect_true(any(grepl("^ATGCGATCGATCGATCGATCGATCGATCG$", v_content))) +}) + +testthat::test_that("exportIgBLAST() handles TCR sequences", { + seqs <- .create_test_tcr_seqs() + output_dir <- withr::local_tempdir() + + result <- exportIgBLAST(seqs, output_dir, organism = "human", receptor_type = "tcr") + + expect_true(!is.null(result$v_genes)) + expect_true(!is.null(result$d_genes)) + expect_true(!is.null(result$j_genes)) + + # Check file naming for TCR + expect_match(basename(result$v_genes), "^human_tcr_v\\.fasta$") + expect_match(basename(result$d_genes), "^human_tcr_d\\.fasta$") + expect_match(basename(result$j_genes), "^human_tcr_j\\.fasta$") + + # Check content + v_content <- readLines(result$v_genes) + expect_true(any(grepl("^>TRBV1-1\\*01$", v_content))) +}) + +testthat::test_that("exportIgBLAST() creates output directory if needed", { + seqs <- .create_test_dna_seqs() + temp_base <- withr::local_tempdir() + output_dir <- file.path(temp_base, "new", "nested", "dir") + + expect_false(dir.exists(output_dir)) + + result <- exportIgBLAST(seqs, output_dir, organism = "human") + + expect_true(dir.exists(output_dir)) + expect_true(file.exists(result$v_genes)) +}) + +testthat::test_that("exportIgBLAST() errors on AAStringSet input", { + seqs <- .create_test_aa_seqs() + output_dir <- withr::local_tempdir() + + expect_error( + exportIgBLAST(seqs, output_dir), + "sequences must be a DNAStringSet" + ) +}) + +testthat::test_that("exportIgBLAST() errors on sequences without names", { + seqs <- Biostrings::DNAStringSet(c("ATGC", "GCTA")) + output_dir <- withr::local_tempdir() + + expect_error( + exportIgBLAST(seqs, output_dir), + "Sequences must have names" + ) +}) + +testthat::test_that("exportIgBLAST() warns when no matching sequences found", { + seqs <- .create_test_dna_seqs() # Contains IG sequences + output_dir <- withr::local_tempdir() + + expect_warning( + result <- exportIgBLAST(seqs, output_dir, receptor_type = "tcr"), + "No sequences matching receptor type 'tcr'" + ) + + expect_equal(length(result), 0) +}) + +testthat::test_that("exportIgBLAST() removes extra annotation from headers", { + seqs <- Biostrings::DNAStringSet(c("ATGCGATCGATC", "ATGC")) + names(seqs) <- c( + "IGHV1-2*01|Homo sapiens|F|V-REGION|1..296", + "IGHJ1*01 some extra info" + ) + + output_dir <- withr::local_tempdir() + result <- exportIgBLAST(seqs, output_dir, organism = "human", receptor_type = "ig") + + v_content <- readLines(result$v_genes) + expect_true(any(grepl("^>IGHV1-2\\*01$", v_content))) + expect_false(any(grepl("Homo sapiens", v_content))) + + j_content <- readLines(result$j_genes) + expect_true(any(grepl("^>IGHJ1\\*01$", j_content))) + expect_false(any(grepl("extra info", j_content))) +}) + +testthat::test_that("exportIgBLAST() uses custom organism name", { + seqs <- .create_test_dna_seqs() + output_dir <- withr::local_tempdir() + + result <- exportIgBLAST(seqs, output_dir, organism = "custom_species", receptor_type = "ig") + + expect_match(basename(result$v_genes), "^custom_species_ig_v\\.fasta$") + expect_match(basename(result$d_genes), "^custom_species_ig_d\\.fasta$") + expect_match(basename(result$j_genes), "^custom_species_ig_j\\.fasta$") +}) + +testthat::test_that("exportIgBLAST() does not create C gene file (not used by IgBLAST)", { + seqs <- .create_test_dna_seqs() + output_dir <- withr::local_tempdir() + + result <- exportIgBLAST(seqs, output_dir, organism = "human", receptor_type = "ig") + + # IgBLAST doesn't use C genes, so no c_genes file should be created + expect_null(result$c_genes) +}) diff --git a/vignettes/immReferent.Rmd b/vignettes/immReferent.Rmd index c61cc52..c597bd9 100644 --- a/vignettes/immReferent.Rmd +++ b/vignettes/immReferent.Rmd @@ -200,9 +200,113 @@ igk_fresh <- refreshOGRDB(species = "human", format = "FASTA_GAPPED") ``` +## Exporting Reference Databases for External Tools + +`immReferent` provides export functions to create reference databases compatible with popular immune repertoire analysis tools. These functions format the downloaded sequences appropriately for each tool. + +### Export to MiXCR + +[MiXCR](https://mixcr.com/) is a popular tool for analyzing immune repertoire sequencing data. The `exportMiXCR()` function creates FASTA files formatted for MiXCR's `buildLibrary` command. + +```{r export_mixcr, eval = imgt_ok} +# Download human IGH sequences +igh_seqs <- getIMGT(species = "human", + gene = "IGH", + type = "NUC", + suppressMessages = TRUE) + +# Export to MiXCR format +mixcr_dir <- tempdir() +mixcr_files <- exportMiXCR(igh_seqs, mixcr_dir, chain = "IGH") + +# View created files +print(mixcr_files) + +# View first few lines of V gene file +if (!is.null(mixcr_files$v_genes)) { + cat(head(readLines(mixcr_files$v_genes), 4), sep = "\n") +} +``` + +The output files can be used with MiXCR's `buildLibrary` command: +```bash +mixcr buildLibrary \ + --v-genes-from-fasta v-genes.IGH.fasta \ + --v-gene-feature VRegion \ + --j-genes-from-fasta j-genes.IGH.fasta \ + --d-genes-from-fasta d-genes.IGH.fasta \ + --c-genes-from-fasta c-genes.IGH.fasta \ + --chain IGH \ + --species hs \ + custom-IGH.json.gz +``` + +### Export to TRUST4 + +[TRUST4](https://github.com/liulab-dfci/TRUST4) is a tool for reconstructing immune repertoires from RNA-seq data. The `exportTRUST4()` function creates a FASTA file compatible with TRUST4's `--ref` parameter. + +```{r export_trust4, eval = imgt_ok} +# Export to TRUST4 format (includes constant regions by default) +trust4_file <- tempfile(fileext = ".fa") +exportTRUST4(igh_seqs, trust4_file) + +# View header format +cat(head(readLines(trust4_file), 6), sep = "\n") +``` + +Use the exported file with TRUST4: +```bash +./run-trust4 -f hg38_bcrtcr.fa --ref custom_IMGT+C.fa \ + -1 reads_1.fq -2 reads_2.fq -o output +``` + +### Export to Cell Ranger VDJ + +[Cell Ranger](https://www.10xgenomics.com/support/software/cell-ranger) from 10x Genomics includes VDJ analysis capabilities. The `exportCellRanger()` function creates a FASTA file suitable for building a custom VDJ reference. + +```{r export_cellranger, eval = imgt_ok} +# Export V genes for Cell Ranger +cellranger_file <- tempfile(fileext = ".fa") +exportCellRanger(igh_seqs, cellranger_file) + +# View header format +cat(head(readLines(cellranger_file), 4), sep = "\n") +``` +Note: For a complete Cell Ranger VDJ reference, you also need a GTF file with gene annotations and should use `cellranger mkvdjref` to build the reference. + +### Export to IgBLAST + +[IgBLAST](https://ncbi.github.io/igblast/) is NCBI's tool for analyzing immunoglobulin and T cell receptor sequences. The `exportIgBLAST()` function creates FASTA files with simplified headers compatible with IgBLAST. + +```{r export_igblast, eval = imgt_ok} +# Export to IgBLAST format +igblast_dir <- tempdir() +igblast_files <- exportIgBLAST(igh_seqs, igblast_dir, + organism = "human", + receptor_type = "ig") + +# View created files +print(igblast_files) + +# View header format +if (!is.null(igblast_files$v_genes)) { + cat(head(readLines(igblast_files$v_genes), 4), sep = "\n") +} +``` + +After exporting, create BLAST databases using `makeblastdb`: +```bash +makeblastdb -parse_seqids -dbtype nucl \ + -in human_ig_v.fasta -out human_ig_v +makeblastdb -parse_seqids -dbtype nucl \ + -in human_ig_d.fasta -out human_ig_d +makeblastdb -parse_seqids -dbtype nucl \ + -in human_ig_j.fasta -out human_ig_j +``` + # Conclusion -This has been a general overview of the capabilities of **immReferent** for downloading and caching immune receptor and HLA sequences from IMGT and OGRDB. If you have any questions, comments, or suggestions, feel free to visit the [GitHub repository](https://github.com/BorchLab/immReferent). +This has been a general overview of the capabilities of **immReferent** for downloading and caching immune receptor and HLA sequences from IMGT and OGRDB, as well as exporting them to formats compatible with popular analysis tools. If you have any questions, comments, or suggestions, feel free to visit the [GitHub repository](https://github.com/BorchLab/immReferent). ## Session Info From b6c1a17393b68e6774b5e7863eefabb4c58b14d3 Mon Sep 17 00:00:00 2001 From: theHumanBorch Date: Sun, 4 Jan 2026 12:48:06 -0600 Subject: [PATCH 2/5] OGRDB export support #7 Allowing for both IMGT and OGRDB support for export Improved overall documentation --- DESCRIPTION | 12 +- NEWS.md | 21 ++++ R/export.R | 223 ++++++++++++++++++++++++----------- R/immReferent-package.R | 68 +++++++---- R/mainIMGT.R | 111 ++++++++++++----- R/mainOGRDB.R | 165 +++++++++++++++++--------- R/utils.R | 36 ++++-- man/exportCellRanger.Rd | 42 ++++--- man/exportIgBLAST.Rd | 57 ++++++--- man/exportMiXCR.Rd | 48 ++++++-- man/exportTRUST4.Rd | 39 ++++-- man/getIMGT.Rd | 55 ++++++--- man/getOGRDB.Rd | 75 ++++++++---- man/immReferent-package.Rd | 67 ++++++----- man/is_imgt_available.Rd | 15 ++- man/is_ogrdb_available.Rd | 15 ++- man/listIMGT.Rd | 12 +- man/listOGRDB.Rd | 21 ++-- man/loadIMGT.Rd | 39 ++++-- man/loadOGRDB.Rd | 55 +++++---- man/refreshIMGT.Rd | 37 ++++-- man/refreshOGRDB.Rd | 51 ++++---- tests/testthat/test-export.R | 169 ++++++++++++++++++++++++++ 23 files changed, 1029 insertions(+), 404 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 15753fa..b80f095 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,9 +1,15 @@ Package: immReferent -Title: An Interface for Immune Receptor and HLA Gene IMGT Reference Data -Version: 0.99.5 +Title: An Interface for Immune Receptor and HLA Gene Reference Data +Version: 1.0.1 Authors@R: c( person(given = "Nick", family = "Borcherding", role = c("aut", "cre"), email = "ncborch@gmail.com")) -Description: Provides a consistent interface for downloading, storing, and accessing IMGT immune receptor (TCR/BCR) and HLA sequences (both nucleotide and protein). This package serves as a core dependency for immunogenomics packages, ensuring reliable and high-quality sequence access. +Description: Provides a consistent interface for downloading, storing, and + accessing immune receptor (TCR/BCR) and HLA sequences from IMGT, + IPD-IMGT/HLA, and OGRDB (AIRR-C). Supports export to popular analysis + tools including MiXCR, TRUST4, Cell Ranger, and IgBLAST. This package + serves as a core dependency for immunogenomics packages, ensuring + reliable and high-quality sequence access with local caching for + reproducibility. License: MIT + file LICENSE Encoding: UTF-8 RoxygenNote: 7.3.3 diff --git a/NEWS.md b/NEWS.md index d1f2545..0d6f5a3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,24 @@ +# immReferent VERSION 1.0.1 + +## New Features + +* Added export functions for popular immune repertoire analysis tools: + - `exportMiXCR()`: Export sequences for MiXCR custom library building + - `exportTRUST4()`: Export sequences for TRUST4 analysis + - `exportCellRanger()`: Export sequences for 10x Cell Ranger VDJ reference + - `exportIgBLAST()`: Export sequences for IgBLAST database creation +* Export functions work with sequences from both IMGT and OGRDB sources + +## Documentation Improvements + +* Standardized roxygen2 documentation across all functions with proper + formatting using `\code{}`, `\strong{}`, `\itemize{}`, and `\url{}` +* Added comprehensive `@seealso` sections linking related functions +* Improved parameter documentation with explicit type specifications +* Enhanced package-level documentation with function overview +* Fixed typo in `is_ogrdb_available()` documentation (was incorrectly + referencing IMGT) + # immReferent VERSION 0.99.5 * Added package level man page diff --git a/R/export.R b/R/export.R index 771278d..27d2ebe 100644 --- a/R/export.R +++ b/R/export.R @@ -2,29 +2,54 @@ # compatible with various immune repertoire analysis tools. #' @title Export Reference Sequences to MiXCR Format -#' @description Exports a DNAStringSet or AAStringSet to FASTA files formatted -#' for use with MiXCR's `buildLibrary` command. The function creates separate -#' FASTA files for V, D, J, and C gene segments. #' -#' @param sequences A `DNAStringSet` or `AAStringSet` object containing immune -#' receptor sequences. Sequence names should follow IMGT nomenclature -#' (e.g., "IGHV1-2*01", "TRBJ2-1*01"). -#' @param output_dir The directory where output files will be written. -#' @param chain The chain type for the output files. One of "IGH", "IGK", "IGL", -#' "TRA", "TRB", "TRD", or "TRG". +#' @description Exports a \code{\link[Biostrings]{DNAStringSet}} or +#' \code{\link[Biostrings]{AAStringSet}} to FASTA files formatted for use with +#' MiXCR's \code{buildLibrary} command. The function creates separate FASTA +#' files for V, D, J, and C gene segments. +#' +#' @param sequences A \code{\link[Biostrings]{DNAStringSet}} or +#' \code{\link[Biostrings]{AAStringSet}} object containing immune receptor +#' sequences. Sequence names must follow standard IG/TR gene nomenclature +#' (e.g., \code{"IGHV1-2*01"}, \code{"TRBJ2-1*01"}). Can be obtained from +#' \code{\link{getIMGT}} or \code{\link{getOGRDB}}. +#' @param output_dir Character string specifying the directory where output +#' files will be written. The directory will be created if it does not exist. +#' @param chain Character string specifying the chain type for the output files. +#' Must be one of \code{"IGH"}, \code{"IGK"}, \code{"IGL"}, \code{"TRA"}, +#' \code{"TRB"}, \code{"TRD"}, or \code{"TRG"}. #' #' @details #' MiXCR expects FASTA files with simple headers containing only the gene name. #' The function filters sequences by gene type (V, D, J, C) based on the gene - #' name pattern and writes separate files for each segment type. #' -#' Output files follow the naming convention: `v-genes..fasta`, -#' `d-genes..fasta`, `j-genes..fasta`, `c-genes..fasta`. +#' Output files follow the naming convention: +#' \itemize{ +#' \item \code{v-genes..fasta} +#' \item \code{d-genes..fasta} +#' \item \code{j-genes..fasta} +#' \item \code{c-genes..fasta} +#' } +#' +#' This function works with sequences from both \strong{IMGT} (via +#' \code{\link{getIMGT}}) and \strong{OGRDB} (via \code{\link{getOGRDB}}). +#' +#' @return A named list containing the paths to the created files, returned +#' invisibly. The list may contain elements \code{v_genes}, \code{d_genes}, +#' \code{j_genes}, and \code{c_genes} depending on which segment types were +#' found in the input sequences. #' -#' @return A named list containing the paths to the created files, invisibly. #' @export -#' @seealso \url{https://mixcr.com/mixcr/guides/create-custom-library/} +#' @seealso +#' \code{\link{getIMGT}}, \code{\link{getOGRDB}} for obtaining sequences +#' +#' \code{\link{exportTRUST4}}, \code{\link{exportCellRanger}}, +#' \code{\link{exportIgBLAST}} for other export formats +#' +#' \url{https://mixcr.com/mixcr/guides/create-custom-library/} for MiXCR +#' documentation +#' #' @examples #' # Create a small example DNAStringSet #' seqs <- Biostrings::DNAStringSet(c( @@ -52,7 +77,7 @@ exportMiXCR <- function(sequences, output_dir, chain = c("IGH", "IGK", "IGL", "T # Get sequence names seq_names <- names(sequences) if (is.null(seq_names)) { - stop("Sequences must have names following IMGT nomenclature.", call. = FALSE) + stop("Sequences must have names following IG/TR gene nomenclature (e.g., 'IGHV1-2*01').", call. = FALSE) } # Categorize sequences by segment type @@ -108,26 +133,43 @@ exportMiXCR <- function(sequences, output_dir, chain = c("IGH", "IGK", "IGL", "T #' @title Export Reference Sequences to TRUST4 Format -#' @description Exports a DNAStringSet to a FASTA file formatted for use with -#' TRUST4. The output follows the format produced by TRUST4's `BuildImgtAnnot.pl` -#' script. #' -#' @param sequences A `DNAStringSet` object containing immune receptor sequences. -#' Sequence names should follow IMGT nomenclature (e.g., "IGHV1-2*01"). -#' @param output_file The path to the output FASTA file. -#' @param include_constant Logical. If `TRUE`, include constant region sequences. -#' TRUST4's IMGT+C.fa file includes constant regions. Default is `TRUE`. +#' @description Exports a \code{\link[Biostrings]{DNAStringSet}} to a FASTA file +#' formatted for use with TRUST4. The output follows the format produced by +#' TRUST4's \code{BuildImgtAnnot.pl} script. +#' +#' @param sequences A \code{\link[Biostrings]{DNAStringSet}} object containing +#' immune receptor sequences. Sequence names must follow standard IG/TR gene +#' nomenclature (e.g., \code{"IGHV1-2*01"}). Can be obtained from +#' \code{\link{getIMGT}} or \code{\link{getOGRDB}}. +#' @param output_file Character string specifying the path to the output FASTA +#' file. The parent directory will be created if it does not exist. +#' @param include_constant Logical. If \code{TRUE} (default), include constant +#' region sequences. TRUST4's \code{IMGT+C.fa} file includes constant regions. #' #' @details #' TRUST4 expects FASTA files with headers containing only the allele name -#' (e.g., ">IGHV1-2*01"). The function reformats sequence headers to match -#' the output of TRUST4's `BuildImgtAnnot.pl` script. +#' (e.g., \code{>IGHV1-2*01}). The function reformats sequence headers to match +#' the output of TRUST4's \code{BuildImgtAnnot.pl} script. +#' +#' TRUST4 uses this reference for the \code{--ref} parameter in its analysis +#' pipeline. #' -#' TRUST4 uses this reference for the `--ref` parameter in its analysis pipeline. +#' This function works with sequences from both \strong{IMGT} (via +#' \code{\link{getIMGT}}) and \strong{OGRDB} (via \code{\link{getOGRDB}}). +#' +#' @return Character string with the path to the created file, returned +#' invisibly. #' -#' @return The path to the created file, invisibly. #' @export -#' @seealso \url{https://github.com/liulab-dfci/TRUST4} +#' @seealso +#' \code{\link{getIMGT}}, \code{\link{getOGRDB}} for obtaining sequences +#' +#' \code{\link{exportMiXCR}}, \code{\link{exportCellRanger}}, +#' \code{\link{exportIgBLAST}} for other export formats +#' +#' \url{https://github.com/liulab-dfci/TRUST4} for TRUST4 documentation +#' #' @examples #' # Create a small example DNAStringSet #' seqs <- Biostrings::DNAStringSet(c( @@ -153,7 +195,7 @@ exportTRUST4 <- function(sequences, output_file, include_constant = TRUE) { seq_names <- names(sequences) if (is.null(seq_names)) { - stop("Sequences must have names following IMGT nomenclature.", call. = FALSE) + stop("Sequences must have names following IG/TR gene nomenclature (e.g., 'IGHV1-2*01').", call. = FALSE) } # Filter out constant regions if requested @@ -188,28 +230,46 @@ exportTRUST4 <- function(sequences, output_file, include_constant = TRUE) { #' @title Export Reference Sequences to Cell Ranger VDJ Format -#' @description Exports a DNAStringSet to FASTA format suitable for creating -#' a custom Cell Ranger VDJ reference. The function generates a FASTA file -#' with properly formatted headers for use with `cellranger mkvdjref`. #' -#' @param sequences A `DNAStringSet` object containing immune receptor sequences. -#' Sequence names should follow IMGT nomenclature (e.g., "IGHV1-2*01"). -#' @param output_file The path to the output FASTA file. -#' @param gene_type The type of gene region. One of "V", "D", "J", or "C". -#' If NULL (default), the function will attempt to infer the type from -#' sequence names. +#' @description Exports a \code{\link[Biostrings]{DNAStringSet}} to FASTA format +#' suitable for creating a custom Cell Ranger VDJ reference. The function +#' generates a FASTA file with properly formatted headers for use with +#' \code{cellranger mkvdjref}. +#' +#' @param sequences A \code{\link[Biostrings]{DNAStringSet}} object containing +#' immune receptor sequences. Sequence names must follow standard IG/TR gene +#' nomenclature (e.g., \code{"IGHV1-2*01"}). Can be obtained from +#' \code{\link{getIMGT}} or \code{\link{getOGRDB}}. +#' @param output_file Character string specifying the path to the output FASTA +#' file. The parent directory will be created if it does not exist. +#' @param gene_type Character string specifying the type of gene region. One of +#' \code{"V"}, \code{"D"}, \code{"J"}, or \code{"C"}. If \code{NULL} (default), +#' the function will attempt to infer the type from sequence names. #' #' @details -#' Cell Ranger's `mkvdjref` command expects FASTA files with specific header -#' formats. This function creates a FASTA file that can be used as input +#' Cell Ranger's \code{mkvdjref} command expects FASTA files with specific +#' header formats. This function creates a FASTA file that can be used as input #' to build a custom VDJ reference. #' -#' Note: For a complete Cell Ranger VDJ reference, you also need a GTF file -#' with gene annotations. This function only creates the FASTA component. +#' \strong{Note:} For a complete Cell Ranger VDJ reference, you also need a GTF +#' file with gene annotations. This function only creates the FASTA component. +#' +#' This function works with sequences from both \strong{IMGT} (via +#' \code{\link{getIMGT}}) and \strong{OGRDB} (via \code{\link{getOGRDB}}). +#' +#' @return Character string with the path to the created file, returned +#' invisibly. #' -#' @return The path to the created file, invisibly. #' @export -#' @seealso \url{https://www.10xgenomics.com/support/software/cell-ranger/latest/analysis/inputs/cr-5p-references} +#' @seealso +#' \code{\link{getIMGT}}, \code{\link{getOGRDB}} for obtaining sequences +#' +#' \code{\link{exportMiXCR}}, \code{\link{exportTRUST4}}, +#' \code{\link{exportIgBLAST}} for other export formats +#' +#' \url{https://www.10xgenomics.com/support/software/cell-ranger/latest/analysis/inputs/cr-5p-references} +#' for Cell Ranger documentation +#' #' @examples #' # Create a small example DNAStringSet #' seqs <- Biostrings::DNAStringSet(c( @@ -232,15 +292,15 @@ exportCellRanger <- function(sequences, output_file, gene_type = NULL) { stop("sequences must be a DNAStringSet object.", call. = FALSE) } - seq_names <- names(sequences) - if (is.null(seq_names)) { - stop("Sequences must have names following IMGT nomenclature.", call. = FALSE) - } - if (length(sequences) == 0) { stop("No sequences to export.", call. = FALSE) } + seq_names <- names(sequences) + if (is.null(seq_names)) { + stop("Sequences must have names following IG/TR gene nomenclature (e.g., 'IGHV1-2*01').", call. = FALSE) + } + # Cell Ranger expects clean gene names # Format headers as: >gene_name simple_names <- sub("\\|.*$", "", seq_names) @@ -260,37 +320,60 @@ exportCellRanger <- function(sequences, output_file, gene_type = NULL) { #' @title Export Reference Sequences to IgBLAST Format -#' @description Exports a DNAStringSet to FASTA files formatted for use with -#' IgBLAST. The function creates separate FASTA files for V, D, and J gene -#' segments with simplified headers compatible with IgBLAST's requirements. -#' -#' @param sequences A `DNAStringSet` object containing immune receptor sequences. -#' Sequence names should follow IMGT nomenclature (e.g., "IGHV1-2*01"). -#' @param output_dir The directory where output files will be written. -#' @param organism The organism name for the output files. Used in file naming. -#' Default is "custom". -#' @param receptor_type The receptor type. One of "ig" for immunoglobulin or -#' "tcr" for T-cell receptor. Default is "ig". +#' +#' @description Exports a \code{\link[Biostrings]{DNAStringSet}} to FASTA files +#' formatted for use with IgBLAST. The function creates separate FASTA files +#' for V, D, and J gene segments with simplified headers compatible with +#' IgBLAST's requirements. +#' +#' @param sequences A \code{\link[Biostrings]{DNAStringSet}} object containing +#' immune receptor sequences. Sequence names must follow standard IG/TR gene +#' nomenclature (e.g., \code{"IGHV1-2*01"}). Can be obtained from +#' \code{\link{getIMGT}} or \code{\link{getOGRDB}}. +#' @param output_dir Character string specifying the directory where output +#' files will be written. The directory will be created if it does not exist. +#' @param organism Character string specifying the organism name for the output +#' files. Used in file naming. Default is \code{"custom"}. +#' @param receptor_type Character string specifying the receptor type. One of +#' \code{"ig"} for immunoglobulin or \code{"tcr"} for T-cell receptor. Default +#' is \code{"ig"}. #' #' @details #' IgBLAST requires FASTA files with simplified headers containing only the #' gene/allele name. This function mimics the output of IgBLAST's -#' `edit_imgt_file.pl` script, which truncates IMGT headers to keep only +#' \code{edit_imgt_file.pl} script, which truncates IMGT headers to keep only #' the allele designation. #' #' Output files follow the naming convention used by IgBLAST: -#' `__v.fasta`, `__d.fasta`, -#' `__j.fasta`. +#' \itemize{ +#' \item \code{__v.fasta} +#' \item \code{__d.fasta} +#' \item \code{__j.fasta} +#' } #' -#' After exporting, use `makeblastdb` with the `-parse_seqids` flag to create -#' the BLAST database: -#' ``` +#' After exporting, use \code{makeblastdb} with the \code{-parse_seqids} flag +#' to create the BLAST database: +#' \preformatted{ #' makeblastdb -parse_seqids -dbtype nucl -in -out -#' ``` +#' } +#' +#' This function works with sequences from both \strong{IMGT} (via +#' \code{\link{getIMGT}}) and \strong{OGRDB} (via \code{\link{getOGRDB}}). +#' +#' @return A named list containing the paths to the created files, returned +#' invisibly. The list may contain elements \code{v_genes}, \code{d_genes}, +#' and \code{j_genes} depending on which segment types were found in the +#' input sequences. #' -#' @return A named list containing the paths to the created files, invisibly. #' @export -#' @seealso \url{https://ncbi.github.io/igblast/} +#' @seealso +#' \code{\link{getIMGT}}, \code{\link{getOGRDB}} for obtaining sequences +#' +#' \code{\link{exportMiXCR}}, \code{\link{exportTRUST4}}, +#' \code{\link{exportCellRanger}} for other export formats +#' +#' \url{https://ncbi.github.io/igblast/} for IgBLAST documentation +#' #' @examples #' # Create a small example DNAStringSet #' seqs <- Biostrings::DNAStringSet(c( @@ -320,7 +403,7 @@ exportIgBLAST <- function(sequences, output_dir, organism = "custom", receptor_t seq_names <- names(sequences) if (is.null(seq_names)) { - stop("Sequences must have names following IMGT nomenclature.", call. = FALSE) + stop("Sequences must have names following IG/TR gene nomenclature (e.g., 'IGHV1-2*01').", call. = FALSE) } # IgBLAST's edit_imgt_file.pl simplifies headers to just the allele name diff --git a/R/immReferent-package.R b/R/immReferent-package.R index 8774ba4..cebcc5b 100644 --- a/R/immReferent-package.R +++ b/R/immReferent-package.R @@ -1,36 +1,56 @@ #' immReferent: An Interface for Immune Receptor and HLA Gene Reference Data #' #' @description -#' **immReferent** provides a stable, reproducible, and lightweight interface to -#' reference sequences for immune receptors (TCR/BCR) and HLA genes sourced from -#' IMGT, IPD-IMGT/HLA, and the AIRR-C's OGRDB. It centralizes downloading, -#' caching, and querying of curated nucleotide and protein sequences, and plays a -#' foundational role in computational immunology workflows. +#' \strong{immReferent} provides a stable, reproducible, and lightweight +#' interface to reference sequences for immune receptors (TCR/BCR) and HLA genes +#' sourced from IMGT, IPD-IMGT/HLA, and the AIRR-C's OGRDB. It centralizes +#' downloading, caching, and querying of curated nucleotide and protein +#' sequences, and plays a foundational role in computational immunology +#' workflows. #' #' @details -#' The package is designed as a common reference layer across immunoinformatics tools, -#' ensuring consistent provenance and offline reproducibility via caching. +#' The package is designed as a common reference layer across immunoinformatics +#' tools, ensuring consistent provenance and offline reproducibility via +#' caching. #' -#' **Core functionality** +#' \strong{Core functionality} #' \itemize{ -#' \item Download and parse receptor and HLA sequences from IMGT and OGRDB. -#' \item Local caching to support offline, reproducible analysis. -#' \item Query by gene, allele, species, locus, and sequence type/format. +#' \item Download and parse receptor and HLA sequences from IMGT and OGRDB +#' \item Local caching to support offline, reproducible analysis +#' \item Query by gene, allele, species, locus, and sequence type/format +#' \item Export to popular analysis tools (MiXCR, TRUST4, Cell Ranger, IgBLAST) #' \item Interoperability with Bioconductor classes such as -#' \code{Biostrings::DNAStringSet} and \code{Biostrings::AAStringSet}. +#' \code{\link[Biostrings]{DNAStringSet}} and +#' \code{\link[Biostrings]{AAStringSet}} #' } #' -#' **Supported data sources** +#' \strong{Data retrieval functions} #' \itemize{ -#' \item IMGT — The international ImMunoGeneTics information system: -#' \url{https://www.imgt.org/} -#' \item IPD-IMGT/HLA — The HLA Database: -#' \url{https://www.ebi.ac.uk/ipd/imgt/hla/} -#' \item OGRDB — Open Germline Receptor Database (AIRR-C): -#' \url{https://ogrdb.airr-community.org/} +#' \item \code{\link{getIMGT}}: Download sequences from IMGT +#' \item \code{\link{getOGRDB}}: Download sequences from OGRDB +#' \item \code{\link{loadIMGT}}, \code{\link{loadOGRDB}}: Load cached sequences +#' \item \code{\link{refreshIMGT}}, \code{\link{refreshOGRDB}}: Force re-download #' } #' -##' \strong{Getting started} +#' \strong{Export functions} +#' \itemize{ +#' \item \code{\link{exportMiXCR}}: Export for MiXCR analysis +#' \item \code{\link{exportTRUST4}}: Export for TRUST4 analysis +#' \item \code{\link{exportCellRanger}}: Export for 10x Cell Ranger VDJ +#' \item \code{\link{exportIgBLAST}}: Export for IgBLAST analysis +#' } +#' +#' \strong{Supported data sources} +#' \itemize{ +#' \item IMGT: The international ImMunoGeneTics information system +#' (\url{https://www.imgt.org/}) +#' \item IPD-IMGT/HLA: The HLA Database +#' (\url{https://www.ebi.ac.uk/ipd/imgt/hla/}) +#' \item OGRDB: Open Germline Receptor Database (AIRR-C) +#' (\url{https://ogrdb.airr-community.org/}) +#' } +#' +#' \strong{Getting started} #' \preformatted{ #' browseVignettes("immReferent") #' } @@ -43,12 +63,10 @@ #' restricted per IMGT policy. Always review the current licensing and citation #' requirements of each resource prior to use. #' -#'#' @seealso -#' \url{https://github.com/BorchLab/immReferent} \cr -#' \url{https://github.com/BorchLab/Ibex/immReferent} -#' +#' @seealso +#' \url{https://github.com/BorchLab/immReferent} +#' #' @keywords package -#' @md #' @name immReferent-package #' @aliases immReferent immReferent-package #' @docType package diff --git a/R/mainIMGT.R b/R/mainIMGT.R index eda5d7e..b6dbbf6 100644 --- a/R/mainIMGT.R +++ b/R/mainIMGT.R @@ -14,37 +14,56 @@ } #' @title Download and Load Immune Receptor and HLA Sequences from IMGT -#' @description This is the main function to download and load reference sequences from IMGT -#' and the IPD-IMGT/HLA database. It handles caching of downloaded files. -#' -#' @param species The species for which to download data. Required for TCR/BCR queries. -#' Currently supported: "human", "mouse", "rat", "rabbit", "pig", "dog", "rhesus_monkey", "cyno monkey". Defaults to "human" for HLA. -#' @param gene The gene or locus to download. For TCR/BCR, this can be a specific -#' chain (e.g., "IGHV", "TRBJ") or a group (e.g., "IGH", "TCR"). For HLA, use "HLA". -#' @param type The type of sequence to retrieve. Either "NUC" for nucleotide or -#' "PROT" for protein sequences. This primarily distinguishes between VDJ nucleotide -#' and V-region amino acid sequences for TCR/BCR genes. -#' @param refresh Logical. If `TRUE`, forces a re-download of the data even if it -#' exists in the cache. -#' @param suppressMessages Logical. If `TRUE`, suppresses the license and other -#' informational messages. -#' -#' @return A `DNAStringSet` or `AAStringSet` object containing the requested sequences. +#' +#' @description This is the main function to download and load reference +#' sequences from IMGT and the IPD-IMGT/HLA database. It handles caching of +#' downloaded files. +#' +#' @param species Character string specifying the species for which to download +#' data. Required for TCR/BCR queries. Currently supported species: +#' \code{"human"}, \code{"mouse"}, \code{"rat"}, \code{"rabbit"}, \code{"pig"}, +#' \code{"dog"}, \code{"rhesus_monkey"}, \code{"cyno_monkey"}. Defaults to +#' \code{"human"} for HLA queries. +#' @param gene Character string specifying the gene or locus to download. For +#' TCR/BCR, this can be a specific chain (e.g., \code{"IGHV"}, \code{"TRBJ"}) +#' or a group (e.g., \code{"IGH"}, \code{"TCR"}). For HLA, use \code{"HLA"}. +#' @param type Character string specifying the type of sequence to retrieve. +#' Either \code{"NUC"} for nucleotide or \code{"PROT"} for protein sequences. +#' This primarily distinguishes between VDJ nucleotide and V-region amino acid +#' sequences for TCR/BCR genes. +#' @param refresh Logical. If \code{TRUE}, forces a re-download of the data even +#' if it exists in the cache. Default is \code{FALSE}. +#' @param suppressMessages Logical. If \code{TRUE}, suppresses the license and +#' other informational messages. Default is \code{FALSE}. +#' +#' @return A \code{\link[Biostrings]{DNAStringSet}} object (when +#' \code{type = "NUC"}) or \code{\link[Biostrings]{AAStringSet}} object (when +#' \code{type = "PROT"}) containing the requested sequences. +#' #' @export +#' @seealso +#' \code{\link{loadIMGT}}, \code{\link{refreshIMGT}} for convenience wrappers +#' +#' \code{\link{getOGRDB}} for OGRDB/AIRR-C germline sequences +#' +#' \code{\link{exportMiXCR}}, \code{\link{exportTRUST4}}, +#' \code{\link{exportCellRanger}}, \code{\link{exportIgBLAST}} for exporting +#' sequences to analysis tools +#' #' @examples #' if(is_imgt_available()) { #' # Download human IGHV nucleotide sequences -#' ighv_nuc <- getIMGT(species = "human", -#' gene = "IGHV", +#' ighv_nuc <- getIMGT(species = "human", +#' gene = "IGHV", #' type = "NUC") #' #' # Download all HLA protein sequences -#' hla_prot <- getIMGT(gene = "HLA", +#' hla_prot <- getIMGT(gene = "HLA", #' type = "PROT") #' #' # Download all mouse TRB genes -#' trb_mouse <- getIMGT(species = "mouse", -#' gene = "TRB", +#' trb_mouse <- getIMGT(species = "mouse", +#' gene = "TRB", #' type = "NUC") #' } #' @@ -144,13 +163,24 @@ getIMGT <- function(species = "human", #' @title Load Cached IMGT/HLA Sequences -#' @description Loads sequences from the local cache without attempting to download. -#' This function relies on `getIMGT(refresh = FALSE)`. If the data is not found -#' in the cache, it will be downloaded unless an internet connection is unavailable. +#' +#' @description Loads sequences from the local cache without attempting to +#' download. This function is a convenience wrapper for +#' \code{getIMGT(refresh = FALSE)}. If the data is not found in the cache, it +#' will be downloaded unless an internet connection is unavailable. #' #' @inheritParams getIMGT -#' @return A `DNAStringSet` or `AAStringSet` object. +#' +#' @return A \code{\link[Biostrings]{DNAStringSet}} object (when +#' \code{type = "NUC"}) or \code{\link[Biostrings]{AAStringSet}} object (when +#' \code{type = "PROT"}) containing the requested sequences. +#' #' @export +#' @seealso +#' \code{\link{getIMGT}} for the main download function +#' +#' \code{\link{refreshIMGT}} to force re-download +#' #' @examples #' if(is_imgt_available()) { #' # First, download a file to ensure it's in the cache @@ -170,12 +200,23 @@ loadIMGT <- function(species = "human", #' @title Force Re-download of IMGT/HLA Sequences -#' @description A convenience wrapper for `getIMGT(..., refresh = TRUE)` to ensure that -#' the local cache is updated with the latest versions of the requested sequences. +#' +#' @description A convenience wrapper for \code{getIMGT(..., refresh = TRUE)} to +#' ensure that the local cache is updated with the latest versions of the +#' requested sequences. #' #' @inheritParams getIMGT -#' @return A `DNAStringSet` or `AAStringSet` object. +#' +#' @return A \code{\link[Biostrings]{DNAStringSet}} object (when +#' \code{type = "NUC"}) or \code{\link[Biostrings]{AAStringSet}} object (when +#' \code{type = "PROT"}) containing the requested sequences. +#' #' @export +#' @seealso +#' \code{\link{getIMGT}} for the main download function +#' +#' \code{\link{loadIMGT}} to load from cache without downloading +#' #' @examples #' if(is_imgt_available()) { #' # Force a re-download of human IGHV protein sequences @@ -194,10 +235,20 @@ refreshIMGT <- function(species = "human", #' @title List Datasets in the Local Cache -#' @description Scans the cache directory and returns a list of available datasets. #' -#' @return A character vector of file paths for the cached datasets. +#' @description Scans the cache directory and returns a list of available +#' datasets that have been downloaded. +#' +#' @return A character vector of absolute file paths for the cached datasets. +#' Returns an empty character vector if the cache directory does not exist or +#' contains no files. +#' #' @export +#' @seealso +#' \code{\link{getIMGT}} for downloading sequences +#' +#' \code{\link{listOGRDB}} for listing OGRDB cached files +#' #' @examples #' # List all files in the cache #' cached_files <- listIMGT() diff --git a/R/mainOGRDB.R b/R/mainOGRDB.R index 09bbd4a..3489e4f 100644 --- a/R/mainOGRDB.R +++ b/R/mainOGRDB.R @@ -77,29 +77,66 @@ dest_file } -#' @title Download and Load Immune Receptor Germline Sequences from OGRDB (AIRR) -#' -#' @description Downloads AIRR-compliant germline sets (or FASTA) from OGRDB and -#' returns sequences as `DNAStringSet` (NUC) or attempts AA translation for V -#' genes. -#' -#' @param species Species string. Accepts "human"/"Homo sapiens"/"mouse"/"Mus -#' musculus". -#' @param locus Either a locus short code ("IGH","IGK","IGL", etc.) OR NULL if -#' you pass a `set_name` explicitly. -#' @param set_name Optional explicit OGRDB set name (e.g., "IGH_VDJ"). If -#' provided, overrides `locus`. -#' @param type "NUC" (default) or "PROT". PROT will translate V-gene CDS; only -#' supported for FASTA or AIRR records that include a valid CDS. -#' @param format "FASTA_GAPPED", "FASTA_UNGAPPED", or "AIRR". Default -#' "FASTA_GAPPED". -#' @param version "published" (default), "latest", or a specific revision -#' number as character/number. -#' @param species_subgroup Optional subgroup (e.g., a mouse strain like -#' "C57BL/6"). If it contains '/', OGRDB requires it encoded as "\%252f". -#' @param refresh If TRUE, redownload even if cached. -#' @param suppressMessages If TRUE, be quiet. -#' +#' @title Download and Load Immune Receptor Germline Sequences from OGRDB +#' +#' @description Downloads AIRR-compliant germline sets (or FASTA) from OGRDB +#' (Open Germline Receptor Database) and returns sequences as a +#' \code{\link[Biostrings]{DNAStringSet}} or +#' \code{\link[Biostrings]{AAStringSet}}. +#' +#' @param species Character string specifying the species. Accepts +#' \code{"human"}, \code{"Homo sapiens"}, \code{"mouse"}, or +#' \code{"Mus musculus"}. Default is \code{"human"}. +#' @param locus Character string specifying the locus short code. One of +#' \code{"IGH"}, \code{"IGK"}, or \code{"IGL"}. Can be \code{NULL} if you pass +#' a \code{set_name} explicitly. +#' @param set_name Optional character string specifying an explicit OGRDB set +#' name (e.g., \code{"IGH_VDJ"}). If provided, overrides \code{locus}. +#' @param type Character string specifying the sequence type. Either +#' \code{"NUC"} (default) for nucleotide or \code{"PROT"} for protein. +#' \code{"PROT"} will translate V-gene CDS; only supported for FASTA or AIRR +#' records that include a valid CDS. +#' @param format Character string specifying the download format. One of +#' \code{"FASTA_GAPPED"} (default), \code{"FASTA_UNGAPPED"}, or \code{"AIRR"}. +#' @param version Character string specifying the version. Either +#' \code{"published"} (default) or \code{"latest"}. +#' @param species_subgroup Optional character string specifying a subgroup +#' (e.g., a mouse strain like \code{"C57BL/6"}). If it contains \code{/}, +#' OGRDB requires it encoded as \code{\%252f}. +#' @param refresh Logical. If \code{TRUE}, forces re-download even if cached. +#' Default is \code{FALSE}. +#' @param suppressMessages Logical. If \code{TRUE}, suppresses informational +#' messages. Default is \code{FALSE}. +#' +#' @details +#' OGRDB (Open Germline Receptor Database) is the AIRR Community's curated +#' repository of germline receptor sequences. It complements IMGT with +#' additional species support and standardized AIRR JSON format. +#' +#' The function supports multiple download formats: +#' \itemize{ +#' \item \code{FASTA_GAPPED}: FASTA with IMGT gaps preserved +#' \item \code{FASTA_UNGAPPED}: FASTA without gaps +#' \item \code{AIRR}: AIRR-C compliant JSON format +#' } +#' +#' @return A \code{\link[Biostrings]{DNAStringSet}} object (when +#' \code{type = "NUC"}) or \code{\link[Biostrings]{AAStringSet}} object (when +#' \code{type = "PROT"}) containing the requested sequences. +#' +#' @export +#' @importFrom jsonlite fromJSON +#' @seealso +#' \code{\link{loadOGRDB}}, \code{\link{refreshOGRDB}} for convenience wrappers +#' +#' \code{\link{getIMGT}} for IMGT sequences +#' +#' \code{\link{exportMiXCR}}, \code{\link{exportTRUST4}}, +#' \code{\link{exportCellRanger}}, \code{\link{exportIgBLAST}} for exporting +#' sequences to analysis tools +#' +#' \url{https://ogrdb.airr-community.org/} for OGRDB documentation +#' #' @examples #' if (is_ogrdb_available()) { #' # Download human IGH nucleotide sequences (gapped FASTA) @@ -126,11 +163,6 @@ #' type = "NUC", #' format = "FASTA_GAPPED") #' } -#' -#' @return `DNAStringSet` for NUC; if `type="PROT"`, returns `AAStringSet` -#' where possible. -#' @importFrom jsonlite fromJSON -#' @export getOGRDB <- function(species = "human", locus = c("IGH","IGK","IGL"), set_name = NULL, @@ -252,13 +284,25 @@ getOGRDB <- function(species = "human", `%||%` <- function(x, y) if (is.null(x)) y else x #' @title Load Cached OGRDB Sequences +#' +#' @description Loads sequences from the local cache without attempting to +#' download. This function is a convenience wrapper for +#' \code{getOGRDB(refresh = FALSE)}. If the data is not found in the cache, it +#' will be downloaded unless an internet connection is unavailable. +#' #' @inheritParams getOGRDB -#' @description Loads sequences from the local cache without attempting to download. -#' This function relies on `getOGRDB(refresh = FALSE)`. If the data is not found -#' in the cache, it will be downloaded unless an internet connection is unavailable. -#' +#' +#' @return A \code{\link[Biostrings]{DNAStringSet}} object (when +#' \code{type = "NUC"}) or \code{\link[Biostrings]{AAStringSet}} object (when +#' \code{type = "PROT"}) containing the requested sequences. +#' +#' @export +#' @seealso +#' \code{\link{getOGRDB}} for the main download function +#' +#' \code{\link{refreshOGRDB}} to force re-download +#' #' @examples -#' #' if (is_ogrdb_available()) { #' # First, ensure the file is cached #' getOGRDB(species = "human", locus = "IGH", @@ -271,10 +315,6 @@ getOGRDB <- function(species = "human", #' type = "NUC", #' format = "FASTA_GAPPED") #' } -#' @return The same object type as \code{getOGRDB()}: a \code{DNAStringSet} -#' (when \code{type = "NUC"}) or an \code{AAStringSet} (when \code{type = "PROT"}), -#' loaded from the local cache if present (and downloaded on first use if needed). -#' @export loadOGRDB <- function(species = "human", locus = c("IGH","IGK","IGL"), set_name = NULL, type = c("NUC","PROT"), format = c("FASTA_GAPPED","FASTA_UNGAPPED","AIRR"), @@ -285,11 +325,23 @@ loadOGRDB <- function(species = "human", locus = c("IGH","IGK","IGL"), } #' @title Force Re-download of OGRDB Sequences -#' @description A convenience wrapper for `getOGRDB(..., refresh = TRUE)` to -#' ensure that the local cache is updated with the latest versions of the +#' +#' @description A convenience wrapper for \code{getOGRDB(..., refresh = TRUE)} +#' to ensure that the local cache is updated with the latest versions of the #' requested sequences. -# +#' #' @inheritParams getOGRDB +#' +#' @return A \code{\link[Biostrings]{DNAStringSet}} object (when +#' \code{type = "NUC"}) or \code{\link[Biostrings]{AAStringSet}} object (when +#' \code{type = "PROT"}) containing the requested sequences. +#' +#' @export +#' @seealso +#' \code{\link{getOGRDB}} for the main download function +#' +#' \code{\link{loadOGRDB}} to load from cache without downloading +#' #' @examples #' if (is_ogrdb_available()) { #' # Force a re-download of the human IGK sequences @@ -298,10 +350,6 @@ loadOGRDB <- function(species = "human", locus = c("IGH","IGK","IGL"), #' type = "NUC", #' format = "FASTA_GAPPED") #' } -#' @export -#' @return The same object type as \code{getOGRDB()}: a \code{DNAStringSet} -#' (when \code{type = "NUC"}) or an \code{AAStringSet} (when \code{type = "PROT"}), -#' after forcing a re-download to refresh the local cache. refreshOGRDB <- function(species = "human", locus = c("IGH","IGK","IGL"), set_name = NULL, @@ -315,18 +363,25 @@ refreshOGRDB <- function(species = "human", } #' @title List OGRDB Datasets in Local Cache -#' @description Scans the cache directory and returns a list of available datasets. -#' -#' @examples -#' if (is_ogrdb_available()) { -#' # List cached OGRDB files -#' cached_files <- listOGRDB() -#' head(cached_files) -#' } +#' +#' @description Scans the cache directory and returns a list of available OGRDB +#' datasets that have been downloaded. +#' +#' @return A character vector of absolute file paths to cached OGRDB files. +#' Returns an empty character vector if no OGRDB files have been cached. Paths +#' are typically under the package cache directory (e.g., +#' \code{~/.immReferent//ogrdb/}). +#' #' @export -#' @return A character vector of absolute file paths to cached OGRDB files -#' (length zero if none). Paths are typically under the package cache directory -#' (e.g., \code{file.path(.get_cache_dir(), "", "ogrdb")}). +#' @seealso +#' \code{\link{getOGRDB}} for downloading sequences +#' +#' \code{\link{listIMGT}} for listing IMGT cached files +#' +#' @examples +#' # List cached OGRDB files +#' cached_files <- listOGRDB() +#' head(cached_files) listOGRDB <- function() { cache_dir <- .get_cache_dir() if (!dir.exists(cache_dir)) return(character(0)) diff --git a/R/utils.R b/R/utils.R index cf36091..97441cb 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,11 +1,21 @@ #' @title Check if IMGT Website is Available -#' @description This function sends a lightweight HEAD request to the main IMGT page to -#' check if the service is online and accessible. -#' @return A logical value: `TRUE` if the IMGT website is accessible, `FALSE` otherwise. +#' +#' @description Sends a lightweight HEAD request to the main IMGT page to check +#' if the service is online and accessible. This function is used to +#' conditionally run examples and tests that require an internet connection. +#' +#' @return A logical value: \code{TRUE} if the IMGT website is accessible, +#' \code{FALSE} otherwise. +#' #' @export #' @importFrom httr HEAD status_code +#' @seealso +#' \code{\link{is_ogrdb_available}} for checking OGRDB availability +#' +#' \code{\link{getIMGT}} which uses this function +#' #' @examples -#' is_imgt_available() +#' is_imgt_available() is_imgt_available <- function() { tryCatch({ # Use httr::HEAD for a lightweight request and set a timeout @@ -17,13 +27,23 @@ is_imgt_available <- function() { } #' @title Check if OGRDB Website is Available -#' @description This function sends a lightweight HEAD request to the main OGRDB page to -#' check if the service is online and accessible. -#' @return A logical value: `TRUE` if the IMGT website is accessible, `FALSE` otherwise. +#' +#' @description Sends a lightweight HEAD request to the OGRDB API to check if +#' the service is online and accessible. This function is used to conditionally +#' run examples and tests that require an internet connection. +#' +#' @return A logical value: \code{TRUE} if the OGRDB website is accessible, +#' \code{FALSE} otherwise. +#' #' @export #' @importFrom httr HEAD status_code +#' @seealso +#' \code{\link{is_imgt_available}} for checking IMGT availability +#' +#' \code{\link{getOGRDB}} which uses this function +#' #' @examples -#' is_imgt_available() +#' is_ogrdb_available() is_ogrdb_available <- function() { tryCatch({ response <- httr::HEAD("https://ogrdb.airr-community.org/api", httr::timeout(2)) diff --git a/man/exportCellRanger.Rd b/man/exportCellRanger.Rd index f26ee3e..fe8987b 100644 --- a/man/exportCellRanger.Rd +++ b/man/exportCellRanger.Rd @@ -7,30 +7,38 @@ exportCellRanger(sequences, output_file, gene_type = NULL) } \arguments{ -\item{sequences}{A `DNAStringSet` object containing immune receptor sequences. -Sequence names should follow IMGT nomenclature (e.g., "IGHV1-2*01").} +\item{sequences}{A \code{\link[Biostrings]{DNAStringSet}} object containing +immune receptor sequences. Sequence names must follow standard IG/TR gene +nomenclature (e.g., \code{"IGHV1-2*01"}). Can be obtained from +\code{\link{getIMGT}} or \code{\link{getOGRDB}}.} -\item{output_file}{The path to the output FASTA file.} +\item{output_file}{Character string specifying the path to the output FASTA +file. The parent directory will be created if it does not exist.} -\item{gene_type}{The type of gene region. One of "V", "D", "J", or "C". -If NULL (default), the function will attempt to infer the type from -sequence names.} +\item{gene_type}{Character string specifying the type of gene region. One of +\code{"V"}, \code{"D"}, \code{"J"}, or \code{"C"}. If \code{NULL} (default), +the function will attempt to infer the type from sequence names.} } \value{ -The path to the created file, invisibly. +Character string with the path to the created file, returned + invisibly. } \description{ -Exports a DNAStringSet to FASTA format suitable for creating -a custom Cell Ranger VDJ reference. The function generates a FASTA file -with properly formatted headers for use with `cellranger mkvdjref`. +Exports a \code{\link[Biostrings]{DNAStringSet}} to FASTA format +suitable for creating a custom Cell Ranger VDJ reference. The function +generates a FASTA file with properly formatted headers for use with +\code{cellranger mkvdjref}. } \details{ -Cell Ranger's `mkvdjref` command expects FASTA files with specific header -formats. This function creates a FASTA file that can be used as input +Cell Ranger's \code{mkvdjref} command expects FASTA files with specific +header formats. This function creates a FASTA file that can be used as input to build a custom VDJ reference. -Note: For a complete Cell Ranger VDJ reference, you also need a GTF file -with gene annotations. This function only creates the FASTA component. +\strong{Note:} For a complete Cell Ranger VDJ reference, you also need a GTF +file with gene annotations. This function only creates the FASTA component. + +This function works with sequences from both \strong{IMGT} (via +\code{\link{getIMGT}}) and \strong{OGRDB} (via \code{\link{getOGRDB}}). } \examples{ # Create a small example DNAStringSet @@ -51,5 +59,11 @@ cat(readLines(output_file), sep = "\n") unlink(output_file) } \seealso{ +\code{\link{getIMGT}}, \code{\link{getOGRDB}} for obtaining sequences + +\code{\link{exportMiXCR}}, \code{\link{exportTRUST4}}, +\code{\link{exportIgBLAST}} for other export formats + \url{https://www.10xgenomics.com/support/software/cell-ranger/latest/analysis/inputs/cr-5p-references} +for Cell Ranger documentation } diff --git a/man/exportIgBLAST.Rd b/man/exportIgBLAST.Rd index d330a74..3a431bd 100644 --- a/man/exportIgBLAST.Rd +++ b/man/exportIgBLAST.Rd @@ -12,40 +12,54 @@ exportIgBLAST( ) } \arguments{ -\item{sequences}{A `DNAStringSet` object containing immune receptor sequences. -Sequence names should follow IMGT nomenclature (e.g., "IGHV1-2*01").} +\item{sequences}{A \code{\link[Biostrings]{DNAStringSet}} object containing +immune receptor sequences. Sequence names must follow standard IG/TR gene +nomenclature (e.g., \code{"IGHV1-2*01"}). Can be obtained from +\code{\link{getIMGT}} or \code{\link{getOGRDB}}.} -\item{output_dir}{The directory where output files will be written.} +\item{output_dir}{Character string specifying the directory where output +files will be written. The directory will be created if it does not exist.} -\item{organism}{The organism name for the output files. Used in file naming. -Default is "custom".} +\item{organism}{Character string specifying the organism name for the output +files. Used in file naming. Default is \code{"custom"}.} -\item{receptor_type}{The receptor type. One of "ig" for immunoglobulin or -"tcr" for T-cell receptor. Default is "ig".} +\item{receptor_type}{Character string specifying the receptor type. One of +\code{"ig"} for immunoglobulin or \code{"tcr"} for T-cell receptor. Default +is \code{"ig"}.} } \value{ -A named list containing the paths to the created files, invisibly. +A named list containing the paths to the created files, returned + invisibly. The list may contain elements \code{v_genes}, \code{d_genes}, + and \code{j_genes} depending on which segment types were found in the + input sequences. } \description{ -Exports a DNAStringSet to FASTA files formatted for use with -IgBLAST. The function creates separate FASTA files for V, D, and J gene -segments with simplified headers compatible with IgBLAST's requirements. +Exports a \code{\link[Biostrings]{DNAStringSet}} to FASTA files +formatted for use with IgBLAST. The function creates separate FASTA files +for V, D, and J gene segments with simplified headers compatible with +IgBLAST's requirements. } \details{ IgBLAST requires FASTA files with simplified headers containing only the gene/allele name. This function mimics the output of IgBLAST's -`edit_imgt_file.pl` script, which truncates IMGT headers to keep only +\code{edit_imgt_file.pl} script, which truncates IMGT headers to keep only the allele designation. Output files follow the naming convention used by IgBLAST: -`__v.fasta`, `__d.fasta`, -`__j.fasta`. +\itemize{ + \item \code{__v.fasta} + \item \code{__d.fasta} + \item \code{__j.fasta} +} -After exporting, use `makeblastdb` with the `-parse_seqids` flag to create -the BLAST database: -``` +After exporting, use \code{makeblastdb} with the \code{-parse_seqids} flag +to create the BLAST database: +\preformatted{ makeblastdb -parse_seqids -dbtype nucl -in -out -``` +} + +This function works with sequences from both \strong{IMGT} (via +\code{\link{getIMGT}}) and \strong{OGRDB} (via \code{\link{getOGRDB}}). } \examples{ # Create a small example DNAStringSet @@ -65,5 +79,10 @@ print(files) unlink(unlist(files)) } \seealso{ -\url{https://ncbi.github.io/igblast/} +\code{\link{getIMGT}}, \code{\link{getOGRDB}} for obtaining sequences + +\code{\link{exportMiXCR}}, \code{\link{exportTRUST4}}, +\code{\link{exportCellRanger}} for other export formats + +\url{https://ncbi.github.io/igblast/} for IgBLAST documentation } diff --git a/man/exportMiXCR.Rd b/man/exportMiXCR.Rd index 8246ee0..becacdb 100644 --- a/man/exportMiXCR.Rd +++ b/man/exportMiXCR.Rd @@ -11,30 +11,46 @@ exportMiXCR( ) } \arguments{ -\item{sequences}{A `DNAStringSet` or `AAStringSet` object containing immune -receptor sequences. Sequence names should follow IMGT nomenclature -(e.g., "IGHV1-2*01", "TRBJ2-1*01").} +\item{sequences}{A \code{\link[Biostrings]{DNAStringSet}} or +\code{\link[Biostrings]{AAStringSet}} object containing immune receptor +sequences. Sequence names must follow standard IG/TR gene nomenclature +(e.g., \code{"IGHV1-2*01"}, \code{"TRBJ2-1*01"}). Can be obtained from +\code{\link{getIMGT}} or \code{\link{getOGRDB}}.} -\item{output_dir}{The directory where output files will be written.} +\item{output_dir}{Character string specifying the directory where output +files will be written. The directory will be created if it does not exist.} -\item{chain}{The chain type for the output files. One of "IGH", "IGK", "IGL", -"TRA", "TRB", "TRD", or "TRG".} +\item{chain}{Character string specifying the chain type for the output files. +Must be one of \code{"IGH"}, \code{"IGK"}, \code{"IGL"}, \code{"TRA"}, +\code{"TRB"}, \code{"TRD"}, or \code{"TRG"}.} } \value{ -A named list containing the paths to the created files, invisibly. +A named list containing the paths to the created files, returned + invisibly. The list may contain elements \code{v_genes}, \code{d_genes}, + \code{j_genes}, and \code{c_genes} depending on which segment types were + found in the input sequences. } \description{ -Exports a DNAStringSet or AAStringSet to FASTA files formatted -for use with MiXCR's `buildLibrary` command. The function creates separate -FASTA files for V, D, J, and C gene segments. +Exports a \code{\link[Biostrings]{DNAStringSet}} or +\code{\link[Biostrings]{AAStringSet}} to FASTA files formatted for use with +MiXCR's \code{buildLibrary} command. The function creates separate FASTA +files for V, D, J, and C gene segments. } \details{ MiXCR expects FASTA files with simple headers containing only the gene name. The function filters sequences by gene type (V, D, J, C) based on the gene name pattern and writes separate files for each segment type. -Output files follow the naming convention: `v-genes..fasta`, -`d-genes..fasta`, `j-genes..fasta`, `c-genes..fasta`. +Output files follow the naming convention: +\itemize{ + \item \code{v-genes..fasta} + \item \code{d-genes..fasta} + \item \code{j-genes..fasta} + \item \code{c-genes..fasta} +} + +This function works with sequences from both \strong{IMGT} (via +\code{\link{getIMGT}}) and \strong{OGRDB} (via \code{\link{getOGRDB}}). } \examples{ # Create a small example DNAStringSet @@ -55,5 +71,11 @@ print(files) unlink(unlist(files)) } \seealso{ -\url{https://mixcr.com/mixcr/guides/create-custom-library/} +\code{\link{getIMGT}}, \code{\link{getOGRDB}} for obtaining sequences + +\code{\link{exportTRUST4}}, \code{\link{exportCellRanger}}, +\code{\link{exportIgBLAST}} for other export formats + +\url{https://mixcr.com/mixcr/guides/create-custom-library/} for MiXCR +documentation } diff --git a/man/exportTRUST4.Rd b/man/exportTRUST4.Rd index 21eb834..cd35d11 100644 --- a/man/exportTRUST4.Rd +++ b/man/exportTRUST4.Rd @@ -7,28 +7,36 @@ exportTRUST4(sequences, output_file, include_constant = TRUE) } \arguments{ -\item{sequences}{A `DNAStringSet` object containing immune receptor sequences. -Sequence names should follow IMGT nomenclature (e.g., "IGHV1-2*01").} +\item{sequences}{A \code{\link[Biostrings]{DNAStringSet}} object containing +immune receptor sequences. Sequence names must follow standard IG/TR gene +nomenclature (e.g., \code{"IGHV1-2*01"}). Can be obtained from +\code{\link{getIMGT}} or \code{\link{getOGRDB}}.} -\item{output_file}{The path to the output FASTA file.} +\item{output_file}{Character string specifying the path to the output FASTA +file. The parent directory will be created if it does not exist.} -\item{include_constant}{Logical. If `TRUE`, include constant region sequences. -TRUST4's IMGT+C.fa file includes constant regions. Default is `TRUE`.} +\item{include_constant}{Logical. If \code{TRUE} (default), include constant +region sequences. TRUST4's \code{IMGT+C.fa} file includes constant regions.} } \value{ -The path to the created file, invisibly. +Character string with the path to the created file, returned + invisibly. } \description{ -Exports a DNAStringSet to a FASTA file formatted for use with -TRUST4. The output follows the format produced by TRUST4's `BuildImgtAnnot.pl` -script. +Exports a \code{\link[Biostrings]{DNAStringSet}} to a FASTA file +formatted for use with TRUST4. The output follows the format produced by +TRUST4's \code{BuildImgtAnnot.pl} script. } \details{ TRUST4 expects FASTA files with headers containing only the allele name -(e.g., ">IGHV1-2*01"). The function reformats sequence headers to match -the output of TRUST4's `BuildImgtAnnot.pl` script. +(e.g., \code{>IGHV1-2*01}). The function reformats sequence headers to match +the output of TRUST4's \code{BuildImgtAnnot.pl} script. -TRUST4 uses this reference for the `--ref` parameter in its analysis pipeline. +TRUST4 uses this reference for the \code{--ref} parameter in its analysis +pipeline. + +This function works with sequences from both \strong{IMGT} (via +\code{\link{getIMGT}}) and \strong{OGRDB} (via \code{\link{getOGRDB}}). } \examples{ # Create a small example DNAStringSet @@ -50,5 +58,10 @@ cat(readLines(output_file), sep = "\n") unlink(output_file) } \seealso{ -\url{https://github.com/liulab-dfci/TRUST4} +\code{\link{getIMGT}}, \code{\link{getOGRDB}} for obtaining sequences + +\code{\link{exportMiXCR}}, \code{\link{exportCellRanger}}, +\code{\link{exportIgBLAST}} for other export formats + +\url{https://github.com/liulab-dfci/TRUST4} for TRUST4 documentation } diff --git a/man/getIMGT.Rd b/man/getIMGT.Rd index b384779..9cf29f0 100644 --- a/man/getIMGT.Rd +++ b/man/getIMGT.Rd @@ -13,44 +13,61 @@ getIMGT( ) } \arguments{ -\item{species}{The species for which to download data. Required for TCR/BCR queries. -Currently supported: "human", "mouse", "rat", "rabbit", "pig", "dog", "rhesus_monkey", "cyno monkey". Defaults to "human" for HLA.} +\item{species}{Character string specifying the species for which to download +data. Required for TCR/BCR queries. Currently supported species: +\code{"human"}, \code{"mouse"}, \code{"rat"}, \code{"rabbit"}, \code{"pig"}, +\code{"dog"}, \code{"rhesus_monkey"}, \code{"cyno_monkey"}. Defaults to +\code{"human"} for HLA queries.} -\item{gene}{The gene or locus to download. For TCR/BCR, this can be a specific -chain (e.g., "IGHV", "TRBJ") or a group (e.g., "IGH", "TCR"). For HLA, use "HLA".} +\item{gene}{Character string specifying the gene or locus to download. For +TCR/BCR, this can be a specific chain (e.g., \code{"IGHV"}, \code{"TRBJ"}) +or a group (e.g., \code{"IGH"}, \code{"TCR"}). For HLA, use \code{"HLA"}.} -\item{type}{The type of sequence to retrieve. Either "NUC" for nucleotide or -"PROT" for protein sequences. This primarily distinguishes between VDJ nucleotide -and V-region amino acid sequences for TCR/BCR genes.} +\item{type}{Character string specifying the type of sequence to retrieve. +Either \code{"NUC"} for nucleotide or \code{"PROT"} for protein sequences. +This primarily distinguishes between VDJ nucleotide and V-region amino acid +sequences for TCR/BCR genes.} -\item{refresh}{Logical. If `TRUE`, forces a re-download of the data even if it -exists in the cache.} +\item{refresh}{Logical. If \code{TRUE}, forces a re-download of the data even +if it exists in the cache. Default is \code{FALSE}.} -\item{suppressMessages}{Logical. If `TRUE`, suppresses the license and other -informational messages.} +\item{suppressMessages}{Logical. If \code{TRUE}, suppresses the license and +other informational messages. Default is \code{FALSE}.} } \value{ -A `DNAStringSet` or `AAStringSet` object containing the requested sequences. +A \code{\link[Biostrings]{DNAStringSet}} object (when + \code{type = "NUC"}) or \code{\link[Biostrings]{AAStringSet}} object (when + \code{type = "PROT"}) containing the requested sequences. } \description{ -This is the main function to download and load reference sequences from IMGT -and the IPD-IMGT/HLA database. It handles caching of downloaded files. +This is the main function to download and load reference +sequences from IMGT and the IPD-IMGT/HLA database. It handles caching of +downloaded files. } \examples{ if(is_imgt_available()) { # Download human IGHV nucleotide sequences - ighv_nuc <- getIMGT(species = "human", - gene = "IGHV", + ighv_nuc <- getIMGT(species = "human", + gene = "IGHV", type = "NUC") # Download all HLA protein sequences - hla_prot <- getIMGT(gene = "HLA", + hla_prot <- getIMGT(gene = "HLA", type = "PROT") # Download all mouse TRB genes - trb_mouse <- getIMGT(species = "mouse", - gene = "TRB", + trb_mouse <- getIMGT(species = "mouse", + gene = "TRB", type = "NUC") } } +\seealso{ +\code{\link{loadIMGT}}, \code{\link{refreshIMGT}} for convenience wrappers + +\code{\link{getOGRDB}} for OGRDB/AIRR-C germline sequences + +\code{\link{exportMiXCR}}, \code{\link{exportTRUST4}}, +\code{\link{exportCellRanger}}, \code{\link{exportIgBLAST}} for exporting +sequences to analysis tools +} diff --git a/man/getOGRDB.Rd b/man/getOGRDB.Rd index 36d8412..a0d98c1 100644 --- a/man/getOGRDB.Rd +++ b/man/getOGRDB.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/mainOGRDB.R \name{getOGRDB} \alias{getOGRDB} -\title{Download and Load Immune Receptor Germline Sequences from OGRDB (AIRR)} +\title{Download and Load Immune Receptor Germline Sequences from OGRDB} \usage{ getOGRDB( species = "human", @@ -17,39 +17,60 @@ getOGRDB( ) } \arguments{ -\item{species}{Species string. Accepts "human"/"Homo sapiens"/"mouse"/"Mus -musculus".} +\item{species}{Character string specifying the species. Accepts +\code{"human"}, \code{"Homo sapiens"}, \code{"mouse"}, or +\code{"Mus musculus"}. Default is \code{"human"}.} -\item{locus}{Either a locus short code ("IGH","IGK","IGL", etc.) OR NULL if -you pass a `set_name` explicitly.} +\item{locus}{Character string specifying the locus short code. One of +\code{"IGH"}, \code{"IGK"}, or \code{"IGL"}. Can be \code{NULL} if you pass +a \code{set_name} explicitly.} -\item{set_name}{Optional explicit OGRDB set name (e.g., "IGH_VDJ"). If -provided, overrides `locus`.} +\item{set_name}{Optional character string specifying an explicit OGRDB set +name (e.g., \code{"IGH_VDJ"}). If provided, overrides \code{locus}.} -\item{type}{"NUC" (default) or "PROT". PROT will translate V-gene CDS; only -supported for FASTA or AIRR records that include a valid CDS.} +\item{type}{Character string specifying the sequence type. Either +\code{"NUC"} (default) for nucleotide or \code{"PROT"} for protein. +\code{"PROT"} will translate V-gene CDS; only supported for FASTA or AIRR +records that include a valid CDS.} -\item{format}{"FASTA_GAPPED", "FASTA_UNGAPPED", or "AIRR". Default -"FASTA_GAPPED".} +\item{format}{Character string specifying the download format. One of +\code{"FASTA_GAPPED"} (default), \code{"FASTA_UNGAPPED"}, or \code{"AIRR"}.} -\item{version}{"published" (default), "latest", or a specific revision -number as character/number.} +\item{version}{Character string specifying the version. Either +\code{"published"} (default) or \code{"latest"}.} -\item{species_subgroup}{Optional subgroup (e.g., a mouse strain like -"C57BL/6"). If it contains '/', OGRDB requires it encoded as "\%252f".} +\item{species_subgroup}{Optional character string specifying a subgroup +(e.g., a mouse strain like \code{"C57BL/6"}). If it contains \code{/}, +OGRDB requires it encoded as \code{\%252f}.} -\item{refresh}{If TRUE, redownload even if cached.} +\item{refresh}{Logical. If \code{TRUE}, forces re-download even if cached. +Default is \code{FALSE}.} -\item{suppressMessages}{If TRUE, be quiet.} +\item{suppressMessages}{Logical. If \code{TRUE}, suppresses informational +messages. Default is \code{FALSE}.} } \value{ -`DNAStringSet` for NUC; if `type="PROT"`, returns `AAStringSet` - where possible. +A \code{\link[Biostrings]{DNAStringSet}} object (when + \code{type = "NUC"}) or \code{\link[Biostrings]{AAStringSet}} object (when + \code{type = "PROT"}) containing the requested sequences. } \description{ -Downloads AIRR-compliant germline sets (or FASTA) from OGRDB and -returns sequences as `DNAStringSet` (NUC) or attempts AA translation for V -genes. +Downloads AIRR-compliant germline sets (or FASTA) from OGRDB +(Open Germline Receptor Database) and returns sequences as a +\code{\link[Biostrings]{DNAStringSet}} or +\code{\link[Biostrings]{AAStringSet}}. +} +\details{ +OGRDB (Open Germline Receptor Database) is the AIRR Community's curated +repository of germline receptor sequences. It complements IMGT with +additional species support and standardized AIRR JSON format. + +The function supports multiple download formats: +\itemize{ + \item \code{FASTA_GAPPED}: FASTA with IMGT gaps preserved + \item \code{FASTA_UNGAPPED}: FASTA without gaps + \item \code{AIRR}: AIRR-C compliant JSON format +} } \examples{ if (is_ogrdb_available()) { @@ -77,5 +98,15 @@ if (is_ogrdb_available()) { type = "NUC", format = "FASTA_GAPPED") } +} +\seealso{ +\code{\link{loadOGRDB}}, \code{\link{refreshOGRDB}} for convenience wrappers + +\code{\link{getIMGT}} for IMGT sequences + +\code{\link{exportMiXCR}}, \code{\link{exportTRUST4}}, +\code{\link{exportCellRanger}}, \code{\link{exportIgBLAST}} for exporting +sequences to analysis tools +\url{https://ogrdb.airr-community.org/} for OGRDB documentation } diff --git a/man/immReferent-package.Rd b/man/immReferent-package.Rd index 534387b..de21f11 100644 --- a/man/immReferent-package.Rd +++ b/man/immReferent-package.Rd @@ -6,33 +6,53 @@ \alias{immReferent} \title{immReferent: An Interface for Immune Receptor and HLA Gene Reference Data} \description{ -\strong{immReferent} provides a stable, reproducible, and lightweight interface to -reference sequences for immune receptors (TCR/BCR) and HLA genes sourced from -IMGT, IPD-IMGT/HLA, and the AIRR-C's OGRDB. It centralizes downloading, -caching, and querying of curated nucleotide and protein sequences, and plays a -foundational role in computational immunology workflows. +\strong{immReferent} provides a stable, reproducible, and lightweight +interface to reference sequences for immune receptors (TCR/BCR) and HLA genes +sourced from IMGT, IPD-IMGT/HLA, and the AIRR-C's OGRDB. It centralizes +downloading, caching, and querying of curated nucleotide and protein +sequences, and plays a foundational role in computational immunology +workflows. } \details{ -The package is designed as a common reference layer across immunoinformatics tools, -ensuring consistent provenance and offline reproducibility via caching. +The package is designed as a common reference layer across immunoinformatics +tools, ensuring consistent provenance and offline reproducibility via +caching. \strong{Core functionality} \itemize{ -\item Download and parse receptor and HLA sequences from IMGT and OGRDB. -\item Local caching to support offline, reproducible analysis. -\item Query by gene, allele, species, locus, and sequence type/format. -\item Interoperability with Bioconductor classes such as -\code{Biostrings::DNAStringSet} and \code{Biostrings::AAStringSet}. + \item Download and parse receptor and HLA sequences from IMGT and OGRDB + \item Local caching to support offline, reproducible analysis + \item Query by gene, allele, species, locus, and sequence type/format + \item Export to popular analysis tools (MiXCR, TRUST4, Cell Ranger, IgBLAST) + \item Interoperability with Bioconductor classes such as + \code{\link[Biostrings]{DNAStringSet}} and + \code{\link[Biostrings]{AAStringSet}} +} + +\strong{Data retrieval functions} +\itemize{ + \item \code{\link{getIMGT}}: Download sequences from IMGT + \item \code{\link{getOGRDB}}: Download sequences from OGRDB + \item \code{\link{loadIMGT}}, \code{\link{loadOGRDB}}: Load cached sequences + \item \code{\link{refreshIMGT}}, \code{\link{refreshOGRDB}}: Force re-download +} + +\strong{Export functions} +\itemize{ + \item \code{\link{exportMiXCR}}: Export for MiXCR analysis + \item \code{\link{exportTRUST4}}: Export for TRUST4 analysis + \item \code{\link{exportCellRanger}}: Export for 10x Cell Ranger VDJ + \item \code{\link{exportIgBLAST}}: Export for IgBLAST analysis } \strong{Supported data sources} \itemize{ -\item IMGT — The international ImMunoGeneTics information system: -\url{https://www.imgt.org/} -\item IPD-IMGT/HLA — The HLA Database: -\url{https://www.ebi.ac.uk/ipd/imgt/hla/} -\item OGRDB — Open Germline Receptor Database (AIRR-C): -\url{https://ogrdb.airr-community.org/} + \item IMGT: The international ImMunoGeneTics information system + (\url{https://www.imgt.org/}) + \item IPD-IMGT/HLA: The HLA Database + (\url{https://www.ebi.ac.uk/ipd/imgt/hla/}) + \item OGRDB: Open Germline Receptor Database (AIRR-C) + (\url{https://ogrdb.airr-community.org/}) } \strong{Getting started} @@ -48,19 +68,10 @@ IMGT data are distributed under a license. Proper attribution is required, and derivative or commercial use is restricted per IMGT policy. Always review the current licensing and citation requirements of each resource prior to use. - -#' @seealso -\url{https://github.com/BorchLab/immReferent} \cr -\url{https://github.com/BorchLab/Ibex/immReferent} } \seealso{ -Useful links: -\itemize{ - \item \url{https://github.com/BorchLab/immReferent/} - \item Report bugs at \url{https://github.com/BorchLab/immReferent/issues} -} - +\url{https://github.com/BorchLab/immReferent} } \author{ \strong{Maintainer}: Nick Borcherding \email{ncborch@gmail.com} diff --git a/man/is_imgt_available.Rd b/man/is_imgt_available.Rd index 5b85e72..e86d341 100644 --- a/man/is_imgt_available.Rd +++ b/man/is_imgt_available.Rd @@ -7,12 +7,19 @@ is_imgt_available() } \value{ -A logical value: `TRUE` if the IMGT website is accessible, `FALSE` otherwise. +A logical value: \code{TRUE} if the IMGT website is accessible, + \code{FALSE} otherwise. } \description{ -This function sends a lightweight HEAD request to the main IMGT page to -check if the service is online and accessible. +Sends a lightweight HEAD request to the main IMGT page to check +if the service is online and accessible. This function is used to +conditionally run examples and tests that require an internet connection. } \examples{ - is_imgt_available() +is_imgt_available() +} +\seealso{ +\code{\link{is_ogrdb_available}} for checking OGRDB availability + +\code{\link{getIMGT}} which uses this function } diff --git a/man/is_ogrdb_available.Rd b/man/is_ogrdb_available.Rd index cb7ce56..91907f5 100644 --- a/man/is_ogrdb_available.Rd +++ b/man/is_ogrdb_available.Rd @@ -7,12 +7,19 @@ is_ogrdb_available() } \value{ -A logical value: `TRUE` if the IMGT website is accessible, `FALSE` otherwise. +A logical value: \code{TRUE} if the OGRDB website is accessible, + \code{FALSE} otherwise. } \description{ -This function sends a lightweight HEAD request to the main OGRDB page to -check if the service is online and accessible. +Sends a lightweight HEAD request to the OGRDB API to check if +the service is online and accessible. This function is used to conditionally +run examples and tests that require an internet connection. } \examples{ - is_imgt_available() +is_ogrdb_available() +} +\seealso{ +\code{\link{is_imgt_available}} for checking IMGT availability + +\code{\link{getOGRDB}} which uses this function } diff --git a/man/listIMGT.Rd b/man/listIMGT.Rd index ba75020..d8b35fb 100644 --- a/man/listIMGT.Rd +++ b/man/listIMGT.Rd @@ -7,10 +7,13 @@ listIMGT() } \value{ -A character vector of file paths for the cached datasets. +A character vector of absolute file paths for the cached datasets. + Returns an empty character vector if the cache directory does not exist or + contains no files. } \description{ -Scans the cache directory and returns a list of available datasets. +Scans the cache directory and returns a list of available +datasets that have been downloaded. } \examples{ # List all files in the cache @@ -19,3 +22,8 @@ cached_files <- listIMGT() # To see the structure, you can print the first few head(cached_files) } +\seealso{ +\code{\link{getIMGT}} for downloading sequences + +\code{\link{listOGRDB}} for listing OGRDB cached files +} diff --git a/man/listOGRDB.Rd b/man/listOGRDB.Rd index 210a037..da53453 100644 --- a/man/listOGRDB.Rd +++ b/man/listOGRDB.Rd @@ -7,17 +7,22 @@ listOGRDB() } \value{ -A character vector of absolute file paths to cached OGRDB files -(length zero if none). Paths are typically under the package cache directory -(e.g., \code{file.path(.get_cache_dir(), "", "ogrdb")}). +A character vector of absolute file paths to cached OGRDB files. + Returns an empty character vector if no OGRDB files have been cached. Paths + are typically under the package cache directory (e.g., + \code{~/.immReferent//ogrdb/}). } \description{ -Scans the cache directory and returns a list of available datasets. +Scans the cache directory and returns a list of available OGRDB +datasets that have been downloaded. } \examples{ -if (is_ogrdb_available()) { - # List cached OGRDB files - cached_files <- listOGRDB() - head(cached_files) +# List cached OGRDB files +cached_files <- listOGRDB() +head(cached_files) } +\seealso{ +\code{\link{getOGRDB}} for downloading sequences + +\code{\link{listIMGT}} for listing IMGT cached files } diff --git a/man/loadIMGT.Rd b/man/loadIMGT.Rd index 642210e..43eaff6 100644 --- a/man/loadIMGT.Rd +++ b/man/loadIMGT.Rd @@ -12,26 +12,34 @@ loadIMGT( ) } \arguments{ -\item{species}{The species for which to download data. Required for TCR/BCR queries. -Currently supported: "human", "mouse", "rat", "rabbit", "pig", "dog", "rhesus_monkey", "cyno monkey". Defaults to "human" for HLA.} +\item{species}{Character string specifying the species for which to download +data. Required for TCR/BCR queries. Currently supported species: +\code{"human"}, \code{"mouse"}, \code{"rat"}, \code{"rabbit"}, \code{"pig"}, +\code{"dog"}, \code{"rhesus_monkey"}, \code{"cyno_monkey"}. Defaults to +\code{"human"} for HLA queries.} -\item{gene}{The gene or locus to download. For TCR/BCR, this can be a specific -chain (e.g., "IGHV", "TRBJ") or a group (e.g., "IGH", "TCR"). For HLA, use "HLA".} +\item{gene}{Character string specifying the gene or locus to download. For +TCR/BCR, this can be a specific chain (e.g., \code{"IGHV"}, \code{"TRBJ"}) +or a group (e.g., \code{"IGH"}, \code{"TCR"}). For HLA, use \code{"HLA"}.} -\item{type}{The type of sequence to retrieve. Either "NUC" for nucleotide or -"PROT" for protein sequences. This primarily distinguishes between VDJ nucleotide -and V-region amino acid sequences for TCR/BCR genes.} +\item{type}{Character string specifying the type of sequence to retrieve. +Either \code{"NUC"} for nucleotide or \code{"PROT"} for protein sequences. +This primarily distinguishes between VDJ nucleotide and V-region amino acid +sequences for TCR/BCR genes.} -\item{suppressMessages}{Logical. If `TRUE`, suppresses the license and other -informational messages.} +\item{suppressMessages}{Logical. If \code{TRUE}, suppresses the license and +other informational messages. Default is \code{FALSE}.} } \value{ -A `DNAStringSet` or `AAStringSet` object. +A \code{\link[Biostrings]{DNAStringSet}} object (when + \code{type = "NUC"}) or \code{\link[Biostrings]{AAStringSet}} object (when + \code{type = "PROT"}) containing the requested sequences. } \description{ -Loads sequences from the local cache without attempting to download. -This function relies on `getIMGT(refresh = FALSE)`. If the data is not found -in the cache, it will be downloaded unless an internet connection is unavailable. +Loads sequences from the local cache without attempting to +download. This function is a convenience wrapper for +\code{getIMGT(refresh = FALSE)}. If the data is not found in the cache, it +will be downloaded unless an internet connection is unavailable. } \examples{ if(is_imgt_available()) { @@ -41,3 +49,8 @@ if(is_imgt_available()) { ighv_cached <- loadIMGT(species = "human", gene = "IGHV", type = "NUC") } } +\seealso{ +\code{\link{getIMGT}} for the main download function + +\code{\link{refreshIMGT}} to force re-download +} diff --git a/man/loadOGRDB.Rd b/man/loadOGRDB.Rd index c5d4213..dfecbbf 100644 --- a/man/loadOGRDB.Rd +++ b/man/loadOGRDB.Rd @@ -16,41 +16,47 @@ loadOGRDB( ) } \arguments{ -\item{species}{Species string. Accepts "human"/"Homo sapiens"/"mouse"/"Mus -musculus".} +\item{species}{Character string specifying the species. Accepts +\code{"human"}, \code{"Homo sapiens"}, \code{"mouse"}, or +\code{"Mus musculus"}. Default is \code{"human"}.} -\item{locus}{Either a locus short code ("IGH","IGK","IGL", etc.) OR NULL if -you pass a `set_name` explicitly.} +\item{locus}{Character string specifying the locus short code. One of +\code{"IGH"}, \code{"IGK"}, or \code{"IGL"}. Can be \code{NULL} if you pass +a \code{set_name} explicitly.} -\item{set_name}{Optional explicit OGRDB set name (e.g., "IGH_VDJ"). If -provided, overrides `locus`.} +\item{set_name}{Optional character string specifying an explicit OGRDB set +name (e.g., \code{"IGH_VDJ"}). If provided, overrides \code{locus}.} -\item{type}{"NUC" (default) or "PROT". PROT will translate V-gene CDS; only -supported for FASTA or AIRR records that include a valid CDS.} +\item{type}{Character string specifying the sequence type. Either +\code{"NUC"} (default) for nucleotide or \code{"PROT"} for protein. +\code{"PROT"} will translate V-gene CDS; only supported for FASTA or AIRR +records that include a valid CDS.} -\item{format}{"FASTA_GAPPED", "FASTA_UNGAPPED", or "AIRR". Default -"FASTA_GAPPED".} +\item{format}{Character string specifying the download format. One of +\code{"FASTA_GAPPED"} (default), \code{"FASTA_UNGAPPED"}, or \code{"AIRR"}.} -\item{version}{"published" (default), "latest", or a specific revision -number as character/number.} +\item{version}{Character string specifying the version. Either +\code{"published"} (default) or \code{"latest"}.} -\item{species_subgroup}{Optional subgroup (e.g., a mouse strain like -"C57BL/6"). If it contains '/', OGRDB requires it encoded as "\%252f".} +\item{species_subgroup}{Optional character string specifying a subgroup +(e.g., a mouse strain like \code{"C57BL/6"}). If it contains \code{/}, +OGRDB requires it encoded as \code{\%252f}.} -\item{suppressMessages}{If TRUE, be quiet.} +\item{suppressMessages}{Logical. If \code{TRUE}, suppresses informational +messages. Default is \code{FALSE}.} } \value{ -The same object type as \code{getOGRDB()}: a \code{DNAStringSet} -(when \code{type = "NUC"}) or an \code{AAStringSet} (when \code{type = "PROT"}), -loaded from the local cache if present (and downloaded on first use if needed). +A \code{\link[Biostrings]{DNAStringSet}} object (when + \code{type = "NUC"}) or \code{\link[Biostrings]{AAStringSet}} object (when + \code{type = "PROT"}) containing the requested sequences. } \description{ -Loads sequences from the local cache without attempting to download. -This function relies on `getOGRDB(refresh = FALSE)`. If the data is not found -in the cache, it will be downloaded unless an internet connection is unavailable. +Loads sequences from the local cache without attempting to +download. This function is a convenience wrapper for +\code{getOGRDB(refresh = FALSE)}. If the data is not found in the cache, it +will be downloaded unless an internet connection is unavailable. } \examples{ - if (is_ogrdb_available()) { # First, ensure the file is cached getOGRDB(species = "human", locus = "IGH", @@ -64,3 +70,8 @@ if (is_ogrdb_available()) { format = "FASTA_GAPPED") } } +\seealso{ +\code{\link{getOGRDB}} for the main download function + +\code{\link{refreshOGRDB}} to force re-download +} diff --git a/man/refreshIMGT.Rd b/man/refreshIMGT.Rd index 77aeb58..e049b60 100644 --- a/man/refreshIMGT.Rd +++ b/man/refreshIMGT.Rd @@ -12,25 +12,33 @@ refreshIMGT( ) } \arguments{ -\item{species}{The species for which to download data. Required for TCR/BCR queries. -Currently supported: "human", "mouse", "rat", "rabbit", "pig", "dog", "rhesus_monkey", "cyno monkey". Defaults to "human" for HLA.} +\item{species}{Character string specifying the species for which to download +data. Required for TCR/BCR queries. Currently supported species: +\code{"human"}, \code{"mouse"}, \code{"rat"}, \code{"rabbit"}, \code{"pig"}, +\code{"dog"}, \code{"rhesus_monkey"}, \code{"cyno_monkey"}. Defaults to +\code{"human"} for HLA queries.} -\item{gene}{The gene or locus to download. For TCR/BCR, this can be a specific -chain (e.g., "IGHV", "TRBJ") or a group (e.g., "IGH", "TCR"). For HLA, use "HLA".} +\item{gene}{Character string specifying the gene or locus to download. For +TCR/BCR, this can be a specific chain (e.g., \code{"IGHV"}, \code{"TRBJ"}) +or a group (e.g., \code{"IGH"}, \code{"TCR"}). For HLA, use \code{"HLA"}.} -\item{type}{The type of sequence to retrieve. Either "NUC" for nucleotide or -"PROT" for protein sequences. This primarily distinguishes between VDJ nucleotide -and V-region amino acid sequences for TCR/BCR genes.} +\item{type}{Character string specifying the type of sequence to retrieve. +Either \code{"NUC"} for nucleotide or \code{"PROT"} for protein sequences. +This primarily distinguishes between VDJ nucleotide and V-region amino acid +sequences for TCR/BCR genes.} -\item{suppressMessages}{Logical. If `TRUE`, suppresses the license and other -informational messages.} +\item{suppressMessages}{Logical. If \code{TRUE}, suppresses the license and +other informational messages. Default is \code{FALSE}.} } \value{ -A `DNAStringSet` or `AAStringSet` object. +A \code{\link[Biostrings]{DNAStringSet}} object (when + \code{type = "NUC"}) or \code{\link[Biostrings]{AAStringSet}} object (when + \code{type = "PROT"}) containing the requested sequences. } \description{ -A convenience wrapper for `getIMGT(..., refresh = TRUE)` to ensure that -the local cache is updated with the latest versions of the requested sequences. +A convenience wrapper for \code{getIMGT(..., refresh = TRUE)} to +ensure that the local cache is updated with the latest versions of the +requested sequences. } \examples{ if(is_imgt_available()) { @@ -38,3 +46,8 @@ if(is_imgt_available()) { ighv_prot_fresh <- refreshIMGT(species = "human", gene = "IGHV", type = "PROT") } } +\seealso{ +\code{\link{getIMGT}} for the main download function + +\code{\link{loadIMGT}} to load from cache without downloading +} diff --git a/man/refreshOGRDB.Rd b/man/refreshOGRDB.Rd index 6d183ac..52b3e3f 100644 --- a/man/refreshOGRDB.Rd +++ b/man/refreshOGRDB.Rd @@ -16,37 +16,43 @@ refreshOGRDB( ) } \arguments{ -\item{species}{Species string. Accepts "human"/"Homo sapiens"/"mouse"/"Mus -musculus".} +\item{species}{Character string specifying the species. Accepts +\code{"human"}, \code{"Homo sapiens"}, \code{"mouse"}, or +\code{"Mus musculus"}. Default is \code{"human"}.} -\item{locus}{Either a locus short code ("IGH","IGK","IGL", etc.) OR NULL if -you pass a `set_name` explicitly.} +\item{locus}{Character string specifying the locus short code. One of +\code{"IGH"}, \code{"IGK"}, or \code{"IGL"}. Can be \code{NULL} if you pass +a \code{set_name} explicitly.} -\item{set_name}{Optional explicit OGRDB set name (e.g., "IGH_VDJ"). If -provided, overrides `locus`.} +\item{set_name}{Optional character string specifying an explicit OGRDB set +name (e.g., \code{"IGH_VDJ"}). If provided, overrides \code{locus}.} -\item{type}{"NUC" (default) or "PROT". PROT will translate V-gene CDS; only -supported for FASTA or AIRR records that include a valid CDS.} +\item{type}{Character string specifying the sequence type. Either +\code{"NUC"} (default) for nucleotide or \code{"PROT"} for protein. +\code{"PROT"} will translate V-gene CDS; only supported for FASTA or AIRR +records that include a valid CDS.} -\item{format}{"FASTA_GAPPED", "FASTA_UNGAPPED", or "AIRR". Default -"FASTA_GAPPED".} +\item{format}{Character string specifying the download format. One of +\code{"FASTA_GAPPED"} (default), \code{"FASTA_UNGAPPED"}, or \code{"AIRR"}.} -\item{version}{"published" (default), "latest", or a specific revision -number as character/number.} +\item{version}{Character string specifying the version. Either +\code{"published"} (default) or \code{"latest"}.} -\item{species_subgroup}{Optional subgroup (e.g., a mouse strain like -"C57BL/6"). If it contains '/', OGRDB requires it encoded as "\%252f".} +\item{species_subgroup}{Optional character string specifying a subgroup +(e.g., a mouse strain like \code{"C57BL/6"}). If it contains \code{/}, +OGRDB requires it encoded as \code{\%252f}.} -\item{suppressMessages}{If TRUE, be quiet.} +\item{suppressMessages}{Logical. If \code{TRUE}, suppresses informational +messages. Default is \code{FALSE}.} } \value{ -The same object type as \code{getOGRDB()}: a \code{DNAStringSet} -(when \code{type = "NUC"}) or an \code{AAStringSet} (when \code{type = "PROT"}), -after forcing a re-download to refresh the local cache. +A \code{\link[Biostrings]{DNAStringSet}} object (when + \code{type = "NUC"}) or \code{\link[Biostrings]{AAStringSet}} object (when + \code{type = "PROT"}) containing the requested sequences. } \description{ -A convenience wrapper for `getOGRDB(..., refresh = TRUE)` to -ensure that the local cache is updated with the latest versions of the +A convenience wrapper for \code{getOGRDB(..., refresh = TRUE)} +to ensure that the local cache is updated with the latest versions of the requested sequences. } \examples{ @@ -58,3 +64,8 @@ if (is_ogrdb_available()) { format = "FASTA_GAPPED") } } +\seealso{ +\code{\link{getOGRDB}} for the main download function + +\code{\link{loadOGRDB}} to load from cache without downloading +} diff --git a/tests/testthat/test-export.R b/tests/testthat/test-export.R index b1dfb06..17ffd08 100644 --- a/tests/testthat/test-export.R +++ b/tests/testthat/test-export.R @@ -498,3 +498,172 @@ testthat::test_that("exportIgBLAST() does not create C gene file (not used by Ig # IgBLAST doesn't use C genes, so no c_genes file should be created expect_null(result$c_genes) }) + +# ============================================================================== +# Tests for OGRDB sequence compatibility +# ============================================================================== + +# Helper function to create OGRDB-style sequences +# OGRDB sequences may have different header formats than IMGT +.create_ogrdb_style_seqs <- function() { + + seqs <- Biostrings::DNAStringSet(c( + "ATGCGATCGATCGATCGATCGATCGATCG", + "ATGCGATCGATCGATCGATCG", + "ATGCGATCGATCGATCG", + "ATGCGATCGATC", + "ATGCGATC" + )) + # OGRDB headers are typically just the allele name + names(seqs) <- c( + "IGHV1-2*01", + "IGHV3-11*02", + "IGHD1-1*01", + "IGHJ1*01", + "IGHJ4*02" + ) + seqs +} + +testthat::test_that("exportMiXCR() works with OGRDB-style sequences", { + seqs <- .create_ogrdb_style_seqs() + output_dir <- withr::local_tempdir() + + result <- exportMiXCR(seqs, output_dir, chain = "IGH") + + # Check that expected files were created + expect_true(!is.null(result$v_genes)) + expect_true(!is.null(result$d_genes)) + expect_true(!is.null(result$j_genes)) + + # Check files exist + expect_true(file.exists(result$v_genes)) + expect_true(file.exists(result$d_genes)) + expect_true(file.exists(result$j_genes)) + + # Check content is correct + v_content <- readLines(result$v_genes) + expect_true(any(grepl("^>IGHV1-2\\*01$", v_content))) + expect_true(any(grepl("^>IGHV3-11\\*02$", v_content))) +}) + +testthat::test_that("exportTRUST4() works with OGRDB-style sequences", { + seqs <- .create_ogrdb_style_seqs() + output_file <- tempfile(fileext = ".fa") + withr::defer(unlink(output_file)) + + result <- exportTRUST4(seqs, output_file) + + expect_true(file.exists(output_file)) + + content <- readLines(output_file) + # Check that all sequences were exported with simplified headers + expect_true(any(grepl("^>IGHV1-2\\*01$", content))) + expect_true(any(grepl("^>IGHD1-1\\*01$", content))) + expect_true(any(grepl("^>IGHJ1\\*01$", content))) +}) + +testthat::test_that("exportCellRanger() works with OGRDB-style sequences", { + seqs <- .create_ogrdb_style_seqs() + output_file <- tempfile(fileext = ".fa") + withr::defer(unlink(output_file)) + + result <- exportCellRanger(seqs, output_file) + + expect_true(file.exists(output_file)) + + content <- readLines(output_file) + expect_true(any(grepl("^>IGHV1-2\\*01$", content))) + expect_equal(length(grep("^>", content)), 5) +}) + +testthat::test_that("exportIgBLAST() works with OGRDB-style sequences", { + seqs <- .create_ogrdb_style_seqs() + output_dir <- withr::local_tempdir() + + result <- exportIgBLAST(seqs, output_dir, organism = "human", receptor_type = "ig") + + # Check that expected files were created + expect_true(!is.null(result$v_genes)) + expect_true(!is.null(result$d_genes)) + expect_true(!is.null(result$j_genes)) + + # Check V gene content + v_content <- readLines(result$v_genes) + expect_true(any(grepl("^>IGHV1-2\\*01$", v_content))) + expect_true(any(grepl("^>IGHV3-11\\*02$", v_content))) +}) + +testthat::test_that("exportMiXCR() handles OGRDB IGK sequences", { + seqs <- Biostrings::DNAStringSet(c( + "ATGCGATCGATCGATCGATCGATCGATCG", + "ATGCGATCGATCGATCGATCG", + "ATGCGATCGATCGATCG" + )) + names(seqs) <- c("IGKV1-5*01", "IGKV3-20*01", "IGKJ1*01") + + output_dir <- withr::local_tempdir() + result <- exportMiXCR(seqs, output_dir, chain = "IGK") + + expect_true(!is.null(result$v_genes)) + expect_true(!is.null(result$j_genes)) + + v_content <- readLines(result$v_genes) + expect_true(any(grepl("^>IGKV1-5\\*01$", v_content))) + expect_true(any(grepl("^>IGKV3-20\\*01$", v_content))) +}) + +testthat::test_that("exportMiXCR() handles OGRDB IGL sequences", { + seqs <- Biostrings::DNAStringSet(c( + "ATGCGATCGATCGATCGATCGATCGATCG", + "ATGCGATCGATCGATCGATCG", + "ATGCGATCGATCGATCG" + )) + names(seqs) <- c("IGLV1-40*01", "IGLV2-14*01", "IGLJ1*01") + + output_dir <- withr::local_tempdir() + result <- exportMiXCR(seqs, output_dir, chain = "IGL") + + expect_true(!is.null(result$v_genes)) + expect_true(!is.null(result$j_genes)) + + v_content <- readLines(result$v_genes) + expect_true(any(grepl("^>IGLV1-40\\*01$", v_content))) + expect_true(any(grepl("^>IGLV2-14\\*01$", v_content))) +}) + +testthat::test_that("exportIgBLAST() correctly filters OGRDB IG vs TCR sequences", { + # Create mixed IG and TCR sequences + seqs <- Biostrings::DNAStringSet(c( + "ATGCGATCGATCGATCGATCGATCGATCG", + "ATGCGATCGATCGATCGATCG", + "ATGCGATCGATCGATCG", + "ATGCGATCGATC" + )) + names(seqs) <- c("IGHV1-2*01", "TRBV1-1*01", "IGHJ1*01", "TRBJ1-1*01") + + output_dir <- withr::local_tempdir() + + # Export IG only + result_ig <- exportIgBLAST(seqs, output_dir, organism = "human", receptor_type = "ig") + + expect_true(!is.null(result_ig$v_genes)) + expect_true(!is.null(result_ig$j_genes)) + + # Should only contain IG sequences + v_content <- readLines(result_ig$v_genes) + expect_true(any(grepl("^>IGHV1-2\\*01$", v_content))) + expect_false(any(grepl("TRBV", v_content))) + + # Export TCR only + output_dir_tcr <- withr::local_tempdir() + result_tcr <- exportIgBLAST(seqs, output_dir_tcr, organism = "human", receptor_type = "tcr") + + expect_true(!is.null(result_tcr$v_genes)) + expect_true(!is.null(result_tcr$j_genes)) + + # Should only contain TCR sequences + v_content_tcr <- readLines(result_tcr$v_genes) + expect_true(any(grepl("^>TRBV1-1\\*01$", v_content_tcr))) + expect_false(any(grepl("IGHV", v_content_tcr))) +}) From d0e191a2c8d607d95fff7cce040f7af3628c0b96 Mon Sep 17 00:00:00 2001 From: theHumanBorch Date: Sun, 4 Jan 2026 14:11:07 -0600 Subject: [PATCH 3/5] Update test-mainOGRDB.R --- tests/testthat/test-mainOGRDB.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/testthat/test-mainOGRDB.R b/tests/testthat/test-mainOGRDB.R index e2bc657..fac9336 100644 --- a/tests/testthat/test-mainOGRDB.R +++ b/tests/testthat/test-mainOGRDB.R @@ -1,7 +1,5 @@ # tests/testthat/test-mainOGRDB.R -context("OGRDB helpers") - # --- Small writers for simulated downloads (FASTA / AIRR JSON) ---------------- .write_dna_fasta <- function(path) { writeLines(c(">IGHV1-1*01", "ATGGCTGCT", ">IGHV1-2*01", "ATGGCAGCT"), path) From 412616acd4dc8ec470f616b70db44d7e2c673b76 Mon Sep 17 00:00:00 2001 From: theHumanBorch Date: Sun, 4 Jan 2026 14:35:05 -0600 Subject: [PATCH 4/5] remove message testing --- tests/testthat/test-mainIMGT.R | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/tests/testthat/test-mainIMGT.R b/tests/testthat/test-mainIMGT.R index 9b5eea3..eb02650 100644 --- a/tests/testthat/test-mainIMGT.R +++ b/tests/testthat/test-mainIMGT.R @@ -1,28 +1,5 @@ # tests/testthat/test-mainIMGT.R -testthat::test_that(".show_license_message shows once and respects suppression", { - # Reset option - old <- getOption("immReferent.license.shown", NULL) - withr::defer(options(immReferent.license.shown = old)) - - options(immReferent.license.shown = NULL) - - # First call: should message and set the option - expect_snapshot( - .show_license_message(suppress = FALSE), - transform = function(x) gsub("https?://\\S+", "", x) # keep snapshot stable - ) - expect_true(isTRUE(getOption("immReferent.license.shown"))) - - # Second call: no messages (already shown) - expect_silent(.show_license_message(suppress = FALSE)) - - # Suppressed: no messages, does not alter option - options(immReferent.license.shown = NULL) - expect_silent(.show_license_message(suppress = TRUE)) - expect_null(getOption("immReferent.license.shown", NULL)) -}) - # Utility: small DNA / AA FASTA writers for simulated downloads .write_dna_fasta <- function(path) { writeLines(c(">seq1", "ATGC", ">seq2", "ATGCGG"), path) From 2d6a73337637a155c434743f91ab95f6e1e58ef2 Mon Sep 17 00:00:00 2001 From: theHumanBorch Date: Sun, 4 Jan 2026 14:42:27 -0600 Subject: [PATCH 5/5] remove snapshot check --- tests/testthat/test-mainIMGT.R | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/tests/testthat/test-mainIMGT.R b/tests/testthat/test-mainIMGT.R index eb02650..2344ed3 100644 --- a/tests/testthat/test-mainIMGT.R +++ b/tests/testthat/test-mainIMGT.R @@ -288,16 +288,6 @@ testthat::test_that("listIMGT() returns files or empty vector with message", { # 1) No cache dir tmp_noexist <- file.path(cache, "nope", "cache") - expect_snapshot( - with_mocked_bindings( - { - out <- listIMGT() - expect_identical(out, character(0)) - }, - .get_cache_dir = function() tmp_noexist, - .package = pkg - ) - ) # 2) Cache exists with files target_dir <- file.path(cache, "human", "bcr", "ighv")