diff --git a/DESCRIPTION b/DESCRIPTION index 27f165e..02a98bf 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: immGLIPH Title: Grouping of Lymphocyte Interactions by Paratope Hotspots -Version: 0.99.2 +Version: 0.99.3 Authors@R: c( person("Nick", "Borcherding", role = c("aut", "cre"), email = "ncborch@gmail.com") @@ -26,8 +26,7 @@ Depends: Imports: stringdist, igraph, - foreach, - doParallel, + BiocParallel, parallel, stringr, stats, diff --git a/NAMESPACE b/NAMESPACE index 3479bfb..767757e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,6 +8,5 @@ export(getRandomSubsample) export(loadGLIPH) export(plotNetwork) export(runGLIPH) -import(foreach) import(grDevices) import(viridis) diff --git a/NEWS.md b/NEWS.md index 9da9e29..0200bc0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,36 @@ +# immGLIPH 0.99.4 + +* Added a Validation section to the README with concordance metrics + against the published GLIPH and GLIPH2 cluster vectors from + Glanville et al. (2017) and Huang et al. (2020). With paper-matched + parameters, immGLIPH reproduces the published cluster vectors at + ARI 0.985 (Glanville) and 0.863 (Huang) on the intersection of + shared CDR3s. Full benchmark code lives at + [BorchLab/immGLIPH-benchmark](https://github.com/BorchLab/immGLIPH-benchmark). +* Fixed `clusterScoring()` failure in the clonal-expansion-enrichment + test when a cluster contains more members than the reference pool + has rows. The null draw now uses `replace = TRUE` (bootstrap), + matching the V-gene null and the statistically appropriate choice + for resampling. Surfaced on the Huang 2020 benchmark. + +# immGLIPH 0.99.3 + +* Replaced `foreach`/`doParallel` with `BiocParallel` for parallelization + across all functions per Bioconductor recommendations. +* Replaced `devtools::install_github()` references with + `BiocManager::install()` in vignette and error messages. +* Updated vignette to use `SingleCellExperiment` instead of `Seurat` for + the single-cell workflow example. +* Fixed `combineTCR()` example in vignette (removed obsolete `cells` + argument). +* Made more vignette code chunks evaluable (`clusterScoring()`, + `deNovoTCRs()`, `plotNetwork()` examples). +* Noted that `scRepertoire` and `immApex` are Bioconductor packages. +* Replaced iterative for-loop list growing with vectorized alternatives + (`lapply()`, `vapply()`, `Reduce()`). +* Standardized code spacing around operators and after commas per + Bioconductor coding style. + # immGLIPH 0.99.2 * Fixing roxygen documentation issue creating warnings. diff --git a/R/clusterScoring.R b/R/clusterScoring.R index ed6733f..092552c 100644 --- a/R/clusterScoring.R +++ b/R/clusterScoring.R @@ -100,7 +100,6 @@ #' @references Glanville, Jacob, et al. #' "Identifying specificity groups in the T cell receptor repertoire." Nature 547.7661 (2017): 94. #' @references https://github.com/immunoengineer/gliph -#' @import foreach #' @export clusterScoring <- function(cluster_list, cdr3_sequences, @@ -111,85 +110,124 @@ clusterScoring <- function(cluster_list, gliph_version = 1, sim_depth = 1000, hla_cutoff = 0.1, - n_cores = 1){ + n_cores = 1) { ################################################################## ## Unit-testing ## ################################################################## ### cluster_list - if(!is.list(cluster_list)) stop("parameter 'cluster_list' has to be an object of class 'list'.") + if (!is.list(cluster_list)) { + stop("parameter 'cluster_list' has to be an object of class 'list'.") + } ### cdr3_sequences - if(is.atomic(cdr3_sequences)) cdr3_sequences <- data.frame("CDR3b" = cdr3_sequences) - if(!is.data.frame(cdr3_sequences)) stop("parameter 'cdr3_sequences' has to be an object of class 'data.frame'.") + if (is.atomic(cdr3_sequences)) { + cdr3_sequences <- data.frame("CDR3b" = cdr3_sequences) + } + if (!is.data.frame(cdr3_sequences)) { + stop("parameter 'cdr3_sequences' has to be an object of class 'data.frame'.") + } cdr3_sequences[] <- lapply(cdr3_sequences, as.character) ### refdb_beta - if(!is.data.frame(refdb_beta)){ + if (!is.data.frame(refdb_beta)) { valid_names <- .valid_reference_names() - if(length(refdb_beta) != 1 || !is.character(refdb_beta)){ + if (length(refdb_beta) != 1 || !is.character(refdb_beta)) { stop("refdb_beta must be a data frame or one of: ", paste(sQuote(valid_names), collapse = ", ")) - } else if(!(refdb_beta %in% valid_names)){ + } else if (!(refdb_beta %in% valid_names)) { stop("refdb_beta must be a data frame or one of: ", paste(sQuote(valid_names), collapse = ", ")) } } ### v_usage_freq - if(!is.null(v_usage_freq)){ - if(is.data.frame(v_usage_freq)){ - if(ncol(v_usage_freq) < 2) stop("v_usage_freq has to be a data frame containing V-gene information in the first column and the corresponding frequency in a naive T-cell repertoire in the second column.") - if(nrow(v_usage_freq) < 1) stop("v_usage_freq has to contain at least one row.") - if(suppressWarnings(any(is.na(as.numeric(v_usage_freq[,2])))) == TRUE){ - stop("The second column of v_usage_freq must contain the frequency of the corresponding V-gene in the first column in a naive T-cell repertoire.") - } else v_usage_freq[,2] <- as.numeric(v_usage_freq[,2]) - - } else {stop("v_usage_freq has to be a data frame containing V-gene information in the first column and the corresponding frequency in a naive T-cell repertoire in the second column.")} + if (!is.null(v_usage_freq)) { + if (is.data.frame(v_usage_freq)) { + if (ncol(v_usage_freq) < 2) { + stop("v_usage_freq has to be a data frame containing V-gene ", + "information in the first column and the corresponding ", + "frequency in a naive T-cell repertoire in the second column.") + } + if (nrow(v_usage_freq) < 1) { + stop("v_usage_freq has to contain at least one row.") + } + if (suppressWarnings(any(is.na(as.numeric(v_usage_freq[, 2]))))) { + stop("The second column of v_usage_freq must contain the frequency ", + "of the corresponding V-gene in the first column in a naive ", + "T-cell repertoire.") + } else { + v_usage_freq[, 2] <- as.numeric(v_usage_freq[, 2]) + } + } else { + stop("v_usage_freq has to be a data frame containing V-gene ", + "information in the first column and the corresponding ", + "frequency in a naive T-cell repertoire in the second column.") + } } ### cdr3_length_freq - if(!is.null(cdr3_length_freq)){ - if(is.data.frame(cdr3_length_freq)){ - if(ncol(cdr3_length_freq) < 2) stop("cdr3_length_freq has to be a data frame containing CDR3 lengths in the first column and the corresponding frequency in a naive T-cell repertoire in the second column.") - if(nrow(cdr3_length_freq) < 1) stop("cdr3_length_freq has to contain at least one row.") - if(suppressWarnings(any(is.na(as.numeric(cdr3_length_freq[,2])))) == TRUE){ - stop("The second column of cdr3_length_freq must contain the frequency of the corresponding CDR3 length in the first column in a naive T-cell repertoire.") - } else cdr3_length_freq[,2] <- as.numeric(cdr3_length_freq[,2]) - - } else {stop("cdr3_length_freq has to be a data frame containing CDR3 lengths in the first column and the corresponding frequency in a naive T-cell repertoire in the second column.")} + if (!is.null(cdr3_length_freq)) { + if (is.data.frame(cdr3_length_freq)) { + if (ncol(cdr3_length_freq) < 2) { + stop("cdr3_length_freq has to be a data frame containing CDR3 ", + "lengths in the first column and the corresponding frequency ", + "in a naive T-cell repertoire in the second column.") + } + if (nrow(cdr3_length_freq) < 1) { + stop("cdr3_length_freq has to contain at least one row.") + } + if (suppressWarnings(any(is.na(as.numeric(cdr3_length_freq[, 2]))))) { + stop("The second column of cdr3_length_freq must contain the ", + "frequency of the corresponding CDR3 length in the first ", + "column in a naive T-cell repertoire.") + } else { + cdr3_length_freq[, 2] <- as.numeric(cdr3_length_freq[, 2]) + } + } else { + stop("cdr3_length_freq has to be a data frame containing CDR3 ", + "lengths in the first column and the corresponding frequency ", + "in a naive T-cell repertoire in the second column.") + } } ### ref_cluster_size - if(!(ref_cluster_size %in% c("original", "simulated") || !is.character(ref_cluster_size) || length(ref_cluster_size) > 1)){ + if (!(ref_cluster_size %in% c("original", "simulated") || + !is.character(ref_cluster_size) || + length(ref_cluster_size) > 1)) { stop("ref_cluster_size has to be either 'original' or 'simulated'.") } ### gliph_version - if(!(gliph_version %in% c(1,2))) stop("gliph_version has to be either 1 or 2.") + if (!(gliph_version %in% c(1, 2))) { + stop("gliph_version has to be either 1 or 2.") + } ### sim_depth - if(!is.numeric(sim_depth)) stop("sim_depth has to be numeric") - if(length(sim_depth) > 1) stop("sim_depth has to be a single number") - if(sim_depth < 1) stop("sim_depth must be at least 1") + if (!is.numeric(sim_depth)) stop("sim_depth has to be numeric") + if (length(sim_depth) > 1) stop("sim_depth has to be a single number") + if (sim_depth < 1) stop("sim_depth must be at least 1") sim_depth <- round(sim_depth) ### hla_cutoff - if(!is.numeric(hla_cutoff)) stop("hla_cutoff has to be numeric") - if(length(hla_cutoff) > 1) stop("hla_cutoff has to be a single number") - if(hla_cutoff > 1 || hla_cutoff < 0) stop("hla_cutoff must be between 0 and 1") + if (!is.numeric(hla_cutoff)) stop("hla_cutoff has to be numeric") + if (length(hla_cutoff) > 1) stop("hla_cutoff has to be a single number") + if (hla_cutoff > 1 || hla_cutoff < 0) { + stop("hla_cutoff must be between 0 and 1") + } ### n_cores - if(is.null(n_cores)) n_cores <- max(1, parallel::detectCores()-2) else { - if(!is.numeric(n_cores)) stop("n_cores has to be numeric") - if(length(n_cores) > 1) stop("n_cores has to be a single number") - if(n_cores < 1) stop("n_cores must be at least 1") - - n_cores <- max(1, min(n_cores, parallel::detectCores()-2)) + if (is.null(n_cores)) { + n_cores <- max(1, parallel::detectCores() - 2) + } else { + if (!is.numeric(n_cores)) stop("n_cores has to be numeric") + if (length(n_cores) > 1) stop("n_cores has to be a single number") + if (n_cores < 1) stop("n_cores must be at least 1") + n_cores <- max(1, min(n_cores, parallel::detectCores() - 2)) } ### Early return for empty cluster_list (after all validation) - if(length(cluster_list) == 0) { + if (length(cluster_list) == 0) { message("No clusters to score.") return(data.frame()) } @@ -199,62 +237,71 @@ clusterScoring <- function(cluster_list, ################################################################# ### Which scores can be calculated from the dataset? - score_names <- c("network.size.score","cdr3.length.score") - if("TRBV" %in% colnames(cdr3_sequences)){ + score_names <- c("network.size.score", "cdr3.length.score") + if ("TRBV" %in% colnames(cdr3_sequences)) { vgene_info <- TRUE score_names <- c(score_names, "vgene.score") - } else vgene_info <- FALSE - if("counts" %in% colnames(cdr3_sequences)) { + } else { + vgene_info <- FALSE + } + if ("counts" %in% colnames(cdr3_sequences)) { counts_info <- TRUE cdr3_sequences$counts <- as.numeric(cdr3_sequences$counts) cdr3_sequences$counts[is.na(cdr3_sequences$counts)] <- 1 score_names <- c(score_names, "clonal.expansion.score") - } else counts_info <- FALSE - if("patient" %in% colnames(cdr3_sequences)) patient_info <- TRUE else patient_info <- FALSE - if("HLA" %in% colnames(cdr3_sequences)) hla_info <- TRUE else hla_info <- FALSE - if(hla_info == TRUE && patient_info == TRUE){ - if("patient" %in% colnames(cdr3_sequences) && "HLA" %in% colnames(cdr3_sequences)){ - cdr3_sequences <- cdr3_sequences[cdr3_sequences$HLA != "" & !is.na(cdr3_sequences$HLA),] - if(nrow(cdr3_sequences > 0)) score_names <- c(score_names, "hla.score", "lowest.hlas") else hla_info <- FALSE + } else { + counts_info <- FALSE + } + patient_info <- "patient" %in% colnames(cdr3_sequences) + hla_info <- "HLA" %in% colnames(cdr3_sequences) + if (hla_info && patient_info) { + if ("patient" %in% colnames(cdr3_sequences) && + "HLA" %in% colnames(cdr3_sequences)) { + cdr3_sequences <- cdr3_sequences[ + cdr3_sequences$HLA != "" & !is.na(cdr3_sequences$HLA), ] + if (nrow(cdr3_sequences) > 0) { + score_names <- c(score_names, "hla.score", "lowest.hlas") + } else { + hla_info <- FALSE + } } } ### load or generate reference tables from reference database - # ref_cluster_sizes: data frame containing the cluster size in the first and the probability of observing a cluster with this size - # in a sample from the reference database in the second column - # vgene_ref_frequencies: vector containing the (relative) frequencies of v gene usage - # cdr3_length_ref_frequencies: vector containing the (relative) frequencies of CDR3b lengths utils::data(ref_cluster_sizes, envir = environment(), package = "immGLIPH") ref_cluster_sizes <- ref_cluster_sizes[[ref_cluster_size]] - if(is.character(refdb_beta) && refdb_beta %in% .valid_reference_names()){ + if (is.character(refdb_beta) && refdb_beta %in% .valid_reference_names()) { reference_list <- .get_reference_list() refdb_name <- refdb_beta vgene_ref_frequencies <- reference_list[[refdb_name]]$vgene_frequencies$freq cdr3_length_ref_frequencies <- reference_list[[refdb_name]]$cdr3_length_frequencies$freq } else { # User-provided data frame: compute frequencies from the data - if("TRBV" %in% colnames(refdb_beta)){ + if ("TRBV" %in% colnames(refdb_beta)) { vgene_ref_frequencies <- as.data.frame(table(refdb_beta$TRBV)) - vgene_ref_frequencies <- vgene_ref_frequencies$Freq/sum(vgene_ref_frequencies$Freq) + vgene_ref_frequencies <- vgene_ref_frequencies$Freq / + sum(vgene_ref_frequencies$Freq) } else { reference_list <- .get_reference_list() vgene_ref_frequencies <- reference_list[["gliph_reference"]]$vgene_frequencies$freq } cdr3_length_ref_frequencies <- as.data.frame(table(nchar(refdb_beta$CDR3b))) - cdr3_length_ref_frequencies <- cdr3_length_ref_frequencies$Freq/sum(cdr3_length_ref_frequencies$Freq) + cdr3_length_ref_frequencies <- cdr3_length_ref_frequencies$Freq / + sum(cdr3_length_ref_frequencies$Freq) } - if(!is.null(v_usage_freq)) vgene_ref_frequencies <- as.numeric(v_usage_freq[,2]) - if(!is.null(cdr3_length_freq)) cdr3_length_ref_frequencies <- as.numeric(cdr3_length_freq[,2]) + if (!is.null(v_usage_freq)) { + vgene_ref_frequencies <- as.numeric(v_usage_freq[, 2]) + } + if (!is.null(cdr3_length_freq)) { + cdr3_length_ref_frequencies <- as.numeric(cdr3_length_freq[, 2]) + } ### Obtain the distribution of all patients and HLA alleles in the sample - # all_patients: vector containing all unique patient indices - # all_hlas: vector containing all unique HLA alleles - # all_patient_hlas: list whose elements are named after the patients and contain the patient's HLA alleles in a vector. all_patient_hlas <- c() - if(hla_info == TRUE && patient_info == TRUE){ - cdr3_sequences$patient <- gsub(":.*", "",cdr3_sequences$patient) + if (hla_info && patient_info) { + cdr3_sequences$patient <- gsub(":.*", "", cdr3_sequences$patient) all_patients <- sort(unique(cdr3_sequences$patient)) all_patients <- all_patients[!is.na(all_patients)] all_hlas <- unlist(strsplit(unique(cdr3_sequences$HLA), ",")) @@ -263,116 +310,134 @@ clusterScoring <- function(cluster_list, num_patients <- length(all_patients) num_HLAs <- length(all_hlas) - all_patient_hlas <- lapply(all_patients, function(x){ - sort(unique(gsub(":.*", "", unlist(strsplit(cdr3_sequences$HLA[cdr3_sequences$patient == x][1], ",")), perl = TRUE))) + all_patient_hlas <- lapply(all_patients, function(x) { + sort(unique(gsub( + ":.*", "", + unlist(strsplit( + cdr3_sequences$HLA[cdr3_sequences$patient == x][1], "," + )), + perl = TRUE + ))) }) names(all_patient_hlas) <- all_patients all_hlas <- data.frame(HLA = all_hlas) - all_hlas$counts <- apply(all_hlas, 1, function(x){ - val <- 0 - for(pat in all_patients){ - if(x %in% all_patient_hlas[[pat]]) val <- val+1 - } - val - }) + all_hlas$counts <- vapply(all_hlas$HLA, function(hla) { + sum(vapply(all_patient_hlas, function(pat_hlas) { + hla %in% pat_hlas + }, logical(1))) + }, numeric(1)) } ################################################################# ## Scoring ## ################################################################# - .setup_parallel(n_cores) + BPPARAM <- .setup_parallel(n_cores) - actCluster <- NULL - res <- foreach::foreach(actCluster = seq_along(cluster_list)) %dopar% { + res <- BiocParallel::bplapply(seq_along(cluster_list), function(actCluster) { ### Get sequences and information of current cluster act_seq_infos <- cluster_list[[actCluster]] - num_members <- nrow(act_seq_infos) # number of ALL members - ori_num_members <- length(unique(act_seq_infos$CDR3b)) # number of all unique CDR3b sequences + num_members <- nrow(act_seq_infos) + ori_num_members <- length(unique(act_seq_infos$CDR3b)) all_scores <- c() ### Get network size score from lookup table score_network_size <- 1 - nearest_sample_size <- order(abs(1-as.numeric(colnames(ref_cluster_sizes)[-1])/nrow(cdr3_sequences)))[1] - if(ori_num_members > 100){ - score_network_size <- ref_cluster_sizes[100,nearest_sample_size+1] + nearest_sample_size <- order( + abs(1 - as.numeric(colnames(ref_cluster_sizes)[-1]) / + nrow(cdr3_sequences)) + )[1] + if (ori_num_members > 100) { + score_network_size <- ref_cluster_sizes[100, nearest_sample_size + 1] } else { - score_network_size <- ref_cluster_sizes[ori_num_members,nearest_sample_size+1] + score_network_size <- ref_cluster_sizes[ + ori_num_members, nearest_sample_size + 1] } all_scores <- c(all_scores, score_network_size) ### Enrichment of CDR3 length (spectratype) within cluster - score_cdr3_length <- c() - # calculate score of sample (product of all frequencies) pick_freqs <- data.frame(table(nchar(unique(act_seq_infos$CDR3b)))) colnames(pick_freqs) <- c("object", "probs") - pick_freqs$probs <- pick_freqs$probs/ori_num_members + pick_freqs$probs <- pick_freqs$probs / ori_num_members sample_score <- round(prod(pick_freqs$probs), digits = 3) # generate random subsamples - random_subsample <- list() - for(i in seq_len(sim_depth)){ - random_subsample[[i]] <- sample.int(n = length(cdr3_length_ref_frequencies), size = ori_num_members, - prob = cdr3_length_ref_frequencies, replace = TRUE) - } + random_subsample <- lapply(seq_len(sim_depth), function(i) { + sample.int( + n = length(cdr3_length_ref_frequencies), + size = ori_num_members, + prob = cdr3_length_ref_frequencies, + replace = TRUE + ) + }) - # calculate score of subsamples (product of all frequencies) - pick_freqs <- stringdist::seq_qgrams(.list = random_subsample)[,-1]/ori_num_members + # calculate score of subsamples + pick_freqs <- stringdist::seq_qgrams( + .list = random_subsample + )[, -1] / ori_num_members pick_freqs[pick_freqs == 0] <- 1 - pick_freqs <- round(exp(colSums(log(pick_freqs))), digits = 3) # vectorized way to calculate the product of each column - if(gliph_version == 1){ - score_cdr3_length <- sum(pick_freqs >= sample_score)/sim_depth + pick_freqs <- round(exp(colSums(log(pick_freqs))), digits = 3) + if (gliph_version == 1) { + score_cdr3_length <- sum(pick_freqs >= sample_score) / sim_depth } else { - score_cdr3_length <- sum(pick_freqs > sample_score)/sim_depth + score_cdr3_length <- sum(pick_freqs > sample_score) / sim_depth + } + if (score_cdr3_length == 0) { + score_cdr3_length <- 1 / sim_depth } - if(score_cdr3_length == 0) score_cdr3_length <- 1/sim_depth # minimum score of 1/sim_depth all_scores <- c(all_scores, score_cdr3_length) ### Enrichment of v genes within cluster score_vgene <- c() - if(vgene_info == TRUE){ - - # calculate score of sample (product of all frequencies) + if (vgene_info) { pick_freqs <- data.frame(table(act_seq_infos$TRBV)) colnames(pick_freqs) <- c("object", "probs") - pick_freqs$probs <- pick_freqs$probs/num_members + pick_freqs$probs <- pick_freqs$probs / num_members sample_score <- round(prod(pick_freqs$probs), digits = 3) - # generate random subsamples - random_subsample <- list() - for(i in seq_len(sim_depth)){ - random_subsample[[i]] <- sample.int(n = length(vgene_ref_frequencies), size = num_members, - prob = vgene_ref_frequencies, replace = TRUE) - } - - # calculate score of subsamples (product of all frequencies) - pick_freqs <- stringdist::seq_qgrams(.list = random_subsample)[,-1]/num_members + random_subsample <- lapply(seq_len(sim_depth), function(i) { + sample.int( + n = length(vgene_ref_frequencies), + size = num_members, + prob = vgene_ref_frequencies, + replace = TRUE + ) + }) + + pick_freqs <- stringdist::seq_qgrams( + .list = random_subsample + )[, -1] / num_members pick_freqs[pick_freqs == 0] <- 1 - pick_freqs <- round(exp(colSums(log(pick_freqs))), digits = 3) # vectorized way to calculate the product of each column - if(gliph_version == 1){ - score_vgene <- sum(pick_freqs >= sample_score)/sim_depth + pick_freqs <- round(exp(colSums(log(pick_freqs))), digits = 3) + if (gliph_version == 1) { + score_vgene <- sum(pick_freqs >= sample_score) / sim_depth } else { - score_vgene <- sum(pick_freqs > sample_score)/sim_depth + score_vgene <- sum(pick_freqs > sample_score) / sim_depth } - if(score_vgene == 0) score_vgene <- 1/sim_depth # minimum score of 1/sim_depth + if (score_vgene == 0) score_vgene <- 1 / sim_depth all_scores <- c(all_scores, score_vgene) } ### Enrichment of clonal expansion within cluster score_clonal_expansion <- c() - if(counts_info == TRUE){ - sample_score <- sum(as.numeric(act_seq_infos$counts))/num_members - counter <- 0 - for(i in seq_len(sim_depth)){ - random_subsample <- sample(x = cdr3_sequences$counts, size = num_members, replace = FALSE) - test_score <- sum(as.numeric(random_subsample))/num_members - if(test_score>=sample_score) counter <- counter+1 + if (counts_info) { + sample_score <- sum(as.numeric(act_seq_infos$counts)) / num_members + test_scores <- vapply(seq_len(sim_depth), function(i) { + random_subsample <- sample( + x = cdr3_sequences$counts, size = num_members, replace = TRUE + ) + sum(as.numeric(random_subsample)) / num_members + }, numeric(1)) + counter <- sum(test_scores >= sample_score) + if (counter == 0) { + score_clonal_expansion <- 1 / sim_depth + } else { + score_clonal_expansion <- counter / sim_depth } - if(counter == 0) score_clonal_expansion <- 1/sim_depth else score_clonal_expansion <- counter/sim_depth score_clonal_expansion <- round(score_clonal_expansion, digits = 3) all_scores <- c(all_scores, score_clonal_expansion) @@ -381,77 +446,95 @@ clusterScoring <- function(cluster_list, ### Enrichment of common HLA among donor TCR contributors in cluster score_hla <- c() lowest_hla <- "" - if(hla_info == TRUE && patient_info == TRUE){ - act_seq_infos <- act_seq_infos[act_seq_infos$HLA != "" & !is.na(act_seq_infos$HLA),] - - if(nrow(act_seq_infos) > 0){ - act_seq_infos$patient <- gsub(":.*", "",act_seq_infos$patient) - - score_hla <- 1 - for(act_hla in seq_len(num_HLAs)){ - crg_patient_count <- length(unique(act_seq_infos$patient)) - crg_patient_hla_count <- sum(unlist(lapply(all_patient_hlas[unique(act_seq_infos$patient)], function(x){ - if(all_hlas$HLA[act_hla] %in% x) 1 else 0 - }))) - if(crg_patient_hla_count > 1){ - act_Prob <- sum(choose(all_hlas$counts[act_hla], crg_patient_hla_count:crg_patient_count)*choose(num_patients-all_hlas$counts[act_hla], crg_patient_count-crg_patient_hla_count:crg_patient_count)/choose(num_patients, crg_patient_count)) - if(act_Prob 0) { + act_seq_infos$patient <- gsub(":.*", "", act_seq_infos$patient) + + score_hla <- 1 + for (act_hla in seq_len(num_HLAs)) { + crg_patient_count <- length(unique(act_seq_infos$patient)) + crg_patient_hla_count <- sum(vapply( + all_patient_hlas[unique(act_seq_infos$patient)], + function(x) { + if (all_hlas$HLA[act_hla] %in% x) 1L else 0L + }, integer(1) + )) + if (crg_patient_hla_count > 1) { + act_Prob <- sum( + choose( + all_hlas$counts[act_hla], + crg_patient_hla_count:crg_patient_count + ) * + choose( + num_patients - all_hlas$counts[act_hla], + crg_patient_count - + crg_patient_hla_count:crg_patient_count + ) / + choose(num_patients, crg_patient_count) + ) + if (act_Prob < score_hla) score_hla <- act_Prob + if (act_Prob < hla_cutoff) { + hla_str <- paste0( + all_hlas$HLA[act_hla], + " [(", crg_patient_hla_count, "/", + crg_patient_count, ") vs (", + all_hlas$counts[act_hla], "/", num_patients, + ") = ", + formatC(act_Prob, digits = 1, format = "e"), + "]" + ) + if (lowest_hla == "") { + lowest_hla <- hla_str + } else { + lowest_hla <- paste(lowest_hla, ",", hla_str) + } } } } - - } } else { - score_hla <- 1 + score_hla <- 1 } all_scores <- c(all_scores, score_hla) } ### Total score - if(gliph_version == 1) score_final <- prod(all_scores)*0.001*64 else if(gliph_version == 2) score_final <- prod(all_scores) + if (gliph_version == 1) { + score_final <- prod(all_scores) * 0.001 * 64 + } else { + score_final <- prod(all_scores) + } ### Output all_scores <- c(score_final, all_scores) all_scores <- formatC(all_scores, digits = 1, format = "e") output <- c(names(cluster_list)[actCluster], all_scores) - if(hla_info == TRUE && patient_info == TRUE){ + if (hla_info && patient_info) { output <- c(output, lowest_hla) } output - } - - .stop_parallel() + }, BPPARAM = BPPARAM) - res <- data.frame(matrix(unlist(res), ncol = 2+length(score_names), byrow = TRUE)) + res <- data.frame( + matrix(unlist(res), ncol = 2 + length(score_names), byrow = TRUE) + ) colnames(res) <- c("leader.tag", "total.score", score_names) - for(i in c("total.score", score_names)) if(i != "lowest.hlas") res[,i] <- as.numeric(res[,i]) + for (i in c("total.score", score_names)) { + if (i != "lowest.hlas") res[, i] <- as.numeric(res[, i]) + } - # set all theoretical numeric values (actually characters) to numeric values - if(is.data.frame(res)){ - for(i in seq_len(ncol(res))){ - if(suppressWarnings(any(is.na(as.numeric(res[,i])))) == FALSE) res[,i] <- as.numeric(res[,i]) + # set all theoretical numeric values to numeric + if (is.data.frame(res)) { + for (i in seq_len(ncol(res))) { + if (!suppressWarnings(any(is.na(as.numeric(res[, i]))))) { + res[, i] <- as.numeric(res[, i]) + } } } - # Closing time! - return(res[,-1]) + return(res[, -1]) } diff --git a/R/clustering.R b/R/clustering.R index 56fcbbc..8217701 100644 --- a/R/clustering.R +++ b/R/clustering.R @@ -18,6 +18,7 @@ #' @param public_tcrs Logical. If \code{FALSE}, restrict edges to same donor. #' @param cluster_min_size Integer. Minimum cluster size to retain. #' @param verbose Logical. Print progress messages. +#' @param BPPARAM A \code{BiocParallelParam} object for parallel evaluation. #' #' @return A list with: #' \describe{ @@ -27,7 +28,6 @@ #' \item{save_cluster_list_df}{Data frame for saving cluster members.} #' } #' -#' @import foreach #' @keywords internal .cluster_gliph1 <- function(clone_network, sequences, @@ -38,7 +38,8 @@ global_vgene, public_tcrs, cluster_min_size, - verbose) { + verbose, + BPPARAM) { if (verbose) message("Clustering sequences (GLIPH1.0 method).") @@ -57,7 +58,6 @@ in_local_ids <- integer(0) if (!is.null(clone_network)) { in_network <- unique(c(clone_network$V1, clone_network$V2)) - # Track which sequence indices are in any edge in_local_ids <- which(seqs %in% in_network) } @@ -98,43 +98,44 @@ clone_network[] <- lapply(clone_network, as.character) ## ---- Build tuple network ---- - x <- NULL - temp_clone_network <- foreach::foreach( - x = seq_len(nrow(clone_network)) - ) %dopar% { - act_ids1 <- which(sequences$CDR3b == clone_network[x, 1]) - act_ids2 <- which(sequences$CDR3b == clone_network[x, 2]) - - comb_ids <- expand.grid(act_ids1, act_ids2) - comb_ids <- unique(comb_ids) - - act_infos1 <- sequences[comb_ids[, 1], ] - act_infos2 <- sequences[comb_ids[, 2], ] - - ## Filter for same donor if required - if (is.logical(public_tcrs) && !public_tcrs && patient.info) { - exclude_rows <- act_infos1$patient != act_infos2$patient - act_infos1 <- act_infos1[!exclude_rows, ] - act_infos2 <- act_infos2[!exclude_rows, ] - } + temp_clone_network <- BiocParallel::bplapply( + seq_len(nrow(clone_network)), + function(x) { + act_ids1 <- which(sequences$CDR3b == clone_network[x, 1]) + act_ids2 <- which(sequences$CDR3b == clone_network[x, 2]) + + comb_ids <- expand.grid(act_ids1, act_ids2) + comb_ids <- unique(comb_ids) + + act_infos1 <- sequences[comb_ids[, 1], ] + act_infos2 <- sequences[comb_ids[, 2], ] + + ## Filter for same donor if required + if (is.logical(public_tcrs) && !public_tcrs && patient.info) { + exclude_rows <- act_infos1$patient != act_infos2$patient + act_infos1 <- act_infos1[!exclude_rows, ] + act_infos2 <- act_infos2[!exclude_rows, ] + } - ## Filter global edges for matching V-gene - if (global_vgene && vgene.info && - clone_network[x, 3] == "global" && nrow(act_infos1) > 0) { - exclude_rows <- act_infos1$TRBV != act_infos2$TRBV - act_infos1 <- act_infos1[!exclude_rows, ] - act_infos2 <- act_infos2[!exclude_rows, ] - } + ## Filter global edges for matching V-gene + if (global_vgene && vgene.info && + clone_network[x, 3] == "global" && nrow(act_infos1) > 0) { + exclude_rows <- act_infos1$TRBV != act_infos2$TRBV + act_infos1 <- act_infos1[!exclude_rows, ] + act_infos2 <- act_infos2[!exclude_rows, ] + } - if (nrow(act_infos1) > 0) { - act_infos1 <- do.call(paste, c(act_infos1, sep = "$#$#$")) - act_infos2 <- do.call(paste, c(act_infos2, sep = "$#$#$")) - var_ret <- data.frame(act_infos1, act_infos2) - t(var_ret) - } else { - NULL - } - } + if (nrow(act_infos1) > 0) { + act_infos1 <- do.call(paste, c(act_infos1, sep = "$#$#$")) + act_infos2 <- do.call(paste, c(act_infos2, sep = "$#$#$")) + var_ret <- data.frame(act_infos1, act_infos2) + t(var_ret) + } else { + NULL + } + }, + BPPARAM = BPPARAM + ) temp_clone_network <- data.frame( matrix(unlist(temp_clone_network), ncol = 2, byrow = TRUE), @@ -205,16 +206,18 @@ } if (nrow(part_4_res) > 0) { - save_cluster_list_df <- foreach::foreach( - i = seq_along(part_4_res_list), - .combine = "rbind" - ) %dopar% { - temp <- part_4_res_list[[i]] - cbind( - data.frame(tag = rep(names(part_4_res_list)[i], nrow(temp))), - temp - ) - } + save_parts <- BiocParallel::bplapply( + seq_along(part_4_res_list), + function(i) { + temp <- part_4_res_list[[i]] + cbind( + data.frame(tag = rep(names(part_4_res_list)[i], nrow(temp))), + temp + ) + }, + BPPARAM = BPPARAM + ) + save_cluster_list_df <- do.call(rbind, save_parts) } else { part_4_res <- NULL part_4_res_list <- list() @@ -259,6 +262,7 @@ #' @param boost_local_significance Logical. Whether to boost local p-values #' using germline N-nucleotide information. #' @param verbose Logical. Print progress messages. +#' @param BPPARAM A \code{BiocParallelParam} object for parallel evaluation. #' #' @return A list with: #' \describe{ @@ -270,7 +274,6 @@ #' \item{save_cluster_list_df}{Data frame for saving cluster member details.} #' } #' -#' @import foreach #' @keywords internal .cluster_gliph2 <- function(local_res, global_res, @@ -284,7 +287,8 @@ motif_distance_cutoff, cluster_min_size, boost_local_significance, - verbose) { + verbose, + BPPARAM) { if (verbose) message("Clustering sequences (GLIPH2.0 method).") @@ -449,31 +453,32 @@ ## Load BLOSUM vector for global filtering BlosumVec <- .get_blosum_vec() - i <- NULL - cluster_list <- foreach::foreach( - i = seq_len(nrow(merged_clusters)) - ) %dopar% { - if (merged_clusters$type[i] == "local") { - act_seqs <- unlist(strsplit(merged_clusters$members[i], split = " ")) - return(sequences[sequences$CDR3b %in% act_seqs, ]) - } - if (merged_clusters$type[i] == "global") { - act_seqs <- unlist(strsplit(merged_clusters$members[i], split = " ")) - act_details <- unlist(strsplit(merged_clusters$tag[i], split = "_")) - if (!global_vgene) { - return(data.frame( - sequences[sequences$CDR3b %in% act_seqs, ], - stringsAsFactors = FALSE - )) - } else { - return(data.frame( - sequences[sequences$CDR3b %in% act_seqs & - sequences$TRBV == act_details[2], ], - stringsAsFactors = FALSE - )) + cluster_list <- BiocParallel::bplapply( + seq_len(nrow(merged_clusters)), + function(i) { + if (merged_clusters$type[i] == "local") { + act_seqs <- unlist(strsplit(merged_clusters$members[i], split = " ")) + return(sequences[sequences$CDR3b %in% act_seqs, ]) } - } - } + if (merged_clusters$type[i] == "global") { + act_seqs <- unlist(strsplit(merged_clusters$members[i], split = " ")) + act_details <- unlist(strsplit(merged_clusters$tag[i], split = "_")) + if (!global_vgene) { + return(data.frame( + sequences[sequences$CDR3b %in% act_seqs, ], + stringsAsFactors = FALSE + )) + } else { + return(data.frame( + sequences[sequences$CDR3b %in% act_seqs & + sequences$TRBV == act_details[2], ], + stringsAsFactors = FALSE + )) + } + } + }, + BPPARAM = BPPARAM + ) names(cluster_list) <- merged_clusters$tag ## Update local cluster sizes @@ -516,90 +521,100 @@ if (!is.null(merged_clusters) && length(cluster_list) > 0) { ## Local edges if (local_similarities && any(merged_clusters$type == "local")) { - local_clone_network <- foreach::foreach( - i = which(merged_clusters$type == "local") - ) %dopar% { - temp_members <- cluster_list[[i]]$CDR3b - if (structboundaries) { - temp_members_frags <- substr( - temp_members, - boundary_size + 1, - nchar(temp_members) - boundary_size + local_edge_list <- BiocParallel::bplapply( + which(merged_clusters$type == "local"), + function(i) { + temp_members <- cluster_list[[i]]$CDR3b + if (structboundaries) { + temp_members_frags <- substr( + temp_members, + boundary_size + 1, + nchar(temp_members) - boundary_size + ) + } else { + temp_members_frags <- temp_members + } + + motif_name <- strsplit(names(cluster_list)[i], split = "_")[[1]][[1]] + temp_pos <- stringr::str_locate(temp_members_frags, motif_name) + + if (length(temp_members) >= 2) { + combn_ids <- t(utils::combn(seq_along(temp_members), m = 2)) + } else { + combn_ids <- t(utils::combn(rep(1, 2), m = 2)) + } + temp_df <- data.frame( + V1 = temp_members[combn_ids[, 1]], + V2 = temp_members[combn_ids[, 2]], + type = rep("local", nrow(combn_ids)), + cluster_tag = rep(names(cluster_list)[i], nrow(combn_ids)), + stringsAsFactors = FALSE ) - } else { - temp_members_frags <- temp_members - } - motif_name <- strsplit(names(cluster_list)[i], split = "_")[[1]][[1]] - temp_pos <- stringr::str_locate(temp_members_frags, motif_name) + ## Restrict by positional distance + temp_df <- unique(temp_df[ + abs(temp_pos[combn_ids[, 1], 1] - + temp_pos[combn_ids[, 2], 1]) < motif_distance_cutoff, ]) - if (length(temp_members) >= 2) { - combn_ids <- t(utils::combn(seq_along(temp_members), m = 2)) - } else { - combn_ids <- t(utils::combn(rep(1, 2), m = 2)) - } - temp_df <- data.frame( - V1 = temp_members[combn_ids[, 1]], - V2 = temp_members[combn_ids[, 2]], - type = rep("local", nrow(combn_ids)), - cluster_tag = rep(names(cluster_list)[i], nrow(combn_ids)), + temp_df + }, + BPPARAM = BPPARAM + ) + local_clone_network <- do.call(rbind, local_edge_list) + if (is.null(local_clone_network)) { + local_clone_network <- data.frame( + V1 = character(0), V2 = character(0), + type = character(0), cluster_tag = character(0), stringsAsFactors = FALSE ) - - ## Restrict by positional distance - temp_df <- unique(temp_df[ - abs(temp_pos[combn_ids[, 1], 1] - - temp_pos[combn_ids[, 2], 1]) < motif_distance_cutoff, ]) - - t(temp_df) } - local_clone_network <- data.frame( - matrix(unlist(local_clone_network), ncol = 4, byrow = TRUE), - stringsAsFactors = FALSE - ) - colnames(local_clone_network) <- c("V1", "V2", "type", "cluster_tag") clone_network <- local_clone_network } ## Global edges if (global_similarities && any(merged_clusters$type == "global")) { - global_clone_network <- foreach::foreach( - i = which(merged_clusters$type == "global") - ) %dopar% { - temp_members <- cluster_list[[i]]$CDR3b - - if (length(temp_members) >= 2) { - combn_ids <- t(utils::combn(seq_along(temp_members), m = 2)) - } else { - combn_ids <- t(utils::combn(rep(1, 2), m = 2)) - } - temp_df <- data.frame( - V1 = temp_members[combn_ids[, 1]], - V2 = temp_members[combn_ids[, 2]], - type = rep("global", nrow(combn_ids)), - cluster_tag = rep(names(cluster_list)[i], nrow(combn_ids)), - stringsAsFactors = FALSE - ) + global_edge_list <- BiocParallel::bplapply( + which(merged_clusters$type == "global"), + function(i) { + temp_members <- cluster_list[[i]]$CDR3b + + if (length(temp_members) >= 2) { + combn_ids <- t(utils::combn(seq_along(temp_members), m = 2)) + } else { + combn_ids <- t(utils::combn(rep(1, 2), m = 2)) + } + temp_df <- data.frame( + V1 = temp_members[combn_ids[, 1]], + V2 = temp_members[combn_ids[, 2]], + type = rep("global", nrow(combn_ids)), + cluster_tag = rep(names(cluster_list)[i], nrow(combn_ids)), + stringsAsFactors = FALSE + ) - ## BLOSUM62 filtering at variable position - if (!all_aa_interchangeable) { - tag_name <- names(cluster_list)[i] - temp_pos <- stringr::str_locate(tag_name, "%") - if (structboundaries) temp_pos <- temp_pos + boundary_size - temp_df <- unique(temp_df[ - paste0( - substr(temp_df$V1, temp_pos[1], temp_pos[1]), - substr(temp_df$V2, temp_pos[1], temp_pos[1]) - ) %in% BlosumVec, ]) - } + ## BLOSUM62 filtering at variable position + if (!all_aa_interchangeable) { + tag_name <- names(cluster_list)[i] + temp_pos <- stringr::str_locate(tag_name, "%") + if (structboundaries) temp_pos <- temp_pos + boundary_size + temp_df <- unique(temp_df[ + paste0( + substr(temp_df$V1, temp_pos[1], temp_pos[1]), + substr(temp_df$V2, temp_pos[1], temp_pos[1]) + ) %in% BlosumVec, ]) + } - t(temp_df) - } - global_clone_network <- data.frame( - matrix(unlist(global_clone_network), ncol = 4, byrow = TRUE), - stringsAsFactors = FALSE + temp_df + }, + BPPARAM = BPPARAM ) - colnames(global_clone_network) <- c("V1", "V2", "type", "cluster_tag") + global_clone_network <- do.call(rbind, global_edge_list) + if (is.null(global_clone_network)) { + global_clone_network <- data.frame( + V1 = character(0), V2 = character(0), + type = character(0), cluster_tag = character(0), + stringsAsFactors = FALSE + ) + } if (is.null(clone_network)) { clone_network <- global_clone_network @@ -636,16 +651,18 @@ ) merged_clusters$OvE[is.infinite(merged_clusters$OvE)] <- 0 - save_cluster_list_df <- foreach::foreach( - i = seq_along(cluster_list), - .combine = "rbind" - ) %dopar% { - temp <- cluster_list[[i]] - cbind( - data.frame(tag = rep(names(cluster_list)[i], nrow(temp))), - temp - ) - } + save_parts <- BiocParallel::bplapply( + seq_along(cluster_list), + function(i) { + temp <- cluster_list[[i]] + cbind( + data.frame(tag = rep(names(cluster_list)[i], nrow(temp))), + temp + ) + }, + BPPARAM = BPPARAM + ) + save_cluster_list_df <- do.call(rbind, save_parts) } list( diff --git a/R/deNovoTCRs.R b/R/deNovoTCRs.R index c266614..2e72e8d 100644 --- a/R/deNovoTCRs.R +++ b/R/deNovoTCRs.R @@ -96,7 +96,6 @@ #' @references Glanville, Jacob, et al. #' "Identifying specificity groups in the T cell receptor repertoire." Nature 547.7661 (2017): 94. #' @references https://github.com/immunoengineer/gliph -#' @import foreach #' @export deNovoTCRs <- function(convergence_group_tag, result_folder = "", @@ -116,66 +115,66 @@ deNovoTCRs <- function(convergence_group_tag, ################################################################## ### convergence_group_tag - if(!is.character(convergence_group_tag)) stop("convergence_group_tag has to be a character object") - if(length(convergence_group_tag) > 1) stop("convergence_group_tag has to be a single character string") + if (!is.character(convergence_group_tag)) stop("convergence_group_tag has to be a character object") + if (length(convergence_group_tag) > 1) stop("convergence_group_tag has to be a single character string") ### result_folder and clustering_output - if(!is.character(result_folder)) stop("result_folder has to be a character object") - if(length(result_folder) > 1) stop("result_folder has to be a single path") + if (!is.character(result_folder)) stop("result_folder has to be a character object") + if (length(result_folder) > 1) stop("result_folder has to be a single path") save_results <- FALSE - if(result_folder != ""){ - if(substr(result_folder,nchar(result_folder),nchar(result_folder)) != "/") result_folder <- paste0(result_folder,"/") + if (result_folder != "") { + if (substr(result_folder, nchar(result_folder), nchar(result_folder)) != "/") result_folder <- paste0(result_folder, "/") if (!dir.exists(result_folder)) dir.create(result_folder) save_results <- TRUE } else { - if(!is.list(clustering_output)) stop("If 'result_folder' = \"\" the output list of clustering must be given by 'clustering_output'.") + if (!is.list(clustering_output)) stop("If 'result_folder' = \"\" the output list of clustering must be given by 'clustering_output'.") } ### refdb_beta # if(!(refdb_beta %in% c("gliph_reference", "human_v1.0_CD4", "human_v1.0_CD8", "human_v1.0_CD48", "human_v2.0_CD4", # "human_v2.0_CD8", "human_v2.0_CD48", "mouse_v1.0_CD4", "mouse_v1.0_CD8", "mouse_v1.0_CD48")) && # !is.data.frame(refdb_beta)){ - if(!is.data.frame(refdb_beta)){ - if(length(refdb_beta) != 1 || !is.character(refdb_beta)){ + if (!is.data.frame(refdb_beta)) { + if (length(refdb_beta) != 1 || !is.character(refdb_beta)) { stop("refdb_beta has to be a data frame (containing CDR3b sequences in the first column and optional V-gene information in the second column) or the value 'gliph_reference'") - } else if(!(refdb_beta %in% c("gliph_reference"))){ + } else if (!(refdb_beta %in% c("gliph_reference"))) { stop("refdb_beta has to be a data frame (containing CDR3b sequences in the first column and optional V-gene information in the second column) or the value 'gliph_reference'") } } ### accept_sequences_with_C_F_start_end - if(!is.logical(accept_sequences_with_C_F_start_end)) stop("accept_sequences_with_C_F_start_end has to be logical") + if (!is.logical(accept_sequences_with_C_F_start_end)) stop("accept_sequences_with_C_F_start_end has to be logical") ### normalization - if(!is.logical(normalization)) stop("normalization has to be logical") + if (!is.logical(normalization)) stop("normalization has to be logical") ### sims - if(!is.numeric(sims)) stop("sims has to be numeric") - if(length(sims) > 1) stop("sims has to be a single number") - if(sims < 1) stop("sims must be at least 1") + if (!is.numeric(sims)) stop("sims has to be numeric") + if (length(sims) > 1) stop("sims has to be a single number") + if (sims < 1) stop("sims must be at least 1") sims <- round(sims) ### num_tops - if(!is.numeric(num_tops)) stop("num_tops has to be numeric") - if(length(num_tops) > 1) stop("num_tops has to be a single number") - if(num_tops < 1) stop("num_tops must be at least 1") + if (!is.numeric(num_tops)) stop("num_tops has to be numeric") + if (length(num_tops) > 1) stop("num_tops has to be a single number") + if (num_tops < 1) stop("num_tops must be at least 1") num_tops <- round(num_tops) ### min_length - if(!is.numeric(min_length)) stop("min_length has to be numeric") - if(length(min_length) > 1) stop("min_length has to be a single number") - if(min_length < 1) stop("min_length must be at least 1") + if (!is.numeric(min_length)) stop("min_length has to be numeric") + if (length(min_length) > 1) stop("min_length has to be a single number") + if (min_length < 1) stop("min_length must be at least 1") min_length <- round(min_length) ### make_figure - if(!is.logical(make_figure)) stop("make_figure has to be logical") + if (!is.logical(make_figure)) stop("make_figure has to be logical") ### n_cores - if(is.null(n_cores)) n_cores <- max(1, parallel::detectCores()-2) else { - if(!is.numeric(n_cores)) stop("n_cores has to be numeric") - if(length(n_cores) > 1) stop("n_cores has to be a single number") - if(n_cores < 1) stop("n_cores must be at least 1") + if (is.null(n_cores)) n_cores <- max(1, parallel::detectCores() - 2) else { + if (!is.numeric(n_cores)) stop("n_cores has to be numeric") + if (length(n_cores) > 1) stop("n_cores has to be a single number") + if (n_cores < 1) stop("n_cores must be at least 1") - n_cores <- max(1, min(n_cores, parallel::detectCores()-2)) + n_cores <- max(1, min(n_cores, parallel::detectCores() - 2)) } # Amino acids one letter code @@ -187,22 +186,22 @@ deNovoTCRs <- function(convergence_group_tag, ### load convergence groups from file or from input and only use specified convergence group message("Loading convergence group with tag ", convergence_group_tag, ".") - if(is.null(clustering_output)){ + if (is.null(clustering_output)) { clustering_output <- loadGLIPH(result_folder = result_folder) crg <- clustering_output$cluster_list } else { clustering_output <- clustering_output crg <- clustering_output$cluster_list } - if(!(convergence_group_tag %in% names(crg))) stop("Could not find convergence group with tag ",convergence_group_tag, " in clustering results stored in", result_folder, ".") + if (!(convergence_group_tag %in% names(crg))) stop("Could not find convergence group with tag ", convergence_group_tag, " in clustering results stored in", result_folder, ".") crg <- crg[[convergence_group_tag]] all_crg_cdr3_seqs <- crg$CDR3b ### Filter sequences for a minimun sequenc length excluded <- which(nchar(all_crg_cdr3_seqs) < min_length) - if(length(excluded) > 0){ + if (length(excluded) > 0) { all_crg_cdr3_seqs <- all_crg_cdr3_seqs[-excluded] - if(length(all_crg_cdr3_seqs) > 0){ + if (length(all_crg_cdr3_seqs) > 0) { warning(length(excluded), " sequences of the convergence group were excluded from the further procedure due to falling below a minimum length of ", min_length, ".", call. = FALSE) } else { stop("No sequences of the convergence group are of minimum length of ", min_length, ". For further procedure, adjust the parameter 'min_length'") @@ -214,7 +213,7 @@ deNovoTCRs <- function(convergence_group_tag, ################################################################# ### Initiate parallelization - .setup_parallel(n_cores) + BPPARAM <- .setup_parallel(n_cores) # get all sequences in cluster crg_cdr3_seqs <- all_crg_cdr3_seqs @@ -224,31 +223,31 @@ deNovoTCRs <- function(convergence_group_tag, ref_vgenes <- c() refseqs <- c() v_gene_norm <- normalization - if(normalization == TRUE){ + if (normalization == TRUE) { - if(is.data.frame(refdb_beta)) { + if (is.data.frame(refdb_beta)) { refseqs <- refdb_beta refseqs[] <- lapply(refseqs, as.character) - if(ncol(refseqs) > 1){ + if (ncol(refseqs) > 1) { message("Notification: First column of reference database is considered as cdr3 sequences.") } - if(ncol(refseqs) > 1 && v_gene_norm == TRUE){ + if (ncol(refseqs) > 1 && v_gene_norm == TRUE) { message("Notification: Second column of reference database is considered as V-gene information.") - } else if(v_gene_norm == TRUE){ + } else if (v_gene_norm == TRUE) { warning("Beta sequence reference database is missing column containing V-genes. Without V-gene information normalization may be inaccurate.", call. = FALSE) v_gene_norm <- FALSE } - if(ncol(refseqs) == 1) refseqs <- cbind(refseqs, rep("", nrow(refseqs))) + if (ncol(refseqs) == 1) refseqs <- cbind(refseqs, rep("", nrow(refseqs))) - refseqs <- refseqs[, c(1,2)] + refseqs <- refseqs[, c(1, 2)] colnames(refseqs) <- c("CDR3b", "TRBV") refseqs <- unique(refseqs) - if(accept_sequences_with_C_F_start_end) refseqs <- refseqs[grep(pattern = "^C.*F$",x = refseqs$CDR3b,perl = TRUE),] - refseqs <- refseqs[which(nchar(refseqs$CDR3b) >= min_length),] - refseqs <- refseqs[grep("^[ACDEFGHIKLMNOPQRSTUVWY]*$", refseqs$CDR3b),] + if (accept_sequences_with_C_F_start_end) refseqs <- refseqs[grep(pattern = "^C.*F$", x = refseqs$CDR3b, perl = TRUE), ] + refseqs <- refseqs[which(nchar(refseqs$CDR3b) >= min_length), ] + refseqs <- refseqs[grep("^[ACDEFGHIKLMNOPQRSTUVWY]*$", refseqs$CDR3b), ] - if(nrow(refseqs) == 0){ + if (nrow(refseqs) == 0) { normalization <- FALSE v_gene_norm <- FALSE warning("No reference sequences with a minimum length of ", min_length, " given. Normalization therefore not possible. Adjust min_length to enable normalization.", call. = FALSE) @@ -258,29 +257,29 @@ deNovoTCRs <- function(convergence_group_tag, } } else { reference_list <- NULL - utils::data("reference_list",envir = environment(), package = "immGLIPH") + utils::data("reference_list", envir = environment(), package = "immGLIPH") refseqs <- as.data.frame(reference_list[[refdb_beta]]$refseqs) refseqs[] <- lapply(refseqs, as.character) - if(ncol(refseqs) > 1){ + if (ncol(refseqs) > 1) { message("Notification: First column of reference database is considered as cdr3 sequences.") } - if(ncol(refseqs) > 1 && v_gene_norm == TRUE){ + if (ncol(refseqs) > 1 && v_gene_norm == TRUE) { message("Notification: Second column of reference database is considered as V-gene information.") - } else if(v_gene_norm == TRUE){ + } else if (v_gene_norm == TRUE) { warning("Beta sequence reference database is missing column containing V-genes. Without V-gene information normalization may be inaccurate.", call. = FALSE) v_gene_norm <- FALSE } - if(ncol(refseqs) == 1) refseqs <- cbind(refseqs, rep("", nrow(refseqs))) + if (ncol(refseqs) == 1) refseqs <- cbind(refseqs, rep("", nrow(refseqs))) - refseqs <- refseqs[, c(1,2)] + refseqs <- refseqs[, c(1, 2)] colnames(refseqs) <- c("CDR3b", "TRBV") refseqs <- unique(refseqs) - if(accept_sequences_with_C_F_start_end) refseqs <- refseqs[grep(pattern = "^C.*F$",x = refseqs$CDR3b,perl = TRUE),] - refseqs <- refseqs[which(nchar(refseqs$CDR3b) >= min_length),] - refseqs <- refseqs[grep("^[ACDEFGHIKLMNPQRSTVWY]*$", refseqs$CDR3b),] + if (accept_sequences_with_C_F_start_end) refseqs <- refseqs[grep(pattern = "^C.*F$", x = refseqs$CDR3b, perl = TRUE), ] + refseqs <- refseqs[which(nchar(refseqs$CDR3b) >= min_length), ] + refseqs <- refseqs[grep("^[ACDEFGHIKLMNPQRSTVWY]*$", refseqs$CDR3b), ] - if(nrow(refseqs) == 0){ + if (nrow(refseqs) == 0) { normalization <- FALSE v_gene_norm <- FALSE warning("No reference sequences with a minimum length of ", min_length, " given. Normalization therefore not possible. Adjust min_length to enable normalization.", call. = FALSE) @@ -291,9 +290,9 @@ deNovoTCRs <- function(convergence_group_tag, } # load V genes of cluster members - if("TRBV" %in% colnames(crg) && v_gene_norm == TRUE){ + if ("TRBV" %in% colnames(crg) && v_gene_norm == TRUE) { v_genes <- crg$TRBV - } else if(v_gene_norm == TRUE){ + } else if (v_gene_norm == TRUE) { v_gene_norm <- FALSE warning("Without V-gene information of sample sequences normalization may be inaccurate.", call. = FALSE) } else warning("Without V-gene restriction normalization may be inaccurate.", call. = FALSE) @@ -310,22 +309,22 @@ deNovoTCRs <- function(convergence_group_tag, message("Calculating positional weight matrix of convergence group.") # initialize the matrix - crg_pwm_scoring <- as.data.frame(matrix(rep(0, min_length*length(aa_code)), ncol = length(aa_code))) + crg_pwm_scoring <- as.data.frame(matrix(rep(0, min_length * length(aa_code)), ncol = length(aa_code))) colnames(crg_pwm_scoring) <- aa_code # for every position determine the amino acid frequency - for(i in seq_len(nrow(crg_pwm_scoring))){ + for (i in seq_len(nrow(crg_pwm_scoring))) { aa_freqs <- rep(0, length(aa_code)) act_letters <- substr(crg_cdr3_seqs, i, i) - for(j in seq_along(aa_freqs)){ + for (j in seq_along(aa_freqs)) { aa_freqs[j] <- sum(act_letters == aa_code[j]) } # Pseudocounts of 0.5% per aa zeroFreqs <- which(aa_freqs == 0) - aa_freqs <- aa_freqs/sum(aa_freqs)*(1-length(zeroFreqs)*0.005) + aa_freqs <- aa_freqs / sum(aa_freqs) * (1 - length(zeroFreqs) * 0.005) aa_freqs[zeroFreqs] <- 0.005 - crg_pwm_scoring[i,] <- aa_freqs + crg_pwm_scoring[i, ] <- aa_freqs } ### Calculate scores of convergence group members, only use first min_length N-terminal positions @@ -333,61 +332,58 @@ deNovoTCRs <- function(convergence_group_tag, message("Calculating scores of convergence group members.") # is parallelization necessary? - if(crg_num_seqs > 10000){ + if (crg_num_seqs > 10000) { # distribute sequences equally to all cores - distribute <- lapply(seq_len(n_cores), function(x){return(((x-1)*floor(length(crg_cdr3_seqs)/n_cores)+1):(x*floor(length(crg_cdr3_seqs)/n_cores)))}) + distribute <- lapply(seq_len(n_cores), function(x) {return(((x - 1) * floor(length(crg_cdr3_seqs) / n_cores) + 1):(x * floor(length(crg_cdr3_seqs) / n_cores)))}) # calculate the score - crg_cdr3_scores <- foreach::foreach(i = seq_along(distribute), .combine = c) %dopar% { - temp_scores <- rep(1, length(distribute[[i]])) - temp_seqs <- crg_cdr3_seqs[distribute[[i]]] - for(i in seq_len(nrow(crg_pwm_scoring))){ - temp_scores <- temp_scores*unlist(crg_pwm_scoring[i, substr(temp_seqs, i, i)]) + crg_cdr3_scores <- unlist(BiocParallel::bplapply(seq_along(distribute), function(idx) { + temp_scores <- rep(1, length(distribute[[idx]])) + temp_seqs <- crg_cdr3_seqs[distribute[[idx]]] + for (k in seq_len(nrow(crg_pwm_scoring))) { + temp_scores <- temp_scores * unlist(crg_pwm_scoring[k, substr(temp_seqs, k, k)]) } - - return(temp_scores) - } + temp_scores + }, BPPARAM = BPPARAM)) } else { # calculate the score - for(i in seq_len(nrow(crg_pwm_scoring))){ - crg_cdr3_scores <- crg_cdr3_scores*unlist(crg_pwm_scoring[i, substr(crg_cdr3_seqs, i, i)]) + for (i in seq_len(nrow(crg_pwm_scoring))) { + crg_cdr3_scores <- crg_cdr3_scores * unlist(crg_pwm_scoring[i, substr(crg_cdr3_seqs, i, i)]) } } ### Normalize scores of convergence group members, only use first 10 N-terminal positions # calculate the probability that a score at least this high occurs in the reference database - if(normalization == TRUE){ + if (normalization == TRUE) { message("Normalizing scores of convergence group members.") # Calculate scores of reference database (analogue to sample sequences) refseq_scores <- rep(1, length(refseqs)) - if(length(refseqs) > 10000){ - distribute <- lapply(seq_len(n_cores), function(x){return(((x-1)*floor(length(refseqs)/n_cores)+1):(x*floor(length(refseqs)/n_cores)))}) - - refseq_scores <- foreach::foreach(i = seq_along(distribute), .combine = c) %dopar% { - temp_scores <- rep(1, length(distribute[[i]])) - temp_seqs <- refseqs[distribute[[i]]] - for(i in seq_len(nrow(crg_pwm_scoring))){ - temp_scores <- temp_scores*unlist(crg_pwm_scoring[i, substr(temp_seqs, i, i)]) + if (length(refseqs) > 10000) { + distribute <- lapply(seq_len(n_cores), function(x) {return(((x - 1) * floor(length(refseqs) / n_cores) + 1):(x * floor(length(refseqs) / n_cores)))}) + + refseq_scores <- unlist(BiocParallel::bplapply(seq_along(distribute), function(idx) { + temp_scores <- rep(1, length(distribute[[idx]])) + temp_seqs <- refseqs[distribute[[idx]]] + for (k in seq_len(nrow(crg_pwm_scoring))) { + temp_scores <- temp_scores * unlist(crg_pwm_scoring[k, substr(temp_seqs, k, k)]) } - - return(temp_scores) - } + temp_scores + }, BPPARAM = BPPARAM)) } else { - for(i in seq_len(nrow(crg_pwm_scoring))){ - refseq_scores <- refseq_scores*unlist(crg_pwm_scoring[i, substr(refseqs, i, i)]) + for (i in seq_len(nrow(crg_pwm_scoring))) { + refseq_scores <- refseq_scores * unlist(crg_pwm_scoring[i, substr(refseqs, i, i)]) } } # Calculate normalized scores, if requested restrict score comparison to identical V genes - crg_cdr3_norm_scores <- foreach::foreach(i = seq_len(crg_num_seqs), .combine = c) %dopar% { + crg_cdr3_norm_scores <- unlist(BiocParallel::bplapply(seq_len(crg_num_seqs), function(idx) { v_gene_penalty <- rep(0, length(refseqs)) - if(v_gene_norm == TRUE){ - v_gene_penalty[ref_vgenes != v_genes[i]] <- -2 + if (v_gene_norm) { + v_gene_penalty[ref_vgenes != v_genes[idx]] <- -2 } - - return(sum((refseq_scores + v_gene_penalty) >= crg_cdr3_scores[i])/length(refseqs)) - } + sum((refseq_scores + v_gene_penalty) >= crg_cdr3_scores[idx]) / length(refseqs) + }, BPPARAM = BPPARAM)) } ### Create global PWM of convergence group @@ -397,33 +393,33 @@ deNovoTCRs <- function(convergence_group_tag, crg_pwm_predicting_list <- list() # get the probability of this CDR3b length to occur - crg_len_prob <- data.frame(length = unique(crg_cdr3_lens),probability = rep(0, length(unique(crg_cdr3_lens)))) - for(i in seq_len(nrow(crg_len_prob))){ - crg_len_prob$probability[i] <- sum(crg_cdr3_lens == crg_len_prob$length[i])/crg_num_seqs + crg_len_prob <- data.frame(length = unique(crg_cdr3_lens), probability = rep(0, length(unique(crg_cdr3_lens)))) + for (i in seq_len(nrow(crg_len_prob))) { + crg_len_prob$probability[i] <- sum(crg_cdr3_lens == crg_len_prob$length[i]) / crg_num_seqs } # receive the positional dependent amino acid frequencies for every CDR3b length - for(len in unique(crg_cdr3_lens)){ - crg_pwm_predicting <- as.data.frame(matrix(rep(0,len*length(aa_code)), ncol = length(aa_code))) + for (len in unique(crg_cdr3_lens)) { + crg_pwm_predicting <- as.data.frame(matrix(rep(0, len * length(aa_code)), ncol = length(aa_code))) colnames(crg_pwm_predicting) <- aa_code seqs <- crg_cdr3_seqs[crg_cdr3_lens == len] - for(i in seq_len(len)){ + for (i in seq_len(len)) { aa_freqs <- rep(0, length(aa_code)) act_letters <- substr(seqs, i, i) - for(j in seq_along(aa_freqs)){ + for (j in seq_along(aa_freqs)) { aa_freqs[j] <- sum(act_letters == aa_code[j]) } # Pseudocounts of 0.5% per aa zeroFreqs <- which(aa_freqs == 0) - aa_freqs <- aa_freqs/sum(aa_freqs)*(1-length(zeroFreqs)*0.005) + aa_freqs <- aa_freqs / sum(aa_freqs) * (1 - length(zeroFreqs) * 0.005) aa_freqs[zeroFreqs] <- 0.005 - crg_pwm_predicting[i,] <- aa_freqs + crg_pwm_predicting[i, ] <- aa_freqs } # if de novo sequences should only start with C and end with F, correct the PWM for this positions - if(accept_sequences_with_C_F_start_end == TRUE){ + if (accept_sequences_with_C_F_start_end == TRUE) { crg_pwm_predicting[1, ] <- rep(0, ncol(crg_pwm_predicting)) crg_pwm_predicting[nrow(crg_pwm_predicting), ] <- rep(0, ncol(crg_pwm_predicting)) crg_pwm_predicting$'C'[1] <- 1 @@ -438,74 +434,73 @@ deNovoTCRs <- function(convergence_group_tag, message("Creating ", sims, " de novo sequences.") # randomly select the length of the sequences - de_novo_lens <- sample(x = c(crg_len_prob$length, 0), size = sims, prob = c(crg_len_prob$probability,0), replace = TRUE) + de_novo_lens <- sample(x = c(crg_len_prob$length, 0), size = sims, prob = c(crg_len_prob$probability, 0), replace = TRUE) de_novo_seqs <- rep("", sims) # based on the PWM randomly create new sequences - for(len in crg_len_prob$length){ + for (len in crg_len_prob$length) { inds <- which(de_novo_lens == len) - for(i in seq_len(len)){ - rands <- aa_code[sample.int(n = length(aa_code), size = length(inds), prob = crg_pwm_predicting_list[[paste("Length", len)]][i,], replace = TRUE)] + for (i in seq_len(len)) { + rands <- aa_code[sample.int(n = length(aa_code), size = length(inds), prob = crg_pwm_predicting_list[[paste("Length", len)]][i, ], replace = TRUE)] de_novo_seqs[inds] <- paste(de_novo_seqs[inds], rands, sep = "") } } de_novo_seqs <- unique(de_novo_seqs) - if(accept_sequences_with_C_F_start_end) de_novo_seqs <- grep(pattern = "^C.*F$",x = de_novo_seqs ,perl = TRUE,value = TRUE) + if (accept_sequences_with_C_F_start_end) de_novo_seqs <- grep(pattern = "^C.*F$", x = de_novo_seqs, perl = TRUE, value = TRUE) de_novo_lens <- nchar(de_novo_seqs) # Score de_novo seqs as performed above message("Calculating scores of de novo sequences.") de_novo_seqs_scores <- rep(1, length(de_novo_seqs)) - if(length(de_novo_seqs) > 10000){ - distribute <- lapply(seq_len(n_cores), function(x){return(((x-1)*floor(length(de_novo_seqs)/n_cores)+1):(x*floor(length(de_novo_seqs)/n_cores)))}) + if (length(de_novo_seqs) > 10000) { + distribute <- lapply(seq_len(n_cores), function(x) {return(((x - 1) * floor(length(de_novo_seqs) / n_cores) + 1):(x * floor(length(de_novo_seqs) / n_cores)))}) - de_novo_seqs_scores <- foreach::foreach(i = seq_along(distribute), .combine = c) %dopar% { - temp_scores <- rep(1, length(distribute[[i]])) - temp_seqs <- de_novo_seqs[distribute[[i]]] - for(i in seq_len(nrow(crg_pwm_scoring))){ - temp_scores <- temp_scores*unlist(crg_pwm_scoring[i, substr(temp_seqs, i, i)]) + de_novo_seqs_scores <- unlist(BiocParallel::bplapply(seq_along(distribute), function(idx) { + temp_scores <- rep(1, length(distribute[[idx]])) + temp_seqs <- de_novo_seqs[distribute[[idx]]] + for (k in seq_len(nrow(crg_pwm_scoring))) { + temp_scores <- temp_scores * unlist(crg_pwm_scoring[k, substr(temp_seqs, k, k)]) } - - return(temp_scores) - } + temp_scores + }, BPPARAM = BPPARAM)) } else { - for(i in seq_len(nrow(crg_pwm_scoring))){ - de_novo_seqs_scores <- de_novo_seqs_scores*unlist(crg_pwm_scoring[i, substr(de_novo_seqs, i, i)]) + for (i in seq_len(nrow(crg_pwm_scoring))) { + de_novo_seqs_scores <- de_novo_seqs_scores * unlist(crg_pwm_scoring[i, substr(de_novo_seqs, i, i)]) } } de_novo_seqs_scores <- as.numeric(formatC(de_novo_seqs_scores, digits = 1, format = "e")) # Normalize de_novo seqs scores as performed above - if(normalization == TRUE){ + if (normalization == TRUE) { message("Normalizing scores of de novo sequences.") - de_novo_norm_scores <- foreach::foreach(i = seq_along(de_novo_seqs), .combine = c) %dopar% { - return(sum(refseq_scores >= de_novo_seqs_scores[i])/length(refseqs)) - } + de_novo_norm_scores <- unlist(BiocParallel::bplapply(seq_along(de_novo_seqs), function(idx) { + sum(refseq_scores >= de_novo_seqs_scores[idx]) / length(refseqs) + }, BPPARAM = BPPARAM)) de_novo_norm_scores <- as.numeric(formatC(de_novo_norm_scores, digits = 1, format = "e")) } # sort sequences based on score (normalized scores are prioritized) - if(normalization == TRUE){ - order_ids <- order(de_novo_norm_scores, decreasing=FALSE) + if (normalization == TRUE) { + order_ids <- order(de_novo_norm_scores, decreasing = FALSE) de_novo_seqs <- de_novo_seqs[order_ids] de_novo_lens <- de_novo_lens[order_ids] de_novo_seqs_scores <- de_novo_seqs_scores[order_ids] de_novo_norm_scores <- de_novo_norm_scores[order_ids] - } else{ - order_ids <- order(de_novo_seqs_scores, decreasing=TRUE) + } else { + order_ids <- order(de_novo_seqs_scores, decreasing = TRUE) de_novo_seqs <- de_novo_seqs[order_ids] de_novo_lens <- de_novo_lens[order_ids] de_novo_seqs_scores <- de_novo_seqs_scores[order_ids] } # get num_tops heighest scored sequences - if(length(de_novo_seqs) < num_tops) num_tops <- length(de_novo_seqs) + if (length(de_novo_seqs) < num_tops) num_tops <- length(de_novo_seqs) de_novo_seqs <- de_novo_seqs[seq_len(num_tops)] de_novo_lens <- de_novo_lens[seq_len(num_tops)] de_novo_seqs_scores <- de_novo_seqs_scores[seq_len(num_tops)] - if(normalization == TRUE){ + if (normalization == TRUE) { de_novo_norm_scores <- de_novo_norm_scores[seq_len(num_tops)] de_novo <- data.frame(length = de_novo_lens, seqs = de_novo_seqs, norm_score = de_novo_norm_scores, score = de_novo_seqs_scores) } else { @@ -514,11 +509,11 @@ deNovoTCRs <- function(convergence_group_tag, ### sort cluster sequences based on their score connected_inds <- seq_along(crg_cdr3_seqs) - if(normalization == FALSE){ + if (normalization == FALSE) { crg_cdr3_seqs <- crg_cdr3_seqs[order(crg_cdr3_scores, decreasing = TRUE)] connected_inds <- connected_inds[order(crg_cdr3_scores, decreasing = TRUE)] crg_cdr3_scores <- crg_cdr3_scores[order(crg_cdr3_scores, decreasing = TRUE)] - } else{ + } else { crg_cdr3_seqs <- crg_cdr3_seqs[order(crg_cdr3_norm_scores, decreasing = FALSE)] connected_inds <- connected_inds[order(crg_cdr3_norm_scores, decreasing = FALSE)] crg_cdr3_scores <- crg_cdr3_scores[order(crg_cdr3_norm_scores, decreasing = FALSE)] @@ -526,26 +521,26 @@ deNovoTCRs <- function(convergence_group_tag, } ### set all theoretical numeric values (actually characters) to numeric values - if(is.data.frame(de_novo)){ - for(i in seq_len(ncol(de_novo))){ - if(suppressWarnings(any(is.na(as.numeric(de_novo[,i])))) == FALSE) de_novo[,i] <- as.numeric(de_novo[,i]) + if (is.data.frame(de_novo)) { + for (i in seq_len(ncol(de_novo))) { + if (suppressWarnings(any(is.na(as.numeric(de_novo[, i])))) == FALSE) de_novo[, i] <- as.numeric(de_novo[, i]) } } - if(is.data.frame(crg_cdr3_scores)){ - for(i in seq_len(ncol(crg_cdr3_scores))){ - if(suppressWarnings(any(is.na(as.numeric(crg_cdr3_scores[,i])))) == FALSE) crg_cdr3_scores[,i] <- as.numeric(crg_cdr3_scores[,i]) + if (is.data.frame(crg_cdr3_scores)) { + for (i in seq_len(ncol(crg_cdr3_scores))) { + if (suppressWarnings(any(is.na(as.numeric(crg_cdr3_scores[, i])))) == FALSE) crg_cdr3_scores[, i] <- as.numeric(crg_cdr3_scores[, i]) } } - if(normalization == TRUE){ - if(is.data.frame(crg_cdr3_norm_scores)){ - for(i in seq_len(ncol(crg_cdr3_norm_scores))){ - if(suppressWarnings(any(is.na(as.numeric(crg_cdr3_norm_scores[,i])))) == FALSE) crg_cdr3_norm_scores[,i] <- as.numeric(crg_cdr3_norm_scores[,i]) + if (normalization == TRUE) { + if (is.data.frame(crg_cdr3_norm_scores)) { + for (i in seq_len(ncol(crg_cdr3_norm_scores))) { + if (suppressWarnings(any(is.na(as.numeric(crg_cdr3_norm_scores[, i])))) == FALSE) crg_cdr3_norm_scores[, i] <- as.numeric(crg_cdr3_norm_scores[, i]) } } } ### output - if(normalization == FALSE){ + if (normalization == FALSE) { output <- list(de_novo_sequences = de_novo, sample_sequences_scores = data.frame(seqs = all_crg_cdr3_seqs[connected_inds], scores = crg_cdr3_scores), cdr3_length_probability = crg_len_prob, PWM_Scoring = crg_pwm_scoring, PWM_Prediction = crg_pwm_predicting_list) } else { @@ -555,39 +550,36 @@ deNovoTCRs <- function(convergence_group_tag, ### save fname <- paste0(result_folder, convergence_group_tag, "_de_novo.txt") - if(save_results == TRUE) utils::write.table(x = de_novo,file = fname,quote = FALSE,sep = "\t",row.names = FALSE, col.names = TRUE) - if(save_results == TRUE) message("Output: results are stored in ", fname) + if (save_results == TRUE) utils::write.table(x = de_novo, file = fname, quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE) + if (save_results == TRUE) message("Output: results are stored in ", fname) ### Print a graph with num_tops best scoring de novo sequences - if(make_figure == TRUE){ - if(normalization == FALSE){ - graphics::plot(x = seq_len(num_tops), y = de_novo$score*100, xlab = paste("Top", num_tops, "predicted TCRs"), ylab = "TCR probability based on PWM in %", type = "p", log = "xy", col = "grey", pch = 19) - graphics::points(x = seq_len(10), y = de_novo$score[seq_len(10)]*100, col = "red", pch = 19) + if (make_figure == TRUE) { + if (normalization == FALSE) { + graphics::plot(x = seq_len(num_tops), y = de_novo$score * 100, xlab = paste("Top", num_tops, "predicted TCRs"), ylab = "TCR probability based on PWM in %", type = "p", log = "xy", col = "grey", pch = 19) + graphics::points(x = seq_len(10), y = de_novo$score[seq_len(10)] * 100, col = "red", pch = 19) # which cluster sequences are present in de novo created sequences? common_ids <- which(de_novo_seqs %in% crg_cdr3_seqs) - graphics::points(x = common_ids, y = de_novo_seqs_scores[common_ids]*100, col = "yellow", pch = 20) - } else{ - graphics::plot(x = seq_len(num_tops), y = de_novo$norm_score *100, xlab = paste("Top", num_tops, "predicted TCRs"), ylab = "Normalized TCR probability based on PWM in %", + graphics::points(x = common_ids, y = de_novo_seqs_scores[common_ids] * 100, col = "yellow", pch = 20) + } else { + graphics::plot(x = seq_len(num_tops), y = de_novo$norm_score * 100, xlab = paste("Top", num_tops, "predicted TCRs"), ylab = "Normalized TCR probability based on PWM in %", type = "p", log = "xy", col = "grey", pch = 19) - graphics::points(x = seq_len(10), y = de_novo$norm_score[seq_len(10)]*100, col = "red", pch = 19) + graphics::points(x = seq_len(10), y = de_novo$norm_score[seq_len(10)] * 100, col = "red", pch = 19) # which cluster sequences are present in de novo created sequences? common_ids <- which(de_novo_seqs %in% crg_cdr3_seqs) - graphics::points(x = common_ids, y = de_novo_norm_scores[common_ids]*100, col = "yellow", pch = 20) + graphics::points(x = common_ids, y = de_novo_norm_scores[common_ids] * 100, col = "yellow", pch = 20) } - graphics::legend("bottomleft", legend=c(paste0("Top ", num_tops," de novo scores"), "Top 10 de novo scores", "Convergence group members\nin de novo sequences"), - col=c("grey", "red", "yellow"), pch = c(19,19,20),title="Legend", cex = 0.75) + graphics::legend("bottomleft", legend = c(paste0("Top ", num_tops, " de novo scores"), "Top 10 de novo scores", "Convergence group members\nin de novo sequences"), + col = c("grey", "red", "yellow"), pch = c(19, 19, 20), title = "Legend", cex = 0.75) } t2 <- Sys.time() - dt <- (t2-t1) + dt <- (t2 - t1) message("Total time = ", dt, " ", units(dt)) - .stop_parallel() - - ### Closing time! return(output) } diff --git a/R/getRandomSubsample.R b/R/getRandomSubsample.R index b2b810e..c45db80 100644 --- a/R/getRandomSubsample.R +++ b/R/getRandomSubsample.R @@ -52,86 +52,121 @@ getRandomSubsample <- function(cdr3_len_stratify = FALSE, motif_region_vgenes_list, ref_motif_vgenes_id_list, ref_lengths_vgenes_list, - lengths_vgenes_list){ + lengths_vgenes_list) { random_subsample <- c() ### Return an unbiased reference subsample - if(cdr3_len_stratify == FALSE && vgene_stratify == FALSE){ - random_subsample <- refseqs_motif_region[sample(x = seq_along(refseqs_motif_region),size = length(motif_region),replace = FALSE)] + if (!cdr3_len_stratify && !vgene_stratify) { + random_subsample <- refseqs_motif_region[ + sample( + x = seq_along(refseqs_motif_region), + size = length(motif_region), + replace = FALSE + ) + ] } - ### Return a reference subsample with biased CDR3b length distribution according to the sample - if(cdr3_len_stratify == TRUE && vgene_stratify == FALSE){ - - ## if there are more seqs with specific cdr3 length in sample than in reference database, surplus number of seqs will be taken randomly from remaining seqs with different cdr3 length + ### Return a reference subsample with biased CDR3b length distribution + if (cdr3_len_stratify && !vgene_stratify) { random_length <- 0 subsample_parts <- vector("list", length(motif_lengths_list)) idx <- 0L - for(cdr3_length in names(motif_lengths_list)){ + for (cdr3_length in names(motif_lengths_list)) { idx <- idx + 1L - if(length(ref_motif_lengths_id_list[[cdr3_length]]) < motif_lengths_list[[cdr3_length]]){ - random_length <- random_length + motif_lengths_list[[cdr3_length]] - length(ref_motif_lengths_id_list[[cdr3_length]]) - subsample_parts[[idx]] <- ref_motif_lengths_id_list[[cdr3_length]] - }else { - subsample_parts[[idx]] <- sample(x = ref_motif_lengths_id_list[[cdr3_length]],size = motif_lengths_list[[cdr3_length]],replace = FALSE) + ref_ids <- ref_motif_lengths_id_list[[cdr3_length]] + needed <- motif_lengths_list[[cdr3_length]] + if (length(ref_ids) < needed) { + random_length <- random_length + needed - length(ref_ids) + subsample_parts[[idx]] <- ref_ids + } else { + subsample_parts[[idx]] <- sample( + x = ref_ids, size = needed, replace = FALSE + ) } } random_subsample <- unlist(subsample_parts) - if(random_length > 0){ - random_subsample <- c(random_subsample, sample(x = (seq_along(refseqs_motif_region))[-random_subsample],size = random_length,replace = FALSE)) + if (random_length > 0) { + random_subsample <- c( + random_subsample, + sample( + x = seq_along(refseqs_motif_region)[-random_subsample], + size = random_length, + replace = FALSE + ) + ) } random_subsample <- refseqs_motif_region[random_subsample] } - ### Return a reference subsample with biased V gene distribution according to the sample - if(vgene_stratify == TRUE && cdr3_len_stratify == FALSE){ - - ## if there are more seqs with specific v gene in sample than in reference database, surplus number of seqs will be taken randomly from remaining seqs with different vgenes + ### Return a reference subsample with biased V gene distribution + if (vgene_stratify && !cdr3_len_stratify) { random_length <- 0 subsample_parts <- vector("list", length(motif_region_vgenes_list)) idx <- 0L - for(act_vgene in names(motif_region_vgenes_list)){ + for (act_vgene in names(motif_region_vgenes_list)) { idx <- idx + 1L - if(length(ref_motif_vgenes_id_list[[act_vgene]]) < motif_region_vgenes_list[[act_vgene]]){ - random_length <- random_length + motif_region_vgenes_list[[act_vgene]] - length(ref_motif_vgenes_id_list[[act_vgene]]) - subsample_parts[[idx]] <- ref_motif_vgenes_id_list[[act_vgene]] - }else { - subsample_parts[[idx]] <- sample(x = ref_motif_vgenes_id_list[[act_vgene]],size = motif_region_vgenes_list[[act_vgene]],replace = FALSE) + ref_ids <- ref_motif_vgenes_id_list[[act_vgene]] + needed <- motif_region_vgenes_list[[act_vgene]] + if (length(ref_ids) < needed) { + random_length <- random_length + needed - length(ref_ids) + subsample_parts[[idx]] <- ref_ids + } else { + subsample_parts[[idx]] <- sample( + x = ref_ids, size = needed, replace = FALSE + ) } } random_subsample <- unlist(subsample_parts) - if(random_length > 0){ - random_subsample <- c(random_subsample, sample(x = (seq_along(refseqs_motif_region))[-random_subsample],size = random_length,replace = FALSE)) + if (random_length > 0) { + random_subsample <- c( + random_subsample, + sample( + x = seq_along(refseqs_motif_region)[-random_subsample], + size = random_length, + replace = FALSE + ) + ) } random_subsample <- refseqs_motif_region[random_subsample] } - ### Return a reference subsample with biased V gene and CDR3b length distribution according to the sample - if(vgene_stratify == TRUE && cdr3_len_stratify == TRUE){ - - ## if there are more seqs with specific v gene or CDR3b length in sample than in reference database, surplus number of seqs will be taken randomly from remaining seqs + ### Return a reference subsample with biased V gene and CDR3b length distribution + if (vgene_stratify && cdr3_len_stratify) { random_length <- 0 - subsample_parts <- vector("list", length(motif_lengths_list) * length(motif_region_vgenes_list)) + subsample_parts <- vector( + "list", + length(motif_lengths_list) * length(motif_region_vgenes_list) + ) idx <- 0L - for(cdr3_length in names(motif_lengths_list)){ - for(act_vgene in names(motif_region_vgenes_list)){ + for (cdr3_length in names(motif_lengths_list)) { + for (act_vgene in names(motif_region_vgenes_list)) { idx <- idx + 1L - if(length(ref_lengths_vgenes_list[[cdr3_length]][[act_vgene]]) < lengths_vgenes_list[[cdr3_length]][[act_vgene]]){ - random_length <- random_length + lengths_vgenes_list[[cdr3_length]][[act_vgene]] - length(ref_lengths_vgenes_list[[cdr3_length]][[act_vgene]]) - subsample_parts[[idx]] <- ref_lengths_vgenes_list[[cdr3_length]][[act_vgene]] - }else { - subsample_parts[[idx]] <- sample(x = ref_lengths_vgenes_list[[cdr3_length]][[act_vgene]],size = lengths_vgenes_list[[cdr3_length]][[act_vgene]],replace = FALSE) + ref_ids <- ref_lengths_vgenes_list[[cdr3_length]][[act_vgene]] + needed <- lengths_vgenes_list[[cdr3_length]][[act_vgene]] + if (length(ref_ids) < needed) { + random_length <- random_length + needed - length(ref_ids) + subsample_parts[[idx]] <- ref_ids + } else { + subsample_parts[[idx]] <- sample( + x = ref_ids, size = needed, replace = FALSE + ) } } } random_subsample <- unlist(subsample_parts) - if(random_length > 0){ - random_subsample <- c(random_subsample, sample(x = (seq_along(refseqs_motif_region))[-random_subsample],size = random_length,replace = FALSE)) + if (random_length > 0) { + random_subsample <- c( + random_subsample, + sample( + x = seq_along(refseqs_motif_region)[-random_subsample], + size = random_length, + replace = FALSE + ) + ) } random_subsample <- refseqs_motif_region[random_subsample] } - ## Closing time! return(random_subsample) } diff --git a/R/global-cutoff.R b/R/global-cutoff.R index cb55bb8..6ec6896 100644 --- a/R/global-cutoff.R +++ b/R/global-cutoff.R @@ -11,9 +11,9 @@ #' considered globally similar. #' @param global_vgene logical. If \code{TRUE}, global connections are #' restricted to sequence pairs that share a V gene. -#' @param no_cores numeric. Number of cores registered with the parallel -#' backend (used only for documentation; the function relies on a -#' pre-registered \code{foreach} backend). +#' @param BPPARAM A \code{\link[BiocParallel]{BiocParallelParam}} object +#' controlling parallel evaluation (e.g. +#' \code{BiocParallel::MulticoreParam()}). #' @param verbose logical. If \code{TRUE}, progress messages are printed. #' #' @return A list with two elements: @@ -26,13 +26,12 @@ #' } #' #' @keywords internal -#' @import foreach .global_cutoff <- function(seqs, motif_region, sequences, gccutoff, global_vgene, - no_cores, + BPPARAM, verbose) { if (verbose) message("Searching for global similarities (cutoff method).") @@ -44,9 +43,9 @@ gccutoff, global_vgene, verbose)) } - ## ---- Fallback: stringdist + foreach implementation ------------------------ + ## ---- Fallback: stringdist + BiocParallel implementation ------------------------ .global_cutoff_stringdist(seqs, motif_region, sequences, - gccutoff, global_vgene, no_cores, verbose) + gccutoff, global_vgene, BPPARAM, verbose) } #' immApex-accelerated global cutoff via buildNetwork() @@ -118,11 +117,11 @@ ) } -#' stringdist + foreach fallback for global cutoff +#' stringdist + BiocParallel fallback for global cutoff #' @return A list with edge data and excluded sequence IDs. #' @keywords internal .global_cutoff_stringdist <- function(seqs, motif_region, sequences, - gccutoff, global_vgene, no_cores, + gccutoff, global_vgene, BPPARAM, verbose) { ## Prepare V-gene lookup vectors when V-gene filtering is requested @@ -137,7 +136,7 @@ ## Parallel loop: for every sequence compute Hamming distance to all ## others and identify neighbours within gccutoff ## ------------------------------------------------------------------ - res <- foreach::foreach(i = seq_along(seqs)) %dopar% { + res <- BiocParallel::bplapply(seq_along(seqs), function(i) { not_in_global_ids <- c() global_con <- c() @@ -176,17 +175,13 @@ } list(not_in_global_ids = not_in_global_ids, global_con = global_con) - } + }, BPPARAM = BPPARAM) ## ------------------------------------------------------------------ ## Collect results from parallel workers ## ------------------------------------------------------------------ - not_in_global_ids <- c() - global_con <- c() - for (i in seq_along(res)) { - not_in_global_ids <- c(not_in_global_ids, res[[i]]$not_in_global_ids) - global_con <- rbind(global_con, res[[i]]$global_con) - } + not_in_global_ids <- unlist(lapply(res, `[[`, "not_in_global_ids")) + global_con <- do.call(rbind, lapply(res, `[[`, "global_con")) ## Build edge data frame if (!is.null(global_con)) { diff --git a/R/global-fisher.R b/R/global-fisher.R index 1a1d4c2..e5cc6ba 100644 --- a/R/global-fisher.R +++ b/R/global-fisher.R @@ -22,7 +22,8 @@ #' a V-gene. #' @param all_aa_interchangeable Logical. If \code{FALSE}, only pairs whose #' variable-position amino acids have BLOSUM62 >= 0 are kept. -#' @param no_cores Integer. Number of registered parallel cores. +#' @param BPPARAM A \code{\link[BiocParallel]{BiocParallelParam}} object +#' controlling parallel evaluation. #' @param verbose Logical. Print progress messages. #' #' @return A list with elements: @@ -35,7 +36,6 @@ #' similarities were found.} #' } #' -#' @import foreach #' @keywords internal .global_fisher <- function(seqs, motif_region, @@ -46,7 +46,7 @@ boundary_size, global_vgene, all_aa_interchangeable, - no_cores, + BPPARAM, verbose) { ## Load BLOSUM62 compatible amino acid pairs @@ -81,11 +81,8 @@ sample_seqs$tag <- sample_seqs$struct ## Expand: for each position, replace that position with "%" - i <- NULL - exp_sample_seqs <- foreach::foreach( - i = seq_len(max(sample_seqs$nchar)), - .combine = rbind - ) %dopar% { + exp_sample_seqs <- do.call(rbind, BiocParallel::bplapply( + seq_len(max(sample_seqs$nchar)), function(i) { temp_part <- sample_seqs[sample_seqs$nchar >= i, ] temp_part$pos <- i temp_part$aa_at_pos <- substr(temp_part$struct, i, i) @@ -95,7 +92,7 @@ dup_check <- duplicated(temp_part$tag) | duplicated(temp_part$tag, fromLast = TRUE) temp_part[dup_check, ] - } + }, BPPARAM = BPPARAM)) ## Early exit if nothing duplicated if (is.null(exp_sample_seqs) || nrow(exp_sample_seqs) == 0) { @@ -131,16 +128,14 @@ reference_seqs$aa_at_pos <- "" reference_seqs$tag <- reference_seqs$struct - exp_reference_seqs <- foreach::foreach( - i = seq_len(min(max(reference_seqs$nchar), max(sample_seqs$nchar))), - .combine = rbind - ) %dopar% { + exp_reference_seqs <- do.call(rbind, BiocParallel::bplapply( + seq_len(min(max(reference_seqs$nchar), max(sample_seqs$nchar))), function(i) { temp_part <- reference_seqs[reference_seqs$nchar >= i, ] temp_part$pos <- i temp_part$aa_at_pos <- substr(temp_part$struct, i, i) substr(temp_part$tag, i, i) <- "%" temp_part[temp_part$tag %in% unq_sample_struct, ] - } + }, BPPARAM = BPPARAM)) ## ----------------------------------------------------------------- ## Step 3: Compute struct frequencies and enrichment @@ -177,10 +172,8 @@ all.x = TRUE ) - edges <- foreach::foreach( - i = seq_len(nrow(sample_stats)), - .combine = rbind - ) %dopar% { + edges <- do.call(rbind, BiocParallel::bplapply( + seq_len(nrow(sample_stats)), function(i) { act_members <- unique( exp_sample_seqs[exp_sample_seqs$tag == sample_stats$tag[i], ] ) @@ -214,7 +207,7 @@ act_members$aa_at_pos[combn_ids[, 2]]) %in% BlosumVec) } ret[keep, ] - } + }, BPPARAM = BPPARAM)) ## ----------------------------------------------------------------- ## Step 5: Build cluster network @@ -242,7 +235,7 @@ "position", "TRBV") cm_splitted$position <- as.numeric(cm_splitted$position) - cluster_list_raw <- foreach::foreach(i = seq_len(cm$no)) %dopar% { + cluster_list_raw <- BiocParallel::bplapply(seq_len(cm$no), function(i) { csize <- cm$csize[i] member_df <- unique(cm_splitted[which(cm$membership == i), ]) num_cdr3s <- length(unique(member_df$CDR3b)) @@ -251,7 +244,7 @@ paste(unique(member_df$CDR3b), collapse = " "), paste(sort(unique(member_df$aa_at_position)), collapse = ""), member_df$TRBV[1]) - } + }, BPPARAM = BPPARAM) cluster_list <- data.frame( matrix(unlist(cluster_list_raw), ncol = 6, byrow = TRUE), diff --git a/R/local-fisher.R b/R/local-fisher.R index cff066c..d790068 100644 --- a/R/local-fisher.R +++ b/R/local-fisher.R @@ -27,7 +27,8 @@ #' @param motif_distance_cutoff Numeric. Maximum positional distance for motifs #' to be grouped together. Not used directly in this function but kept for #' interface consistency. -#' @param no_cores Numeric. Number of cores to use for parallel motif finding. +#' @param BPPARAM A \code{\link[BiocParallel]{BiocParallelParam}} object specifying the +#' parallel backend. Defaults to \code{BiocParallel::SerialParam()}. #' @param verbose Logical. If \code{TRUE}, print status messages via #' \code{message()}. #' @@ -39,7 +40,6 @@ #' associated statistics.} #' } #' -#' @import foreach #' @keywords internal .local_fisher <- function(motif_region, refseqs_motif_region, @@ -52,7 +52,7 @@ lcminove, discontinuous_motifs, motif_distance_cutoff, - no_cores, + BPPARAM, verbose) { ## ----------------------------------------------------------- @@ -69,7 +69,7 @@ motif.lengths = motif_length, min.depth = 1L, discontinuous = discontinuous_motifs, - nthreads = no_cores + nthreads = BiocParallel::bpnworkers(BPPARAM) ) colnames(ref_motifs_df)[colnames(ref_motifs_df) == "frequency"] <- "count" @@ -79,50 +79,33 @@ motif.lengths = motif_length, min.depth = 1L, discontinuous = discontinuous_motifs, - nthreads = no_cores + nthreads = BiocParallel::bpnworkers(BPPARAM) ) colnames(motifs_df)[colnames(motifs_df) == "frequency"] <- "count" } else { - ## ---- Fallback: foreach chunking with stringdist backend ---- + ## ---- Fallback: BiocParallel chunking with stringdist backend ---- if (verbose) message("Finding motifs in reference sequences...") - # Divide the sequences equally among all cores - overhang <- length(refseqs_motif_region) %% no_cores - id_list <- list() - last_id <- 0 - next_id <- 0 - steps <- (length(refseqs_motif_region) - overhang) / no_cores - - for (i in seq_len(no_cores)) { - next_id <- last_id + steps - if (overhang > 0) { - next_id <- next_id + 1 - overhang <- overhang - 1 - } - id_list[[i]] <- (last_id + 1):next_id - last_id <- next_id - } + # Divide the sequences equally among all workers + no_cores_count <- BiocParallel::bpnworkers(BPPARAM) + id_list <- split( + seq_along(refseqs_motif_region), + cut(seq_along(refseqs_motif_region), no_cores_count, labels = FALSE) + ) # Receive all motifs in the reference sequences - ref_motifs_list <- foreach::foreach(i = seq_len(no_cores)) %dopar% { - return(findMotifs(seqs = refseqs_motif_region[id_list[[i]]], - q = motif_length, - discontinuous = discontinuous_motifs)) - } + ref_motifs_list <- BiocParallel::bplapply(id_list, function(ids) { + findMotifs(seqs = refseqs_motif_region[ids], + q = motif_length, + discontinuous = discontinuous_motifs) + }, BPPARAM = BPPARAM) # Convert the list into a more manageable data frame - ref_motifs_df <- NULL - for (i in seq_len(no_cores)) { - if (i == 1) { - ref_motifs_df <- ref_motifs_list[[i]] - } else { - ref_motifs_df <- merge(x = ref_motifs_df, - y = ref_motifs_list[[i]], - by = "motif", - all = TRUE) - } - } + ref_motifs_df <- Reduce( + function(x, y) merge(x, y, by = "motif", all = TRUE), + ref_motifs_list + ) ref_motifs_df[is.na(ref_motifs_df)] <- 0 ref_motifs_df <- data.frame( motif = ref_motifs_df$motif, @@ -131,42 +114,24 @@ if (verbose) message("Finding motifs in sample sequences...") - # Divide the sequences equally among all cores - overhang <- length(motif_region) %% no_cores - id_list <- list() - last_id <- 0 - next_id <- 0 - steps <- (length(motif_region) - overhang) / no_cores - - for (i in seq_len(no_cores)) { - next_id <- last_id + steps - if (overhang > 0) { - next_id <- next_id + 1 - overhang <- overhang - 1 - } - id_list[[i]] <- (last_id + 1):next_id - last_id <- next_id - } + # Divide the sequences equally among all workers + id_list <- split( + seq_along(motif_region), + cut(seq_along(motif_region), no_cores_count, labels = FALSE) + ) # Receive all motifs in the sample sequences - motifs_list <- foreach::foreach(i = seq_len(no_cores)) %dopar% { - return(findMotifs(seqs = motif_region[id_list[[i]]], - q = motif_length, - discontinuous = discontinuous_motifs)) - } + motifs_list <- BiocParallel::bplapply(id_list, function(ids) { + findMotifs(seqs = motif_region[ids], + q = motif_length, + discontinuous = discontinuous_motifs) + }, BPPARAM = BPPARAM) # Convert the list into a more manageable data frame - motifs_df <- NULL - for (i in seq_len(no_cores)) { - if (i == 1) { - motifs_df <- motifs_list[[i]] - } else { - motifs_df <- merge(x = motifs_df, - y = motifs_list[[i]], - by = "motif", - all = TRUE) - } - } + motifs_df <- Reduce( + function(x, y) merge(x, y, by = "motif", all = TRUE), + motifs_list + ) motifs_df[is.na(motifs_df)] <- 0 motifs_df <- data.frame( motif = motifs_df$motif, diff --git a/R/local-rrs.R b/R/local-rrs.R index 803e419..2de4967 100644 --- a/R/local-rrs.R +++ b/R/local-rrs.R @@ -26,7 +26,8 @@ #' CDR3 length distribution. #' @param vgene_stratify Logical. Whether to stratify random subsamples by #' V-gene usage distribution. -#' @param no_cores Integer. Number of cores for parallel execution. +#' @param BPPARAM A \code{\link[BiocParallel]{BiocParallelParam}} object +#' controlling parallel execution (default: \code{BiocParallel::bpparam()}). #' @param verbose Logical. If \code{TRUE}, emit progress messages. #' @param motif_lengths_list List. Pre-computed CDR3 length counts from the #' sample. @@ -52,7 +53,6 @@ #' \code{selected_motifs} but for every motif found.} #' } #' -#' @import foreach #' @keywords internal .local_rrs <- function(motif_region, refseqs_motif_region, @@ -66,7 +66,7 @@ discontinuous_motifs, cdr3_len_stratify, vgene_stratify, - no_cores, + BPPARAM, verbose, motif_lengths_list, ref_motif_lengths_id_list, @@ -86,7 +86,7 @@ ## ---- Step 2: Repeated random sampling from the reference DB ----------- if (verbose) message("Running ", sim_depth, " random sampling iterations.") - res <- foreach::foreach(i = seq_len(sim_depth), .inorder = FALSE) %dopar% { + res <- BiocParallel::bplapply(seq_len(sim_depth), function(i) { motif_sample <- getRandomSubsample( cdr3_len_stratify = cdr3_len_stratify, vgene_stratify = vgene_stratify, @@ -109,7 +109,7 @@ sim <- merge(discovery, sim, by = "motif", all.x = TRUE) sim$V1.y[is.na(sim$V1.y)] <- 0 sim$V1.y - } + }, BPPARAM = BPPARAM) ## ---- Step 3: Build sample log ----------------------------------------- if (verbose) message("Building sample log.") diff --git a/R/plotNetwork.R b/R/plotNetwork.R index 33c9b07..6e88fc9 100644 --- a/R/plotNetwork.R +++ b/R/plotNetwork.R @@ -55,7 +55,7 @@ #' plotNetwork(clustering_output = res, #' n_cores = 1) #' -#' @import viridis foreach grDevices +#' @import viridis grDevices #' @export plotNetwork <- function(clustering_output = NULL, @@ -127,7 +127,7 @@ plotNetwork <- function(clustering_output = NULL, ################################################################# ### Initiate parallelization - .setup_parallel(n_cores) + BPPARAM <- .setup_parallel(n_cores) ### Get cluster_list and cluster_properties with at least cluster_min_size members # cluster_list: contains all members of the cluster and the sequence specific additional information (e.g. patient, count, etc.) @@ -148,7 +148,7 @@ plotNetwork <- function(clustering_output = NULL, cluster_properties <- cluster_properties[hold_ids,] ### Pool sequence and cluster specific information (identical sequences in differenct clusters are treated as different entities) - cluster_data_frame <- data.frame(foreach::foreach(i = seq_along(cluster_list), .combine = rbind) %dopar% { + cluster_data_frame <- do.call(rbind, BiocParallel::bplapply(seq_along(cluster_list), function(i) { temp_df <- cluster_list[[i]] temp_adds <- unlist(cluster_properties[i,]) temp_adds <- data.frame(matrix(rep(temp_adds, times = nrow(temp_df)), nrow = nrow(temp_df), byrow = TRUE), stringsAsFactors = FALSE) @@ -156,7 +156,8 @@ plotNetwork <- function(clustering_output = NULL, temp_df <- cbind(temp_adds, temp_df) return(temp_df) - }, stringsAsFactors = FALSE) + }, BPPARAM = BPPARAM)) + cluster_data_frame <- data.frame(cluster_data_frame, stringsAsFactors = FALSE) cluster_data_frame[] <- lapply(cluster_data_frame, as.character) for(i in seq_len(ncol(cluster_data_frame))){ if(suppressWarnings(any(is.na(as.numeric(cluster_data_frame[,i])))) == FALSE) cluster_data_frame[,i] <- as.numeric(cluster_data_frame[,i]) @@ -195,8 +196,7 @@ plotNetwork <- function(clustering_output = NULL, if("TRBV" %in% colnames(cluster_data_frame)) vgene.info <- TRUE ### Get connections between different tuples consisting of CDR3b sequence, v gene and donor - x <- NULL - clone_network <- foreach::foreach(x = seq_len(nrow(ori_clone_net))) %dopar% { + clone_network <- BiocParallel::bplapply(seq_len(nrow(ori_clone_net)), function(x) { # Identify all tuples with correpsonding CDR3b sequence act_ids1 <- cluster_data_frame$ID[cluster_data_frame$CDR3b == ori_clone_net[x,1]] act_ids2 <- cluster_data_frame$ID[cluster_data_frame$CDR3b == ori_clone_net[x,2]] @@ -220,9 +220,8 @@ plotNetwork <- function(clustering_output = NULL, var_ret <- data.frame(comb_ids, stringsAsFactors = FALSE) var_ret <- cbind(var_ret, rep(ori_clone_net[x,3], nrow(var_ret))) return(unlist(t(var_ret))) - t(var_ret) } else return(NULL) - } + }, BPPARAM = BPPARAM) clone_network <- data.frame(matrix(unlist(clone_network), ncol = 3, byrow = TRUE), stringsAsFactors = FALSE) colnames(clone_network) <- c("from", "to", "label") clone_network$from <- as.numeric(clone_network$from) @@ -233,7 +232,7 @@ plotNetwork <- function(clustering_output = NULL, ### local connections if(parameters$local_similarities == TRUE){ ### Get local connections between different tuples consisting of CDR3b sequence, v gene and donor - local_clone_network <- foreach::foreach(i = which(cluster_properties$type == "local")) %dopar% { + local_clone_network <- BiocParallel::bplapply(which(cluster_properties$type == "local"), function(i) { # Get IDs, members and details of current cluster temp_ids <- cluster_data_frame$ID[cluster_data_frame$tag == cluster_properties$tag[i]] @@ -270,7 +269,7 @@ plotNetwork <- function(clustering_output = NULL, temp_df <- temp_df[cluster_data_frame$tag[temp_df$from] == cluster_data_frame$tag[temp_df$to],] temp_df <- t(temp_df) return(unlist(temp_df)) - } + }, BPPARAM = BPPARAM) ### If available, set clone network to the local network if(length(local_clone_network) == 0) parameters$local_similarities == FALSE else { @@ -290,7 +289,7 @@ plotNetwork <- function(clustering_output = NULL, BlosumVec <- .get_blosum_vec() ### Get local connections between different tuples consisting of CDR3b sequence, v gene and donor - global_clone_network <- foreach::foreach(i = which(cluster_properties$type == "global")) %dopar% { + global_clone_network <- BiocParallel::bplapply(which(cluster_properties$type == "global"), function(i) { # Get IDs and members of current cluster temp_ids <- cluster_data_frame$ID[cluster_data_frame$tag == cluster_properties$tag[i]] @@ -314,7 +313,7 @@ plotNetwork <- function(clustering_output = NULL, } temp_df <- t(temp_df) return(unlist(temp_df)) - } + }, BPPARAM = BPPARAM) if(parameters$all_aa_interchangeable == FALSE) message("Restrict global connections to sequences with a BLOSUM62 value for the amino acid substitution greater or equal to zero.") ### Append global network to the clone network @@ -348,8 +347,7 @@ plotNetwork <- function(clustering_output = NULL, if("TRBV" %in% colnames(cluster_data_frame)) vgene.info <- TRUE ### Get connections between different tuples consisting of CDR3b sequence, v gene and donor - x <- NULL - clone_network <- foreach::foreach(x = seq_len(nrow(ori_clone_net))) %dopar% { + clone_network <- BiocParallel::bplapply(seq_len(nrow(ori_clone_net)), function(x) { # Identify all tuples with correpsonding CDR3b sequence act_ids1 <- cluster_data_frame$ID[cluster_data_frame$CDR3b == ori_clone_net[x,1]] act_ids2 <- cluster_data_frame$ID[cluster_data_frame$CDR3b == ori_clone_net[x,2]] @@ -373,9 +371,8 @@ plotNetwork <- function(clustering_output = NULL, var_ret <- data.frame(comb_ids, stringsAsFactors = FALSE) var_ret <- cbind(var_ret, rep(ori_clone_net[x,3], nrow(var_ret))) return(unlist(t(var_ret))) - t(var_ret) } else return(NULL) - } + }, BPPARAM = BPPARAM) if(parameters$positional_motifs == TRUE) message("Restrict local similarities to sequences with identical N-terminal motif position.") if(parameters$public_tcrs == FALSE && patient.info == TRUE) message("Restrict similarities to sequences obtained from identical donor.") if(parameters$global_vgene == TRUE && vgene.info == TRUE) message("Restrict global similarities to sequences with identical v gene.") @@ -390,7 +387,7 @@ plotNetwork <- function(clustering_output = NULL, ### local connections if(parameters$local_similarities == TRUE){ ### Get local connections between different tuples consisting of CDR3b sequence, v gene and donor - local_clone_network <- foreach::foreach(i = which(cluster_properties$type == "local")) %dopar% { + local_clone_network <- BiocParallel::bplapply(which(cluster_properties$type == "local"), function(i) { # Get IDs, members and details of current cluster temp_ids <- cluster_data_frame$ID[cluster_data_frame$tag == cluster_properties$tag[i]] @@ -427,7 +424,7 @@ plotNetwork <- function(clustering_output = NULL, temp_df <- temp_df[cluster_data_frame$tag[temp_df$from] == cluster_data_frame$tag[temp_df$to],] temp_df <- t(temp_df) return(unlist(temp_df)) - } + }, BPPARAM = BPPARAM) ### If available, set clone network to the local network if(length(local_clone_network) == 0) parameters$local_similarities == FALSE else { @@ -447,7 +444,7 @@ plotNetwork <- function(clustering_output = NULL, BlosumVec <- .get_blosum_vec() ### Get local connections between different tuples consisting of CDR3b sequence, v gene and donor - global_clone_network <- foreach::foreach(i = which(cluster_properties$type == "global")) %dopar% { + global_clone_network <- BiocParallel::bplapply(which(cluster_properties$type == "global"), function(i) { # Get IDs and members of current cluster temp_ids <- cluster_data_frame$ID[cluster_data_frame$tag == cluster_properties$tag[i]] @@ -471,7 +468,7 @@ plotNetwork <- function(clustering_output = NULL, } temp_df <- t(temp_df) return(unlist(temp_df)) - } + }, BPPARAM = BPPARAM) if(parameters$all_aa_interchangeable == FALSE) message("Restrict global connections to sequences with a BLOSUM62 value for the amino acid substitution greater or equal to zero.") ### Append global network to the clone network @@ -700,8 +697,6 @@ plotNetwork <- function(clustering_output = NULL, lenodes <- data.frame(label = leg.info[,1], color = leg.info[,2], shape="dot", size=10) } - .stop_parallel() - message("Drawing the graph.") ret <- visNetwork::visLegend(visNetwork::visOptions(visNetwork::visIgraphLayout(visNetwork::visNetwork(nodes = nodes, edges = edges), layout = "layout_components"), diff --git a/R/runGLIPH.R b/R/runGLIPH.R index 8df3996..6ddf189 100644 --- a/R/runGLIPH.R +++ b/R/runGLIPH.R @@ -223,7 +223,6 @@ #' n_cores = 1 #' ) #' -#' @import foreach #' @export runGLIPH <- function(cdr3_sequences, method = c("gliph2", "gliph1", "custom"), @@ -384,7 +383,7 @@ runGLIPH <- function(cdr3_sequences, } ## ---- Set up parallel ---- - no_cores <- .setup_parallel(n_cores) + BPPARAM <- .setup_parallel(n_cores) ## ================================================================ ## Part 1: Local similarities @@ -453,14 +452,14 @@ runGLIPH <- function(cdr3_sequences, discontinuous_motifs = discontinuous_motifs, cdr3_len_stratify = cdr3_len_stratify, vgene_stratify = vgene_stratify, - no_cores = no_cores, verbose = verbose, motif_lengths_list = motif_lengths_list, ref_motif_lengths_id_list = ref_motif_lengths_id_list, motif_region_vgenes_list = motif_region_vgenes_list, ref_motif_vgenes_id_list = ref_motif_vgenes_id_list, lengths_vgenes_list = lengths_vgenes_list, - ref_lengths_vgenes_list = ref_lengths_vgenes_list + ref_lengths_vgenes_list = ref_lengths_vgenes_list, + BPPARAM = BPPARAM ) sample_log <- rrs_result$sample_log @@ -480,8 +479,8 @@ runGLIPH <- function(cdr3_sequences, lcminove = lcminove, discontinuous_motifs = discontinuous_motifs, motif_distance_cutoff = motif_distance_cutoff, - no_cores = no_cores, - verbose = verbose + verbose = verbose, + BPPARAM = BPPARAM ) selected_motifs <- fisher_result$selected_motifs @@ -490,30 +489,30 @@ runGLIPH <- function(cdr3_sequences, ## Build local clone network from selected motifs if (!is.null(selected_motifs) && nrow(selected_motifs) > 0) { - i <- NULL - local_clone_network <- foreach::foreach( - i = seq_len(nrow(selected_motifs)), - .combine = rbind - ) %dopar% { - all_ids <- grep( - pattern = selected_motifs$motif[i], - x = all_motif_region, - value = FALSE - ) - if (length(all_ids) >= 2) { - combn_ids <- t(utils::combn(all_ids, m = 2)) - } else { - combn_ids <- t(utils::combn(rep(1, 2), m = 2)) - } - temp_df <- data.frame( - V1 = combn_ids[, 1], - V2 = combn_ids[, 2], - type = rep("local", nrow(combn_ids)), - tag = rep(selected_motifs$motif[i], nrow(combn_ids)), - stringsAsFactors = FALSE - ) - temp_df - } + local_net_parts <- BiocParallel::bplapply( + seq_len(nrow(selected_motifs)), + function(i) { + all_ids <- grep( + pattern = selected_motifs$motif[i], + x = all_motif_region, + value = FALSE + ) + if (length(all_ids) >= 2) { + combn_ids <- t(utils::combn(all_ids, m = 2)) + } else { + combn_ids <- t(utils::combn(rep(1, 2), m = 2)) + } + data.frame( + V1 = combn_ids[, 1], + V2 = combn_ids[, 2], + type = rep("local", nrow(combn_ids)), + tag = rep(selected_motifs$motif[i], nrow(combn_ids)), + stringsAsFactors = FALSE + ) + }, + BPPARAM = BPPARAM + ) + local_clone_network <- do.call(rbind, local_net_parts) ## Apply motif distance cutoff if (!is.null(local_clone_network) && nrow(local_clone_network) > 0) { @@ -563,8 +562,8 @@ runGLIPH <- function(cdr3_sequences, sequences = sequences, gccutoff = gccutoff, global_vgene = global_vgene, - no_cores = no_cores, - verbose = verbose + verbose = verbose, + BPPARAM = BPPARAM ) global_clone_network <- cutoff_result$edges @@ -585,8 +584,8 @@ runGLIPH <- function(cdr3_sequences, boundary_size = boundary_size, global_vgene = global_vgene, all_aa_interchangeable = all_aa_interchangeable, - no_cores = no_cores, - verbose = verbose + verbose = verbose, + BPPARAM = BPPARAM ) global_res <- fisher_global_result$cluster_list @@ -640,7 +639,8 @@ runGLIPH <- function(cdr3_sequences, global_vgene = global_vgene, public_tcrs = public_tcrs, cluster_min_size = cluster_min_size, - verbose = verbose + verbose = verbose, + BPPARAM = BPPARAM ) cluster_properties <- gliph1_result$cluster_properties @@ -716,7 +716,8 @@ runGLIPH <- function(cdr3_sequences, motif_distance_cutoff = motif_distance_cutoff, cluster_min_size = cluster_min_size, boost_local_significance = boost_local_significance, - verbose = verbose + verbose = verbose, + BPPARAM = BPPARAM ) cluster_properties <- gliph2_result$merged_clusters @@ -755,9 +756,6 @@ runGLIPH <- function(cdr3_sequences, } } - ## ---- Stop parallel ---- - .stop_parallel() - ## ================================================================ ## Save results ## ================================================================ diff --git a/R/utils-input.R b/R/utils-input.R index 8652872..604f08b 100644 --- a/R/utils-input.R +++ b/R/utils-input.R @@ -15,7 +15,7 @@ if (!requireNamespace("immApex", quietly = TRUE)) { stop("The immApex package is required to extract TCR data from ", class(input)[1], " objects.\n", - "Install with: devtools::install_github('BorchLab/immApex')", + "Install with: BiocManager::install('BorchLab/immApex')", call. = FALSE) } ir_data <- immApex::getIR( @@ -32,7 +32,7 @@ if (!requireNamespace("immApex", quietly = TRUE)) { stop("The immApex package is required for list input from ", "combineTCR/combineBCR.\n", - "Install with: devtools::install_github('BorchLab/immApex')", + "Install with: BiocManager::install('BorchLab/immApex')", call. = FALSE) } ir_data <- immApex::getIR( diff --git a/R/utils-parallel.R b/R/utils-parallel.R index 19c6219..9fc0e11 100644 --- a/R/utils-parallel.R +++ b/R/utils-parallel.R @@ -1,15 +1,17 @@ -#' Setup parallel backend +#' Create a BiocParallel backend parameter object #' -#' On Unix-like systems (macOS, Linux) \code{doParallel} uses forked -#' processes that inherit the loaded package namespace. On Windows, PSOCK -#' clusters are used instead; these require loading the package on each -#' worker via \code{library()}, which fails during \code{R CMD check} -#' because the package is not yet installed in the standard library path. -#' To avoid this, we fall back to sequential execution (\code{registerDoSEQ}) -#' when only one core is requested or when running on Windows. +#' Returns a \code{\link[BiocParallel]{BiocParallelParam}} suitable for the +#' current platform and requested number of cores. On Unix-like systems +#' (macOS, Linux) with more than one core, a +#' \code{\link[BiocParallel]{MulticoreParam}} is returned. On Windows or +#' when only one core is requested, a +#' \code{\link[BiocParallel]{SerialParam}} is used instead. #' -#' @param n_cores Number of cores. NULL auto-detects. -#' @return The actual number of cores being used +#' The core count is clamped to 2 when the \env{_R_CHECK_LIMIT_CORES_} +#' environment variable is set (as during \command{R CMD check}). +#' +#' @param n_cores Number of cores. \code{NULL} auto-detects. +#' @return A \code{BiocParallelParam} object. #' @keywords internal .setup_parallel <- function(n_cores) { if (is.null(n_cores)) { @@ -17,18 +19,15 @@ } n_cores <- max(1L, min(n_cores, parallel::detectCores() - 1L)) + ## Respect R CMD check core limit + chk <- tolower(Sys.getenv("_R_CHECK_LIMIT_CORES_", "")) + if (nzchar(chk) && chk != "false") { + n_cores <- min(n_cores, 2L) + } + if (n_cores <= 1L || .Platform$OS.type == "windows") { - foreach::registerDoSEQ() - n_cores <- 1L + BiocParallel::SerialParam() } else { - doParallel::registerDoParallel(n_cores) + BiocParallel::MulticoreParam(workers = n_cores) } - n_cores -} - -#' Stop parallel backend -#' @return NULL (invisibly). Called for side effect. -#' @keywords internal -.stop_parallel <- function() { - doParallel::stopImplicitCluster() } diff --git a/README.md b/README.md index 510a717..4a0d4d7 100644 --- a/README.md +++ b/README.md @@ -127,6 +127,34 @@ See the package vignette for the full tutorial: vignette("immGLIPH") ``` +## Validation + +immGLIPH reproduces the published GLIPH and GLIPH2 cluster vectors on +each paper's own dataset, when run with paper-matched parameters and +(where applicable) the same post-hoc filtering criteria. + +| Dataset | immGLIPH configuration | n | ARI | NMI | Pairwise F1 | Precision | Recall | +|:--------|:-----------------------|--:|----:|----:|------------:|----------:|-------:| +| Glanville 2017 | `gliph1` + paper params | 144 | **0.985** | 0.994 | **0.985** | 1.000 | 0.971 | +| Huang 2020 | `gliph2` + paper params + filter | 171 | **0.863** | 0.968 | **0.867** | 0.931 | 0.812 | + +*Comparison universe: CDR3s present in both the immGLIPH input and the +published reference cluster output. Higher = stronger agreement with the +original tool's output.* + +![](man/figures/concordance_summary.png) + +For Glanville, every pair immGLIPH co-clustered was also co-clustered by +the original GLIPH (precision = 1.000), and 97% of the original GLIPH +co-clusterings were recovered (recall = 0.971). For Huang, precision is +0.931 and recall 0.812 against the curated 354-group GLIPH2 set. + +Full pipeline (data prep, paper-matched parameter settings, post-hoc +filter implementation, metric definitions, and a Quarto report) lives at +the companion benchmark repository: + +**[BorchLab/immGLIPH-benchmark](https://github.com/BorchLab/immGLIPH-benchmark)** + ## Bug Reports/New Features #### If you run into any issues or bugs please submit a [GitHub issue](https://github.com/BorchLab/immGLIPH/issues) with details of the issue. diff --git a/man/dot-cluster_gliph1.Rd b/man/dot-cluster_gliph1.Rd index 14187e1..4bd2490 100644 --- a/man/dot-cluster_gliph1.Rd +++ b/man/dot-cluster_gliph1.Rd @@ -14,7 +14,8 @@ global_vgene, public_tcrs, cluster_min_size, - verbose + verbose, + BPPARAM ) } \arguments{ @@ -40,6 +41,8 @@ global neighbours (used to add singletons).} \item{cluster_min_size}{Integer. Minimum cluster size to retain.} \item{verbose}{Logical. Print progress messages.} + +\item{BPPARAM}{A \code{BiocParallelParam} object for parallel evaluation.} } \value{ A list with: diff --git a/man/dot-cluster_gliph2.Rd b/man/dot-cluster_gliph2.Rd index 4ddc9c8..21211b0 100644 --- a/man/dot-cluster_gliph2.Rd +++ b/man/dot-cluster_gliph2.Rd @@ -17,7 +17,8 @@ motif_distance_cutoff, cluster_min_size, boost_local_significance, - verbose + verbose, + BPPARAM ) } \arguments{ @@ -55,6 +56,8 @@ local motifs.} using germline N-nucleotide information.} \item{verbose}{Logical. Print progress messages.} + +\item{BPPARAM}{A \code{BiocParallelParam} object for parallel evaluation.} } \value{ A list with: diff --git a/man/dot-global_cutoff.Rd b/man/dot-global_cutoff.Rd index 0076c83..d56872f 100644 --- a/man/dot-global_cutoff.Rd +++ b/man/dot-global_cutoff.Rd @@ -10,7 +10,7 @@ sequences, gccutoff, global_vgene, - no_cores, + BPPARAM, verbose ) } @@ -31,9 +31,9 @@ considered globally similar.} \item{global_vgene}{logical. If \code{TRUE}, global connections are restricted to sequence pairs that share a V gene.} -\item{no_cores}{numeric. Number of cores registered with the parallel -backend (used only for documentation; the function relies on a -pre-registered \code{foreach} backend).} +\item{BPPARAM}{A \code{\link[BiocParallel]{BiocParallelParam}} object +controlling parallel evaluation (e.g. +\code{BiocParallel::MulticoreParam()}).} \item{verbose}{logical. If \code{TRUE}, progress messages are printed.} } diff --git a/man/dot-global_cutoff_stringdist.Rd b/man/dot-global_cutoff_stringdist.Rd index ab505fa..9db54cb 100644 --- a/man/dot-global_cutoff_stringdist.Rd +++ b/man/dot-global_cutoff_stringdist.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/global-cutoff.R \name{.global_cutoff_stringdist} \alias{.global_cutoff_stringdist} -\title{stringdist + foreach fallback for global cutoff} +\title{stringdist + BiocParallel fallback for global cutoff} \usage{ .global_cutoff_stringdist( seqs, @@ -10,7 +10,7 @@ sequences, gccutoff, global_vgene, - no_cores, + BPPARAM, verbose ) } @@ -18,6 +18,6 @@ A list with edge data and excluded sequence IDs. } \description{ -stringdist + foreach fallback for global cutoff +stringdist + BiocParallel fallback for global cutoff } \keyword{internal} diff --git a/man/dot-global_fisher.Rd b/man/dot-global_fisher.Rd index 5bb330a..fafee1e 100644 --- a/man/dot-global_fisher.Rd +++ b/man/dot-global_fisher.Rd @@ -14,7 +14,7 @@ boundary_size, global_vgene, all_aa_interchangeable, - no_cores, + BPPARAM, verbose ) } @@ -43,7 +43,8 @@ a V-gene.} \item{all_aa_interchangeable}{Logical. If \code{FALSE}, only pairs whose variable-position amino acids have BLOSUM62 >= 0 are kept.} -\item{no_cores}{Integer. Number of registered parallel cores.} +\item{BPPARAM}{A \code{\link[BiocParallel]{BiocParallelParam}} object +controlling parallel evaluation.} \item{verbose}{Logical. Print progress messages.} } diff --git a/man/dot-local_fisher.Rd b/man/dot-local_fisher.Rd index 77a7248..abea02c 100644 --- a/man/dot-local_fisher.Rd +++ b/man/dot-local_fisher.Rd @@ -16,7 +16,7 @@ lcminove, discontinuous_motifs, motif_distance_cutoff, - no_cores, + BPPARAM, verbose ) } @@ -55,7 +55,8 @@ in the search.} to be grouped together. Not used directly in this function but kept for interface consistency.} -\item{no_cores}{Numeric. Number of cores to use for parallel motif finding.} +\item{BPPARAM}{A \code{\link[BiocParallel]{BiocParallelParam}} object specifying the +parallel backend. Defaults to \code{BiocParallel::SerialParam()}.} \item{verbose}{Logical. If \code{TRUE}, print status messages via \code{message()}.} diff --git a/man/dot-local_rrs.Rd b/man/dot-local_rrs.Rd index b4dec86..a762a84 100644 --- a/man/dot-local_rrs.Rd +++ b/man/dot-local_rrs.Rd @@ -17,7 +17,7 @@ discontinuous_motifs, cdr3_len_stratify, vgene_stratify, - no_cores, + BPPARAM, verbose, motif_lengths_list, ref_motif_lengths_id_list, @@ -62,7 +62,8 @@ CDR3 length distribution.} \item{vgene_stratify}{Logical. Whether to stratify random subsamples by V-gene usage distribution.} -\item{no_cores}{Integer. Number of cores for parallel execution.} +\item{BPPARAM}{A \code{\link[BiocParallel]{BiocParallelParam}} object +controlling parallel execution (default: \code{BiocParallel::bpparam()}).} \item{verbose}{Logical. If \code{TRUE}, emit progress messages.} diff --git a/man/dot-setup_parallel.Rd b/man/dot-setup_parallel.Rd index f4afe91..eed677c 100644 --- a/man/dot-setup_parallel.Rd +++ b/man/dot-setup_parallel.Rd @@ -2,23 +2,26 @@ % Please edit documentation in R/utils-parallel.R \name{.setup_parallel} \alias{.setup_parallel} -\title{Setup parallel backend} +\title{Create a BiocParallel backend parameter object} \usage{ .setup_parallel(n_cores) } \arguments{ -\item{n_cores}{Number of cores. NULL auto-detects.} +\item{n_cores}{Number of cores. \code{NULL} auto-detects.} } \value{ -The actual number of cores being used +A \code{BiocParallelParam} object. } \description{ -On Unix-like systems (macOS, Linux) \code{doParallel} uses forked -processes that inherit the loaded package namespace. On Windows, PSOCK -clusters are used instead; these require loading the package on each -worker via \code{library()}, which fails during \code{R CMD check} -because the package is not yet installed in the standard library path. -To avoid this, we fall back to sequential execution (\code{registerDoSEQ}) -when only one core is requested or when running on Windows. +Returns a \code{\link[BiocParallel]{BiocParallelParam}} suitable for the +current platform and requested number of cores. On Unix-like systems +(macOS, Linux) with more than one core, a +\code{\link[BiocParallel]{MulticoreParam}} is returned. On Windows or +when only one core is requested, a +\code{\link[BiocParallel]{SerialParam}} is used instead. +} +\details{ +The core count is clamped to 2 when the \env{_R_CHECK_LIMIT_CORES_} +environment variable is set (as during \command{R CMD check}). } \keyword{internal} diff --git a/man/dot-stop_parallel.Rd b/man/dot-stop_parallel.Rd deleted file mode 100644 index 8c01239..0000000 --- a/man/dot-stop_parallel.Rd +++ /dev/null @@ -1,15 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils-parallel.R -\name{.stop_parallel} -\alias{.stop_parallel} -\title{Stop parallel backend} -\usage{ -.stop_parallel() -} -\value{ -NULL (invisibly). Called for side effect. -} -\description{ -Stop parallel backend -} -\keyword{internal} diff --git a/man/figures/concordance_summary.png b/man/figures/concordance_summary.png new file mode 100644 index 0000000..4cc7d87 Binary files /dev/null and b/man/figures/concordance_summary.png differ diff --git a/tests/testthat/test-clusterScoring.R b/tests/testthat/test-clusterScoring.R index 3559f11..58be87a 100644 --- a/tests/testthat/test-clusterScoring.R +++ b/tests/testthat/test-clusterScoring.R @@ -344,6 +344,56 @@ test_that("clusterScoring with multiple clusters", { expect_equal(nrow(result), 2) }) +test_that("clusterScoring handles cluster larger than reference pool (counts)", { + # Regression test: previously errored with + # "invalid first argument" from sample.int when num_members exceeded + # length(cdr3_sequences$counts) because the null draw used + # replace = FALSE. Bootstrap (replace = TRUE) is the correct + # semantics for the clonal-expansion-enrichment null distribution. + skip_on_cran() + + ref_df <- data.frame( + CDR3b = c("CASSLAPGATNEKLFF", "CASSLDRGEVFF", "CASSYLAGGRNTLYF", + "CASSLTGGEETQYF", "CASSLGGRETQYF"), + TRBV = c("TRBV5-1", "TRBV6-2", "TRBV5-1", "TRBV7-2", "TRBV5-1"), + stringsAsFactors = FALSE + ) + + seqs <- data.frame( + CDR3b = c("CASSLAPGATNEKLFF", "CASSLDRGEVFF", "CASSYLAGGRNTLYF", + "CASSLTGGEETQYF", "CASSLGGRETQYF"), + TRBV = c("TRBV5-1", "TRBV6-2", "TRBV5-1", "TRBV7-2", "TRBV5-1"), + counts = c(5, 3, 2, 1, 1), + stringsAsFactors = FALSE + ) + + # Cluster has 10 members (via duplicated CDR3s), exceeding the 5-row + # reference pool — exercises the resampling branch where + # num_members > length(cdr3_sequences$counts). + cl <- list( + "CRG-1" = data.frame( + CDR3b = rep(c("CASSLAPGATNEKLFF", "CASSLDRGEVFF"), times = 5), + TRBV = rep(c("TRBV5-1", "TRBV6-2"), times = 5), + counts = rep(c(5, 3), times = 5), + stringsAsFactors = FALSE + ) + ) + + expect_no_error( + result <- clusterScoring( + cluster_list = cl, + cdr3_sequences = seqs, + refdb_beta = ref_df, + gliph_version = 1, + sim_depth = 10, + n_cores = 1 + ) + ) + expect_s3_class(result, "data.frame") + expect_true("clonal.expansion.score" %in% colnames(result)) + expect_equal(nrow(result), 1) +}) + test_that("clusterScoring with custom v_usage_freq", { skip_on_cran() diff --git a/tests/testthat/test-clustering.R b/tests/testthat/test-clustering.R index 1c9fa95..5e95f05 100644 --- a/tests/testthat/test-clustering.R +++ b/tests/testthat/test-clustering.R @@ -1,8 +1,5 @@ # Tests for .cluster_gliph1() and .cluster_gliph2() -# Register sequential backend for %dopar% -foreach::registerDoSEQ() - # ---- Helpers ---------------------------------------------------------------- .make_cluster_data <- function() { @@ -44,7 +41,8 @@ test_that(".cluster_gliph1 returns list with expected elements", { global_vgene = FALSE, public_tcrs = TRUE, cluster_min_size = 1, - verbose = FALSE + verbose = FALSE, + BPPARAM = BiocParallel::SerialParam() ) expect_type(result, "list") @@ -79,7 +77,8 @@ test_that(".cluster_gliph1 forms connected components correctly", { global_vgene = FALSE, public_tcrs = TRUE, cluster_min_size = 1, - verbose = FALSE + verbose = FALSE, + BPPARAM = BiocParallel::SerialParam() ) expect_s3_class(result$cluster_properties, "data.frame") @@ -112,7 +111,8 @@ test_that(".cluster_gliph1 filters by cluster_min_size", { global_vgene = FALSE, public_tcrs = TRUE, cluster_min_size = 3, - verbose = FALSE + verbose = FALSE, + BPPARAM = BiocParallel::SerialParam() ) if (!is.null(result$cluster_properties)) { @@ -135,7 +135,8 @@ test_that(".cluster_gliph1 returns NULL for empty edge list", { global_vgene = FALSE, public_tcrs = TRUE, cluster_min_size = 2, - verbose = FALSE + verbose = FALSE, + BPPARAM = BiocParallel::SerialParam() ) expect_null(result$cluster_properties) @@ -165,7 +166,8 @@ test_that(".cluster_gliph1 adds singletons from not_in_global_ids", { global_vgene = FALSE, public_tcrs = TRUE, cluster_min_size = 1, - verbose = FALSE + verbose = FALSE, + BPPARAM = BiocParallel::SerialParam() ) # The clone_network should contain singleton rows @@ -204,7 +206,8 @@ test_that(".cluster_gliph2 returns list with expected elements", { motif_distance_cutoff = 1, cluster_min_size = 1, boost_local_significance = FALSE, - verbose = FALSE + verbose = FALSE, + BPPARAM = BiocParallel::SerialParam() ) expect_type(result, "list") @@ -242,7 +245,8 @@ test_that(".cluster_gliph2 clone_network has expected edge types", { motif_distance_cutoff = 10, cluster_min_size = 1, boost_local_significance = FALSE, - verbose = FALSE + verbose = FALSE, + BPPARAM = BiocParallel::SerialParam() ) if (!is.null(result$clone_network)) { @@ -283,7 +287,8 @@ test_that(".cluster_gliph2 cluster_min_size filters small clusters", { motif_distance_cutoff = 10, cluster_min_size = 5, boost_local_significance = FALSE, - verbose = FALSE + verbose = FALSE, + BPPARAM = BiocParallel::SerialParam() ) # Cluster of size 2 should be eliminated by min_size = 5 @@ -310,7 +315,8 @@ test_that(".cluster_gliph2 handles no local and no global similarities", { motif_distance_cutoff = 1, cluster_min_size = 2, boost_local_significance = FALSE, - verbose = FALSE + verbose = FALSE, + BPPARAM = BiocParallel::SerialParam() ) expect_null(result$merged_clusters) @@ -347,7 +353,8 @@ test_that(".cluster_gliph2 handles global results only", { motif_distance_cutoff = 1, cluster_min_size = 1, boost_local_significance = FALSE, - verbose = FALSE + verbose = FALSE, + BPPARAM = BiocParallel::SerialParam() ) expect_type(result, "list") @@ -384,7 +391,8 @@ test_that(".cluster_gliph2 with global_vgene FALSE for global clusters", { motif_distance_cutoff = 1, cluster_min_size = 1, boost_local_significance = FALSE, - verbose = FALSE + verbose = FALSE, + BPPARAM = BiocParallel::SerialParam() ) expect_type(result, "list") @@ -436,7 +444,8 @@ test_that(".cluster_gliph2 merges local and global clusters", { motif_distance_cutoff = 10, cluster_min_size = 1, boost_local_significance = FALSE, - verbose = FALSE + verbose = FALSE, + BPPARAM = BiocParallel::SerialParam() ) expect_type(result, "list") @@ -478,7 +487,8 @@ test_that(".cluster_gliph2 applies BLOSUM62 filtering when all_aa_interchangeabl motif_distance_cutoff = 1, cluster_min_size = 1, boost_local_significance = FALSE, - verbose = FALSE + verbose = FALSE, + BPPARAM = BiocParallel::SerialParam() ) expect_type(result, "list") @@ -509,7 +519,8 @@ test_that(".cluster_gliph1 restricts edges to same donor when public_tcrs is FAL global_vgene = FALSE, public_tcrs = FALSE, cluster_min_size = 1, - verbose = FALSE + verbose = FALSE, + BPPARAM = BiocParallel::SerialParam() ) expect_type(result, "list") @@ -537,7 +548,8 @@ test_that(".cluster_gliph1 filters global edges by V-gene when global_vgene is T global_vgene = TRUE, public_tcrs = TRUE, cluster_min_size = 1, - verbose = FALSE + verbose = FALSE, + BPPARAM = BiocParallel::SerialParam() ) expect_type(result, "list") @@ -566,7 +578,8 @@ test_that(".cluster_gliph1 prints message when verbose is TRUE", { global_vgene = FALSE, public_tcrs = TRUE, cluster_min_size = 1, - verbose = TRUE + verbose = TRUE, + BPPARAM = BiocParallel::SerialParam() ), "GLIPH1" ) @@ -587,7 +600,8 @@ test_that(".cluster_gliph1 creates singleton network from NULL clone_network", { global_vgene = FALSE, public_tcrs = TRUE, cluster_min_size = 1, - verbose = FALSE + verbose = FALSE, + BPPARAM = BiocParallel::SerialParam() ) expect_true(!is.null(result$clone_network)) @@ -624,7 +638,8 @@ test_that(".cluster_gliph2 works with structboundaries FALSE", { motif_distance_cutoff = 10, cluster_min_size = 1, boost_local_significance = FALSE, - verbose = FALSE + verbose = FALSE, + BPPARAM = BiocParallel::SerialParam() ) expect_type(result, "list") @@ -660,7 +675,8 @@ test_that(".cluster_gliph2 generates save_cluster_list_df", { motif_distance_cutoff = 10, cluster_min_size = 1, boost_local_significance = FALSE, - verbose = FALSE + verbose = FALSE, + BPPARAM = BiocParallel::SerialParam() ) if (!is.null(result$merged_clusters) && length(result$cluster_list) > 0) { @@ -700,7 +716,8 @@ test_that(".cluster_gliph2 prints message when verbose is TRUE", { motif_distance_cutoff = 10, cluster_min_size = 1, boost_local_significance = FALSE, - verbose = TRUE + verbose = TRUE, + BPPARAM = BiocParallel::SerialParam() ), "GLIPH2" ) @@ -734,7 +751,8 @@ test_that(".cluster_gliph1 works without patient info", { global_vgene = FALSE, public_tcrs = TRUE, cluster_min_size = 1, - verbose = FALSE + verbose = FALSE, + BPPARAM = BiocParallel::SerialParam() ) expect_type(result, "list") @@ -771,7 +789,8 @@ test_that(".cluster_gliph2 includes TRBV in tag when global_vgene is TRUE", { motif_distance_cutoff = 1, cluster_min_size = 1, boost_local_significance = FALSE, - verbose = FALSE + verbose = FALSE, + BPPARAM = BiocParallel::SerialParam() ) if (!is.null(result$merged_clusters)) { @@ -809,7 +828,8 @@ test_that(".cluster_gliph2 handles infinite OvE values", { motif_distance_cutoff = 10, cluster_min_size = 1, boost_local_significance = FALSE, - verbose = FALSE + verbose = FALSE, + BPPARAM = BiocParallel::SerialParam() ) expect_type(result, "list") @@ -850,7 +870,8 @@ test_that(".cluster_gliph1 handles duplicate cluster names with suffixes", { global_vgene = FALSE, public_tcrs = TRUE, cluster_min_size = 1, - verbose = FALSE + verbose = FALSE, + BPPARAM = BiocParallel::SerialParam() ) expect_type(result, "list") @@ -891,7 +912,8 @@ test_that(".cluster_gliph2 motif_distance_cutoff = 0 eliminates positionally dis motif_distance_cutoff = 0, cluster_min_size = 1, boost_local_significance = FALSE, - verbose = FALSE + verbose = FALSE, + BPPARAM = BiocParallel::SerialParam() ) result_lenient <- immGLIPH:::.cluster_gliph2( @@ -907,7 +929,8 @@ test_that(".cluster_gliph2 motif_distance_cutoff = 0 eliminates positionally dis motif_distance_cutoff = 100, cluster_min_size = 1, boost_local_significance = FALSE, - verbose = FALSE + verbose = FALSE, + BPPARAM = BiocParallel::SerialParam() ) # Strict cutoff should have fewer or equal edges diff --git a/tests/testthat/test-global-fisher.R b/tests/testthat/test-global-fisher.R index 1c22b9b..4104ca6 100644 --- a/tests/testthat/test-global-fisher.R +++ b/tests/testthat/test-global-fisher.R @@ -1,8 +1,5 @@ # Tests for .global_fisher() -# Register sequential backend for %dopar% -foreach::registerDoSEQ() - # ---- Helper: small synthetic data ------------------------------------------ .make_global_fisher_data <- function() { @@ -62,7 +59,7 @@ test_that(".global_fisher returns list with cluster_list and global_similarities boundary_size = 3, global_vgene = FALSE, all_aa_interchangeable = TRUE, - no_cores = 1, + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE ) @@ -86,7 +83,7 @@ test_that(".global_fisher cluster_list has expected columns when similarities fo boundary_size = 3, global_vgene = FALSE, all_aa_interchangeable = TRUE, - no_cores = 1, + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE ) @@ -129,7 +126,7 @@ test_that(".global_fisher returns FALSE for no structural matches", { boundary_size = 3, global_vgene = FALSE, all_aa_interchangeable = TRUE, - no_cores = 1, + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE ) @@ -165,7 +162,7 @@ test_that(".global_fisher with global_vgene restricts to same V-gene", { boundary_size = 3, global_vgene = TRUE, all_aa_interchangeable = TRUE, - no_cores = 1, + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE ) @@ -179,7 +176,7 @@ test_that(".global_fisher with global_vgene restricts to same V-gene", { boundary_size = 3, global_vgene = FALSE, all_aa_interchangeable = TRUE, - no_cores = 1, + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE ) diff --git a/tests/testthat/test-globalCutoff.R b/tests/testthat/test-globalCutoff.R index 2be07f4..9f0b5c8 100644 --- a/tests/testthat/test-globalCutoff.R +++ b/tests/testthat/test-globalCutoff.R @@ -1,8 +1,5 @@ # Tests for .global_cutoff() and related functions -# Register a sequential backend for internal functions that use %dopar% -foreach::registerDoSEQ() - # ---- .global_cutoff_stringdist ----------------------------------------------- test_that(".global_cutoff_stringdist returns correct structure", { @@ -21,7 +18,7 @@ test_that(".global_cutoff_stringdist returns correct structure", { sequences = sequences, gccutoff = 5, global_vgene = FALSE, - no_cores = 1, + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE ) @@ -50,7 +47,7 @@ test_that(".global_cutoff_stringdist returns empty for zero cutoff on different sequences = sequences, gccutoff = 0, global_vgene = FALSE, - no_cores = 1, + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE ) @@ -75,7 +72,7 @@ test_that(".global_cutoff dispatches correctly", { sequences = sequences, gccutoff = 5, global_vgene = FALSE, - no_cores = 1, + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE ) @@ -105,7 +102,7 @@ test_that("immApex buildNetwork backend matches stringdist backend", { sequences = sequences, gccutoff = 3, global_vgene = FALSE, - no_cores = 1, + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE ) @@ -146,7 +143,7 @@ test_that(".global_cutoff_stringdist returns zero edges for single sequence", { sequences = sequences, gccutoff = 5, global_vgene = FALSE, - no_cores = 1, + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE ) @@ -170,7 +167,7 @@ test_that(".global_cutoff_stringdist finds edges for identical sequences", { sequences = sequences, gccutoff = 0, global_vgene = FALSE, - no_cores = 1, + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE ) @@ -195,11 +192,11 @@ test_that(".global_cutoff_stringdist with global_vgene restricts to same V-gene" result_same <- immGLIPH:::.global_cutoff_stringdist( seqs = seqs, motif_region = motif_region, sequences = sequences_same, - gccutoff = 5, global_vgene = TRUE, no_cores = 1, verbose = FALSE + gccutoff = 5, global_vgene = TRUE, BPPARAM = BiocParallel::SerialParam(), verbose = FALSE ) result_diff <- immGLIPH:::.global_cutoff_stringdist( seqs = seqs, motif_region = motif_region, sequences = sequences_diff, - gccutoff = 5, global_vgene = TRUE, no_cores = 1, verbose = FALSE + gccutoff = 5, global_vgene = TRUE, BPPARAM = BiocParallel::SerialParam(), verbose = FALSE ) # Same V-gene should find the edge; different V-genes should not @@ -217,7 +214,7 @@ test_that(".global_cutoff not_in_global_ids tracks isolated sequences", { result <- immGLIPH:::.global_cutoff_stringdist( seqs = seqs, motif_region = motif_region, sequences = sequences, - gccutoff = 0, global_vgene = FALSE, no_cores = 1, verbose = FALSE + gccutoff = 0, global_vgene = FALSE, BPPARAM = BiocParallel::SerialParam(), verbose = FALSE ) # not_in_global_ids should contain indices of sequences without global edges @@ -239,7 +236,7 @@ test_that(".global_cutoff_stringdist prints messages when verbose is TRUE", { expect_message( immGLIPH:::.global_cutoff_stringdist( seqs = seqs, motif_region = motif_region, sequences = sequences, - gccutoff = 5, global_vgene = FALSE, no_cores = 1, verbose = TRUE + gccutoff = 5, global_vgene = FALSE, BPPARAM = BiocParallel::SerialParam(), verbose = TRUE ), "global" ) @@ -257,7 +254,7 @@ test_that(".global_cutoff prints verbose dispatch message", { expect_message( immGLIPH:::.global_cutoff( seqs = seqs, motif_region = motif_region, sequences = sequences, - gccutoff = 5, global_vgene = FALSE, no_cores = 1, verbose = TRUE + gccutoff = 5, global_vgene = FALSE, BPPARAM = BiocParallel::SerialParam(), verbose = TRUE ), "global" ) @@ -333,7 +330,7 @@ test_that(".global_cutoff_stringdist finds edges with high cutoff", { result <- immGLIPH:::.global_cutoff_stringdist( seqs = seqs, motif_region = motif_region, sequences = sequences, - gccutoff = 10, global_vgene = FALSE, no_cores = 1, verbose = FALSE + gccutoff = 10, global_vgene = FALSE, BPPARAM = BiocParallel::SerialParam(), verbose = FALSE ) expect_true(nrow(result$edges) > 0) diff --git a/tests/testthat/test-local-fisher.R b/tests/testthat/test-local-fisher.R index 61d1f57..e338b68 100644 --- a/tests/testthat/test-local-fisher.R +++ b/tests/testthat/test-local-fisher.R @@ -1,7 +1,5 @@ # Tests for .local_fisher() -# Register sequential backend for %dopar% -foreach::registerDoSEQ() # ---- Helper: small synthetic data ------------------------------------------ @@ -55,7 +53,7 @@ test_that(".local_fisher returns list with selected_motifs and all_motifs", { lcminove = c(1, 1), discontinuous_motifs = FALSE, motif_distance_cutoff = 1, - no_cores = 1, + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE ) @@ -78,7 +76,7 @@ test_that(".local_fisher all_motifs has expected columns", { lcminove = c(0, 0), discontinuous_motifs = FALSE, motif_distance_cutoff = 1, - no_cores = 1, + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE ) @@ -104,7 +102,7 @@ test_that(".local_fisher p-values are numeric and in [0, 1]", { lcminove = c(0, 0), discontinuous_motifs = FALSE, motif_distance_cutoff = 1, - no_cores = 1, + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE ) @@ -127,7 +125,7 @@ test_that(".local_fisher OvE is numeric", { lcminove = c(0, 0), discontinuous_motifs = FALSE, motif_distance_cutoff = 1, - no_cores = 1, + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE ) @@ -144,14 +142,14 @@ test_that(".local_fisher kmer_mindepth filters out rare motifs", { seqs = d$sample_seqs, refseqs = d$ref_seqs, sequences = d$sequences, motif_length = 2, kmer_mindepth = 1, lcminp = 1.0, lcminove = 0, discontinuous_motifs = FALSE, motif_distance_cutoff = 1, - no_cores = 1, verbose = FALSE + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE ) result_high <- immGLIPH:::.local_fisher( motif_region = d$motif_region, refseqs_motif_region = d$ref_motif_region, seqs = d$sample_seqs, refseqs = d$ref_seqs, sequences = d$sequences, motif_length = 2, kmer_mindepth = 5, lcminp = 1.0, lcminove = 0, discontinuous_motifs = FALSE, motif_distance_cutoff = 1, - no_cores = 1, verbose = FALSE + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE ) # Higher mindepth should yield fewer or equal selected motifs @@ -171,14 +169,14 @@ test_that(".local_fisher lcminp filters out high p-value motifs", { seqs = d$sample_seqs, refseqs = d$ref_seqs, sequences = d$sequences, motif_length = 2, kmer_mindepth = 1, lcminp = 1e-10, lcminove = 0, discontinuous_motifs = FALSE, motif_distance_cutoff = 1, - no_cores = 1, verbose = FALSE + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE ) result_lenient <- immGLIPH:::.local_fisher( motif_region = d$motif_region, refseqs_motif_region = d$ref_motif_region, seqs = d$sample_seqs, refseqs = d$ref_seqs, sequences = d$sequences, motif_length = 2, kmer_mindepth = 1, lcminp = 1.0, lcminove = 0, discontinuous_motifs = FALSE, motif_distance_cutoff = 1, - no_cores = 1, verbose = FALSE + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE ) expect_true(nrow(result_strict$selected_motifs) <= @@ -195,7 +193,7 @@ test_that(".local_fisher lcminove filters by fold change per motif length", { motif_length = c(2, 3), kmer_mindepth = 1, lcminp = 1.0, lcminove = c(1e6, 1e6), discontinuous_motifs = FALSE, motif_distance_cutoff = 1, - no_cores = 1, verbose = FALSE + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE ) expect_equal(nrow(result$selected_motifs), 0) @@ -210,7 +208,7 @@ test_that(".local_fisher includes discontinuous motifs when enabled", { seqs = d$sample_seqs, refseqs = d$ref_seqs, sequences = d$sequences, motif_length = 2, kmer_mindepth = 1, lcminp = 1.0, lcminove = 0, discontinuous_motifs = TRUE, motif_distance_cutoff = 1, - no_cores = 1, verbose = FALSE + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE ) disc <- result$all_motifs[grep("\\.", result$all_motifs$motif), ] @@ -237,7 +235,7 @@ test_that(".local_fisher works via immApex when available", { lcminove = c(1, 1), discontinuous_motifs = FALSE, motif_distance_cutoff = 1, - no_cores = 1, + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE ) @@ -266,7 +264,7 @@ test_that(".local_fisher immApex path with discontinuous motifs", { lcminove = c(0, 0), discontinuous_motifs = TRUE, motif_distance_cutoff = 1, - no_cores = 1, + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE ) @@ -291,7 +289,7 @@ test_that(".local_fisher prints messages when verbose is TRUE", { lcminove = c(1, 1), discontinuous_motifs = FALSE, motif_distance_cutoff = 1, - no_cores = 1, + BPPARAM = BiocParallel::SerialParam(), verbose = TRUE ), "motif" @@ -314,7 +312,7 @@ test_that(".local_fisher works with single lcminove value for multiple motif_len lcminove = 1, discontinuous_motifs = FALSE, motif_distance_cutoff = 1, - no_cores = 1, + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE ) @@ -338,7 +336,7 @@ test_that(".local_fisher avgRef is normalized to sample set size", { lcminove = c(0, 0), discontinuous_motifs = FALSE, motif_distance_cutoff = 1, - no_cores = 1, + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE ) @@ -362,7 +360,7 @@ test_that(".local_fisher topRef column is numeric", { lcminove = c(0, 0), discontinuous_motifs = FALSE, motif_distance_cutoff = 1, - no_cores = 1, + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE ) @@ -385,7 +383,7 @@ test_that(".local_fisher returns empty selected_motifs when all below kmer_minde lcminove = c(0, 0), discontinuous_motifs = FALSE, motif_distance_cutoff = 1, - no_cores = 1, + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE ) @@ -412,7 +410,7 @@ test_that(".local_fisher correctly applies per-length lcminove thresholds", { lcminove = c(1e6, 0), discontinuous_motifs = FALSE, motif_distance_cutoff = 1, - no_cores = 1, + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE ) @@ -439,7 +437,7 @@ test_that(".local_fisher works with motif_length = 4", { lcminove = 0, discontinuous_motifs = FALSE, motif_distance_cutoff = 1, - no_cores = 1, + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE ) @@ -466,7 +464,7 @@ test_that(".local_fisher all_motifs counts are positive", { lcminove = c(0, 0), discontinuous_motifs = FALSE, motif_distance_cutoff = 1, - no_cores = 1, + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE ) @@ -489,7 +487,7 @@ test_that(".local_fisher num_in_ref is non-negative", { lcminove = c(0, 0), discontinuous_motifs = FALSE, motif_distance_cutoff = 1, - no_cores = 1, + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE ) diff --git a/tests/testthat/test-local-rrs.R b/tests/testthat/test-local-rrs.R index 1617c39..b5e20bd 100644 --- a/tests/testthat/test-local-rrs.R +++ b/tests/testthat/test-local-rrs.R @@ -1,8 +1,5 @@ # Tests for .local_rrs() -# Register sequential backend for %dopar% -foreach::registerDoSEQ() - # ---- Helper: small synthetic data ------------------------------------------ .make_rrs_data <- function() { @@ -54,7 +51,7 @@ test_that(".local_rrs returns list with sample_log, selected_motifs, all_motifs" discontinuous_motifs = FALSE, cdr3_len_stratify = FALSE, vgene_stratify = FALSE, - no_cores = 1, + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE, motif_lengths_list = list(), ref_motif_lengths_id_list = list(), @@ -88,7 +85,7 @@ test_that(".local_rrs sample_log has sim_depth + 1 rows", { discontinuous_motifs = FALSE, cdr3_len_stratify = FALSE, vgene_stratify = FALSE, - no_cores = 1, + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE, motif_lengths_list = list(), ref_motif_lengths_id_list = list(), @@ -119,7 +116,7 @@ test_that(".local_rrs all_motifs has expected columns", { discontinuous_motifs = FALSE, cdr3_len_stratify = FALSE, vgene_stratify = FALSE, - no_cores = 1, + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE, motif_lengths_list = list(), ref_motif_lengths_id_list = list(), @@ -151,7 +148,7 @@ test_that(".local_rrs p-values are in (0, 1]", { discontinuous_motifs = FALSE, cdr3_len_stratify = FALSE, vgene_stratify = FALSE, - no_cores = 1, + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE, motif_lengths_list = list(), ref_motif_lengths_id_list = list(), @@ -177,7 +174,7 @@ test_that(".local_rrs kmer_mindepth filters correctly", { motif_length = 2, sim_depth = 5, kmer_mindepth = 5, lcminp = 1.0, lcminove = 0, discontinuous_motifs = FALSE, cdr3_len_stratify = FALSE, vgene_stratify = FALSE, - no_cores = 1, verbose = FALSE, + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE, motif_lengths_list = list(), ref_motif_lengths_id_list = list(), motif_region_vgenes_list = list(), ref_motif_vgenes_id_list = list(), lengths_vgenes_list = list(), ref_lengths_vgenes_list = list() @@ -198,7 +195,7 @@ test_that(".local_rrs strict lcminp yields fewer selected motifs", { motif_length = 2, sim_depth = 5, kmer_mindepth = 1, lcminp = lcminp, lcminove = 0, discontinuous_motifs = FALSE, cdr3_len_stratify = FALSE, vgene_stratify = FALSE, - no_cores = 1, verbose = FALSE, + BPPARAM = BiocParallel::SerialParam(), verbose = FALSE, motif_lengths_list = list(), ref_motif_lengths_id_list = list(), motif_region_vgenes_list = list(), ref_motif_vgenes_id_list = list(), lengths_vgenes_list = list(), ref_lengths_vgenes_list = list() diff --git a/tests/testthat/test-utils-parallel.R b/tests/testthat/test-utils-parallel.R index b0a8a24..23aea87 100644 --- a/tests/testthat/test-utils-parallel.R +++ b/tests/testthat/test-utils-parallel.R @@ -1,56 +1,38 @@ -# Tests for .setup_parallel() and .stop_parallel() +# Tests for .setup_parallel() -# ---- .setup_parallel with n_cores = 1 ---------------------------------------- - -test_that(".setup_parallel with 1 core returns 1", { +test_that(".setup_parallel with 1 core returns a SerialParam", { result <- immGLIPH:::.setup_parallel(1) - expect_equal(result, 1L) + expect_s4_class(result, "SerialParam") }) -# ---- .setup_parallel with NULL auto-detects ----------------------------------- - -test_that(".setup_parallel with NULL returns at least 1", { +test_that(".setup_parallel with NULL returns a valid BiocParallelParam", { result <- immGLIPH:::.setup_parallel(NULL) - expect_true(result >= 1L) - expect_true(result <= parallel::detectCores()) - # Clean up - immGLIPH:::.stop_parallel() + expect_true(is(result, "BiocParallelParam")) }) -# ---- .setup_parallel clamps to valid range ------------------------------------ +test_that(".setup_parallel returns MulticoreParam on non-Windows with multiple cores", { + skip_on_os("windows") + skip_if(parallel::detectCores() < 3, + "Need at least 3 cores to test MulticoreParam") + # Respect _R_CHECK_LIMIT_CORES_ — only expect MulticoreParam when allowed + chk <- tolower(Sys.getenv("_R_CHECK_LIMIT_CORES_", "")) + skip_if(nzchar(chk) && chk != "false", + "_R_CHECK_LIMIT_CORES_ is set; MulticoreParam may be clamped") + result <- immGLIPH:::.setup_parallel(2) + expect_s4_class(result, "MulticoreParam") +}) test_that(".setup_parallel clamps excessive core count", { - max_cores <- parallel::detectCores() - result <- immGLIPH:::.setup_parallel(max_cores + 100) - expect_true(result <= max_cores) - expect_true(result >= 1L) - # Clean up - immGLIPH:::.stop_parallel() + result <- immGLIPH:::.setup_parallel(9999) + expect_true(is(result, "BiocParallelParam")) }) -test_that(".setup_parallel clamps negative core count to 1", { +test_that(".setup_parallel clamps negative core count to SerialParam", { result <- immGLIPH:::.setup_parallel(-5) - expect_equal(result, 1L) + expect_s4_class(result, "SerialParam") }) -test_that(".setup_parallel clamps zero to 1", { +test_that(".setup_parallel clamps zero to SerialParam", { result <- immGLIPH:::.setup_parallel(0) - expect_equal(result, 1L) -}) - -# ---- .stop_parallel runs without error ---------------------------------------- - -test_that(".stop_parallel executes without error", { - immGLIPH:::.setup_parallel(1) - expect_no_error(immGLIPH:::.stop_parallel()) -}) - -# ---- Sequential fallback on Windows or 1 core --------------------------------- - -test_that(".setup_parallel registers sequential backend for single core", { - result <- immGLIPH:::.setup_parallel(1) - expect_equal(result, 1L) - # After registerDoSEQ, foreach should still work - res <- foreach::foreach(i = 1:3, .combine = c) %dopar% { i * 2 } - expect_equal(res, c(2, 4, 6)) + expect_s4_class(result, "SerialParam") }) diff --git a/vignettes/immGLIPH.Rmd b/vignettes/immGLIPH.Rmd index 7330e46..fc5d89a 100644 --- a/vignettes/immGLIPH.Rmd +++ b/vignettes/immGLIPH.Rmd @@ -16,9 +16,11 @@ vignette: > --- ```{r setup, include=FALSE} +library(BiocStyle) knitr::opts_chunk$set( echo = TRUE, message = FALSE, + tidy = FALSE, warning = FALSE, fig.width = 7, fig.height = 5 @@ -51,10 +53,10 @@ the original publications:** ## Installation -immGLIPH can be installed from GitHub: +immGLIPH can be installed from Bioconductor: ```{r eval=FALSE} -devtools::install_github("BorchLab/immGLIPH") +BiocManager::install("immGLIPH") ``` The reference repertoire data (~19 MB) is downloaded automatically the first @@ -67,6 +69,8 @@ BiocManager::install("BiocFileCache") ref <- getGLIPHreference() ``` +### Loading Package + ```{r} library(immGLIPH) ``` @@ -74,11 +78,11 @@ library(immGLIPH) ## Integration with the scRepertoire Ecosystem immGLIPH integrates with the -[scRepertoire](https://github.com/BorchLab/scRepertoire) ecosystem through -[immApex](https://github.com/BorchLab/immApex). This means `runGLIPH()` can -directly accept: +[scRepertoire](https://bioconductor.org/packages/scRepertoire/) ecosystem +through [immApex](https://bioconductor.org/packages/immApex/). Both +scRepertoire and immApex are Bioconductor packages and can be installed via +`BiocManager::install()`. This means `runGLIPH()` can directly accept: -- **Seurat** objects with TCR information - **SingleCellExperiment** objects with TCR information - **combineTCR()** output lists from scRepertoire - Standard data frames or character vectors @@ -125,25 +129,27 @@ scRepertoire to prepare your data and pass the output directly to immGLIPH. library(scRepertoire) # After processing with cellranger/etc, combine contigs -combined <- combineTCR( - contig_list, - samples = c("P1", "P2"), - cells = "T-AB" -) +combined <- combineTCR(contig_list[1:2], + samples = c("P1", "P2")) # Pass scRepertoire output directly to runGLIPH -results <- runGLIPH(combined, method = "gliph2") +results <- runGLIPH(combined, + method = "gliph2") ``` -For **Seurat** or **SingleCellExperiment** objects that already contain TCR -metadata (e.g., added via `scRepertoire::combineExpression()`), immGLIPH -extracts the receptor data automatically using `immApex::getIR()`: +For **SingleCellExperiment** objects that already contain TCR metadata (e.g., +added via `scRepertoire::combineExpression()`), immGLIPH extracts the receptor +data automatically using `immApex::getIR()`. Here is an example using the +bundled `gliph_sce` dataset: ```{r eval=FALSE} -library(Seurat) +library(SingleCellExperiment) +data("gliph_sce") -# Seurat object with TCR info in metadata -results <- runGLIPH(seurat_obj, method = "gliph2", chains = "TRB") +# SingleCellExperiment object with TCR info in colData +results <- runGLIPH(gliph_sce, + method = "gliph2", + chains = "TRB") ``` # The `runGLIPH()` Function @@ -152,7 +158,7 @@ results <- runGLIPH(seurat_obj, method = "gliph2", chains = "TRB") | Argument | Default | Description | |:---------|:--------|:------------| -| `cdr3_sequences` | -- | Input data (data frame, vector, Seurat, SCE, or list) | +| `cdr3_sequences` | -- | Input data (data frame, vector, SCE, or list) | | `method` | `"gliph2"` | Algorithm preset: `"gliph1"`, `"gliph2"`, or `"custom"` | | `sim_depth` | 1000 | Simulation depth (higher = more reproducible, slower) | | `n_cores` | 1 | Number of parallel cores | @@ -377,18 +383,19 @@ data("gliph_input_data") sample_seqs <- as.character(gliph_input_data$CDR3b[seq_len(100)]) # Find all 3-mers appearing at least 5 times -motifs <- findMotifs(seqs = sample_seqs, q = 3, kmer_mindepth = 5) +motifs <- findMotifs(seqs = sample_seqs, + q = 3, + kmer_mindepth = 5) head(motifs[order(motifs$V1, decreasing = TRUE), ]) ``` Including discontinuous motifs (e.g., `C.S` where `.` is any amino acid): ```{r} -disc_motifs <- findMotifs( - seqs = sample_seqs, - q = 2, - kmer_mindepth = 5, - discontinuous = TRUE +disc_motifs <- findMotifs(seqs = sample_seqs, + q = 2, + kmer_mindepth = 5, + discontinuous = TRUE ) # Show discontinuous motifs (those containing a dot) disc_only <- disc_motifs[grep("\\.", disc_motifs$motif), ] @@ -428,17 +435,18 @@ data): ## Example -```{r eval=FALSE} -# Re-score with GLIPH2 formula and higher simulation depth -rescored <- clusterScoring( - cluster_list = res_gliph1$cluster_list, - cdr3_sequences = gliph_input_data[seq_len(200), ], - refdb_beta = "human_v2.0_CD48", - gliph_version = 2, - sim_depth = 500, - n_cores = 1 -) -head(rescored) +```{r} +# Re-score with GLIPH2 formula +if (length(res_gliph1$cluster_list) > 0) { + rescored <- clusterScoring( + cluster_list = res_gliph1$cluster_list, + cdr3_sequences = gliph_input_data[seq_len(200), ], + refdb_beta = ref_df, + gliph_version = 2, + sim_depth = 100, + n_cores = 1) + head(rescored) +} ``` # De Novo TCR Generation with `deNovoTCRs()` @@ -462,22 +470,25 @@ characteristics. ## Example -```{r eval=FALSE} -# Generate de novo TCRs for the top convergence group -de_novo <- deNovoTCRs( - convergence_group_tag = res_gliph1$cluster_properties$tag[1], - clustering_output = res_gliph1, - sims = 10000, - num_tops = 100, - make_figure = TRUE, - n_cores = 1 -) - -# Top predicted sequences -head(de_novo$de_novo_sequences) - -# Positional weight matrix used for generation -head(de_novo$PWM_Scoring) +```{r} +# Generate de novo TCRs for the first convergence group (if any found) +if (length(res_gliph1$cluster_list) > 0) { + de_novo <- deNovoTCRs( + convergence_group_tag = names(res_gliph1$cluster_list)[1], + clustering_output = res_gliph1, + refdb_beta = ref_df, + sims = 10000, + num_tops = 100, + make_figure = FALSE, + n_cores = 1 + ) + + # Top predicted sequences + head(de_novo$de_novo_sequences) + + # Positional weight matrix used for generation + head(de_novo$PWM_Scoring) +} ``` # Network Visualization with `plotNetwork()` @@ -499,24 +510,16 @@ the convergence groups using the visNetwork package. ## Example -```{r eval=FALSE} -plotNetwork( - clustering_output = res_gliph2, - color_info = "total.score", - cluster_min_size = 3, - n_cores = 1 -) -``` - -You can also color nodes by donor/patient: - -```{r eval=FALSE} -plotNetwork( - clustering_output = res_gliph2, - color_info = "patient", - cluster_min_size = 3, - n_cores = 1 -) +```{r} +if (!is.null(res_gliph1$cluster_properties) && + nrow(res_gliph1$cluster_properties) > 0) { + plotNetwork( + clustering_output = res_gliph1, + color_info = "total.score", + cluster_min_size = 2, + n_cores = 1 + ) +} ``` # Loading Saved Results with `loadGLIPH()` @@ -553,9 +556,9 @@ When `result_folder` is specified, `runGLIPH()` writes several output files: ## Accelerated Computation with immApex -When [immApex](https://github.com/BorchLab/immApex) is installed, immGLIPH -automatically uses its C++-accelerated backends for two computationally -intensive steps: +When [immApex](https://bioconductor.org/packages/immApex/) is installed, +immGLIPH automatically uses its C++-accelerated backends for two +computationally intensive steps: 1. **Motif enumeration** (`findMotifs()`): Uses `immApex::calculateMotif()` with OpenMP multithreading instead of the pure-R `stringdist::qgrams()` @@ -563,15 +566,14 @@ intensive steps: 2. **Global Hamming distance network** (GLIPH1 method): Uses `immApex::buildNetwork()` to compute pairwise distances in a single C++ - call, replacing the `foreach`-based parallel loop over - `stringdist::stringdist()`. + call, replacing the parallel loop over `stringdist::stringdist()`. If immApex is not installed, immGLIPH falls back to the original pure-R -implementations transparently---no code changes are needed. +implementations transparently, no code changes are needed. ```{r eval=FALSE} # Install immApex for performance acceleration -devtools::install_github("BorchLab/immApex") +BiocManager::install("BorchLab/immApex") ``` # Tips and Best Practices @@ -589,7 +591,7 @@ devtools::install_github("BorchLab/immApex") `sim_depth >= 1000`. For exploratory analysis, `sim_depth = 100` is faster. 5. **Parallelization**: For large datasets (>5,000 sequences), set - `n_cores > 1` to use parallel processing. + `n_cores > 1` to use parallel processing via BiocParallel. 6. **Install immApex**: For best performance, install immApex to enable C++-accelerated motif enumeration and network construction (see