diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7b732e7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.Rproj.user +.Rhistory +.RData +.Ruserdata +.DS_Store diff --git a/DESCRIPTION b/DESCRIPTION index c603411..0f008b4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,20 +1,32 @@ Package: FastGxC Title: Efficient and Powerful Software for Simulating Data and Detecting Context-specific eQTL Effects in Multi-context Genomic Studies -Version: 0.0.0.9000 +Version: 0.0.1 Authors@R: person("Lena", "Krockenberger", , "lkrocken@g.ucla.edu", role = c("aut", "cre"), comment = c(ORCID = "0009-0002-5987-5496")) -Description: Functions to run FastGxC decomposition and subsequent context-specific eQTL mapping with multiple testing adjustment. This software also can be used to simulate data and run eQTL mapping on toy examples. -Suggests: +Description: Functions to run FastGxC decomposition and subsequent context-specific eQTL mapping with multiple testing adjustment. This software can also simulate data and run eQTL mapping on toy examples. +License: MIT + file LICENSE +Encoding: UTF-8 +Roxygen: list(markdown = TRUE) +RoxygenNote: 7.3.3 +Depends: R (>= 4.2.0) +Imports: + devtools, dplyr, data.table, MatrixEQTL, mvtnorm, reshape2, magrittr, - TreeQTL -License: `use_mit_license()`, `use_gpl3_license()` or friends to pick a - license -Encoding: UTF-8 -Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.2 + reticulate, + Rcpp +LinkingTo: + Rcpp +Suggests: + qvalue, + TreeQTL, + TreeBH, + knitr, + rmarkdown, + testthat (>= 3.0.0) +Config/testthat/edition: 3 diff --git a/NAMESPACE b/NAMESPACE index d75a3a3..2153438 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,5 +6,18 @@ export(eQTL_mapping_step) export(get_eGenes_multi_tissue_mod) export(get_eSNPs_multi_tissue_mod) export(get_pvals_and_fam_p_mod) +export(install_deps) export(simulate_data) +export(to_TreeBH_input) +export(treeBH_step) export(treeQTL_step) +importFrom(Rcpp,sourceCpp) +importFrom(data.table,fread) +importFrom(data.table,getDTthreads) +importFrom(data.table,setDTthreads) +importFrom(dplyr,distinct) +importFrom(dplyr,group_by) +importFrom(dplyr,mutate) +importFrom(dplyr,select) +importFrom(magrittr,"%>%") +useDynLib(FastGxC, .registration = TRUE) diff --git a/R/RcppExports.R b/R/RcppExports.R new file mode 100644 index 0000000..9cdc23e --- /dev/null +++ b/R/RcppExports.R @@ -0,0 +1,31 @@ +# Generated by using Rcpp::compileAttributes() -> do not edit by hand +# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 + +#' Check if groups are properly nested in hierarchy +checkNested <- function(groups, L) { + .Call(`_FastGxC_checkNested`, groups, L) +} + +#' Hierarchical Group P-value Calculation +#' The getGroupPvalues function computes aggregated p-values for each group in a hierarchical +#' structure by combining p-values from lower levels using specified aggregation methods. +#' @param pvals NumericVector containing p-values for individual hypotheses +#' @param groups CharacterMatrix defining the group hierarchy structure +#' @param N Integer specifying the number of hypotheses +#' @param L Integer specifying the number of levels in the hierarchy +#' @param test CharacterVector specifying the aggregation method(s) +getGroupPvalues <- function(pvals, groups, N, L, test) { + .Call(`_FastGxC_getGroupPvalues`, pvals, groups, N, L, test) +} + +#' Hierarchical Group Selection for Multiple Testing +#' The getGroupSelections function applies hierarchical multiple testing correction. +#' @param group_pvals NumericMatrix containing aggregated p-values +#' @param groups CharacterMatrix defining the group hierarchy structure +#' @param q NumericVector containing the q-value thresholds for each level +#' @param N Integer specifying the number of hypotheses +#' @param L Integer specifying the number of levels in the hierarchy +getGroupSelections <- function(group_pvals, groups, q, N, L) { + .Call(`_FastGxC_getGroupSelections`, group_pvals, groups, q, N, L) +} + diff --git a/R/deps.R b/R/deps.R new file mode 100644 index 0000000..f21c55f --- /dev/null +++ b/R/deps.R @@ -0,0 +1,46 @@ +# R/deps.R + +#' Install FastGxC dependencies +#' +#' Installs CRAN, Bioconductor, and source packages required by FastGxC. +#' This includes optional packages like qvalue, TreeQTL, and TreeBH. +#' +#' @param ask Logical, whether to ask before installing (default: TRUE if interactive). +#' @export +install_deps <- function(ask = interactive()) { + # --- Core CRAN packages (should already install via Imports:) --- + cran_pkgs <- c("devtools","dplyr","data.table","MatrixEQTL", + "mvtnorm","reshape2","magrittr","reticulate") + to_install <- cran_pkgs[!cran_pkgs %in% rownames(installed.packages())] + if (length(to_install)) { + msg <- paste("Installing missing CRAN packages:", paste(to_install, collapse = ", ")) + if (!ask || utils::menu(c("Yes","No"), title = msg) == 1) { + install.packages(to_install) + } + } + + # --- Bioconductor package: qvalue --- + if (!requireNamespace("qvalue", quietly = TRUE)) { + message("Installing Bioconductor package: qvalue") + if (!requireNamespace("BiocManager", quietly = TRUE)) { + install.packages("BiocManager") + } + BiocManager::install("qvalue") + } + + # --- Source-only package: TreeQTL --- + if (!requireNamespace("TreeQTL", quietly = TRUE)) { + message("Installing TreeQTL from source tarball") + utils::install.packages("http://bioinformatics.org/treeqtl/TreeQTL_2.0.tar.gz", + repos = NULL, type = "source") + } + + # --- Source-only package: TreeBH --- + if (!requireNamespace("TreeBH", quietly = TRUE)) { + message("Installing TreeBH from source tarball") + utils::install.packages("https://github.com/user-attachments/files/21328935/TreeBH_1.0.tar.gz", + repos = NULL, type = "source") + } + + message("All FastGxC dependencies are installed or already present.") +} diff --git a/R/eQTL_mapping.R b/R/eQTL_mapping.R index ce70ad8..f898e39 100644 --- a/R/eQTL_mapping.R +++ b/R/eQTL_mapping.R @@ -1,96 +1,183 @@ #' eQTL Mapping Step #' -#' Function to map cis eQTLs - cis window is defined as 1Mb +#' Function to map cis eQTLs using either Matrix eQTL or TensorQTL. #' -#' @param SNP_file_name - full file path with genotypes of all individuals -#' @param snps_location_file_name - full file path with snp ids, start, and end positions -#' @param expression_file_name - full file path with expression matrix of individuals per gene -#' @param gene_location_file_name - full file path with gene ids, start, and end positions -#' @param context - name of context for output file naming purposes -#' @param shared_specific - either "shared" if mapping eQTLs with shared component or "specific" if mapping eQTLs with specific component -#' @param out_dir - full file path of output directory where eQTL file will be written out -#' @return outputs one file of mapped cis eQTLs +#' @param SNP_file_name Full path to SNP genotype matrix. +#' @param snps_location_file_name Full path to SNP location file. +#' @param expression_file_name Full path to expression matrix. +#' @param gene_location_file_name Full path to gene location file. +#' @param context Context name for labeling output. +#' @param shared_specific Context specific or context shared +#' @param out_dir Output directory. +#' @param output_file_name_cis Path to write cis-eQTL output. +#' @param output_file_name_tra Path to write trans-eQTL output. +#' @param method Either "MatrixEQTL" or "tensorQTL". +#' @param use_model MatrixEQTL model (default: modelLINEAR). +#' @param cis_dist Distance threshold for cis-eQTLs. +#' @param pv_threshold_cis P-value threshold for cis. +#' @param pv_threshold_tra P-value threshold for trans. +#' @param error_covariance Covariance matrix (or numeric()). #' +#' @return Writes cis-eQTLs and trans-eQTLs (optional) to disk. +#' @importFrom data.table fread setDTthreads #' @export -eQTL_mapping_step = function(SNP_file_name, snps_location_file_name, expression_file_name, gene_location_file_name, context, shared_specific, out_dir){ - -# Output file name -output_file_name_cis = paste0(out_dir, context, "_", shared_specific, ".all_pairs.txt"); -output_file_name_tra = tempfile(); - -setDTthreads(1) -print(paste0("data.table getDTthreads(): ",getDTthreads())) - -string1 = sprintf("Running analysis for %s \n", expression_file_name) -cat(string1) - -#%%%%%%%%%%%%%%%%%%%%%%%% -#%%%%%%%%%%%%%%%%%%%%%%%% MatrixEQTL parameters -#%%%%%%%%%%%%%%%%%%%%%%%% - -# Linear model to use -useModel = modelLINEAR; - -# Only associations significant at this level will be saved -pvOutputThreshold_cis = 1; -pvOutputThreshold_tra = 0; - -# Error covariance matrix -errorCovariance = numeric(); - -# Distance for local gene-SNP pairs -cisDist = 1e6; - -#%%%%%%%%%%%%%%%%%%%%%%%% -#%%%%%%%%%%%%%%%%%%%%%%%% Read files -#%%%%%%%%%%%%%%%%%%%%%%%% - -## Raw gene expression data with gene position -expression_mat=as.matrix(data.frame(data.table::fread(input = expression_file_name, header = T),row.names = 1, check.names = F)) -genepos = read.table(file = gene_location_file_name, header = TRUE, stringsAsFactors = FALSE)[,1:4]; - -## Genotype data with snp position -snps = MatrixEQTL::SlicedData$new(); -snps$fileDelimiter = "\t"; # the TAB character -snps$fileOmitCharacters = "NA"; # denote missing values; -snps$fileSkipRows = 1; # one row of column labels -snps$fileSkipColumns = 1; # one column of row labels -snps$fileSliceSize = 2000; # read file in slices of 2,000 rows -snps$LoadFile(SNP_file_name); - -snpspos = read.table(file = snps_location_file_name, header = TRUE, stringsAsFactors = FALSE)[,1:3]; - -print(dim(snps)) -print(snps$columnNames) - -## Make sure order of individuals is the same in gene expression and genotype matrices -snps$ColumnSubsample(which(snps$columnNames %in% colnames(expression_mat))) # Match SNP and expression individuals -expression_mat=expression_mat[,snps$columnNames] -gene = SlicedData$new(); -gene$CreateFromMatrix(expression_mat) - -#%%%%%%%%%%%%%%%%%%%%%%%% -#%%%%%%%%%%%%%%%%%%%%%%%% Run the analysis -#%%%%%%%%%%%%%%%%%%%%%%%% - -me = Matrix_eQTL_main( - snps = snps, - gene = gene, - cvrt = SlicedData$new(), - output_file_name = output_file_name_tra, - pvOutputThreshold = pvOutputThreshold_tra, - useModel = useModel, - errorCovariance = errorCovariance, - verbose = TRUE, - output_file_name.cis = output_file_name_cis, - pvOutputThreshold.cis = pvOutputThreshold_cis, - snpspos = snpspos, - genepos = genepos, - cisDist = cisDist, - pvalue.hist = FALSE, - min.pv.by.genesnp = FALSE, - noFDRsaveMemory = FALSE); - -## Results: -cat('Analysis finished in: ', me$time.in.sec, ' seconds', '\n') +eQTL_mapping_step = function( + SNP_file_name, + snps_location_file_name, + expression_file_name, + gene_location_file_name, + context, + shared_specific, + out_dir, + output_file_name_cis = file.path(out_dir, paste0(context, "_", shared_specific, ".cis_pairs.txt")), + output_file_name_tra = file.path(out_dir, paste0(context, "_", shared_specific, ".trans_pairs.txt")), + method = "MatrixEQTL", + use_model = modelLINEAR, + cis_dist = 1e6, + pv_threshold_cis = 1, + pv_threshold_tra = 0, + error_covariance = numeric() +) { + if (method == "MatrixEQTL") { + setDTthreads(1) + + expression_mat <- as.matrix(data.frame(fread(expression_file_name, header = TRUE), row.names = 1, check.names = FALSE)) + genepos <- data.table::fread(gene_location_file_name, sep = '\t', header = TRUE, data.table = FALSE) + names(genepos) <- tolower(names(genepos)) + + genepos <- genepos |> + dplyr::rename( + geneid = dplyr::coalesce(names(genepos)[grepl("gene", names(genepos))][1], "geneid"), + s1 = dplyr::coalesce(names(genepos)[grepl("start|s1", names(genepos))][1], "s1"), + s2 = dplyr::coalesce(names(genepos)[grepl("end|s2", names(genepos))][1], "s2") + ) |> + dplyr::select(geneid, chr, s1, s2) + + snps <- SlicedData$new() + snps$fileDelimiter <- "\t" + snps$fileOmitCharacters <- "NA" + snps$fileSkipRows <- 1 + snps$fileSkipColumns <- 1 + snps$fileSliceSize <- 2000 + snps$LoadFile(SNP_file_name) + + snpspos <- read.table(snps_location_file_name, header = TRUE)[, c("snpid", "chr", "pos")] + snps$ColumnSubsample(which(snps$columnNames %in% colnames(expression_mat))) + expression_mat <- expression_mat[, snps$columnNames] + + gene <- SlicedData$new() + gene$CreateFromMatrix(expression_mat) + + Matrix_eQTL_main( + snps = snps, + gene = gene, + cvrt = SlicedData$new(), + output_file_name = output_file_name_tra, + pvOutputThreshold = pv_threshold_tra, + useModel = use_model, + errorCovariance = error_covariance, + verbose = TRUE, + output_file_name.cis = output_file_name_cis, + pvOutputThreshold.cis = pv_threshold_cis, + snpspos = snpspos, + genepos = genepos, + cisDist = cis_dist, + pvalue.hist = FALSE, + min.pv.by.genesnp = FALSE, + noFDRsaveMemory = FALSE + ) + + } else if (method == "tensorQTL") { + SNP_file_name <- path.expand(SNP_file_name) + snps_location_file_name <- path.expand(snps_location_file_name) + gene_location_file_name <- path.expand(gene_location_file_name) + expression_file_name <- path.expand(expression_file_name) + out_dir <- path.expand(out_dir) + suppressWarnings({ + py_run_string(" +import os +import torch +import pandas as pd +import numpy as np +from tensorqtl import cis +import builtins + +os.environ['CUDA_VISIBLE_DEVICES'] = '' +torch.set_num_threads(1) + +def run_tensorqtl(phenotype_df, phenotype_pos_df, genotype_df, variant_df, prefix=''): + if 's1' in phenotype_pos_df.columns and 's2' in phenotype_pos_df.columns: + phenotype_pos_df.rename(columns={'s1': 'start', 's2': 'end'}, inplace=True) + if 'chr' in variant_df.columns: + variant_df.rename(columns={'chr': 'chrom'}, inplace=True) + cis.map_nominal(genotype_df, variant_df, phenotype_df, phenotype_pos_df, prefix=prefix) + +builtins.run_tensorqtl = run_tensorqtl +")}) + + expr <- read.table(expression_file_name, header = TRUE, sep = "\t", check.names = FALSE) + rownames(expr) <- expr[, 1] + expr <- expr[, -1, drop = FALSE] + + expr <- t(apply(expr, 1, function(x) { + x[is.na(x)] <- mean(x, na.rm = TRUE) + x + })) + + write.table( + expr, + file = expression_file_name, + sep = "\t", + row.names = TRUE, + col.names = NA, + quote = FALSE + ) + + phenotype_df <- suppressWarnings(read.table(expression_file_name, header = TRUE, row.names = 1)) + phenotype_pos_df <- suppressWarnings(read.table(gene_location_file_name, header = TRUE, row.names = 1)) + genotype_df <- suppressWarnings(read.table(SNP_file_name, header = TRUE, row.names = 1)) + variant_df <- suppressWarnings(read.table(snps_location_file_name, header = TRUE, row.names = 1)) + + py$phenotype_df <- phenotype_df + py$phenotype_pos_df <- phenotype_pos_df + py$genotype_df <- genotype_df + py$variant_df <- variant_df + tensorqtl_prefix <- file.path(out_dir, paste0(context, "_", shared_specific)) + py$prefix <- tensorqtl_prefix + + tryCatch({ + py$run_tensorqtl( + py$phenotype_df, + py$phenotype_pos_df, + py$genotype_df, + py$variant_df, + prefix = py$prefix + ) + }, error = function(e) { + message("TensorQTL failed for context: ", context) + message(e$message) + }) + + parquet_path <- paste0(py$prefix, ".cis_qtl_pairs.chr1.parquet") + if (!file.exists(parquet_path)) stop("TensorQTL failed: .parquet output not found") + + pd <- import("pandas", convert = FALSE) + pq <- pd$read_parquet(parquet_path) + py$pq <- pq + + pq$rename(columns = dict( + phenotype_id = "gene", + variant_id = "SNP", + slope = "beta", + pval_nominal = "p-value" + ), inplace = TRUE) + + pq_df <- py_eval("pq.loc[:, ['SNP', 'gene', 'beta', 'p-value']]", convert = TRUE) + pq_df$FDR <- p.adjust(pq_df$`p-value`, method = "BH") + pq_df <- pq_df[order(pq_df$`p-value`), ] + + write.table(pq_df, file = output_file_name_cis, sep = '\t', row.names = FALSE, quote = FALSE) + } else { + stop("Invalid method. Use 'MatrixEQTL' or 'tensorQTL'") + } } diff --git a/R/functions.R b/R/functions.R index ce50fdc..8dde035 100644 --- a/R/functions.R +++ b/R/functions.R @@ -16,6 +16,9 @@ decompose=function(X,design){ return(list(Xw=Xw,Xb=X.mean.indiv)) } +#' @importFrom data.table fread +#' @importFrom dplyr select group_by mutate distinct +#' @importFrom magrittr %>% #' @export get_eGenes_multi_tissue_mod = function (m_eqtl_out_dir, treeQTL_dir, tissue_names, level1 = 0.05, level2 = 0.05, level3 = 0.05, exp_suffix, four_level = F, shared_n_tests_per_gene) { pattern=paste0(exp_suffix,".all_pairs.txt") @@ -224,6 +227,9 @@ get_eGenes_multi_tissue_mod = function (m_eqtl_out_dir, treeQTL_dir, tissue_name # eGene_xT_sel #} +#' @importFrom data.table fread +#' @importFrom dplyr mutate select +#' @importFrom magrittr %>% #' @export get_pvals_and_fam_p_mod = function(genes_by_tissue, snps_by_tissue, m_eqtl_out_dir, tissue_names, exp_suffix) { pattern=paste0(exp_suffix,".all_pairs.txt") diff --git a/R/simulate_data.R b/R/simulate_data.R index 654a75c..050bd73 100644 --- a/R/simulate_data.R +++ b/R/simulate_data.R @@ -1,137 +1,87 @@ -#' Simulations +#' Simulate Gene Expression and Genotype Data #' -#' Function to generate simulated SNP and expression data +#' Generates simulated SNP genotypes, gene expression data across multiple contexts (e.g., tissues or cell types) #' -#' @param data_dir - full filepath of the directory where simulated data will be written to -#' @param N - number of individuals simulated (must be a whole number) -#' @param n_genes - number of genes simulated (must be a whole number) -#' @param n_snps_per_gene - number of cis-SNPs per gene (must be a whole number) -#' @param n_contexts - number of contexts to simulate (e.g. tissues or cell types) (must be a whole number) -#' @param maf - minor allele frequency for genotypes -#' @param w_corr - error covariance between contexts -#' @param v_e - error variance in each context (maybe take this out and set it to 1) -#' @param missing - decimal value signifying percentage of missingness in simulated expression data (e.g. parameter value of 0.3 would indicate 30% missing values in outputted expression matrix) -#' @param seed - can set seed for reproducibility -#' @param sim_scenario - must be either "null" or "single_context_het" to signify simulations under the null case (no genetic effects in any context) or the case of single context heterogeneity (one context drives the genetic effect heterogeneity) -#' @return outputs an expression matrix file, a genotype matrix file, a SNP location file, and a gene location file all in the format needed for FastGxC's decomposition step and then subsequent eQTL mapping step with Matrix eQTL. +#' @param data_dir String. Full file path to the directory where simulated data will be saved. +#' @param N Integer. Number of individuals to simulate. +#' @param n_genes Integer. Number of genes to simulate. +#' @param n_snps_per_gene Integer. Number of cis-SNPs per gene. +#' @param n_contexts Integer. Number of contexts to simulate (e.g., tissues or cell types). +#' @param maf Numeric. Minor allele frequency for simulated SNPs (value between 0 and 1). +#' @param w_corr Numeric. Off-diagonal values for the error covariance matrix (correlation between contexts). +#' @param v_e Numeric. Error variance for expression in each context. +#' @param missing Numeric. Proportion of missing values in the simulated expression data (e.g., 0.3 = 30% missing). +#' @param hsq Numeric vector of length `n_contexts`. Heritability values in each context (defaults to 0 = null model). +#' @param mus Numeric vector of length `n_contexts`. Mean expression in each context. +#' +#' @return Writes the following files to `data_dir`: +#' \itemize{ +#' \item \code{SNPs.txt} – genotype matrix (individuals × SNPs) +#' \item \code{snpsloc.txt} – SNP location file for cis-window definitions +#' \item \code{geneloc.txt} – gene location file for cis-window definitions +#' \item \code{simulated_expression.txt} – simulated gene expression matrix with optional missingness +#' } #' #' @export -simulate_data = function(data_dir, N = 300, n_genes=100, n_snps_per_gene=1000, n_contexts=10, maf=0.2, w_corr=0.2, v_e=1, missing = 0, seed = 1, sim_scenario = "single_context_het"){ - -if(!dir.exists(data_dir)) dir.create(data_dir) - -mus = c(rep(0, n_contexts-1),0) # gene expression mean in each context for genotype = 0 - -# Error variance-covariance matrix -sigma = matrix(w_corr,nrow=n_contexts,ncol=n_contexts) # Error variance-covariance matrix -diag(sigma) = v_e - -# Simulate genotypes -genos = sapply(X = 1:(n_snps_per_gene*n_genes), FUN = function(X) rbinom(N, 2, maf)) -colnames(genos) = paste0("snp",1:(n_snps_per_gene*n_genes)) -rownames(genos) = paste0("ind",1:N) -write.table(x = t(data.table(genos, keep.rownames = T) %>% rename(snpid=rn)), file = paste0(data_dir,sim_scenario, "_SNPs.txt"), quote = F, sep = '\t', row.names = T, col.names = F) - -print("Finished simulating and saving genotypes") - -# Save SNP location file -# Data frame with 3 initial columns (name, chrom, and position) that match standard SNP map file, followed by 1 column for each context with a 0/1 indicator of whether the given SNP passed QC in that tissue. -snp_loc=data.frame(snpid=colnames(genos), chr="chr1",pos=rep(x = seq(1,n_genes*1e9, by = 1e9), each=n_snps_per_gene), - matrix(data = 1, nrow = ncol(genos), ncol = n_contexts, dimnames = list(NULL, paste0("context",1:n_contexts)))) -write.table(x = snp_loc, file = paste0(data_dir,sim_scenario,"_snpsloc.txt"), quote = F, sep = '\t', row.names = F, col.names = T) - -print("Finished saving snp location file") - -# Save gene location file -# data frame with 4 initial columns (name, chrom, and start and end position) that match standard gene map file, followed by 1 column for each context with a 0/1 indicator of whether the given gene passed QC in that context -gene_loc=data.frame(geneid=paste0("gene",1:n_genes), - chr="chr1", - s1=seq(1,n_genes*1e9, by = 1e9), - s2=seq(1,n_genes*1e9, by = 1e9)+ 1000, - matrix(data = 1, nrow = n_genes, ncol = n_contexts, dimnames = list(NULL, paste0("context",1:n_contexts)))) -write.table(x = gene_loc, file = paste0(data_dir,sim_scenario,"_geneloc.txt"), quote = F, sep = '\t', row.names = F, col.names = T) - -print("Finished saving gene location file") - -# Use only one snp per gene to generate expression -genos_with_effect = genos[,seq(from = 1, to = (n_snps_per_gene*n_genes), by = n_snps_per_gene)] - -# Generate expression matrix -exp_mat=expand.grid(iid=paste0("ind",1:N),context=paste0("context",1:n_contexts)) - -which_context=rep_len(x = 1:n_contexts, length.out = n_genes) - -if(sim_scenario == "null"){ -which_context=rep_len(x = 1:n_contexts, length.out = n_genes) - -for(i in 1:n_genes){ - - # Genotypic effect in each context for assumed heritability - hsq=rep(NA, n_contexts) # expression heritability, i.e. proportion of gene expression variance explained by genetics, in each context - hsq[which_context[i]]= 0 - hsq[-which_context[i]]= rep(0, n_contexts-1) - betas=sqrt((hsq*v_e)/((1-hsq)*var(genos_with_effect[,i]))) - - # expression of gene per context without noise - Y = matrix(0,nrow=N,ncol=n_contexts, dimnames = list(paste0("ind",1:N), paste0("context",1:n_contexts))) - for (j in 1:n_contexts) Y[,j] = mus[j] + genos_with_effect[,i]*betas[j] - - # get noise per individual - for(j in 1:N) Y[j,] = Y[j,] + rmvnorm(1, rep(0,n_contexts), sigma) - - data_mat=melt(data = data.table(Y,keep.rownames = T), id.vars = "rn") - colnames(data_mat) = c("iid", "context", paste0("gene",i)) - exp_mat = merge(x = exp_mat, y = data_mat) - -} -} - -if(sim_scenario == "single_context_het"){ -which_context=rep_len(x = 1:n_contexts, length.out = n_genes) - -for(i in 1:n_genes){ - - # Genotypic effect in each context for assumed heritability - hsq=rep(NA, n_contexts) # expression heritability, i.e. proportion of gene expression variance explained by genetics, in each context - hsq[which_context[i]]= 0.2 - hsq[-which_context[i]]= rep(0.1, n_contexts-1) - betas=sqrt((hsq*v_e)/((1-hsq)*var(genos_with_effect[,i]))) - - # expression of gene per context without noise - Y = matrix(0,nrow=N,ncol=n_contexts, dimnames = list(paste0("ind",1:N), paste0("context",1:n_contexts))) - for (j in 1:n_contexts) Y[,j] = mus[j] + genos_with_effect[,i]*betas[j] - - # get noise per individual - for(j in 1:N) Y[j,] = Y[j,] + rmvnorm(1, rep(0,n_contexts), sigma) - - data_mat=melt(data = data.table(Y,keep.rownames = T), id.vars = "rn") - colnames(data_mat) = c("iid", "context", paste0("gene",i)) - exp_mat = merge(x = exp_mat, y = data_mat) - -} -} - -rownames(exp_mat) = paste(exp_mat$iid,exp_mat$context, sep = " - ") -exp_mat = cbind(data.frame(design = paste(exp_mat$iid,exp_mat$context, sep = " - ")), exp_mat) - -## add missing values -total_elements = prod(dim(exp_mat[,-c(1,2,3)])) -missing_elements = floor(missing * total_elements) - -missing_indices = sample(1:total_elements, missing_elements) - -# Convert the dataframe to a vector, set the selected elements to NA, and then convert it back to a dataframe -identifiers = exp_mat[,c(1,2,3)] -exp_mat_missing = as.vector(as.matrix(exp_mat[,-c(1,2,3)])) -exp_mat_missing[missing_indices] = NA # Set selected elements to NA - -# Convert the vector back to a dataframe with the original dimensions -exp_mat_missing = matrix(exp_mat_missing, nrow = nrow(exp_mat), ncol = ncol(exp_mat)-3) -exp_mat_missing = as.data.frame(exp_mat_missing) -exp_mat_missing = cbind(identifiers, exp_mat_missing) -colnames(exp_mat_missing) = colnames(exp_mat) - -write.table(x = exp_mat_missing[,-c(2:3)], file = paste0(data_dir,sim_scenario, "_simulated_expression.txt"), quote = F, sep = '\t', row.names = F, col.names = T) - -print("Finished simulating and saving expression file") +simulate_data = function(data_dir, N = 300, n_genes = 100, n_snps_per_gene = 1000, + n_contexts, maf = 0.2, w_corr = 0.2, + v_e = 1, missing = 0, + hsq = rep(0.2, n_contexts), + mus = rep(0, n_contexts)) { + if (!dir.exists(data_dir)) dir.create(data_dir) + sigma = matrix(w_corr, nrow = n_contexts, ncol = n_contexts) + diag(sigma) = v_e + genos = sapply(X = 1:(n_snps_per_gene * n_genes), FUN = function(X) rbinom(N, 2, maf)) + colnames(genos) = paste0("snp", 1:(n_snps_per_gene * n_genes)) + rownames(genos) = paste0("ind", 1:N) + write.table(x = t(data.table(genos, keep.rownames = T) %>% + rename(snpid = rn)), file = paste0(data_dir, "SNPs.txt"), quote = F, sep = "\t", row.names = T, col.names = F) + print("Finished simulating and saving genotypes") + snp_loc = data.frame(snpid = colnames(genos), chr = "chr1", + pos = rep(x = seq(1, n_genes * 1e+09, by = 1e+09), each = n_snps_per_gene), + matrix(data = 1, nrow = ncol(genos), ncol = n_contexts, + dimnames = list(NULL, paste0("context", 1:n_contexts)))) + write.table(x = snp_loc, file = paste0(data_dir,"snpsloc.txt"), quote = F, sep = "\t", row.names = F, + col.names = T) + print("Finished saving snp location file") + gene_loc = data.frame(geneid = paste0("gene", 1:n_genes), + chr = "chr1", s1 = seq(1, n_genes * 1e+09, by = 1e+09), + s2 = seq(1, n_genes * 1e+09, by = 1e+09) + 1000, matrix(data = 1, + nrow = n_genes, + ncol = n_contexts, + dimnames = list(NULL, + paste0("context", 1:n_contexts)))) + write.table(x = gene_loc, file = paste0(data_dir, "geneloc.txt"), quote = F, sep = "\t", row.names = F, + col.names = T) + print("Finished saving gene location file") + genos_with_effect = genos[, seq(from = 1, to = (n_snps_per_gene * + n_genes), by = n_snps_per_gene)] + exp_mat = expand.grid(iid = paste0("ind", 1:N), context = paste0("context", + 1:n_contexts)) + which_context = rep_len(x = 1:n_contexts, length.out = n_genes) + for (i in 1:n_genes) { + betas = sqrt((hsq * v_e)/((1 - hsq) * var(genos_with_effect[, i]))) + Y = matrix(0, nrow = N, ncol = n_contexts, dimnames = list(paste0("ind", 1:N), paste0("context", 1:n_contexts))) + for (j in 1:n_contexts) Y[, j] = mus[j] + genos_with_effect[, i] * betas[j] + for (j in 1:N) Y[j, ] = Y[j, ] + rmvnorm(1, rep(0, n_contexts), sigma) + data_mat = melt(data = data.table(Y, keep.rownames = T), + id.vars = "rn") + colnames(data_mat) = c("iid", "context", paste0("gene", i)) + exp_mat = merge(x = exp_mat, y = data_mat) + } + rownames(exp_mat) = paste(exp_mat$iid, exp_mat$context, sep = " - ") + exp_mat = cbind(data.frame(design = paste(exp_mat$iid, exp_mat$context, sep = " - ")), exp_mat) + total_elements = prod(dim(exp_mat[, -c(1:3)])) + missing_elements = floor(missing * total_elements) + missing_indices = sample(1:total_elements, missing_elements) + identifiers = exp_mat[, 1:3] + exp_mat_missing = as.vector(as.matrix(exp_mat[, -c(1:3)])) + exp_mat_missing[missing_indices] = NA + exp_mat_missing = matrix(exp_mat_missing, nrow = nrow(exp_mat), ncol = ncol(exp_mat) - 3) + exp_mat_missing = as.data.frame(exp_mat_missing) + exp_mat_missing = cbind(identifiers, exp_mat_missing) + colnames(exp_mat_missing) = colnames(exp_mat) + write.table(x = exp_mat_missing[, -c(2:3)], file = paste0(data_dir, "expression.txt"), quote = F, + sep = "\t", row.names = F, col.names = T) + print("Finished simulating and saving expression file") } diff --git a/R/treeBH.R b/R/treeBH.R new file mode 100644 index 0000000..6ec560b --- /dev/null +++ b/R/treeBH.R @@ -0,0 +1,326 @@ +#' @useDynLib FastGxC, .registration = TRUE +#' @importFrom Rcpp sourceCpp +NULL + +#' Converting eQTL Mapping Output to TreeBH Input +#' +#' @param data_dir - full filepath of the directory where eQTL output files are stored. This function assumes that files are named as outputted by FastGxC's eQTL mapping function +#' @param shared_file - full filepath to the shared cis_pairs file from the output of the mapping function +#' @param context_names - vector of all context names in the format c("tissue1", "tissue2", ..., etc.) +#' @param out_dir - full filepath of the output directory where the input of TreeBH can be stored +#' @return A data.frame with columns: +#' - gene_name : gene name +#' - snp_name : snp name +#' - context_name : context name (or "shared") +#' - gene_snp : concatenated "gene_SNP" +#' - component_tag : "gene_SNP_specific" or "gene_SNP_shared" +#' - context_tag : "gene_SNP_component_context" +#' - p_value : numeric p-value +#' @export +to_TreeBH_input <- function(data_dir, shared_file, context_names, out_dir) { + # Prepare list to hold each context-specific + shared df + num_contexts <- length(context_names) + df_list <- vector("list", num_contexts + 1) + + # Inline file-reading and p-value/column validation per context + for (i in seq_len(num_contexts)) { + context <- context_names[i] + pattern <- paste0(context, "_specific.cis_pairs.txt$") + files <- list.files(path = data_dir, pattern = pattern, full.names = TRUE) + if (length(files) == 0) stop("No file matching pattern ", pattern) + df <- readr::read_tsv(files[[1]], col_types = readr::cols()) + + # Ensure columns exist and types are correct + if (!"p-value" %in% names(df)) stop("Missing 'p-value' in ", files[[1]]) + p <- df[["p-value"]] + if (!is.numeric(p) || any(is.na(p))) stop("`p-value` must be numeric without NAs") + if (any(p < 0 | p > 1)) stop("`p-value` entries must lie between 0 and 1") + + if (!"gene" %in% names(df)) stop("Missing 'gene' in ", files[[1]]) + if (!is.character(df[["gene"]])) stop("`gene` must be character in ", files[[1]]) + + if (!"SNP" %in% names(df)) stop("Missing 'SNP' in ", files[[1]]) + if (!is.character(df[["SNP"]])) stop("`SNP` must be character in ", files[[1]]) + + df_list[[i]] <- df + } + + # Shared file: same inline checks + df_shared <- readr::read_tsv(shared_file, col_types = readr::cols()) + for (col in c("p-value","gene","SNP")) { + if (!col %in% names(df_shared)) stop("Missing '", col, "' in shared_file") + col_data <- df_shared[[col]] + if (col == "p-value") { + if (!is.numeric(col_data) || any(is.na(col_data))) stop("`p-value` invalid in shared_file") + if (any(col_data < 0 | col_data > 1)) stop("`p-value` entries must lie between 0 and 1 in shared_file") + } else { + if (!is.character(col_data)) stop("`", col, "` must be character in shared_file") + } + } + df_list[[num_contexts + 1]] <- df_shared + + # Build grouping labels + rows_per_df <- vapply(df_list, nrow, integer(1)) + context_index <- rep(seq_along(df_list), times = rows_per_df) + context_label <- c(context_names, "shared")[context_index] + component_label <- ifelse(context_label == "shared", "shared", "specific") + + # Extract columns across all data.frames + p_values <- unlist(lapply(df_list, `[[`, "p-value"), use.names = FALSE) + gene_names <- unlist(lapply(df_list, `[[`, "gene"), use.names = FALSE) + snp_names <- unlist(lapply(df_list, `[[`, "SNP"), use.names = FALSE) + + # Build hierarchical identifiers + gene_snp_id <- paste0(gene_names, "_", snp_names) + component_id <- paste0(gene_snp_id, "_", component_label) + context_id <- paste0(component_id, "_", context_label) + + # Assemble final data.frame + out_df <- data.frame( + gene_name = gene_names, + snp_name = snp_names, + context_name = context_label, + gene_snp = gene_snp_id, + component_tag = component_id, + context_tag = context_id, + p_value = p_values, + stringsAsFactors = FALSE, + check.names = FALSE + ) + + if (!dir.exists(out_dir)) dir.create(out_dir, recursive = TRUE) + + # Write file + write.table( + out_df, + file = file.path(out_dir, "treeBH_input.txt"), + sep = "\t", + row.names = FALSE, + col.names = TRUE, + quote = FALSE + ) +} + +#' Multiple Testing Correction with TreeBH +#' Function to adjust for hierarchical multiple testing correction using TreeBH. +#' @param matrix - output of to_TreeBH_Input, matrix with the groups and pvalues +#' @param fdr_thres - value between 0 and 1 that signifies what FDR threshold for multiple testing correction. The same value will be used across all hierarchical levels. +#' @param out_dir - full filepath of the output directory where the output of TreeBH can be stored +#' @param method - character string specifying which implementation to use: +#' - "original" (default): uses the TreeBH package implementation +#' - "datatable": uses data.table optimized R implementation +#' - "cpp": uses Rcpp C++ implementation for maximum performance +#' @param test - character string specifying the aggregation method for p-values: +#' - "simes" (default): uses Simes' method +#' - "fisher": uses Fisher's method +#' @return A matrix with identification columns and output columns: +#' - gene : gene name +#' - snp : snp name +#' - context : context name or "specific" +#' - treeBH Output Columns : eGene, eQTL, Component, Context Specific +#' @export +treeBH_step <- function(matrix, fdr_thres, out_dir, method = "original", test = "simes") { + + # Validate method parameter + method <- match.arg(method, c("original", "datatable", "cpp")) + + # Extract components from input matrix + id_mat <- matrix[, 1:3] + groups <- matrix[, c(1, 4, 5, 6)] + p_values <- matrix[, 7] |> as.vector() |> as.numeric() + + num_levels <- 4 + + # Call appropriate implementation based on method parameter + treeBH_results <- switch(method, + original = { + # Original TreeBH package implementation + TreeBH::get_TreeBH_selections( + pvals = p_values, + groups = groups, + q = rep(fdr_thres, num_levels) + ) + }, + datatable = { + # Optimized R implementation using data.table + get_TreeBH_selections_datatable( + pvals = p_values, + groups = groups, + q = rep(fdr_thres, num_levels), + test = test + ) + }, + cpp = { + # C++ implementation using Rcpp + get_TreeBH_selections_cpp( + pvals = p_values, + groups = groups, + q = rep(fdr_thres, num_levels), + test = test + ) + } + ) + + colnames(treeBH_results) <- c("eGene", "eQTL", "Component", "ContextSpecific") + + results <- cbind(id_mat, treeBH_results) + + write.table( + results, + file = file.path(out_dir, "treeBH_output.txt"), + sep = "\t", + row.names = FALSE, + col.names = TRUE, + quote = FALSE + ) +} + +#' Optimized TreeBH Implementation using data.table +#' @keywords internal +get_TreeBH_selections_datatable <- function(pvals, groups, q, test = "simes") { + # Checks + N <- length(pvals) + L <- ncol(groups) + if (length(q) != L) { + stop("Must specify target q for each level of hierarchy") + } + if (min(q) < 0 || max(q) > 1) { + stop("Target q must be in the range 0 - 1") + } + if (min(pvals, na.rm = TRUE) < 0 || max(pvals, na.rm = TRUE) > 1) { + stop("P-values must be in the range 0 - 1") + } + if (length(pvals) != nrow(groups)) { + stop("Dimension mismatch between pvals and groups") + } + + # Convert to data.table for faster group operations + dt_groups <- data.table::as.data.table(as.data.frame(groups)) + data.table::setnames(dt_groups, paste0("Level", 1:L)) + + # Step 1: Check hierarchy nesting + for (cur_level in 2:L) { + cur_col <- paste0("Level", cur_level) + parent_col <- paste0("Level", cur_level - 1) + + check <- dt_groups[, .(parent_count = data.table::uniqueN(get(parent_col))), by = get(cur_col)] + + if (any(check$parent_count != 1)) { + stop("Groups must be nested within hierarchy") + } + } + + if (data.table::uniqueN(groups[, L]) != nrow(groups)) { + stop("Assumption is that lowest level in tree corresponds to individual hypotheses") + } + + # Initialization + sel <- matrix(0, nrow = N, ncol = L) + if (length(test) == 1) { + test <- rep(test, L - 1) + } + q_adj <- matrix(NA, nrow = N, ncol = L) + q_adj[, 1] <- q[1] + group_pvals <- matrix(NA, nrow = N, ncol = L) + group_pvals[, L] <- pvals + + # Step 3: Aggregate group-level p-values + for (cur_level in (L - 1):1) { + cur_groups <- unique(groups[, cur_level]) + for (cur_group in cur_groups) { + cur_inds <- which(groups[, cur_level] == cur_group) + + # Gets the children of all pvalues in the current group + child_pvals <- group_pvals[cur_inds, cur_level + 1] + agg_p <- switch(test[cur_level], + simes = get_simes_p(child_pvals), + fisher = get_fisher_p(child_pvals), + stop("Options for parameter test are 'simes' and 'fisher'")) + group_pvals[cur_inds[1], cur_level] <- agg_p + } + } + + for (cur_level in 1:L) { + if (cur_level > 1 && sum(sel[, cur_level - 1]) == 0) { + break + } + if (cur_level == 1) { + sel_prev <- 1 + } + else { + sel_prev <- which(sel[, cur_level - 1] == 1) + } + for (parent_sel in sel_prev) { + if (cur_level == 1) { + parent_group_ind <- 1 + child_inds <- which(!is.na(group_pvals[, cur_level])) + } + else { + parent_group_num <- groups[parent_sel, cur_level - 1] + parent_group_ind <- min(which(groups[, cur_level - 1] == parent_group_num)) + child_inds <- which(groups[, cur_level - 1] == parent_group_num & + !is.na(group_pvals[, cur_level])) + } + if (length(child_inds) > 1) { + sel_ind_within_group <- which(qvalue::qvalue(group_pvals[child_inds, cur_level], + lambda = 0)$qvalue <= + q_adj[parent_group_ind, cur_level]) + } + else { + sel_ind_within_group <- which(group_pvals[child_inds, cur_level] <= + q_adj[parent_group_ind, cur_level]) + } + sel[child_inds[sel_ind_within_group], cur_level] <- 1 + if (cur_level < L) { + R_parent_sel <- length(sel_ind_within_group) + n_parent_sel <- length(child_inds) + q_adj[child_inds[sel_ind_within_group], cur_level + 1] <- + q[cur_level + 1] * q_adj[parent_sel, cur_level]/q[cur_level] * + R_parent_sel/n_parent_sel + } + } + } + sel +} + +#' C++ TreeBH Implementation using Rcpp +#' @keywords internal +get_TreeBH_selections_cpp <- function(pvals, groups, q, test = "simes") { + N <- length(pvals) + L <- ncol(groups) + if (length(q) != L) { + stop("Must specify target q for each level of hierarchy") + } + if (min(q) < 0 || max(q) > 1) { + stop("Target q must be in the range 0 - 1") + } + if (min(pvals, na.rm = TRUE) < 0 || max(pvals, na.rm = TRUE) > 1) { + stop("P-values must be in the range 0 - 1") + } + if (length(pvals) != nrow(groups)) { + stop("Dimension mismatch between pvals and groups") + } + + # Convert groups to character matrix if it's not already + # This ensures compatibility with C++ CharacterMatrix type + if (!is.matrix(groups)) { + groups <- as.matrix(groups) + } + if (mode(groups) != "character") { + mode(groups) <- "character" + } + + # Check hierarchy structure using C++ function + error <- checkNested(groups = groups, L = L) + if(error != "") { + stop(error) + } + + # Get group p-values using C++ function + group_pvals <- getGroupPvalues(pvals, groups, N, L, test) + + # Get selections using C++ function + selections <- getGroupSelections(group_pvals, groups, q, N, L) + + return(selections) +} \ No newline at end of file diff --git a/R/treeBH_helpers.R b/R/treeBH_helpers.R new file mode 100644 index 0000000..d54454d --- /dev/null +++ b/R/treeBH_helpers.R @@ -0,0 +1,47 @@ +# Helper functions for TreeBH +# These functions are used internally by the different TreeBH implementations + +#' Calculate Simes p-value +#' @param pvals Numeric vector of p-values +#' @return Simes combined p-value +#' @noRd +get_simes_p <- function(pvals) { + # Remove NA values + clean_pvals <- pvals[!is.na(pvals)] + + # If no valid p-values, return NA + if (length(clean_pvals) == 0) { + return(NA_real_) + } + + # Sort p-values + sorted_pvals <- sort(clean_pvals) + n <- length(sorted_pvals) + + # Calculate Simes p-value: min_i( n * p_(i) / i ) + adjusted <- sorted_pvals * n / seq_along(sorted_pvals) + min_val <- min(adjusted) + + return(min_val) +} + +#' Calculate Fisher's combined p-value +#' @param pvals Numeric vector of p-values +#' @return Fisher's combined p-value +#' @noRd +get_fisher_p <- function(pvals) { + # Remove NA values + clean_pvals <- pvals[!is.na(pvals)] + + # If no valid p-values, return NA + if (length(clean_pvals) == 0) { + return(NA_real_) + } + + # Calculate Fisher's statistic: -2 * sum(log(p)) + statistic <- -2 * sum(log(clean_pvals)) + + # Return p-value from chi-squared distribution with 2k degrees of freedom + return(pchisq(statistic, df = 2 * length(clean_pvals), lower.tail = FALSE)) +} + diff --git a/R/treeQTL.R b/R/treeQTL.R index a69e579..3330ad2 100644 --- a/R/treeQTL.R +++ b/R/treeQTL.R @@ -6,85 +6,274 @@ #' @param snps_location_file_name - full filepath of the snpsloc file used in the eQTL mapping step #' @param gene_location_file_name - full filepath of the geneloc file used in the eQTL mapping step #' @param out_dir - full filepath of the output directory that FDR adjusted eQTLs should be written out to. +#' @param cisDist - numeric value of defined cis window. Note: this should match the cisDist set in the eQTL mapping step. #' @param context_names - vector of all context names in the format c("tissue1", "tissue2", ..., etc.) #' @param fdr_thresh - value between 0 and 1 that signifies what FDR threshold for multiple testing correction. The same value will be used across all hierarchical levels. #' @param four_level - boolean value (T or F) that signifies whether to use the 4-level hierarchy (set this parameter to R and test for a global eQTL across shared and specific components) or a 3-level hierarchy (this parameter is default set to F to test for shared vs specific eQTLs) +#' @param qtl_type - string value "cis" or "trans" denoting the type of eQTL mapped. Default is set to "cis" +#' @param treeBH_method - character string specifying which TreeBH implementation to use when four_level = TRUE. Options: "original" (TreeBH package), "datatable" (optimized R), "cpp" (fast C++ implementation, default). Ignored when four_level = FALSE. +#' @param treeBH_test - character string specifying the p-value aggregation method for TreeBH when four_level = TRUE. Options: "simes" (Simes' method, default), "fisher" (Fisher's method). Ignored when four_level = FALSE. #' @return outputs one file of specific eGenes across all contexts and one file of shared eGenes. Outputs an eAssociation file for each context and one for shared eQTLs with snp-gene pairs and FDR adjusted p-values. #' +#' @importFrom data.table fread getDTthreads setDTthreads +#' @importFrom dplyr select group_by mutate distinct +#' @importFrom magrittr %>% #' @export -treeQTL_step = function(data_dir, snps_location_file_name, gene_location_file_name, context_names, out_dir, fdr_thresh = 0.05, four_level = F){ +treeQTL_step = function(data_dir, snps_location_file_name, gene_location_file_name, context_names, out_dir, fdr_thresh = 0.05, four_level = FALSE, cisDist = 1e6, qtl_type = "cis", treeBH_method = "cpp", treeBH_test = "simes"){ -# use a single thread -print(paste0("data.table getDTthreads(): ",getDTthreads())) -setDTthreads(1) -print(paste0("after setting as 1 thread; data.table getDTthreads(): ",getDTthreads())) + print(paste0("data.table getDTthreads(): ", getDTthreads())) + setDTthreads(1) + print(paste0("after setting as 1 thread; data.table getDTthreads(): ", getDTthreads())) + + options(warn=1) + + level1 = fdr_thresh + level2 = fdr_thresh + level3 = fdr_thresh + + # Distance for local gene-SNP pairs + nearby = TRUE + if(qtl_type != "cis"){ + nearby = FALSE + print(paste("Performing multiple testing adjustment for FastGxC trans-eQTLs.")) + }else{ + print(paste("Performing multiple testing adjustment for FastGxC cis-eQTLs.")) + } + + snpspos = fread(file = snps_location_file_name); + genepos = fread(file = gene_location_file_name); + + ## get context names + if (is.vector(context_names)) { + print(paste("Input for context names is a valid vector.")) + }else{ + stop(print(paste0("No valid input for context names."))) + } + # Use treeQTL to perform hierarchical FDR and get specific_eGenes, i.e. genes with at least one context-specific eQTL, and shared_eGenes, i.e. genes with at least one context-shared eQTL + + + names(genepos) <- tolower(names(genepos)) + names(snpspos) <- tolower(names(snpspos)) + + genepos <- genepos |> + dplyr::rename( + geneid = dplyr::coalesce(names(genepos)[grepl("gene", names(genepos))][1], "geneid"), + s1 = dplyr::coalesce(names(genepos)[grepl("start|s1", names(genepos))][1], "s1"), + s2 = dplyr::coalesce(names(genepos)[grepl("end|s2", names(genepos))][1], "s2") + ) + snpspos <- snpspos |> + dplyr::rename( + snpid = dplyr::coalesce(names(snpspos)[grepl("SNP|snp", names(snpspos))][1], "snpid"), + pos = dplyr::coalesce(names(snpspos)[grepl("position|pos", names(snpspos))][1], "pos") + ) -# Display all warnings as they occur -options(warn=1) - -# FDR thresholds -level1=fdr_thresh -level2=fdr_thresh -level3=fdr_thresh - -# Distance for local gene-SNP pairs -cisDist = 1e6; - -snpspos = fread(file = snps_location_file_name); -genepos = fread(file = gene_location_file_name); - -## get context names -if (is.vector(context_names)) { - print(paste("Input for context names is a valid vector.")) -}else{ - stop(print(paste0("No valid input for context names."))) -} -# Use treeQTL to perform hierarchical FDR and get specific_eGenes, i.e. genes with at least one context-specific eQTL, and shared_eGenes, i.e. genes with at least one context-shared eQTL - -shared_n_tests_per_gene = get_n_tests_per_gene(snp_map = snpspos[,1:3], gene_map = genepos[,1:4], - nearby = TRUE, dist = cisDist) -shared_n_tests_per_gene = data.frame(shared_n_tests_per_gene) -shared_n_tests_per_gene = data.frame(gene = rownames(shared_n_tests_per_gene), shared_n_tests_per_gene) -names(shared_n_tests_per_gene) = c("family", "n_tests") - - -if(four_level){ - specific_eGenes=get_eGenes_multi_tissue_mod( - m_eqtl_out_dir = data_dir, - treeQTL_dir = out_dir, - tissue_names = context_names, - level1 = level1, level2 = level2, level3 = level3, - exp_suffix = "specific", - four_level = four_level, - shared_n_tests_per_gene = shared_n_tests_per_gene) - write.table(x = specific_eGenes, file = paste0(out_dir,"specific_eGenes.txt"), quote = F, row.names = F, col.names = T, sep = '\t') -}else{ - specific_eGenes=get_eGenes_multi_tissue_mod( - m_eqtl_out_dir = data_dir, - treeQTL_dir = out_dir, - tissue_names = context_names, - level1 = level1, level2 = level2, level3 = level3, - exp_suffix = "specific", - four_level = four_level) - write.table(x = specific_eGenes, file = paste0(out_dir,"specific_eGenes.txt"), quote = F, row.names = F, col.names = T, sep = '\t') - - pattern=("shared.all_pairs.txt") - shared_eGenes = get_eGenes(n_tests_per_gene = shared_n_tests_per_gene, - m_eqtl_out = list.files(data_dir, pattern = pattern,full.names = T), - method = "BH", - level1 = level1, level2 = level2, - slice_size = 1e+05, - silent = FALSE) - - write.table(x = shared_eGenes, file = paste0(out_dir,"shared_eGenes.txt"), quote = F, row.names = F, col.names = T, sep = '\t') - - - eAssociations = get_eAssociations(eDiscoveries = shared_eGenes, n_tests = n_tests_per_gene, - m_eqtl_out = list.files(data_dir, pattern = pattern,full.names = T), - out_file = paste0(out_dir,"eAssoc_by_gene.context_shared.txt"), - by_snp = F, slice_size = 1e+05, - silent = FALSE) -} - - + expected_files <- paste0(context_names, "_specific.cis_pairs.txt") + missing_files <- setdiff(expected_files, list.files(data_dir)) + + if (length(missing_files) > 0) { + stop(paste0( + "Missing FastGxC output files for contexts: ", + paste(gsub("_specific.cis_pairs.txt", "", missing_files), collapse = ", ") + )) + } else { + message("All expected FastGxC eQTL mapping output files are present.") + } + + shared_n_tests_per_gene = get_n_tests_per_gene(snp_map = snpspos %>% dplyr::select(snpid, chr, pos), gene_map = genepos %>% dplyr::select(geneid, chr, s1, s2), + nearby = nearby, dist = cisDist) + shared_n_tests_per_gene = data.frame(shared_n_tests_per_gene) + + if(ncol(shared_n_tests_per_gene) < 2){ + shared_n_tests_per_gene = data.frame( + gene = rownames(shared_n_tests_per_gene), + shared_n_tests_per_gene + ) + } + names(shared_n_tests_per_gene) = c("family", "n_tests") # sanity check + + # get rid of this later + get_eGenes_multi_tissue_mod <- function(m_eqtl_out_dir, treeQTL_dir, tissue_names, level1 = level1, level2 = level2, level3 = level3, exp_suffix, four_level = FALSE, shared_n_tests_per_gene) { + pattern <- paste0(tissue_names, "_", exp_suffix, ".cis_pairs.txt") + m_eqtl_outfiles <- file.path(m_eqtl_out_dir, pattern) + print("Constructed MatrixEQTL output files:") + print(m_eqtl_outfiles) + + if (!all(file.exists(m_eqtl_outfiles))) { + stop("Some expected MatrixEQTL files do not exist.") + } + + n_tissue <- length(tissue_names) + for (i in 1:n_tissue) { + cur_tissue_name <- tissue_names[i] + m_eqtl_out <- m_eqtl_outfiles[i] + print(paste("Computing summary statistics for tissue", cur_tissue_name)) + n_SNPs_per_gene_this_tissue <- fread(m_eqtl_out) %>% select(gene) %>% group_by(gene) %>% mutate(n = n()) %>% distinct() + colnames(n_SNPs_per_gene_this_tissue) <- c("family", "n_tests") + gene_simes_cur_tissue <- get_eGenes(n_tests_per_gene = n_SNPs_per_gene_this_tissue, m_eqtl_out = m_eqtl_out, method = "BH", level1 = 1, level2 = 1, silent = TRUE) + gene_simes_cur_tissue <- merge(gene_simes_cur_tissue, n_SNPs_per_gene_this_tissue, by = "family", all = TRUE) + gene_simes_cur_tissue$fam_p[which(is.na(gene_simes_cur_tissue$fam_p))] <- 1 + gene_simes_cur_tissue$eGene <- as.integer(gene_simes_cur_tissue$fam_p <= level1) + + # Output eAssociations for specific context + context_eGenes <- gene_simes_cur_tissue[gene_simes_cur_tissue$eGene == 1, , drop = FALSE] + if (nrow(context_eGenes) > 0) { + colnames(context_eGenes)[1] <- "family" + context_eGenes$pval <- 0 + context_eGenes$n_sel <- 1 + dir.create(treeQTL_dir, recursive = TRUE, showWarnings = FALSE) + + eAssoc_file <- paste0(treeQTL_dir, "/eAssoc_by_gene.context_", cur_tissue_name, ".txt") + get_eAssociations( + eDiscoveries = context_eGenes, + n_tests = n_SNPs_per_gene_this_tissue, + m_eqtl_out = m_eqtl_out, + out_file = eAssoc_file, + by_snp = FALSE, + slice_size = 1e+05, + silent = FALSE + ) + } + + if (i == 1) { + eGene_pvals <- gene_simes_cur_tissue[, c("family", "fam_p")] + n_SNPs_per_gene_xT <- n_SNPs_per_gene_this_tissue + } else { + eGene_pvals <- merge(eGene_pvals, gene_simes_cur_tissue[, c("family", "fam_p")], by = "family", all = TRUE) + n_SNPs_per_gene_xT <- merge(n_SNPs_per_gene_xT, n_SNPs_per_gene_this_tissue, by = "family", all = TRUE) + } + names(eGene_pvals)[i + 1] <- cur_tissue_name + names(n_SNPs_per_gene_xT)[i + 1] <- cur_tissue_name + } + names(eGene_pvals)[1] <- "gene" + col_ind_pvals <- 2:(n_tissue + 1) + eGene_pvals$simes_p <- apply(eGene_pvals[, col_ind_pvals], 1, TreeQTL:::get_simes_p) + eGene_xT_qvals <- qvalue(eGene_pvals$simes_p, lambda = 0)$qvalue + R_G <- sum(eGene_xT_qvals <= level1) + print(paste("Number of cross-tissue eGenes = ", R_G)) + + if (R_G == 0) { + warning("No eGenes selected at level1; skipping lower-level selections.") + return(list( + qvalue_table = data.frame(), + raw_pval_table = eGene_pvals, + qvals = eGene_xT_qvals + )) + } + + q2_adj <- R_G * level2 / nrow(eGene_pvals) + ind_sel_simes <- which(eGene_xT_qvals <= level1) + sel_eGenes_simes <- eGene_pvals[ind_sel_simes, ] + rej_simes <- t(1 * apply(sel_eGenes_simes[, col_ind_pvals], 1, TreeQTL:::qsel_by_fam, q2_adj)) + colnames(rej_simes) <- tissue_names + sel_eGenes_simes$n_sel_tissues <- rowSums(rej_simes) + sel_eGenes_simes$n_tested_tissues <- rowSums(!is.na(sel_eGenes_simes[, col_ind_pvals])) + eGene_xT_sel <- data.frame(gene = sel_eGenes_simes$gene) + eGene_xT_sel <- cbind(eGene_xT_sel, rej_simes) + names(eGene_xT_sel)[2:(n_tissue + 1)] <- tissue_names + + return(list( + qvalue_table = eGene_xT_sel, + raw_pval_table = eGene_pvals, + qvals = eGene_xT_qvals + )) + } + + if (four_level) { + shared_file <- list.files(path = data_dir, pattern = "shared_shared.cis_pairs.txt", full.names = TRUE) + if (length(shared_file) == 0) stop("No shared_shared.cis_pairs.txt file found.") + + to_TreeBH_input(data_dir = data_dir, shared_file = shared_file, context_names = context_names, out_dir = out_dir) + df <- read.table(file.path(out_dir, "treeBH_input.txt"), header = TRUE) + treeBH_step(matrix = df, fdr_thres = fdr_thresh, out_dir = out_dir, method = treeBH_method, test = treeBH_test) + + message("TreeBH multiple testing finished. Output written to ", out_dir) + return(invisible("TreeBH done")) + } else { + res = get_eGenes_multi_tissue_mod( + m_eqtl_out_dir = data_dir, + treeQTL_dir = out_dir, + tissue_names = context_names, + level1 = level1, level2 = level2, level3 = level3, + exp_suffix = "specific", + four_level = four_level, + qtl_type = qtl_type, + shared_n_tests_per_gene = shared_n_tests_per_gene + ) + + specific_eGenes = res$qvalue_table + + if (nrow(specific_eGenes) == 0) { + message("No specific eGenes discovered.") + write.table( + data.frame(gene = character(), n_sel_tissues = numeric()), + file = file.path(out_dir, "specific_eGenes.txt"), + quote = FALSE, row.names = FALSE, sep = '\t' + ) + } + else { + write.table( + x = specific_eGenes, + file = file.path(out_dir, "specific_eGenes.txt"), + quote = FALSE, + row.names = FALSE, + col.names = TRUE, + sep = '\t' + ) + } + + pattern <- (paste0("shared.", qtl_type, "_pairs.txt")) + shared_file <- list.files(path = data_dir, pattern = pattern, full.names = TRUE) + + if (length(shared_file) != 1) { + stop("Expected one shared file but found: ", length(shared_file)) + } + + if (file.size(shared_file) == 0) { + message("Shared cis-pairs file is empty. Skipping shared eGene discovery.") + shared_eGenes <- data.frame() + } else { + message("Running TreeQTL correction for shared eGenes...") + shared_eGenes <- get_eGenes( + n_tests_per_gene = shared_n_tests_per_gene, + m_eqtl_out = shared_file, + method = "BH", + level1 = level1, + level2 = level2, + slice_size = 1e+05, + silent = FALSE + ) + } + + if (nrow(shared_eGenes) == 0) { + message("No shared eGenes discovered. Skipping shared association analysis.") + write.table( + data.frame(), + file = file.path(out_dir, "shared_eGenes.txt"), + quote = FALSE, + row.names = FALSE, + col.names = TRUE, + sep = '\t' + ) + return(invisible(NULL)) # Skipping eAssociations + } else { + write.table( + x = shared_eGenes, + file = file.path(out_dir, "shared_eGenes.txt"), + quote = FALSE, + row.names = FALSE, + col.names = TRUE, + sep = '\t' + ) + } + + eAssociations = get_eAssociations( + eDiscoveries = shared_eGenes, + n_tests = shared_n_tests_per_gene, + m_eqtl_out = shared_file, + out_file = file.path(out_dir, "eAssoc_by_gene.context_shared.txt"), + by_snp = FALSE, + slice_size = 1e+05, + silent = FALSE + ) + } } diff --git a/README.md b/README.md index 47864a6..3009dcf 100644 --- a/README.md +++ b/README.md @@ -1,162 +1,32 @@ -# FastGxC: A powerful and computationally efficient software for context-specific eQTL mapping in single-cell omics data +# FastGxC: A powerful and computationally efficient software for context-specific eQTL mapping in single-cell genomics data -FastGxC was originally developed for single-cell data, where each individual contributes gene expression measurements across multiple cell types. +FastGxC is a computationally efficient R package for mapping context-specific eQTLs across multiple biological conditions (e.g., tissues or cell types). It was originally designed for single-cell RNA-seq data, but also supports bulk RNA-seq from multi-context samples. -However, it can also be applied to bulk RNA-seq data when the same individuals are profiled across multiple tissues or conditions. +# Installation and Setup -In both settings, FastGxC models **repeated samples** from each individual, removing shared noise and enabling more accurate detection of context-specific genetic effects. +Before using FastGxC, make sure all required R packages are installed. -FastGxC is also **robust to missing data** —for example, when certain individuals or genes are missing in some cell types or tissues. +To install FastGxC: -Please read the [BioRxiv](https://www.biorxiv.org/content/10.1101/2021.06.17.448889v2) preprint for more details. - - - -# Package Installation and Dependencies -FastGxC is an R package that can be loaded and used in any R environment. -In order for FastGxC to run properly, the following packages are required and should be installed in R prior to using any FastGxC functions. -``` -library(devtools) -library(dplyr) -library(data.table) -library(MatrixEQTL) -library(mvtnorm) -library(reshape2) -library(magrittr) -library(TreeQTL) -``` -** Note ** : to install TreeQTL, qvalue must be installed first. - -Once all dependencies are installed and loaded you can install FastGxC using: -``` - devtools::install_github("BalliuLab/FastGxC") -``` -Once FastGxC is installed, load all functions using -``` +``` +devtools::install_github("BalliuLab/FastGxC") library(FastGxC) ``` -# Simulate toy data - -To run a toy example, generate simulated data by running the following code in R: -``` - data_dir_sim = "~/simulations/" - sim_scenario = "single_context_het" - simulate_data(data_dir = data_dir_sim, sim_scenario = sim_scenario) -``` - -** Note: running the code above simulates data with default parameters (300 individuals, 10,000 SNPs, 100 genes, and 50 contexts without missing data), but this function can be run with any combination of parameter values. See all possible parameters for ```simulate_data()``` by running ```?simulate_data``` in R. - -Running the above code will generate and save the following files in the data_dir: -(1) {sim_scenario}_SNPs.txt: SNP genotype data for 10,000 SNPs and 300 individuals (individual IDs as columns and SNP IDs as rows)\ - -(2) {sim_scenario}_snpsloc.txt: location information of the 10,000 simulated SNPs (MatrixEQTL input format)\ - -(3) {sim_scenario}_geneloc.txt: location information of the 100 simulated genes (MatrixEQTL input format)\ - -(4) {sun_scenario}_simulated_expression.txt: gene expression data for the 300 simulated individuals across 100 genes and 50 contexts \ - -# Running FastGxC - -FastGxC works in two steps. In the first step, expression is decomposed into shared and context-specific components. In the second step, eQTLs are separately mapped on these components. - -*Step 1 - Decomposition:* For each individual, decompose the phenotype of interest (e.g. gene expression) across C contexts (e.g. tissues or cell-types) into one context-shared and C context-specific components using the ```decomposition_step()``` function. -This function takes as imput a file with gene expression data for all individuals, genes, and contexts (see output of ```simulate_data()``` for the right format) and outputs one file with context-shared expression (context_shared_expression.txt) and C files with expression specific to each context (CONTEXT_NAME_specific_expression.txt). - -The following code example demonstrates how to use this function with the data we just simulated above. - - ``` - exp_mat_filename = "~/simulations/single_context_het_simulated_expression.txt" - data_dir_decomp = "~/example_output_single_context_het/" - decomposition_step(exp_mat_filename, data_dir_decomp) - ``` - -*Step 2 - eQTL mapping:* FastGxC estimates genetic effects on the context-shared component and each of the C context-specific components separately using simple linear models. Note: Here we use the R package [MatrixEQTL](http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/) but any other software that can perform quick linear regression can be used (e.g. [FastQTL](http://fastqtl.sourceforge.net/) or [tensorqtl](https://github.com/broadinstitute/tensorqtl)). FastGxC implements eQTL mapping using its ```eQTL_mapping_step()``` function. - -This function take as input data needed to run MatrixEQTL and outputs eQTL summary statistics in the MatrixEQTL format. In the end, you should have one file with summary statistics for shared eQTL and C files with summary statistics for each context C. - -Below is a code example to map context-specific eQTLs and shared eQTLs using the decomposed simulated data from above. -``` -out_dir = "~/example_output_single_context_het/" -input_dir = "~/simulations/" -context_names = paste0("context", seq(1,50)) - -SNP_file_name = paste0(input_dir, "single_context_het_SNPs.txt") -snps_location_file_name = paste0(input_dir, "single_context_het_snpsloc.txt") -gene_location_file_name = paste0(input_dir, "single_context_het_geneloc.txt") -## map context specific eQTLs: -for(context in context_names){ - expression_file_name = paste0(out_dir, context, "_specific_expression.txt") - context = context - shared_specific = "specific" - out_dir = out_dir - - eQTL_mapping_step(SNP_file_name = SNP_file_name, - snps_location_file_name = snps_location_file_name, - expression_file_name = expression_file_name, - gene_location_file_name = gene_location_file_name, - context = context, - shared_specific = shared_specific, - out_dir = out_dir) -} - -## map shared eQTLs: - expression_file_name = paste0(input_dir, "context_shared_expression.txt") - shared_specific = "shared" - - eQTL_mapping_step(SNP_file_name = SNP_file_name, - snps_location_file_name = snps_location_file_name, - expression_file_name = expression_file_name, - gene_location_file_name = gene_location_file_name, - context = "shared", - shared_specific = shared_specific, - out_dir = out_dir) - -``` - -# Multiple testing adjustment +If you need help installing dependencies or Python integration, refer to the Wiki's [Installation and Dependences Guide](https://github.com/BalliuLab/FastGxC/wiki/1.-Installation-and-Dependencies) -To adjust for multiple testing across all contexts, genes, and genetic variants tested, FastGxC uses the hierarchical FDR procedures implemented in the R package [TreeQTL](http://bioinformatics.org/treeqtl/) via the ```treeQTL_step()``` function. +# Installation and Setup -This function requires that you run MatrixEQTL to do eQTL mapping (see step 2 above). If you used another eQTL mapping softwares, please make sure the output is in the format required by TreeQTL. You can also replace TreeQTL with other methods, e.g. [mashR](https://github.com/stephenslab/mashr), which can also lead to a considerable increase in power. +This repository is supported by a full [Wiki](https://github.com/BalliuLab/FastGxC/wiki) that includes the following: -The following code example demonstrates how to use this function with the data outputted from the eQTL mapping we just ran. This script take as input data needed to run TreeQTL and outputs shared and specific eGenes (two files) and eAssociation (C+1 files) summary statistics in the TreeQTL format. - -Map specific-eGenes, i.e., genes with at least one context-specific eQTL -``` -## directory with all matrixeQTL files for all contexts -## (note that this function expects files to be named in the same was as the output files from FastGxC's eQTL mapping function) -data_dir = "~/example_output_single_context_het/" -snps_location_file_name = "~/simulations/single_context_het_snpsloc.txt" -gene_location_file_name = "~/simulations/single_context_het_geneloc.txt" -out_dir = "~/example_output_single_context_het/" -fdr_thresh = 0.05 - -## Run multiple testing correction without the four level hierarchy: -## this will only output specfic eGenes and eAssociations -treeQTL_step( - data_dir, - snps_location_file_name, - gene_location_file_name, - context_names, - out_dir, - fdr_thresh = fdr_thresh - ) - -## Run multiple testing correction with the four level hierarchy: -## this will output specific and shared eGenes and eAssociations as well as global eGenes -treeQTL_step( - data_dir, - snps_location_file_name, - gene_location_file_name, - context_names, - out_dir, - fdr_thresh = fdr_thresh, - four_level = T - ) -``` +1. [Installation and Dependences](https://github.com/BalliuLab/FastGxC/wiki/1.-Installation-and-Dependencies) +2. [Simulate Data](https://github.com/BalliuLab/FastGxC/wiki/2.-Simulate-Data) +3. [Decomposition Step](https://github.com/BalliuLab/FastGxC/wiki/3.-Decomposition-Step) +4. [eQTL Mapping](https://github.com/BalliuLab/FastGxC/wiki/4.-eQTL-Mapping) +5. [Multiple Testing Correction](https://github.com/BalliuLab/FastGxC/wiki/5.-Multiple-Testing-Correction) +If you're unsure where to start or encounter issues, please check the [Wiki](https://github.com/BalliuLab/FastGxC/wiki) first — it covers all major steps with examples and explanations. diff --git a/man/checkNested.Rd b/man/checkNested.Rd new file mode 100644 index 0000000..8889553 --- /dev/null +++ b/man/checkNested.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{checkNested} +\alias{checkNested} +\title{Check if groups are properly nested in hierarchy} +\usage{ +checkNested(groups, L) +} +\description{ +Check if groups are properly nested in hierarchy +} diff --git a/man/eQTL_mapping_step.Rd b/man/eQTL_mapping_step.Rd index 52f4195..e50267e 100644 --- a/man/eQTL_mapping_step.Rd +++ b/man/eQTL_mapping_step.Rd @@ -11,41 +11,53 @@ eQTL_mapping_step( gene_location_file_name, context, shared_specific, - out_dir + out_dir, + output_file_name_cis = file.path(out_dir, paste0(context, "_", shared_specific, + ".cis_pairs.txt")), + output_file_name_tra = file.path(out_dir, paste0(context, "_", shared_specific, + ".trans_pairs.txt")), + method = "MatrixEQTL", + use_model = modelLINEAR, + cis_dist = 1e+06, + pv_threshold_cis = 1, + pv_threshold_tra = 0, + error_covariance = numeric() ) } \arguments{ -\item{SNP_file_name}{\itemize{ -\item full file path with genotypes of all individuals -}} +\item{SNP_file_name}{Full path to SNP genotype matrix.} -\item{snps_location_file_name}{\itemize{ -\item full file path with snp ids, start, and end positions -}} +\item{snps_location_file_name}{Full path to SNP location file.} -\item{expression_file_name}{\itemize{ -\item full file path with expression matrix of individuals per gene -}} +\item{expression_file_name}{Full path to expression matrix.} -\item{gene_location_file_name}{\itemize{ -\item full file path with gene ids, start, and end positions -}} +\item{gene_location_file_name}{Full path to gene location file.} -\item{context}{\itemize{ -\item name of context for output file naming purposes -}} +\item{context}{Context name for labeling output.} -\item{shared_specific}{\itemize{ -\item either "shared" if mapping eQTLs with shared component or "specific" if mapping eQTLs with specific component -}} +\item{shared_specific}{Context specific or context shared} -\item{out_dir}{\itemize{ -\item full file path of output directory where eQTL file will be written out -}} +\item{out_dir}{Output directory.} + +\item{output_file_name_cis}{Path to write cis-eQTL output.} + +\item{output_file_name_tra}{Path to write trans-eQTL output.} + +\item{method}{Either "MatrixEQTL" or "tensorQTL".} + +\item{use_model}{MatrixEQTL model (default: modelLINEAR).} + +\item{cis_dist}{Distance threshold for cis-eQTLs.} + +\item{pv_threshold_cis}{P-value threshold for cis.} + +\item{pv_threshold_tra}{P-value threshold for trans.} + +\item{error_covariance}{Covariance matrix (or numeric()).} } \value{ -outputs one file of mapped cis eQTLs +Writes cis-eQTLs and trans-eQTLs (optional) to disk. } \description{ -Function to map cis eQTLs - cis window is defined as 1Mb +Function to map cis eQTLs using either Matrix eQTL or TensorQTL. } diff --git a/man/getGroupPvalues.Rd b/man/getGroupPvalues.Rd new file mode 100644 index 0000000..30132da --- /dev/null +++ b/man/getGroupPvalues.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{getGroupPvalues} +\alias{getGroupPvalues} +\title{Hierarchical Group P-value Calculation +The getGroupPvalues function computes aggregated p-values for each group in a hierarchical +structure by combining p-values from lower levels using specified aggregation methods.} +\usage{ +getGroupPvalues(pvals, groups, N, L, test) +} +\arguments{ +\item{pvals}{NumericVector containing p-values for individual hypotheses} + +\item{groups}{CharacterMatrix defining the group hierarchy structure} + +\item{N}{Integer specifying the number of hypotheses} + +\item{L}{Integer specifying the number of levels in the hierarchy} + +\item{test}{CharacterVector specifying the aggregation method(s)} +} +\description{ +Hierarchical Group P-value Calculation +The getGroupPvalues function computes aggregated p-values for each group in a hierarchical +structure by combining p-values from lower levels using specified aggregation methods. +} diff --git a/man/getGroupSelections.Rd b/man/getGroupSelections.Rd new file mode 100644 index 0000000..b352920 --- /dev/null +++ b/man/getGroupSelections.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{getGroupSelections} +\alias{getGroupSelections} +\title{Hierarchical Group Selection for Multiple Testing +The getGroupSelections function applies hierarchical multiple testing correction.} +\usage{ +getGroupSelections(group_pvals, groups, q, N, L) +} +\arguments{ +\item{group_pvals}{NumericMatrix containing aggregated p-values} + +\item{groups}{CharacterMatrix defining the group hierarchy structure} + +\item{q}{NumericVector containing the q-value thresholds for each level} + +\item{N}{Integer specifying the number of hypotheses} + +\item{L}{Integer specifying the number of levels in the hierarchy} +} +\description{ +Hierarchical Group Selection for Multiple Testing +The getGroupSelections function applies hierarchical multiple testing correction. +} diff --git a/man/get_TreeBH_selections_cpp.Rd b/man/get_TreeBH_selections_cpp.Rd new file mode 100644 index 0000000..58256cc --- /dev/null +++ b/man/get_TreeBH_selections_cpp.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/treeBH.R +\name{get_TreeBH_selections_cpp} +\alias{get_TreeBH_selections_cpp} +\title{C++ TreeBH Implementation using Rcpp} +\usage{ +get_TreeBH_selections_cpp(pvals, groups, q, test = "simes") +} +\description{ +C++ TreeBH Implementation using Rcpp +} +\keyword{internal} diff --git a/man/get_TreeBH_selections_datatable.Rd b/man/get_TreeBH_selections_datatable.Rd new file mode 100644 index 0000000..446b1ea --- /dev/null +++ b/man/get_TreeBH_selections_datatable.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/treeBH.R +\name{get_TreeBH_selections_datatable} +\alias{get_TreeBH_selections_datatable} +\title{Optimized TreeBH Implementation using data.table} +\usage{ +get_TreeBH_selections_datatable(pvals, groups, q, test = "simes") +} +\description{ +Optimized TreeBH Implementation using data.table +} +\keyword{internal} diff --git a/man/install_deps.Rd b/man/install_deps.Rd new file mode 100644 index 0000000..5331481 --- /dev/null +++ b/man/install_deps.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/deps.R +\name{install_deps} +\alias{install_deps} +\title{Install FastGxC dependencies} +\usage{ +install_deps(ask = interactive()) +} +\arguments{ +\item{ask}{Logical, whether to ask before installing (default: TRUE if interactive).} +} +\description{ +Installs CRAN, Bioconductor, and source packages required by FastGxC. +This includes optional packages like qvalue, TreeQTL, and TreeBH. +} diff --git a/man/simulate_data.Rd b/man/simulate_data.Rd index 2f46d3a..2b5846a 100644 --- a/man/simulate_data.Rd +++ b/man/simulate_data.Rd @@ -2,70 +2,54 @@ % Please edit documentation in R/simulate_data.R \name{simulate_data} \alias{simulate_data} -\title{Simulations} +\title{Simulate Gene Expression and Genotype Data} \usage{ simulate_data( data_dir, N = 300, n_genes = 100, n_snps_per_gene = 1000, - n_contexts = 10, + n_contexts, maf = 0.2, w_corr = 0.2, v_e = 1, missing = 0, - seed = 1, - sim_scenario = "single_context_het" + hsq = rep(0.2, n_contexts), + mus = rep(0, n_contexts) ) } \arguments{ -\item{data_dir}{\itemize{ -\item full filepath of the directory where simulated data will be written to -}} +\item{data_dir}{String. Full file path to the directory where simulated data will be saved.} -\item{N}{\itemize{ -\item number of individuals simulated (must be a whole number) -}} +\item{N}{Integer. Number of individuals to simulate.} -\item{n_genes}{\itemize{ -\item number of genes simulated (must be a whole number) -}} +\item{n_genes}{Integer. Number of genes to simulate.} -\item{n_snps_per_gene}{\itemize{ -\item number of cis-SNPs per gene (must be a whole number) -}} +\item{n_snps_per_gene}{Integer. Number of cis-SNPs per gene.} -\item{n_contexts}{\itemize{ -\item number of contexts to simulate (e.g. tissues or cell types) (must be a whole number) -}} +\item{n_contexts}{Integer. Number of contexts to simulate (e.g., tissues or cell types).} -\item{maf}{\itemize{ -\item minor allele frequency for genotypes -}} +\item{maf}{Numeric. Minor allele frequency for simulated SNPs (value between 0 and 1).} -\item{w_corr}{\itemize{ -\item error covariance between contexts -}} +\item{w_corr}{Numeric. Off-diagonal values for the error covariance matrix (correlation between contexts).} -\item{v_e}{\itemize{ -\item error variance in each context (maybe take this out and set it to 1) -}} +\item{v_e}{Numeric. Error variance for expression in each context.} -\item{missing}{\itemize{ -\item decimal value signifying percentage of missingness in simulated expression data (e.g. parameter value of 0.3 would indicate 30\% missing values in outputted expression matrix) -}} +\item{missing}{Numeric. Proportion of missing values in the simulated expression data (e.g., 0.3 = 30\% missing).} -\item{seed}{\itemize{ -\item can set seed for reproducibility -}} +\item{hsq}{Numeric vector of length \code{n_contexts}. Heritability values in each context (defaults to 0 = null model).} -\item{sim_scenario}{\itemize{ -\item must be either "null" or "single_context_het" to signify simulations under the null case (no genetic effects in any context) or the case of single context heterogeneity (one context drives the genetic effect heterogeneity) -}} +\item{mus}{Numeric vector of length \code{n_contexts}. Mean expression in each context.} } \value{ -outputs an expression matrix file, a genotype matrix file, a SNP location file, and a gene location file all in the format needed for FastGxC's decomposition step and then subsequent eQTL mapping step with Matrix eQTL. +Writes the following files to \code{data_dir}: +\itemize{ +\item \code{SNPs.txt} – genotype matrix (individuals × SNPs) +\item \code{snpsloc.txt} – SNP location file for cis-window definitions +\item \code{geneloc.txt} – gene location file for cis-window definitions +\item \code{simulated_expression.txt} – simulated gene expression matrix with optional missingness +} } \description{ -Function to generate simulated SNP and expression data +Generates simulated SNP genotypes, gene expression data across multiple contexts (e.g., tissues or cell types) } diff --git a/man/to_TreeBH_input.Rd b/man/to_TreeBH_input.Rd new file mode 100644 index 0000000..4cdff3c --- /dev/null +++ b/man/to_TreeBH_input.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/treeBH.R +\name{to_TreeBH_input} +\alias{to_TreeBH_input} +\title{Converting eQTL Mapping Output to TreeBH Input} +\usage{ +to_TreeBH_input(data_dir, shared_file, context_names, out_dir) +} +\arguments{ +\item{data_dir}{\itemize{ +\item full filepath of the directory where eQTL output files are stored. This function assumes that files are named as outputted by FastGxC's eQTL mapping function +}} + +\item{shared_file}{\itemize{ +\item full filepath to the shared cis_pairs file from the output of the mapping function +}} + +\item{context_names}{\itemize{ +\item vector of all context names in the format c("tissue1", "tissue2", ..., etc.) +}} + +\item{out_dir}{\itemize{ +\item full filepath of the output directory where the input of TreeBH can be stored +}} +} +\value{ +A data.frame with columns: +\itemize{ +\item gene_name : gene name +\item snp_name : snp name +\item context_name : context name (or "shared") +\item gene_snp : concatenated "gene_SNP" +\item component_tag : "gene_SNP_specific" or "gene_SNP_shared" +\item context_tag : "gene_SNP_component_context" +\item p_value : numeric p-value +} +} +\description{ +Converting eQTL Mapping Output to TreeBH Input +} diff --git a/man/treeBH_step.Rd b/man/treeBH_step.Rd new file mode 100644 index 0000000..211f2b8 --- /dev/null +++ b/man/treeBH_step.Rd @@ -0,0 +1,52 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/treeBH.R +\name{treeBH_step} +\alias{treeBH_step} +\title{Multiple Testing Correction with TreeBH +Function to adjust for hierarchical multiple testing correction using TreeBH.} +\usage{ +treeBH_step(matrix, fdr_thres, out_dir, method = "original", test = "simes") +} +\arguments{ +\item{matrix}{\itemize{ +\item output of to_TreeBH_Input, matrix with the groups and pvalues +}} + +\item{fdr_thres}{\itemize{ +\item value between 0 and 1 that signifies what FDR threshold for multiple testing correction. The same value will be used across all hierarchical levels. +}} + +\item{out_dir}{\itemize{ +\item full filepath of the output directory where the output of TreeBH can be stored +}} + +\item{method}{\itemize{ +\item character string specifying which implementation to use: +\itemize{ +\item "original" (default): uses the TreeBH package implementation +\item "datatable": uses data.table optimized R implementation +\item "cpp": uses Rcpp C++ implementation for maximum performance +} +}} + +\item{test}{\itemize{ +\item character string specifying the aggregation method for p-values: +\itemize{ +\item "simes" (default): uses Simes' method +\item "fisher": uses Fisher's method +} +}} +} +\value{ +A matrix with identification columns and output columns: +\itemize{ +\item gene : gene name +\item snp : snp name +\item context : context name or "specific" +\item treeBH Output Columns : eGene, eQTL, Component, Context Specific +} +} +\description{ +Multiple Testing Correction with TreeBH +Function to adjust for hierarchical multiple testing correction using TreeBH. +} diff --git a/man/treeQTL_step.Rd b/man/treeQTL_step.Rd index 7c95d47..180575f 100644 --- a/man/treeQTL_step.Rd +++ b/man/treeQTL_step.Rd @@ -11,7 +11,9 @@ treeQTL_step( context_names, out_dir, fdr_thresh = 0.05, - four_level = F + four_level = FALSE, + treeBH_method = "cpp", + treeBH_test = "simes" ) } \arguments{ @@ -42,6 +44,14 @@ treeQTL_step( \item{four_level}{\itemize{ \item boolean value (T or F) that signifies whether to use the 4-level hierarchy (set this parameter to R and test for a global eQTL across shared and specific components) or a 3-level hierarchy (this parameter is default set to F to test for shared vs specific eQTLs) }} + +\item{treeBH_method}{\itemize{ +\item character string specifying which TreeBH implementation to use when four_level = TRUE. Options: "original" (TreeBH package), "datatable" (optimized R), "cpp" (fast C++ implementation, default). Ignored when four_level = FALSE. +}} + +\item{treeBH_test}{\itemize{ +\item character string specifying the p-value aggregation method for TreeBH when four_level = TRUE. Options: "simes" (Simes' method, default), "fisher" (Fisher's method). Ignored when four_level = FALSE. +}} } \value{ outputs one file of specific eGenes across all contexts and one file of shared eGenes. Outputs an eAssociation file for each context and one for shared eQTLs with snp-gene pairs and FDR adjusted p-values. diff --git a/src/FastGxC.so b/src/FastGxC.so new file mode 100755 index 0000000..7f974db Binary files /dev/null and b/src/FastGxC.so differ diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp new file mode 100644 index 0000000..6d2d1b9 --- /dev/null +++ b/src/RcppExports.cpp @@ -0,0 +1,66 @@ +// Generated by using Rcpp::compileAttributes() -> do not edit by hand +// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 + +#include + +using namespace Rcpp; + +#ifdef RCPP_USE_GLOBAL_ROSTREAM +Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); +Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); +#endif + +// checkNested +std::string checkNested(const CharacterMatrix& groups, int L); +RcppExport SEXP _FastGxC_checkNested(SEXP groupsSEXP, SEXP LSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const CharacterMatrix& >::type groups(groupsSEXP); + Rcpp::traits::input_parameter< int >::type L(LSEXP); + rcpp_result_gen = Rcpp::wrap(checkNested(groups, L)); + return rcpp_result_gen; +END_RCPP +} +// getGroupPvalues +NumericMatrix getGroupPvalues(const NumericVector& pvals, const CharacterMatrix& groups, int N, int L, CharacterVector test); +RcppExport SEXP _FastGxC_getGroupPvalues(SEXP pvalsSEXP, SEXP groupsSEXP, SEXP NSEXP, SEXP LSEXP, SEXP testSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const NumericVector& >::type pvals(pvalsSEXP); + Rcpp::traits::input_parameter< const CharacterMatrix& >::type groups(groupsSEXP); + Rcpp::traits::input_parameter< int >::type N(NSEXP); + Rcpp::traits::input_parameter< int >::type L(LSEXP); + Rcpp::traits::input_parameter< CharacterVector >::type test(testSEXP); + rcpp_result_gen = Rcpp::wrap(getGroupPvalues(pvals, groups, N, L, test)); + return rcpp_result_gen; +END_RCPP +} +// getGroupSelections +NumericMatrix getGroupSelections(const NumericMatrix& group_pvals, const CharacterMatrix& groups, const NumericVector& q, int N, int L); +RcppExport SEXP _FastGxC_getGroupSelections(SEXP group_pvalsSEXP, SEXP groupsSEXP, SEXP qSEXP, SEXP NSEXP, SEXP LSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const NumericMatrix& >::type group_pvals(group_pvalsSEXP); + Rcpp::traits::input_parameter< const CharacterMatrix& >::type groups(groupsSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type q(qSEXP); + Rcpp::traits::input_parameter< int >::type N(NSEXP); + Rcpp::traits::input_parameter< int >::type L(LSEXP); + rcpp_result_gen = Rcpp::wrap(getGroupSelections(group_pvals, groups, q, N, L)); + return rcpp_result_gen; +END_RCPP +} + +static const R_CallMethodDef CallEntries[] = { + {"_FastGxC_checkNested", (DL_FUNC) &_FastGxC_checkNested, 2}, + {"_FastGxC_getGroupPvalues", (DL_FUNC) &_FastGxC_getGroupPvalues, 5}, + {"_FastGxC_getGroupSelections", (DL_FUNC) &_FastGxC_getGroupSelections, 5}, + {NULL, NULL, 0} +}; + +RcppExport void R_init_FastGxC(DllInfo *dll) { + R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); + R_useDynamicSymbols(dll, FALSE); +} diff --git a/src/RcppExports.o b/src/RcppExports.o new file mode 100644 index 0000000..4d30bb0 Binary files /dev/null and b/src/RcppExports.o differ diff --git a/src/treeBH.cpp b/src/treeBH.cpp new file mode 100644 index 0000000..ef449fb --- /dev/null +++ b/src/treeBH.cpp @@ -0,0 +1,352 @@ +#include +#include +#include +#include +#include +using namespace Rcpp; +using namespace std; + +//' Check if groups are properly nested in hierarchy +// [[Rcpp::export]] +std::string checkNested(const CharacterMatrix& groups, int L) { + int nrow = groups.nrow(); + + // Step 1: Check hierarchy nesting + for (int cur_level = 1; cur_level < L; cur_level++) { + // Map to track parent to child relationships + map> parent_to_children; + + // Create the mapping from child to parent + for (int i = 0; i < nrow; i++) { + string child = as(groups(i, cur_level)); + string parent = as(groups(i, cur_level - 1)); + + // Add child to parent's set of children + parent_to_children[parent].insert(child); + } + + // Check if any child has multiple parents + map> child_to_parents; + for (int i = 0; i < nrow; i++) { + string child = as(groups(i, cur_level)); + string parent = as(groups(i, cur_level - 1)); + + child_to_parents[child].insert(parent); + + // If child has more than one parent, return error + if (child_to_parents[child].size() > 1) { + return "Groups must be nested within hierarchy"; + } + } + } + + // Step 2: Check if lowest level corresponds to individual hypotheses + set lowest_level_values; + for (int i = 0; i < nrow; i++) { + lowest_level_values.insert(as(groups(i, L - 1))); + } + + if (lowest_level_values.size() != nrow) { + return "Assumption is that lowest level in tree corresponds to individual hypotheses"; + } + + // All checks passed + return ""; +} + +// Calculate Simes p-value +double simesPvalue(const NumericVector& pvals) { + // Remove NA values + NumericVector clean_pvals; + for (int i = 0; i < pvals.size(); i++) { + if (!NumericVector::is_na(pvals[i])) { + clean_pvals.push_back(pvals[i]); + } + } + + // If no valid p-values, return NA + if (clean_pvals.size() == 0) { + return NA_REAL; + } + + // Sort p-values + std::sort(clean_pvals.begin(), clean_pvals.end()); + + // Calculate Simes p-value + double min_val = R_PosInf; + int n = clean_pvals.size(); + + for (int i = 0; i < n; i++) { + double adjusted = clean_pvals[i] / (i + 1); + if (adjusted < min_val) { + min_val = adjusted; + } + } + + return n * min_val; +} + +// Calculate Fisher's pvalue +double fisherPvalue(const NumericVector& pvals) { + // Remove NA values + NumericVector clean_pvals; + for (int i = 0; i < pvals.size(); i++) { + if (!NumericVector::is_na(pvals[i])) { + clean_pvals.push_back(pvals[i]); + } + } + + // If no valid p-values, return NA + if (clean_pvals.size() == 0) { + return NA_REAL; + } + + // Calculate Fisher's statistic: -2 * sum(log(p)) + double sum_log = 0.0; + for (int i = 0; i < clean_pvals.size(); i++) { + sum_log += log(clean_pvals[i]); + } + double statistic = -2 * sum_log; + + // Return p-value from chi-squared distribution with 2k degrees of freedom + // Using R's built-in function + return R::pchisq(statistic, 2 * clean_pvals.size(), false, false); +} + +//' Hierarchical Group P-value Calculation +//' The getGroupPvalues function computes aggregated p-values for each group in a hierarchical +//' structure by combining p-values from lower levels using specified aggregation methods. +//' @param pvals NumericVector containing p-values for individual hypotheses +//' @param groups CharacterMatrix defining the group hierarchy structure +//' @param N Integer specifying the number of hypotheses +//' @param L Integer specifying the number of levels in the hierarchy +//' @param test CharacterVector specifying the aggregation method(s) +// [[Rcpp::export]] +NumericMatrix getGroupPvalues(const NumericVector& pvals, const CharacterMatrix& groups, + int N, int L, CharacterVector test) { + // Initialize group_pvals matrix + NumericMatrix group_pvals(N, L); + + // Fill in initial values + std::fill(group_pvals.begin(), group_pvals.end(), NA_REAL); + + // Set initial values for group_pvals at the lowest level + for (int i = 0; i < N; i++) { + group_pvals(i, L-1) = pvals[i]; + } + + // Ensure test vector is of correct length + CharacterVector test_methods = test; + if (test_methods.size() == 1) { + test_methods = CharacterVector(L-1, test[0]); + } + + // Pre-compute group indices for each level + std::vector>> level_groups(L); + + // Build group index maps for each level + for (int level = 0; level < L; level++) { + for (int i = 0; i < N; i++) { + string group = as(groups(i, level)); + level_groups[level][group].push_back(i); + } + } + + // Step 3: Aggregate group-level p-values + for (int cur_level = L - 2; cur_level >= 0; cur_level--) { + // Get unique groups at the current level + const auto& groups_at_level = level_groups[cur_level]; + + // Cache test method for this level + string method = as(test_methods[cur_level]); + + // Process each group + for (const auto& group_entry : groups_at_level) { + const std::vector& cur_inds = group_entry.second; + + // Skip empty groups (shouldn't happen but check anyway) + if (cur_inds.empty()) continue; + + // Get child p-values + NumericVector child_pvals(cur_inds.size()); + for (size_t i = 0; i < cur_inds.size(); i++) { + child_pvals[i] = group_pvals(cur_inds[i], cur_level + 1); + } + + // Calculate aggregated p-value + double agg_p; + if (method == "simes") { + agg_p = simesPvalue(child_pvals); + } else if (method == "fisher") { + agg_p = fisherPvalue(child_pvals); + } else { + stop("Options for parameter test are 'simes' and 'fisher'"); + } + + // Set the aggregated p-value ONLY for the first index in the group + group_pvals(cur_inds[0], cur_level) = agg_p; + } + } + + // Return only the group_pvals matrix + return group_pvals; +} + +// Function to compute qvalues (simplified version using BH method) +NumericVector computeQValues(const NumericVector& pvals) { + int n = pvals.size(); + if (n == 0) return NumericVector(0); + + // Create index vector and sort by p-value + IntegerVector idx = seq(0, n-1); + std::sort(idx.begin(), idx.end(), [&pvals](int i, int j) { + return pvals[i] < pvals[j]; + }); + + // Calculate adjusted p-values + NumericVector adjusted(n); + adjusted[idx[n-1]] = std::min(1.0, pvals[idx[n-1]]); + + for (int i = n-2; i >= 0; i--) { + adjusted[idx[i]] = std::min(adjusted[idx[i+1]], pvals[idx[i]] * n / (i + 1)); + } + + return adjusted; +} + +//' Hierarchical Group Selection for Multiple Testing +//' The getGroupSelections function applies hierarchical multiple testing correction. +//' @param group_pvals NumericMatrix containing aggregated p-values +//' @param groups CharacterMatrix defining the group hierarchy structure +//' @param q NumericVector containing the q-value thresholds for each level +//' @param N Integer specifying the number of hypotheses +//' @param L Integer specifying the number of levels in the hierarchy +// [[Rcpp::export]] +NumericMatrix getGroupSelections(const NumericMatrix& group_pvals, + const CharacterMatrix& groups, + const NumericVector& q, int N, int L) { + NumericMatrix sel(N, L); + NumericMatrix q_adj(N, L); + + // Fill q_adj with NA + std::fill(q_adj.begin(), q_adj.end(), NA_REAL); + + // Set initial value for q_adj + for (int i = 0; i < N; i++) { + q_adj(i, 0) = q[0]; + } + + // Iterate through each level + for (int cur_level = 0; cur_level < L; cur_level++) { + // Check if we should break + if (cur_level > 0) { + bool any_selected = false; + for (int i = 0; i < N; i++) { + if (sel(i, cur_level - 1) == 1) { + any_selected = true; + break; + } + } + if (!any_selected) break; + } + + // Get selected parents from previous level + std::vector sel_prev; + if (cur_level == 0) { + // First level: select all + sel_prev.push_back(0); // We'll use a dummy index 0 for the root + } else { + // Later levels: get indices of selected items from previous level + for (int i = 0; i < N; i++) { + if (sel(i, cur_level - 1) == 1) { + sel_prev.push_back(i); + } + } + } + + // Process each selected parent + for (int parent_sel : sel_prev) { + std::vector child_inds; + int parent_group_ind; + + if (cur_level == 0) { + // First level: include all non-NA indices + parent_group_ind = 0; + for (int i = 0; i < N; i++) { + if (!NumericVector::is_na(group_pvals(i, cur_level))) { + child_inds.push_back(i); + } + } + } else { + // Later levels: include children of the current parent group + string parent_group_num = as(groups(parent_sel, cur_level - 1)); + + // Find the first index with this parent group + for (int i = 0; i < N; i++) { + if (as(groups(i, cur_level - 1)) == parent_group_num) { + parent_group_ind = i; + break; + } + } + + // Find all indices with this parent group and non-NA p-values + for (int i = 0; i < N; i++) { + if (as(groups(i, cur_level - 1)) == parent_group_num && + !NumericVector::is_na(group_pvals(i, cur_level))) { + child_inds.push_back(i); + } + } + } + + // Skip if no children found + if (child_inds.empty()) continue; + + // Select children based on p-values and thresholds + std::vector sel_ind_within_group; + + if (child_inds.size() > 1) { + // Multiple children: use q-value method + NumericVector child_pvals(child_inds.size()); + for (size_t i = 0; i < child_inds.size(); i++) { + child_pvals[i] = group_pvals(child_inds[i], cur_level); + } + + // Compute q-values + NumericVector qvals = computeQValues(child_pvals); + + // Select children with q-values <= threshold + for (size_t i = 0; i < child_inds.size(); i++) { + if (qvals[i] <= q_adj(parent_group_ind, cur_level)) { + sel_ind_within_group.push_back(i); + } + } + } else { + // Single child: use direct p-value comparison + if (group_pvals(child_inds[0], cur_level) <= q_adj(parent_group_ind, cur_level)) { + sel_ind_within_group.push_back(0); + } + } + + // Update selection matrix + for (int idx : sel_ind_within_group) { + sel(child_inds[idx], cur_level) = 1; + } + + // Update q_adj for the next level + if (cur_level < L - 1) { + double R_parent_sel = sel_ind_within_group.size(); + double n_parent_sel = child_inds.size(); + + if (R_parent_sel > 0) { + for (int idx : sel_ind_within_group) { + q_adj(child_inds[idx], cur_level + 1) = q[cur_level + 1] * + q_adj(parent_sel, cur_level) / q[cur_level] * + R_parent_sel / n_parent_sel; + } + } + } + } + } + + return sel; +} diff --git a/src/treeBH.o b/src/treeBH.o new file mode 100644 index 0000000..6f10ced Binary files /dev/null and b/src/treeBH.o differ