diff --git a/.github/workflows/dims_test.yml b/.github/workflows/dims_test.yml index b547d12..e8c4078 100644 --- a/.github/workflows/dims_test.yml +++ b/.github/workflows/dims_test.yml @@ -23,7 +23,7 @@ jobs: uses: actions/checkout@v4 - name: Install dependencies - run: Rscript -e "install.packages(c('testthat', 'withr', 'vdiffr'))" + run: Rscript -e "install.packages(c('testthat', 'withr', 'vdiffr', 'pdftools'))" - name: Run tests run: Rscript tests/testthat.R diff --git a/DIMS/GenerateExcel.R b/DIMS/GenerateExcel.R index 51d28cf..f33c27d 100644 --- a/DIMS/GenerateExcel.R +++ b/DIMS/GenerateExcel.R @@ -45,7 +45,7 @@ peaks_in_list <- which(rownames(outlist) %in% rlvnc$HMDB_key) outlist_subset <- outlist[peaks_in_list, ] outlist_subset$HMDB_key <- rownames(outlist_subset) outlist <- outlist_subset %>% - left_join(rlvnc %>% rename(sec_HMBD_ID_rlvnc = sec_HMDB_ID), by = "HMDB_key") + left_join(rlvnc %>% rename(sec_HMDB_ID_rlvnc = sec_HMDB_ID), by = "HMDB_key") rownames(outlist) <- outlist$HMDB_key # filter out all irrelevant HMDBs @@ -140,7 +140,7 @@ if (z_score == 1) { filter(HMDB_key %in% metab_list_helix) %>% left_join(., metab_df_helix, by = join_by(HMDB_code == HMDB_code)) %>% select( - -c(HMDB_key, sec_HMBD_ID_rlvnc, name, relevance, descr, origin, fluids, tissue, disease, pathway), + -c(HMDB_key, sec_HMDB_ID_rlvnc, name, relevance, descr, origin, fluids, tissue, disease, pathway), -all_of(control_col_idx), -all_of(patient_col_idx) ) %>% relocate(c(HMDB_code, H_Name, avg_ctrls, sd_ctrls), .after = plots) %>% diff --git a/DIMS/GenerateQCOutput.R b/DIMS/GenerateQCOutput.R index 936ef25..59dd697 100644 --- a/DIMS/GenerateQCOutput.R +++ b/DIMS/GenerateQCOutput.R @@ -12,9 +12,8 @@ cmd_args <- commandArgs(trailingOnly = TRUE) init_file <- cmd_args[1] project <- cmd_args[2] dims_matrix <- cmd_args[3] -z_score <- cmd_args[4] -sst_components_file <- cmd_args[5] -export_scripts_dir <- cmd_args[6] +sst_components_file <- cmd_args[4] +export_scripts_dir <- cmd_args[5] outdir <- "./" @@ -38,11 +37,9 @@ dir.create(paste0(outdir, "/plots"), showWarnings = FALSE) control_label <- "C" #### CHECK NUMBER OF CONTROLS #### -if (z_score == 1) { - file_name <- "Check_number_of_controls.txt" - min_num_controls <- 25 - check_number_of_controls(outlist, min_num_controls, file_name) -} +file_name <- "Check_number_of_controls.txt" +min_num_controls <- 25 +check_number_of_controls(outlist, min_num_controls, file_name) #### INTERNAL STANDARDS #### is_list <- outlist[grep("Internal standard", outlist[, "relevance"], fixed = TRUE), ] @@ -216,7 +213,7 @@ if (nrow(is_below_threshold) > 0) { row.names = FALSE, sep = "\t") } else { write.table("no internal standards are below threshold", - file = "internal_standards_below_threshold.txt" + file = "internal_standards_below_threshold.txt", row.names = FALSE, col.names = FALSE ) } @@ -276,7 +273,7 @@ patterns <- c("^(P1002\\.)[[:digit:]]+_", "^(P1003\\.)[[:digit:]]+_", "^(P1005\\ positive_controls_index <- grepl(pattern = paste(patterns, collapse = "|"), column_list) positive_control_list <- column_list[positive_controls_index] -if (z_score == 1) { +if (positive_controls_index > 0) { # find if one or more positive control samples are missing pos_contr_warning <- c() if (all(sapply(c("^P1002", "^P1003", "^P1005"), @@ -351,31 +348,38 @@ is_list_intensities <- get_is_intensities(is_list, int_cols = intensity_col_ids) is_neg_intensities <- get_is_intensities(outlist_tot_neg, is_codes = is_codes) is_pos_intensities <- get_is_intensities(outlist_tot_pos, is_codes = is_codes) -# SST components. -sst_comp <- read.csv(sst_components_file, header = TRUE, sep = "\t") -sst_list <- outlist %>% filter(HMDB_code %in% sst_comp$HMDB_ID) -sst_colnrs <- grep("P1001", colnames(sst_list)) - -if (length(sst_colnrs) > 0) { - sst_list_intensities <- sst_list[, sst_colnrs] - control_col_ids <- grep(control_label, colnames(sst_list), fixed = TRUE) - control_list_intensities <- sst_list[, control_col_ids] - control_list_cv <- calc_coefficient_of_variation(control_list_intensities) - sst_list_intensities <- cbind(sst_list_intensities, CV_controls = control_list_cv[, "CV_perc"]) - sst_list_intensities <- as.data.frame(sst_list_intensities) +# SST components +sst_components <- read.csv(sst_components_file, header = TRUE, sep = "\t") +sst_metabolites_df <- outlist %>% filter(HMDB_code %in% sst_components$HMDB_ID) +sst_sample_column_index <- grep("P1001", colnames(sst_metabolites_df)) + +# Check if SST mix sample(s) are present +if (length(sst_sample_column_index) > 0) { + # Get the SST intensities of the controls, calculate the coefficient of variation + # and add to SST mix intensities + sst_sample_intensities_df <- sst_metabolites_df[, sst_sample_column_index] + control_col_ids <- grep(control_label, colnames(sst_metabolites_df), fixed = TRUE) + control_sst_intensities_df <- sst_metabolites_df[, control_col_ids] + control_sst_metabolites_cv <- calc_coefficient_of_variation(control_sst_intensities_df) + sst_intensities_df <- cbind(sst_sample_intensities_df, CV_controls = control_sst_metabolites_cv[, "CV_perc"]) } else { - sst_list_intensities <- sst_list[, intensity_col_ids] + # Use intensities when there is not SST mix sample added + sst_intensities_df <- sst_metabolites_df[, intensity_col_ids] } -for (col_nr in seq_len(ncol(sst_list_intensities))) { - sst_list_intensities <- as.data.frame(sst_list_intensities) - sst_list_intensities[, col_nr] <- as.numeric(sst_list_intensities[, col_nr]) - if (grepl("Zscore", colnames(sst_list_intensities)[col_nr])) { - sst_list_intensities[, col_nr] <- round(sst_list_intensities[, col_nr], 2) + +sst_intensities_df <- as.data.frame(sst_intensities_df) +for (col_nr in seq_len(ncol(sst_intensities_df))) { + # Change column type to numeric + sst_intensities_df[, col_nr] <- as.numeric(sst_intensities_df[, col_nr]) + if (grepl("Zscore", colnames(sst_intensities_df)[col_nr])) { + # Round numeric value of Z-score columns to 2 decimal places + sst_intensities_df[, col_nr] <- round(sst_intensities_df[, col_nr], 2) } else { - sst_list_intensities[, col_nr] <- round(sst_list_intensities[, col_nr]) + # Round numeric value of intensity columns to an intiger + sst_intensities_df[, col_nr] <- round(sst_intensities_df[, col_nr]) } } -sst_list_intensities <- cbind(SST_comp_name = sst_list$HMDB_name, sst_list_intensities) +sst_intensities_df <- cbind(SST_comp_name = sst_metabolites_df$HMDB_name, sst_intensities_df) # Create Excel file wb <- createWorkbook("IS_SST") @@ -389,7 +393,7 @@ addWorksheet(wb, "IS neg") openxlsx::writeData(wb, sheet = 3, is_neg_intensities) setColWidths(wb, 3, cols = 1, widths = 24) addWorksheet(wb, "SST components") -openxlsx::writeData(wb, sheet = 4, sst_list_intensities) +openxlsx::writeData(wb, sheet = 4, sst_intensities_df) setColWidths(wb, 4, cols = 1:3, widths = 24) xlsx_name <- paste0(outdir, "/", project, "_IS_SST.xlsx") openxlsx::saveWorkbook(wb, xlsx_name, overwrite = TRUE) diff --git a/DIMS/GenerateQCOutput.nf b/DIMS/GenerateQCOutput.nf index 6a4f613..b39d058 100644 --- a/DIMS/GenerateQCOutput.nf +++ b/DIMS/GenerateQCOutput.nf @@ -25,7 +25,6 @@ process GenerateQCOutput { Rscript ${baseDir}/CustomModules/DIMS/GenerateQCOutput.R $init_file \ $analysis_id \ $params.matrix \ - $params.zscore \ $params.sst_components_file \ $params.export_scripts_dir """ diff --git a/DIMS/GenerateViolinPlots.R b/DIMS/GenerateViolinPlots.R index 263c720..0007881 100644 --- a/DIMS/GenerateViolinPlots.R +++ b/DIMS/GenerateViolinPlots.R @@ -1,14 +1,3 @@ -# For untargeted metabolomics, this tool calculates probability scores for -# metabolic disorders. In addition, it provides visual support with violin plots -# of the DIMS measurements for the lab specialists. -# Input needed: -# 1. Excel file in which metabolites are listed with their intensities for -# controls (with C in samplename) and patients (with P in samplename) and their -# corresponding Z-scores. -# 2. All files from github: https://github.com/UMCUGenetics/DIMS - -## adapted from 15-dIEM_violin.R - # load packages suppressPackageStartupMessages(library("dplyr")) library(reshape2) @@ -17,466 +6,101 @@ library(ggplot2) suppressPackageStartupMessages(library("gridExtra")) library(stringr) +options(digits = 16) + # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) run_name <- cmd_args[1] -scripts_dir <- cmd_args[2] -z_score <- as.numeric(cmd_args[3]) -path_metabolite_groups <- cmd_args[4] -file_ratios_metabolites <- cmd_args[5] -file_expected_biomarkers_iem <- cmd_args[6] -file_explanation <- cmd_args[7] -file_isomers <- cmd_args[8] - -if (z_score == 1){ - # path: output folder for dIEM and violin plots - output_dir <- "./" - - file.copy(file_isomers, output_dir) - - # load functions - source(paste0(scripts_dir, "check_same_samplename.R")) - source(paste0(scripts_dir, "prepare_data.R")) - source(paste0(scripts_dir, "prepare_data_perpage.R")) - source(paste0(scripts_dir, "prepare_toplist.R")) - source(paste0(scripts_dir, "create_violin_plots.R")) - source(paste0(scripts_dir, "prepare_alarmvalues.R")) - source(paste0(scripts_dir, "output_helix.R")) - source(paste0(scripts_dir, "get_patient_data_to_helix.R")) - source(paste0(scripts_dir, "add_lab_id_and_onderzoeksnummer.R")) - source(paste0(scripts_dir, "is_diagnostic_patient.R")) - - # number of diseases that score highest in algorithm to plot - top_nr_iem <- 5 - # probability score cut-off for plotting the top diseases - threshold_iem <- 5 - # z-score cutoff of axis on the left for top diseases - ratios_cutoff <- -5 - # number of violin plots per page in PDF - nr_plots_perpage <- 20 - - # binary variable: run function, yes(1) or no(0) - if (z_score == 1) { - algorithm <- ratios <- violin <- 1 - } else { - algorithm <- ratios <- violin <- 0 - } - # are the sample names headers on row 1 or row 2 in the DIMS excel? (default 1) - header_row <- 1 - # column name where the data starts (default B) - col_start <- "B" - zscore_cutoff <- 5 - xaxis_cutoff <- 20 - protocol_name <- "DIMS_PL_DIAG" - - #### STEP 1: Preparation #### - # in: run_name, path_dims_file, header_row ||| out: output_dir, DIMS - - # load outlist instead of excel file - load("outlist.RData") - - # save outlist as dims_xls, will be changed during refactor - dims_xls <- outlist - rm(outlist) - - #### STEP 2: Edit DIMS data ##### - # in: dims_xls ||| out: Data, nr_contr, nr_pat - # Input: the xlsx file that comes out of the pipeline with format: - # [plots] [C] [P] [summary columns] [C_Zscore] [P_Zscore] - # Output: "_CSV.csv" file that is suited for the algorithm in shiny. - - # Determine the number of Contols and Patients in column names: - nr_contr <- length(grep("C", names(dims_xls))) / 2 - nr_pat <- length(grep("P", names(dims_xls))) / 2 - # total number of samples - nrsamples <- nr_contr + nr_pat - # check whether the number of intensity columns equals the number of Zscore columns - if (nr_contr + nr_pat != length(grep("_Zscore", names(dims_xls)))) { - cat("\n**** Error: there aren't as many intensities listed as Zscores") - } - cat(paste0("\n\n------------\n", nr_contr, " controls \n", nr_pat, " patients\n------------\n\n")) - - # Move the columns HMDB_code and HMDB_name to the beginning. - hmdb_info_cols <- c(which(colnames(dims_xls) == "HMDB_code"), which(colnames(dims_xls) == "HMDB_name")) - other_cols <- seq_along(1:ncol(dims_xls))[-hmdb_info_cols] - dims_xls_copy <- dims_xls[, c(hmdb_info_cols, other_cols)] - # Remove the columns from 'name' to 'pathway' - from_col <- which(colnames(dims_xls_copy) == "name") - to_col <- which(colnames(dims_xls_copy) == "pathway") - dims_xls_copy <- dims_xls_copy[, -c(from_col:to_col)] - # in case the excel had an empty "plots" column, remove it - if ("plots" %in% colnames(dims_xls_copy)) { - dims_xls_copy <- dims_xls_copy[, -grep("plots", colnames(dims_xls_copy))] - } - # Rename columns - names(dims_xls_copy) <- gsub("avg.ctrls", "Mean_controls", names(dims_xls_copy)) - names(dims_xls_copy) <- gsub("sd.ctrls", "SD_controls", names(dims_xls_copy)) - names(dims_xls_copy) <- gsub("HMDB_code", "HMDB.code", names(dims_xls_copy)) - names(dims_xls_copy) <- gsub("HMDB_name", "HMDB.name", names(dims_xls_copy)) - - # intensity columns and mean and standard deviation of controls - numeric_cols <- c(3:ncol(dims_xls_copy)) - # make sure all values are numeric - dims_xls_copy[, numeric_cols] <- sapply(dims_xls_copy[, numeric_cols], as.numeric) - - if (exists("dims_xls_copy") & (length(dims_xls_copy) < length(dims_xls))) { - cat("\n### Step 2 # Edit dims data is done.\n") - } else { - cat("\n**** Error: Could not execute step 2 \n") - } - - #### STEP 3: Calculate ratios of intensities for metabolites #### - # in: ratios, file_ratios_metabolites, dims_xls_copy, nr_contr, nr_pat ||| out: Zscore (+file) - # This script loads the file with Ratios (file_ratios_metabolites) and calculates - # the ratios of the intensities of the given metabolites. It also calculates - # Zs-cores based on the avg and sd of the ratios of the controls. - - # Input: dataframe with intenstities and Zscores of controls and patients: - # [HMDB.code] [HMDB.name] [C] [P] [Mean_controls] [SD_controls] [C_Zscore] [P_Zscore] - - # Output: "_CSV.csv" file that is suited for the algorithm, with format: - # "_Ratios_CSV.csv" file, same file as above, but with ratio rows added. - - if (ratios == 1) { - cat(paste0("\nloading ratios file:\n -> ", file_ratios_metabolites, "\n")) - ratio_input <- read.csv(file_ratios_metabolites, sep = ";", stringsAsFactors = FALSE) - - # Prepare empty data frame to fill with ratios - ratio_list <- setNames(data.frame(matrix( - ncol = ncol(dims_xls_copy), - nrow = nrow(ratio_input) - )), colnames(dims_xls_copy)) - ratio_list <- as.data.frame(ratio_list) - - # put HMDB info into first two columns of ratio_list - ratio_list[, 1:2] <- ratio_input[, 1:2] - - # look for intensity columns (exclude Zscore columns) - control_cols <- grep("C", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")]) - patient_cols <- grep("P", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")]) - intensity_cols <- c(control_cols, patient_cols) - # calculate each of the ratios of intensities - for (ratio_index in 1:nrow(ratio_input)) { - ratio_numerator <- ratio_input[ratio_index, "HMDB_numerator"] - ratio_numerator <- strsplit(ratio_numerator, "plus")[[1]] - ratio_denominator <- ratio_input[ratio_index, "HMDB_denominator"] - ratio_denominator <- strsplit(ratio_denominator, "plus")[[1]] - # find these HMDB IDs in dataset. Could be a sum of multiple metabolites - sel_denominator <- sel_numerator <- c() - for (numerator_index in 1:length(ratio_numerator)) { - sel_numerator <- c(sel_numerator, which(dims_xls_copy[, "HMDB.code"] == ratio_numerator[numerator_index])) - } - for (denominator_index in 1:length(ratio_denominator)) { - # special case for sum of metabolites (dividing by one) - if (ratio_denominator[denominator_index] != "one") { - sel_denominator <- c(sel_denominator, which(dims_xls_copy[, "HMDB.code"] == ratio_denominator[denominator_index])) - } - } - # calculate ratio - if (ratio_denominator[denominator_index] != "one") { - ratio_list[ratio_index, intensity_cols] <- apply(dims_xls_copy[sel_numerator, intensity_cols], 2, sum) / - apply(dims_xls_copy[sel_denominator, intensity_cols], 2, sum) - } else { - # special case for sum of metabolites (dividing by one) - ratio_list[ratio_index, intensity_cols] <- apply(dims_xls_copy[sel_numerator, intensity_cols], 2, sum) - } - # calculate log of ratio - ratio_list[ratio_index, intensity_cols] <- log2(ratio_list[ratio_index, intensity_cols]) - } - - # Calculate means and SD's of the calculated ratios for Controls - ratio_list[, "Mean_controls"] <- apply(ratio_list[, control_cols], 1, mean) - ratio_list[, "SD_controls"] <- apply(ratio_list[, control_cols], 1, sd) - - # Calc z-scores with the means and SD's of Controls - zscore_cols <- grep("Zscore", colnames(ratio_list)) - for (sample_index in 1:length(zscore_cols)) { - zscore_col <- zscore_cols[sample_index] - # matching intensity column - int_col <- intensity_cols[sample_index] - # test on column names - if (check_same_samplename(colnames(ratio_list)[int_col], colnames(ratio_list)[zscore_col])) { - # calculate Z-scores - ratio_list[, zscore_col] <- (ratio_list[, int_col] - ratio_list[, "Mean_controls"]) / ratio_list[, "SD_controls"] - } - } - - # Add rows of the ratio hmdb codes to the data of zscores from the pipeline. - dims_xls_ratios <- rbind(ratio_list, dims_xls_copy) - - # Edit the DIMS output Zscores of all patients in format: - # HMDB_code patientname1 patientname2 - names(dims_xls_ratios) <- gsub("HMDB.code", "HMDB_code", names(dims_xls_ratios)) - names(dims_xls_ratios) <- gsub("HMDB.name", "HMDB_name", names(dims_xls_ratios)) - - # for debugging: - write.table(dims_xls_ratios, file = paste0(output_dir, "/ratios.txt"), sep = "\t") - - # Select only the cols with zscores of the patients - zscore_patients <- dims_xls_ratios[, c(1, 2, zscore_cols[grep("P", colnames(dims_xls_ratios)[zscore_cols])])] - # Select only the cols with zscores of the controls - zscore_controls <- dims_xls_ratios[, c(1, 2, zscore_cols[grep("C", colnames(dims_xls_ratios)[zscore_cols])])] - - } - - #### STEP 4: Run the IEM algorithm ######### - # in: algorithm, file_expected_biomarkers_iem, zscore_patients ||| out: prob_score (+file) - # algorithm taken from DOI: 10.3390/ijms21030979 - - if (algorithm == 1) { - # Load data - cat(paste0("\nloading expected file:\n -> ", file_expected_biomarkers_iem, "\n")) - expected_biomarkers <- read.csv(file_expected_biomarkers_iem, sep = ";", stringsAsFactors = FALSE) - # modify column names - names(expected_biomarkers) <- gsub("HMDB.code", "HMDB_code", names(expected_biomarkers)) - names(expected_biomarkers) <- gsub("Metabolite", "HMDB_name", names(expected_biomarkers)) - - # prepare dataframe scaffold rank_patients - rank_patients <- zscore_patients - # Fill df rank_patients with the ranks for each patient - for (patient_index in 3:ncol(zscore_patients)) { - # number of positive zscores in patient - pos <- sum(zscore_patients[, patient_index] > 0) - # sort the column on zscore; NB: this sorts the entire object, not just one column - rank_patients <- rank_patients[order(-rank_patients[patient_index]), ] - # Rank all positive zscores highest to lowest - rank_patients[1:pos, patient_index] <- as.numeric(ordered(-rank_patients[1:pos, patient_index])) - # Rank all negative zscores lowest to highest - rank_patients[(pos + 1):nrow(rank_patients), patient_index] <- as.numeric(ordered(rank_patients[(pos + 1): - nrow(rank_patients), patient_index])) - } - - # Calculate metabolite score, using the dataframes with only values, and later add the cols without values (1&2). - expected_zscores <- merge(x = expected_biomarkers, y = zscore_patients, by.x = c("HMDB_code"), by.y = c("HMDB_code")) - expected_zscores_original <- expected_zscores - - # determine which columns contain Z-scores and which contain disease info - select_zscore_cols <- grep("_Zscore", colnames(expected_zscores)) - select_info_cols <- 1:(min(select_zscore_cols) - 1) - # set some zscores to zero - select_incr_indisp <- which(expected_zscores$Change == "Increase" & expected_zscores$Dispensability == "Indispensable") - expected_zscores[select_incr_indisp, select_zscore_cols] <- lapply(expected_zscores[select_incr_indisp, - select_zscore_cols], function(x) ifelse (x <= 1.6, 0, x)) - select_decr_indisp <- which(expected_zscores$Change == "Decrease" & expected_zscores$Dispensability == "Indispensable") - expected_zscores[select_decr_indisp, select_zscore_cols] <- lapply(expected_zscores[select_decr_indisp, - select_zscore_cols], function(x) ifelse (x >= -1.2, 0, x)) - - # calculate rank score: - expected_ranks <- merge(x = expected_biomarkers, y = rank_patients, by.x = c("HMDB_code"), by.y = c("HMDB_code")) - rank_scores <- expected_zscores[order(expected_zscores$HMDB_code), select_zscore_cols] / - (expected_ranks[order(expected_ranks$HMDB_code), select_zscore_cols] * 0.9) - # combine disease info with rank scores - expected_metabscore <- cbind(expected_ranks[order(expected_zscores$HMDB_code), select_info_cols], rank_scores) - - # multiply weight score and rank score - weight_score <- expected_zscores - weight_score[, select_zscore_cols] <- expected_metabscore$Total_Weight * expected_metabscore[, select_zscore_cols] - - # sort table on Disease and Absolute_Weight - weight_score <- weight_score[order(weight_score$Disease, weight_score$Absolute_Weight, decreasing = TRUE), ] - - # select columns to check duplicates - dup <- weight_score[, c("Disease", "M.z")] - uni <- weight_score[!duplicated(dup) | !duplicated(dup, fromLast = FALSE), ] - - # calculate probability score - prob_score <- aggregate(uni[, select_zscore_cols], uni["Disease"], sum) - - # list of all diseases that have at least one metabolite Zscore at 0 - for (patient_index in 2:ncol(prob_score)) { - patient_zscore_colname <- colnames(prob_score)[patient_index] - matching_colname_expected <- which(colnames(expected_zscores) == patient_zscore_colname) - # determine which Zscores are 0 for this patient - zscores_zero <- which(expected_zscores[, matching_colname_expected] == 0) - # get Disease for these - disease_zero <- unique(expected_zscores[zscores_zero, "Disease"]) - # set the probability score of these diseases to 0 - prob_score[which(prob_score$Disease %in% disease_zero), patient_index] <- 0 - } - - # determine disease rank per patient - disease_rank <- prob_score - # rank diseases in decreasing order - disease_rank[2:ncol(disease_rank)] <- lapply(2:ncol(disease_rank), function(x) - as.numeric(ordered(-disease_rank[1:nrow(disease_rank), x]))) - # modify column names, Zscores have now been converted to probability scores - colnames(prob_score) <- gsub("_Zscore", "_prob_score", colnames(prob_score)) - colnames(disease_rank) <- gsub("_Zscore", "", colnames(disease_rank)) - - # Create conditional formatting for output Excel sheet. Colors according to values. - wb <- createWorkbook() - addWorksheet(wb, "Probability Scores") - writeData(wb, "Probability Scores", prob_score) - conditionalFormatting(wb, "Probability Scores", cols = 2:ncol(prob_score), rows = 1:nrow(prob_score), - type = "colourScale", style = c("white", "#FFFDA2", "red"), rule = c(1, 10, 100)) - saveWorkbook(wb, file = paste0(output_dir, "/dIEM_algoritme_output_", run_name, ".xlsx"), overwrite = TRUE) - # check whether prob_score df exists and has expected dimensions. - if (exists("expected_biomarkers") & (length(disease_rank) == length(prob_score))) { - cat("\n### Step 4 # Running the IEM algorithm is done.\n\n") - } else { - cat("\n**** Error: Could not run IEM algorithm. Check if path to expected_biomarkers csv-file is correct. \n") - } - - rm(wb) - } - - #### STEP 5: Make violin plots ##### - # in: algorithm / zscore_patients, violin, nr_contr, nr_pat, Data, path_textfiles, zscore_cutoff, xaxis_cutoff, - # top_diseases, top_metab, output_dir ||| out: pdf file, Helix csv file - - if (violin == 1) { - - # preparation - zscore_patients_copy <- zscore_patients - colnames(zscore_patients) <- gsub("_Zscore", "", colnames(zscore_patients)) - colnames(zscore_controls) <- gsub("_Zscore", "", colnames(zscore_controls)) - - # Make patient list for violin plots - patient_list <- names(zscore_patients)[-c(1, 2)] - - # from table expected_biomarkers, choose selected columns - select_columns <- c("Disease", "HMDB_code", "HMDB_name") - #select_col_nrs <- which(colnames(expected_biomarkers) %in% select_columns) - expected_biomarkers_select <- expected_biomarkers %>% select(all_of(select_columns)) - # remove duplicates - expected_biomarkers_select <- expected_biomarkers_select[!duplicated(expected_biomarkers_select[, c(1, 2)]), ] - - # load file with explanatory information to be included in PDF. - explanation <- readLines(file_explanation) - - # first step: normal violin plots - # Find all text files in the given folder, which contain metabolite lists of which - # each file will be a page in the pdf with violin plots. - # Make a PDF file for each of the categories in metabolite_dirs - metabolite_dirs <- list.files(path = path_metabolite_groups, full.names = FALSE, recursive = FALSE) - for (metabolite_dir in metabolite_dirs) { - # create a directory for the output PDFs - pdf_dir <- paste(output_dir, metabolite_dir, sep = "/") - dir.create(pdf_dir, showWarnings = FALSE) - cat("making plots in category:", metabolite_dir, "\n") - - # get a list of all metabolite files - metabolite_files <- list.files(path = paste(path_metabolite_groups, metabolite_dir, sep = "/"), - pattern = "*.txt", full.names = FALSE, recursive = FALSE) - # put all metabolites into one list - metab_list_all <- list() - metab_list_names <- c() - cat("making plots from the input files:") - # open the text files and add each to a list of dataframes (metab_list_all) - for (file_index in seq_along(metabolite_files)) { - infile <- metabolite_files[file_index] - metab_list <- read.table(paste(path_metabolite_groups, metabolite_dir, infile, sep = "/"), - sep = "\t", header = TRUE, quote = "") - # put into list of all lists - metab_list_all[[file_index]] <- metab_list - metab_list_names <- c(metab_list_names, strsplit(infile, ".txt")[[1]][1]) - cat(paste0("\n", infile)) - } - # include list of classes in metabolite list - names(metab_list_all) <- metab_list_names - - # prepare list of metabolites; max nr_plots_perpage on one page - metab_interest_sorted <- prepare_data(metab_list_all, zscore_patients) - metab_interest_controls <- prepare_data(metab_list_all, zscore_controls) - metab_perpage <- prepare_data_perpage(metab_interest_sorted, metab_interest_controls, nr_plots_perpage, nr_pat, nr_contr) - - # for Diagnostics metabolites to be saved in Helix - if(grepl("Diagnost", pdf_dir)) { - # get table that combines DIMS results with stofgroepen/Helix table - dims_helix_table <- get_patient_data_to_helix(metab_interest_sorted, metab_list_all) - - # check if run contains Diagnostics patients (e.g. "P2024M"), not for research runs - if(any(is_diagnostic_patient(dims_helix_table$Patient))){ - # get output file for Helix - output_helix <- output_for_helix(protocol_name, dims_helix_table) - # write output to file - path_helixfile <- paste0(output_dir, "/output_Helix_", run_name,".csv") - write.csv(output_helix, path_helixfile, quote = F, row.names = F) - } - } - - # make violin plots per patient - for (pt_nr in 1:length(patient_list)) { - pt_name <- patient_list[pt_nr] - # for category Diagnostics, make list of metabolites that exceed alarm values for this patient - # for category Other, make list of top highest and lowest Z-scores for this patient - if (grepl("Diagnost", pdf_dir)) { - top_metab_pt <- prepare_alarmvalues(pt_name, dims_helix_table) - } else { - top_metab_pt <- prepare_toplist(pt_name, zscore_patients) - } - - # generate normal violin plots - create_violin_plots(pdf_dir, pt_name, metab_perpage, top_metab_pt) - - } - - } - - # Second step: dIEM plots in separate directory - diem_plot_dir <- paste(output_dir, "dIEM_plots", sep = "/") - dir.create(diem_plot_dir) - - # Select the metabolites that are associated with the top highest scoring IEM, for each patient - # disease_rank is from step 4: the dIEM algorithm. The lower the value, the more likely. - for (pt_nr in 1:length(patient_list)) { - pt_name <- patient_list[pt_nr] - # get top diseases for this patient - pt_colnr <- which(colnames(disease_rank) == pt_name) - pt_top_indices <- which(disease_rank[, pt_colnr] <= top_nr_iem) - pt_iems <- disease_rank[pt_top_indices, "Disease"] - pt_top_iems <- pt_prob_score_top_iems <- c() - for (single_iem in pt_iems) { - # get the probability score - prob_score_iem <- prob_score[which(prob_score$Disease == single_iem), pt_colnr] - # use only diseases for which probability score is above threshold - if (prob_score_iem >= threshold_iem) { - pt_top_iems <- c(pt_top_iems, single_iem) - pt_prob_score_top_iems <- c(pt_prob_score_top_iems, prob_score_iem) - } - } - - # prepare data for plotting dIEM violin plots - # If prob_score_top_iem is an empty list, don't make a plot - if (length(pt_top_iems) > 0) { - # Sorting from high to low, both prob_score_top_iems and pt_top_iems. - pt_prob_score_order <- order(-pt_prob_score_top_iems) - pt_prob_score_top_iems <- round(pt_prob_score_top_iems, 1) - pt_prob_score_top_iem_sorted <- pt_prob_score_top_iems[pt_prob_score_order] - pt_top_iem_sorted <- pt_top_iems[pt_prob_score_order] - # getting metabolites for each top_iem disease exactly like in metab_list_all - metab_iem_all <- list() - metab_iem_names <- c() - for (single_iem_index in 1:length(pt_top_iem_sorted)) { - single_iem <- pt_top_iem_sorted[single_iem_index] - single_prob_score <- pt_prob_score_top_iem_sorted[single_iem_index] - select_rows <- which(expected_biomarkers_select$Disease == single_iem) - metab_list <- expected_biomarkers_select[select_rows, ] - metab_iem_names <- c(metab_iem_names, paste0(single_iem, ", probability score ", single_prob_score)) - metab_list <- metab_list[, -1] - metab_iem_all[[single_iem_index]] <- metab_list - } - # put all metabolites into one list - names(metab_iem_all) <- metab_iem_names - - # get Zscore information from zscore_patients_copy, similar to normal violin plots - metab_iem_sorted <- prepare_data(metab_iem_all, zscore_patients_copy) - metab_iem_controls <- prepare_data(metab_iem_all, zscore_controls) - # make sure every page has 20 metabolites - diem_metab_perpage <- prepare_data_perpage(metab_iem_sorted, metab_iem_controls, nr_plots_perpage, nr_pat) - # add table of metabolites with increased or decreased Z-scores - top_metab_pt <- prepare_toplist(pt_name, zscore_patients) - - # generate dIEM violin plots - create_violin_plots(diem_plot_dir, pt_name, diem_metab_perpage, top_metab_pt) - - } else { - cat(paste0("\n\n**** This patient had no prob_scores higher than ", threshold_iem, ". - Therefore, this pdf was not made:\t ", pt_name, "_iem \n")) - } - - } - - } +export_scripts_dir <- cmd_args[2] +path_metabolite_groups <- cmd_args[3] +file_ratios_metabolites <- cmd_args[4] +file_expected_biomarkers_iem <- cmd_args[5] +file_explanation <- cmd_args[6] + +# load functions +source(paste0(export_scripts_dir, "generate_violin_plots_functions.R")) +# load dataframe with intensities and Z-scores for all samples +intensities_zscore_df <- get(load("outlist.RData")) +rm(outlist) +# read input files +metabolites_ratios_df <- read.csv(file_ratios_metabolites, sep = ";", stringsAsFactors = FALSE) +expected_biomarkers_df <- read.csv(file_expected_biomarkers_iem, sep = ";", stringsAsFactors = FALSE) +expected_biomarkers_df <- expected_biomarkers_df %>% + rename( + HMDB_code = HMDB.code, + HMDB_name = Metabolite + ) +explanation_violin_plot <- readLines(file_explanation) + +# Set global variables +iem_variables <- list( + top_number_iem_diseases = 5, + threshold_iem = 5 +) +top_number_iem_diseases <- 5 # number of diseases that score highest in algorithm to plot +threshold_iem <- 5 # probability score cut-off for plotting the top diseases +nr_plots_perpage <- 20 # number of violin plots per page in PDF +zscore_cutoff <- 5 +protocol_name <- "DIMS_PL_DIAG" +number_of_metabolites <- list( + highest = 20, + lowest = 10 +) + +control_ids <- get_colnames_samples(intensities_zscore_df, "C") +patient_ids <- get_colnames_samples(intensities_zscore_df, "P") +all_sample_ids <- c(control_ids, patient_ids) +number_of_samples <- list( + controls = length(control_ids), + patients = length(patient_ids) +) + +# Add Z-scores for ratios to intensities_zscore_df dataframe +intensities_zscore_ratios_df <- add_zscores_ratios_to_df(intensities_zscore_df, metabolites_ratios_df, all_sample_ids) +# for debugging: +save(intensities_zscore_ratios_df, file = "./outlist_with_ratios.RData") + +# Select only the cols with zscores of the patients +zscore_patients_df <- intensities_zscore_ratios_df %>% + select(HMDB_code, HMDB_name, any_of(paste0(patient_ids, "_Zscore"))) %>% + rename_with(~ str_remove(.x, "_Zscore"), .cols = contains("_Zscore")) +zscore_controls_df <- intensities_zscore_ratios_df %>% + select(HMDB_code, HMDB_name, any_of(paste0(control_ids, "_Zscore"))) %>% + rename_with(~ str_remove(.x, "_Zscore"), .cols = contains("_Zscore")) + +#### Make violin plots ##### +make_and_save_violin_plot_pdfs( + zscore_patients_df, + zscore_controls_df, + path_metabolite_groups, + nr_plots_perpage, + number_of_samples, + run_name, + protocol_name, + explanation_violin_plot, + number_of_metabolites +) + +#### Run the IEM algorithm ######### +diem_probability_score <- run_diem_algorithm(expected_biomarkers_df, zscore_patients_df, patient_ids) + +save_prob_scores_to_excel(diem_probability_score, run_name) + +#### Generate dIEM plots ######### +patient_no_iem <- make_and_save_diem_plots( + diem_probability_score, + patient_ids, + expected_biomarkers_df, + zscore_patients_df, + zscore_controls_df, + nr_plots_perpage, + number_of_samples, + number_of_metabolites, + iem_variables, + explanation_violin_plot +) + +if (length(patient_no_iem) > 0) { + save_patient_no_iem(iem_variables$threshold_iem, patient_no_iem) } diff --git a/DIMS/GenerateViolinPlots.nf b/DIMS/GenerateViolinPlots.nf index 1c4b532..ec65a2e 100755 --- a/DIMS/GenerateViolinPlots.nf +++ b/DIMS/GenerateViolinPlots.nf @@ -18,11 +18,10 @@ process GenerateViolinPlots { script: """ - Rscript ${baseDir}/CustomModules/DIMS/GenerateViolinPlots.R $analysis_id $params.scripts_dir $params.zscore \ + Rscript ${baseDir}/CustomModules/DIMS/GenerateViolinPlots.R $analysis_id $params.export_scripts_dir \ $params.path_metabolite_groups \ $params.file_ratios_metabolites \ $params.file_expected_biomarkers_IEM \ - $params.file_explanation \ - $params.file_isomers + $params.file_explanation """ } diff --git a/DIMS/export/generate_violin_plots_functions.R b/DIMS/export/generate_violin_plots_functions.R new file mode 100644 index 0000000..378148e --- /dev/null +++ b/DIMS/export/generate_violin_plots_functions.R @@ -0,0 +1,976 @@ +#' Preparing the intensities and Z-score dataframe. +#' Certain columns are removed, the HMDB_code and HMDB_name column are moved forward, +#' the avg_ctrls and sd_ctrls columns are renamed and the column type of all columns containing numbers +#' is changed to numeric. +#' +#' @param intensities_zscore_df: dataframe with intensities, Z-scores and metabolite information for all samples +#' +#' @returns intensities_zscore_df: a dataframe containing intensities, Z-scores, HMDB IDs, HMDB names and +#' the mean and average of all controls +prepare_intensities_zscore_df <- function(intensities_zscore_df) { + intensities_zscore_df <- intensities_zscore_df %>% + select(-c( + plots, HMDB_name_all, HMDB_ID_all, sec_HMDB_ID, HMDB_key, sec_HMDB_ID_rlvnc, name, + relevance, descr, origin, fluids, tissue, disease, pathway, nr_ctrls + )) %>% + relocate(c(HMDB_code, HMDB_name)) %>% + rename(mean_controls = avg_ctrls, sd_controls = sd_ctrls) %>% + mutate(across(!c(HMDB_name, HMDB_code), as.numeric)) + return(intensities_zscore_df) +} + +#' Get all column names containing a specific prefix. +#' Find all column names containing a specific prefix, e.g. "P", and remove the _Zscore suffix from the names +#' +#' @param dataframe: dataframe containing multiple columns with Z-scores +#' @param sample_label: a string of a prefix to be searched in the column names, e.g. "P" or "C". +#' +#' @returns sample_colnames: a vector of column names all containing the prefix. +get_colnames_samples <- function(dataframe, sample_label) { + sample_colnames <- unique(gsub("_Zscore", "", grep(paste0("^", sample_label), colnames(dataframe), value = TRUE))) + return(sample_colnames) +} + +#' Add Zscores for multiple ratios to the dataframe +#' +#' @param outlist: dataframe containing intensities and Z-scores for all controls and patients +#' @param metabolites_ratios_df: dataframe containing numerators and denominators for all ratios +#' @param all_sample_ids: vector of sample IDS, controls and patients +#' +#' @returns intensities_zscore_ratios_df: dataframe containing intensities and Z-scores for all controls and patients +#' for all metabolites and ratios +add_zscores_ratios_to_df <- function(outlist, metabolites_ratios_df, all_sample_ids) { + intensities_zscores_df <- prepare_intensities_zscore_df(outlist) + + # calculate Z-scores for the ratios + zscore_ratios_df <- calculate_zscore_ratios(metabolites_ratios_df, intensities_zscores_df, all_sample_ids) + intensities_zscore_ratios_df <- rbind(intensities_zscores_df, zscore_ratios_df) + + return(intensities_zscore_ratios_df) +} + +#' Calculate Z-scores for ratios +#' +#' @param metabolites_ratios_df: dataframe containing numerators and denominators for all ratios +#' @param intensities_zscores_df: dataframe containing intensities and Z-scores for all controls and patients +#' @param intensity_col_names: vector of sample IDS, controls and patients +#' +#' @returns zscore_ratios_df: dataframe containing Z-scores for all ratios for all samples +calculate_zscore_ratios <- function(metabolites_ratios_df, intensities_zscores_df, intensity_col_names) { + zscore_ratios_df <- data.frame(matrix( + ncol = ncol(intensities_zscores_df), + nrow = nrow(metabolites_ratios_df) + )) + colnames(zscore_ratios_df) <- colnames(intensities_zscores_df) + + # put HMDB info into first two columns of ratio_zscore_df + zscore_ratios_df$HMDB_code <- metabolites_ratios_df$HMDB.code + zscore_ratios_df$HMDB_name <- metabolites_ratios_df$Ratio_name + + intensity_cols_index <- which(colnames(zscore_ratios_df) %in% intensity_col_names) + for (row_index in seq_len(nrow(metabolites_ratios_df))) { + # Get a list of intensities for the numerator + numerator_intensities <- get_intensities_fraction_side( + metabolites_ratios_df, + row_index, + intensities_zscores_df, + "HMDB_numerator", + intensity_col_names + ) + # Get a list of intensities for the denominator + denominator_intensities <- get_intensities_fraction_side( + metabolites_ratios_df, + row_index, + intensities_zscores_df, + "HMDB_denominator", + intensity_col_names + ) + # calculate the intensity ratio for each sample + zscore_ratios_df[row_index, intensity_cols_index] <- log2(numerator_intensities / denominator_intensities) + } + + control_intensities_cols_index <- grep("^C[^_]*$", colnames(intensities_zscores_df), perl = TRUE) + # Calculate means and SD's of the calculated ratios for Controls + zscore_ratios_df[, "mean_controls"] <- apply(zscore_ratios_df[, control_intensities_cols_index], 1, mean) + zscore_ratios_df[, "sd_controls"] <- apply(zscore_ratios_df[, control_intensities_cols_index], 1, sd) + + # Calculate Zscores for the ratios + samples_zscore_columns <- get_sample_ids_with_zscores(colnames(intensities_zscores_df), intensity_col_names) + intensity_ratios_df <- zscore_ratios_df[, intensity_col_names] + mean_ratios_controls <- zscore_ratios_df[, "mean_controls"] + sd_ratios_controls <- zscore_ratios_df[, "sd_controls"] + + zscore_ratios_df[, samples_zscore_columns] <- (intensity_ratios_df - mean_ratios_controls) / sd_ratios_controls + + return(zscore_ratios_df) +} + +#' Make and save violin plots for each patient in a PDF +#' +#' @param zscore_patients_df: dataframe with Z-scores for all patient samples +#' @param zscore_controls_df: dataframe with Z-scores for all control samples +#' @param path_metabolite_groups: string containing the path for the metabolite groups directories +#' @param nr_plots_perpage: integer containing the number of metabolites on a plot per page +#' @param number_of_samples: list containing the number of patient and control samples +#' @param run_name: string containing the run name +#' @param protocol_name: string containing the protocol name +#' @param explanation_violin_plot: vector of strings containing the explanation of the violin plots +#' @param number_of_metabolites: list containing the number of metabolites for the top and lowest table +make_and_save_violin_plot_pdfs <- function( + zscore_patients_df, + zscore_controls_df, + path_metabolite_groups, + nr_plots_perpage, + number_of_samples, + run_name, + protocol_name, + explanation_violin_plot, + number_of_metabolites) { + # Get all patient IDs + patient_col_names <- get_colnames_samples(zscore_patients_df, "P") + # get all files from metabolite_groups directory + metabolite_dirs <- list.files(path = path_metabolite_groups, full.names = FALSE, recursive = FALSE) + for (metabolite_dir in metabolite_dirs) { + # create a directory for the output PDFs + pdf_dir <- paste0("./", metabolite_dir) + dir.create(pdf_dir, showWarnings = FALSE) + + metab_list_all <- get_list_dataframes_from_dir(paste(path_metabolite_groups, metabolite_dir, sep = "/")) + metab_interest_patients <- merge_metabolite_info_zscores(metab_list_all, zscore_patients_df) + metab_interest_controls <- merge_metabolite_info_zscores(metab_list_all, zscore_controls_df) + metab_perpage <- get_data_per_metabolite_class( + metab_interest_patients, + metab_interest_controls, + nr_plots_perpage, + number_of_samples$patients, + number_of_samples$controls + ) + + # for Diagnostics metabolites to be saved in Helix + if (grepl("Diagnost", pdf_dir)) { + # get table that combines DIMS results with metabolite classes/Helix table + dims_helix_table <- prepare_helix_patient_data(metab_interest_patients, metab_list_all) + # check if run contains diagnostic patients (e.g. "P2024M") + if (any(is_diagnostic_patients(dims_helix_table$Sample))) { + # transform dataframe for Helix output + output_helix <- transform_metab_df_to_helix_df(protocol_name, dims_helix_table) + # save the DIMS Helix dataframe + path_helixfile <- paste0("./output_Helix_", run_name, ".csv") + write.csv(output_helix, path_helixfile, quote = FALSE, row.names = FALSE) + } + } + + # make violin plots per patient + for (patient_id in patient_col_names) { + if (grepl("Diagnost", pdf_dir)) { + # make list of metabolites that exceed alarm values for this patient + top_metabs_patient <- get_top_metabolites_df(patient_id, dims_helix_table) + } else { + # make list of top highest and lowest Z-scores for this patient + top_metabs_patient <- prepare_toplist( + patient_id, + zscore_patients_df, + number_of_metabolites$highest, + number_of_metabolites$lowest + ) + } + # generate normal violin plots + create_pdf_violin_plots(pdf_dir, patient_id, metab_perpage, top_metabs_patient, explanation_violin_plot) + } + } +} + +#' Get a list with dataframes for all off the metabolite group in a directory +#' +#' @param dir_with_subdirs: directory containing txt files with metabolites per group (string) +#' +#' @returns list_of_dataframes: list with dataframes with info on metabolites (list of dataframes) +get_list_dataframes_from_dir <- function(dir_with_subdirs) { + # get a list of all metabolite files + txt_files_paths <- list.files(dir_with_subdirs, pattern = "*.txt", recursive = FALSE, full.names = TRUE) + # put all metabolites into one list + list_of_dataframes <- lapply(txt_files_paths, + read.table, + sep = "\t", header = TRUE, quote = "" + ) + names(list_of_dataframes) <- gsub(".txt", "", basename(txt_files_paths)) + + return(list_of_dataframes) +} + +#' Merge patient Z-scores with metabolite info +#' +#' @param list_df_metabolite_groups: list of dataframes with metabolite information for different metabolite classes (list) +#' @param zscore_df: dataframe with metabolite Z-scores for all patient +#' +#' @return list_dfs_metabs_info_zscores: list of dataframes for each metabolite class +#' containing info and zscores for all samples +merge_metabolite_info_zscores <- function(list_df_metabolite_groups, zscore_df) { + # remove HMDB_name column and "_Zscore" from column (patient) names + zscore_df <- zscore_df %>% + select(-HMDB_name) + + # put data into pages, max 20 violin plots per page in PDF + list_dfs_metabs_info_zscores <- list() + + for (metabolite_class in names(list_df_metabolite_groups)) { + # select the metabolite_class dataframe and select the HMDB_code and HMDB_name columns + metabolite_info_df <- list_df_metabolite_groups[[metabolite_class]] %>% select(HMDB_code, HMDB_name) + + # Pad or truncate the HMDB names + metabolite_info_df <- pad_truncate_hmdb_names(metabolite_info_df, 45, " ") + + # Join metabolite info with the Z-score dataframe + metabolite_zscore_df <- metabolite_info_df %>% + inner_join(zscore_df, by = "HMDB_code") %>% + select(-HMDB_code) + + # put the data frame in long format + metabolite_zscore_df_long <- reshape2::melt( + metabolite_zscore_df, + id.vars = "HMDB_name", + variable.name = "Sample", + value.name = "Z_score" + ) + # Add the dataframe sorted on HMDB_name to a list + list_dfs_metabs_info_zscores[[metabolite_class]] <- metabolite_zscore_df_long + } + + return(list_dfs_metabs_info_zscores) +} + +#' Combine patient and control data for each page of the violinplot pdf +#' +#' @param metab_interest_patients: list of dataframes with data for each metabolite and patient (list) +#' @param metab_interest_controls: list of dataframes with data for each metabolite and control (list) +#' @param number_of_plots_per_page: number of plots per page in the violinplot pdf (integer) +#' @param number_of_patients: number of patients (integer) +#' @param number_of_controls: number of controls (integer) +#' +#' @return list_metabolite_df_per_page: list of dataframes with metabolite Z-scores for each patient and control, +#' the length of list is the number of pages for the violinplot pdf (list) +get_data_per_metabolite_class <- function( + metab_interest_patients, + metab_interest_controls, + number_of_plots_per_page, + number_of_patients, + number_of_controls) { + list_metabolite_df_per_page <- list() + metabolite_categories <- c() + + for (metabolite_class in names(metab_interest_patients)) { + # Get the data for patients and controls for the metab_interest_sorted list + metabolite_class_patients_df <- metab_interest_patients[[metabolite_class]] + metabolite_class_controls_df <- metab_interest_controls[[metabolite_class]] + + # Get all metabolites and create list with HMDB names of max nr_plots_perpage long + metabolites <- unique(metabolite_class_patients_df$HMDB_name) + metabolites_in_chunks <- split(metabolites, ceiling(seq_along(metabolites) / number_of_plots_per_page)) + number_of_chunks_metabolites <- length(metabolites_in_chunks) + + # Get a list of plot data per page + page_plot_data_list <- get_list_page_plot_data( + metabolites_in_chunks, + metabolite_class_patients_df, + metabolite_class_controls_df, + number_of_plots_per_page + ) + + # Add new items to main list + list_metabolite_df_per_page <- append(list_metabolite_df_per_page, page_plot_data_list) + # create list of page headers + metabolite_categories <- c(metabolite_categories, paste(metabolite_class, seq(number_of_chunks_metabolites), sep = "_")) + } + # add page headers to list + names(list_metabolite_df_per_page) <- metabolite_categories + + return(list_metabolite_df_per_page) +} + +#' Get patient data to be uploaded to Helix +#' +#' @param list_dfs_metab_classes_zscores: list of dataframes with metabolite Z-scores for each sample/patient (list) +#' @param list_metabolite_classes: list of tables with metabolites for Helix and violin plots (list) +#' +#' @return df_zscores_to_helix: dataframe with patient data with only metabolites for Helix and violin plots +#' with Helix name, high/low Z-score cutoffs +prepare_helix_patient_data <- function(list_dfs_metab_classes_zscores, list_metabolite_classes) { + # Combine Z-scores of metab groups together + metabolite_zscore_dataframe <- bind_rows(list_dfs_metab_classes_zscores) + + # Change the Sample column to characters, trim HMDB_name and split HMDB_name in new column + metabolite_zscore_dataframe <- metabolite_zscore_dataframe %>% + mutate( + Sample = as.character(Sample), + HMDB_name = str_trim(HMDB_name, "right"), + HMDB_name_split = str_split_fixed(HMDB_name, "nitine;", 2)[, 1] + ) + + # Combine metabolite classes + dims_helix_metabolite_df <- bind_rows(list_metabolite_classes) + + # Filter for Helix metabolites and split HMDB_name column for matching with metabolite_zscore_dataframe + dims_helix_metabolite_df <- dims_helix_metabolite_df %>% + filter(Helix == "ja") %>% + mutate(HMDB_name_split = str_split_fixed(HMDB_name, "nitine;", 2)[, 1]) %>% + select(HMDB_name_split, Helix_naam, high_zscore, low_zscore) + + # Filter DIMS results for metabolites for Helix and combine Helix info + df_zscores_to_helix <- metabolite_zscore_dataframe %>% + filter(HMDB_name_split %in% dims_helix_metabolite_df$HMDB_name_split) %>% + left_join(dims_helix_metabolite_df, by = join_by(HMDB_name_split)) %>% + select(HMDB_name, Sample, Z_score, Helix_naam, high_zscore, low_zscore) + + return(df_zscores_to_helix) +} + +#' Getting the intensities for calculating ratio Z-scores +#' Retrieving a vector of intensities for a particular fraction side of the ratios for all samples. +#' +#' @param ratios_metabs_df: dataframe with HMDB codes for the ratios (dataframe) +#' @param row_index: index of the row in the ratios_metabs_df (integer) +#' @param intensities_zscore_df: dataframe with intensities for each sample (dataframe) +#' @param fraction_side: either numerator or denominator, which side of the fraction (string) +#' @param intensity_cols: names of the columns that contain the intensities (string) +#' +#' @returns fraction_side_intensity: a vector of intensities (vector of integers) +get_intensities_fraction_side <- function(ratios_metabs_df, row_index, intensities_zscore_df, fraction_side, intensity_cols) { + # get the HMDB ID(s) for the given fraction side + fraction_side_hmdb_ids <- ratios_metabs_df[row_index, fraction_side] + if (grepl("plus", fraction_side_hmdb_ids)) { + # if fraction side contains "plus", split to get both HMDB IDs + fraction_side_hmdb_id_list <- strsplit(fraction_side_hmdb_ids, "plus")[[1]] + # get intensities for both HMDB IDs for all samples + fraction_side_intensity_list <- intensities_zscore_df %>% + filter(HMDB_code %in% fraction_side_hmdb_id_list) %>% + select(any_of(intensity_cols)) + # sum intensities to 1 intensity per samples + fraction_side_intensity <- apply(fraction_side_intensity_list, 2, sum) + } else if (fraction_side_hmdb_ids == "one") { + # set intensity to 1 + fraction_side_intensity <- 1 + } else { + # get intensities of the HMDB ID for all samples + fraction_side_intensity <- intensities_zscore_df %>% + filter(HMDB_code == fraction_side_hmdb_ids) %>% + select(any_of(intensity_cols)) + } + # vector of intensities for all samples + fraction_side_intensity <- as.numeric(fraction_side_intensity) + return(fraction_side_intensity) +} + +#' Get the sample IDs for columns that have Z-score and intensities +#' +#' @param colnames_zscore_cols: vector of sample IDs from the dataframe containing Z-scores (vector of strings) +#' @param colnames_intensity_cols: vector of sample IDs form the dataframe containing intensities (vector of strings) +#' +#' @returns colnames_intersect: vector of sample IDs that are in both input vectors, ending on "_Zscore" (vector of strings) +get_sample_ids_with_zscores <- function(colnames_zscore_cols, colnames_intensity_cols) { + colnames_intersect <- intersect( + paste0(colnames_intensity_cols, "_Zscore"), + grep("_Zscore", colnames_zscore_cols, value = TRUE) + ) + return(colnames_intersect) +} + +#' Pad or truncate HMDB names to a fixed width +#' Add spaces or remove HMDB name characters till the length of the name equals the 'width' +#' +#' @param metabolite_info_df: A dataframe containing a column `HMDB_name` (character). +#' @param width: Integer target width for the display names. Default is 45. +#' @param pad_character: Single character used for padding. Default is a space `" "`. +#' +#' @return metabolite_info_df: A dataframe where the HMDB names are transformed +pad_truncate_hmdb_names <- function(metabolite_info_df, width, pad_character) { + # Change the HMDB_name column so all names have 45 characters + # remove characters if name is longer and add "..." + # add empty spaces till 45 charachters if name is shorter + # keep the name if name is exactly 45 characters + keep_lenght <- width - 3 + metabolite_info_df <- metabolite_info_df %>% mutate(HMDB_name = case_when( + str_length(HMDB_name) > width ~ str_c(str_sub(HMDB_name, 1, keep_lenght), "..."), + str_length(HMDB_name) < width ~ str_pad(HMDB_name, width, side = "right", pad = pad_character), + TRUE ~ HMDB_name + )) + return(metabolite_info_df) +} + +#' Get a list of dataframes for each chunk +#' For each chunk, get a dataframe containing the metabolites in that chunk and add it to the list +#' +#' @param metabolites_in_chunks: list of vectors, each containing metabolites +#' @param metabolite_class_patients_df: dataframe of Z-scores for all patient +#' @param metabolite_class_controls_df: dataframe of Z-scores for all control +#' @param number_of_plots_per_page: integer containing the number of metabolites per plot per page +#' +#' @returns page_plot_data_list: a list of dataframes containing Z-scores +get_list_page_plot_data <- function( + metabolites_in_chunks, + metabolite_class_patients_df, + metabolite_class_controls_df, + number_of_plots_per_page) { + # For each chunk, get a dataframe containing the metabolites and add to a list + page_plot_data_list <- lapply(metabolites_in_chunks, function(metabolite_names_chunk) { + patients_df_chunk <- metabolite_class_patients_df %>% filter(HMDB_name %in% metabolite_names_chunk) + controls_df_chunk <- metabolite_class_controls_df %>% filter(HMDB_name %in% metabolite_names_chunk) + + # Combine both dataframes + patients_controls_df_chunk <- rbind(patients_df_chunk, controls_df_chunk) + metabolite_order <- make_metabolite_order(number_of_plots_per_page, metabolite_names_chunk) + + # Set the order of the metabolites for the violin plots + attr(patients_controls_df_chunk, "y_order") <- rev(metabolite_order) + + return(patients_controls_df_chunk) + }) +} + +#' Make the order of metabolites for the violin plots +#' Create the order of metabolites and add empty strings if the number of metabolites is lower than +#' the number of plots per page. +#' +#' @param number_of_plots_per_page: integer containing the number of metabolites per plot per page +#' @param metabolite_names_chunk: list of vectors, each containing metabolites +#' +#' @returns metabolite_order: a vector containing all metabolites and possibly empty strings +make_metabolite_order <- function(number_of_plots_per_page, metabolite_names_chunk) { + # Add empty dummy's to extend the number of metabs to the nr_plots_perpage + number_of_plots_missing <- number_of_plots_per_page - length(metabolite_names_chunk) + if (number_of_plots_missing > 0) { + dummy_names <- paste0(" ", strrep(" ", seq_len(number_of_plots_missing))) + metabolite_order <- c(metabolite_names_chunk, dummy_names) + } else { + metabolite_order <- metabolite_names_chunk + } + return(metabolite_order) +} + +#' Check for Diagnostics patients with correct patient number (e.g. starting with "P2024M") +#' +#' @param patient_column: a column from dataframe with IDs (character vector) +#' +#' @return: a logical vector with TRUE or FALSE for each element (vector) +is_diagnostic_patients <- function(patient_column) { + diagnostic_patients <- grepl("^P[0-9]{4}M", patient_column) + + return(diagnostic_patients) +} + +#' Get the output dataframe for Helix +#' +#' @param protocol_name: protocol name (string) +#' @param df_metabs_helix: dataframe with metabolite Z-scores for patients (dataframe) +#' +#' @return: dataframe with patient metabolite Z-scores in correct format for Helix +transform_metab_df_to_helix_df <- function(protocol_name, df_metabs_helix) { + # Remove positive controls + df_metabs_helix <- df_metabs_helix %>% filter(is_diagnostic_patients(Sample)) + + # Add 'Vial' column, each patient has unique ID + df_metabs_helix <- df_metabs_helix %>% + group_by(Sample) %>% + mutate(Vial = cur_group_id()) %>% + ungroup() + + # Split patient number into labnummer and Onderzoeksnummer + df_metabs_helix <- add_lab_id_and_onderzoeksnr(df_metabs_helix) + + # Add column with protocol name + df_metabs_helix$Protocol <- protocol_name + + # Change name Z_score and Helix_naam columns to Amount and Name + change_columns <- c(Amount = "Z_score", Name = "Helix_naam") + df_metabs_helix <- df_metabs_helix %>% rename(all_of(change_columns)) + + # Select only necessary columns and set them in correct order + df_metabs_helix <- df_metabs_helix %>% + select(c(Vial, labnummer, Onderzoeksnummer, Protocol, Name, Amount)) + + # Remove duplicate patient-metabolite combinations ("leucine + isoleucine + allo-isoleucin_Z-score" is added 3 times) + df_metabs_helix <- df_metabs_helix %>% + group_by(Onderzoeksnummer, Name) %>% + distinct() %>% + ungroup() + + return(df_metabs_helix) +} + +#' Adding labnummer and Onderzoeksnummer to a dataframe +#' +#' @param df_metabs_helix: dataframe with patient data to be uploaded to Helix +#' +#' @return: dataframe with added labnummer and Onderzoeksnummer columns +add_lab_id_and_onderzoeksnr <- function(df_metabs_helix) { + # Split patient number into labnummer and Onderzoeksnummer + for (row in seq_len(nrow(df_metabs_helix))) { + df_metabs_helix[row, "labnummer"] <- gsub("^P|\\.[0-9]*", "", df_metabs_helix[row, "Sample"]) + labnummer_split <- strsplit(as.character(df_metabs_helix[row, "labnummer"]), "M")[[1]] + df_metabs_helix[row, "Onderzoeksnummer"] <- paste0("MB", labnummer_split[1], "/", labnummer_split[2]) + } + return(df_metabs_helix) +} + +#' Create a dataframe with all metabolites that exceed the min and max Z-score cutoffs +#' +#' @param patient_name: patient code (string) +#' @param dims_helix_table: dataframe with metabolite Z-scores for each patient and Helix info (dataframe) +#' +#' @return: dataframe with metabolites that exceed the min and max Z-score cutoffs for the selected patient +get_top_metabolites_df <- function(patient_name, dims_helix_table) { + # extract data for patient of interest (patient_name) + patient_metabs_helix <- dims_helix_table %>% + filter(Sample == patient_name) %>% + mutate(Z_score = round(Z_score, 2)) + + patient_high_df <- patient_metabs_helix %>% filter(Z_score > high_zscore) + patient_low_df <- patient_metabs_helix %>% filter(Z_score < low_zscore) + + if (nrow(patient_high_df) > 0 || nrow(patient_low_df) > 0) { + # sort tables on zscore + patient_high_df <- patient_high_df %>% + arrange(desc(Z_score)) %>% + select(c(HMDB_name, Z_score)) + patient_low_df <- patient_low_df %>% + arrange(Z_score) %>% + select(c(HMDB_name, Z_score)) + } + # add lines for increased, decreased + line_increased <- c("Increased", "") + line_decreased <- c("Decreased", "") + + # combine the two lists + top_metab_patient <- rbind(line_increased, patient_high_df, line_decreased, patient_low_df) + + # remove row names + rownames(top_metab_patient) <- NULL + # change column names for display + colnames(top_metab_patient) <- c("Metabolite", "Z-score") + + return(top_metab_patient) +} + +#' Create a dataframe with the top 20 highest and top 10 lowest metabolites per patient +#' +#' @param pt_name: patient code (string) +#' @param zscore_patients: dataframe with metabolite Z-scores per patient (dataframe) +#' @param top_highest: the number of metabolites with the highest Z-score to display in the table (numeric) +#' @param top_lowest: the number of metabolites with the lowest Z-score to display in the table (numeric) +#' +#' @return: dataframe with 30 metabolites and Z-scores (dataframe) +prepare_toplist <- function(patient_id, zscore_patients, num_of_highest_metabolites, num_of_lowest_metabolites) { + patient_df <- zscore_patients %>% + select(HMDB_code, HMDB_name, !!sym(patient_id)) %>% + arrange(!!sym(patient_id)) + + # Get lowest Zscores + patient_df_low <- patient_df[1:num_of_lowest_metabolites, ] + patient_df_low <- patient_df_low %>% mutate(across(!!sym(patient_id), ~ round(.x, 2))) + + # Get highest Zscores + patient_df_high <- patient_df[nrow(patient_df):(nrow(patient_df) - num_of_highest_metabolites + 1), ] + patient_df_high <- patient_df_high %>% mutate(across(!!sym(patient_id), ~ round(.x, 2))) + + # add lines for increased, decreased + extra_line1 <- c("Increased", "", "") + extra_line2 <- c("Decreased", "", "") + top_metab_pt <- rbind(extra_line1, patient_df_high, extra_line2, patient_df_low) + # remove row names + rownames(top_metab_pt) <- NULL + + # change column names for display + colnames(top_metab_pt) <- c("HMDB_ID", "Metabolite", "Z-score") + + return(top_metab_pt) +} + +#' Create a pdf with table with metabolites and violin plots +#' +#' @param pdf_dir: location where to save the pdf file (string) +#' @param patient_id: patient id (string) +#' @param metab_perpage: list of dataframes, each dataframe contains data for a page in de pdf (list) +#' @param top_metab_pt: dataframe with increased and decreased metabolites for this patient (dataframe) +#' @param explanation: text that explains the violin plots and the pipeline version (string) +create_pdf_violin_plots <- function(pdf_dir, patient_id, metab_perpage, top_metab_pt, explanation) { + # set parameters for plots + plot_height <- 9.6 + plot_width <- 6 + + # get the names and numbers in the table aligned + table_theme <- ttheme_default( + core = list(fg_params = list(hjust = 0, x = 0.05, fontsize = 6)), + colhead = list(fg_params = list(fontsize = 8, fontface = "bold")) + ) + + # patient plots, create the PDF device + patient_id_sub <- patient_id + suffix <- "" + if (grepl("Diagnostics", pdf_dir) && is_diagnostic_patients(patient_id)) { + prefix <- "MB" + suffix <- "_DIMS_PL_DIAG" + # substitute P and M in P2020M00001 into right format for Helix + patient_id_sub <- gsub("[PM]", "", patient_id) + patient_id_sub <- gsub("\\..*", "", patient_id_sub) + } else if (grepl("Diagnostics", pdf_dir)) { + prefix <- "Dx_" + } else if (grepl("IEM", pdf_dir)) { + prefix <- "IEM_" + } else { + prefix <- "R_" + } + + pdf(paste0(pdf_dir, "/", prefix, patient_id_sub, suffix, ".pdf"), + onefile = TRUE, + width = plot_width, + height = plot_height + ) + + # page headers: + page_headers <- names(metab_perpage) + + # put table into PDF file, if not empty + if (!is.null(dim(top_metab_pt))) { + max_rows_per_page <- 35 + total_rows <- nrow(top_metab_pt) + number_of_pages <- ceiling(total_rows / max_rows_per_page) + + # get the names and numbers in the table aligned + table_theme <- ttheme_default( + core = list(fg_params = list(hjust = 0, x = 0.05, fontsize = 6)), + colhead = list(fg_params = list(fontsize = 8, fontface = "bold")) + ) + + for (page in seq(number_of_pages)) { + start_row <- (page - 1) * max_rows_per_page + 1 + end_row <- min(page * max_rows_per_page, total_rows) + page_data <- top_metab_pt[start_row:end_row, ] + + table_grob <- tableGrob(page_data, theme = table_theme, rows = NULL) + + grid.arrange( + table_grob, + top = paste0("Top deviating metabolites for patient: ", patient_id) + ) + } + } + + # violin plots + for (metab_class in names(metab_perpage)) { + # extract list of metabolites to plot on a page + metab_zscores_df <- metab_perpage[[metab_class]] + # extract original data for patient of interest (pt_name) before cut-offs + patient_zscore_df <- metab_zscores_df %>% filter(Sample == patient_id) + + # Remove patient column and change Z-score. If under -5 to -5 and if above 20 to 20. + metab_zscores_df <- metab_zscores_df %>% + filter(Sample != patient_id) %>% + mutate(Z_score = pmin(pmax(Z_score, -5), 20)) + + # subtitle per page + sub_perpage <- gsub("_", " ", metab_class) + # for IEM plots, put subtitle on two lines + sub_perpage <- gsub("probability", "\nprobability", sub_perpage) + + # draw violin plot. + ggplot_object <- create_violin_plot(metab_zscores_df, patient_zscore_df, sub_perpage, patient_id) + + suppressWarnings(print(ggplot_object)) + } + + # add explanation of violin plots, version number etc. + plot(NA, xlim = c(0, 5), ylim = c(0, 5), bty = "n", xaxt = "n", yaxt = "n", xlab = "", ylab = "") + if (length(explanation) > 0) { + text(0.2, 5, explanation[1], pos = 4, cex = 0.8) + for (line_index in 2:length(explanation)) { + text_y_position <- 5 - (line_index * 0.2) + text(-0.2, text_y_position, explanation[line_index], pos = 4, cex = 0.5) + } + } + + # close the PDF file + dev.off() +} + +#' Create violin plots +#' +#' @param metab_zscores_df: dataframe with Z-scores for all samples (dataframe) +#' @param patient_zscore_df: dataframe with Z-scores for the specified patient (dataframe) +#' @param sub_perpage: subtitle of the page (string) +#' @param patient_id: the patient id of the selected patient (string) +#' +#' @returns ggpplot_object: a violin plot of metabolites that highlights the selected patient (ggplot object) +create_violin_plot <- function(metab_zscores_df, patient_zscore_df, sub_perpage, patient_id) { + fontsize <- 1 + circlesize <- 0.8 + # Set colors for the violinplot: green, blue, blue/purple, purple, orange, red + colors_plot <- c("#22E4AC", "#00B0F0", "#504FFF", "#A704FD", "#F36265", "#DA0641") + + y_order <- attr(metab_zscores_df, "y_order") + metab_zscores_df$HMDB_name <- rev(factor(metab_zscores_df$HMDB_name, levels = rev(y_order))) + patient_zscore_df$HMDB_name <- rev(factor(patient_zscore_df$HMDB_name, levels = rev(y_order))) + + ggplot_object <- ggplot(metab_zscores_df, aes(x = Z_score, y = HMDB_name)) + + # Make violin plots + geom_violin(scale = "width", na.rm = TRUE) + + # Add Z-score for the selected patient, shape=22 gives square for patient of interest + geom_point( + data = patient_zscore_df, aes(color = Z_score), + size = 3.5 * circlesize, shape = 22, fill = "white", na.rm = TRUE + ) + + # Add the Z-score at the right side of the plot + geom_text( + data = patient_zscore_df, + aes(16, label = paste0("Z=", round(Z_score, 2))), + hjust = "left", vjust = +0.2, size = 3, na.rm = TRUE + ) + + # Set colour for the Z-score of the selected patient + scale_fill_gradientn( + colors = colors_plot, values = NULL, space = "Lab", + na.value = "grey50", guide = "colourbar", aesthetics = "colour" + ) + + # Add labels to the axis + labs(x = "Z-scores", y = "Metabolites", subtitle = sub_perpage, color = "z-score") + + # Add a title to the page + ggtitle(label = paste0("Results for patient ", patient_id)) + + # Set theme: size and font type of y-axis labels, remove legend and make the + theme( + axis.text.y = element_text(family = "Courier", size = 6), + legend.position = "none", + plot.caption = element_text(size = rel(fontsize)) + ) + + # Set y-axis to set order + scale_y_discrete(limits = y_order) + + # Limit the x-axis to between -5 and 20 + xlim(-5, 20) + + # Set grey vertical lines at -2 and 2 + geom_vline(xintercept = c(-2, 2), col = "grey", lwd = 0.5, lty = 2) + + + return(ggplot_object) +} + +#' Run the dIEM algorithm (DOI: 10.3390/ijms21030979) +#' +#' @param expected_biomarkers_df: dataframe with information for HMDB codes about IEMs (dataframe) +#' @param zscore_patients: dataframe containing Z-scores for patient (dataframe) +#' +#' @returns probability_score: a dataframe with probability scores for IEMs for each patient (dataframe) +run_diem_algorithm <- function(expected_biomarkers_df, zscore_patients_df, sample_cols) { + # Rank the metabolites for each patient individually + ranking_patients <- zscore_patients_df %>% + mutate(across(-c(HMDB_code, HMDB_name), rank_patient_zscores)) + + ranking_patients <- merge( + x = expected_biomarkers_df, y = ranking_patients, + by.x = c("HMDB_code"), by.y = c("HMDB_code") + ) + + zscore_expected_df <- merge( + x = expected_biomarkers_df, y = zscore_patients_df, + by.x = c("HMDB_code"), by.y = c("HMDB_code") + ) + + # Change Z-score to zero for specific cases + zscore_expected_df <- zscore_expected_df %>% mutate(across( + all_of(sample_cols), + ~ case_when( + Change == "Increase" & Dispensability == "Indispensable" & .x <= 1.6 ~ 0, + Change == "Decrease" & Dispensability == "Indispensable" & .x >= -1.2 ~ 0, + TRUE ~ .x + ) + )) + + # Sort both dataframes on HMDB_code for calculating the metabolite score + zscore_expected_df <- zscore_expected_df[order(zscore_expected_df$HMDB_code), ] + ranking_patients <- ranking_patients[order(ranking_patients$HMDB_code), ] + + # Set up dataframe for the metabolite score, copy zscore_expected_df for biomarker info + metabolite_score_info <- zscore_expected_df + # Calculate metabolite score: Z-score/(Rank * 0.9) + metabolite_score_info[sample_cols] <- zscore_expected_df[sample_cols] / (ranking_patients[sample_cols] * 0.9) + + # Calculate the weighted score: metabolite_score * Total_Weight + metabolite_weight_score <- metabolite_score_info %>% + mutate(across( + all_of(sample_cols), + ~ .x * Total_Weight + )) %>% + arrange(desc(Disease), desc(Absolute_Weight)) + + # Calculate the probability score for each disease - Mz combination + probability_score <- metabolite_weight_score %>% + filter(!duplicated(select(., Disease, M.z)) | + !duplicated(select(., Disease, M.z), fromLast = FALSE)) %>% + group_by(Disease) %>% + summarise(across(all_of(sample_cols), sum), .groups = "drop") + + # Set probability score to 0 for Z-scores == 0 + for (sample_col in sample_cols) { + # Get indexes of Zscore that equal 0 + zscores_zero_idx <- which(zscore_expected_df[[sample_col]] == 0) + # Get diseases that have a Zscore of 0 + diseases_zero <- unique(zscore_expected_df[zscores_zero_idx, "Disease"]) + # Set probabilty of these diseases to 0 + probability_score[probability_score$Disease %in% diseases_zero, sample_col] <- 0 + } + + colnames(probability_score) <- gsub("_Zscore", "_prob_score", colnames(probability_score)) + + return(probability_score) +} + +#' Ranking Z-scores for a patient, separate for positive and negative Z-scores +#' +#' @param zscore_col: vector with Z-scores for a single patient (vector of integers) +#' +#' @returns ranking: a vector of the ranking of the Z-scores (vector of integers) +rank_patient_zscores <- function(zscore_col) { + # Create ranking column with default NA values + ranking <- rep(NA_real_, length(zscore_col)) + + # Get indexes for negative and positive rows + neg_indexes <- which(zscore_col <= 0) + pos_indexes <- which(zscore_col > 0) + + # Rank the negative and positive Zscores + ranking[neg_indexes] <- dense_rank(zscore_col[neg_indexes]) + ranking[pos_indexes] <- dense_rank(-zscore_col[pos_indexes]) + + return(ranking) +} + +#' Save the probability score dataframe as an Excel file +#' +#' @param probability_score: a dataframe containing probability scores for each patient (dataframe) +#' @param run_name: name of the run, for the file name (string) +save_prob_scores_to_excel <- function(probability_score, run_name) { + # Create conditional formatting for output Excel sheet. Colors according to values. + wb <- createWorkbook() + addWorksheet(wb, "Probability Scores") + writeData(wb, "Probability Scores", probability_score) + conditionalFormatting(wb, "Probability Scores", + cols = 2:ncol(probability_score), rows = seq_len(nrow(probability_score)), + type = "colourScale", style = c("white", "#FFFDA2", "red"), rule = c(1, 10, 100) + ) + saveWorkbook(wb, file = paste0("./dIEM_algoritme_output_", run_name, ".xlsx"), overwrite = TRUE) + rm(wb) +} + +#' Make and save dIEM plots +#' +#' @param diem_probability_score: dataframe with dIEM probability scores +#' @param patient_col_names: vector containing all patient column names +#' @param expected_biomarkers_df: dataframe with information for HMDB codes about IEMs +#' @param zscore_patients_df: dataframe containing Z-scores for all patients +#' @param zscore_controls_df: dataframe containing Z-scores for all controls +#' @param nr_plots_perpage: integer containing the number of metabolites per page +#' @param number_of_samples: list containing the number of patients and controls +#' @param number_of_metabolites: list containing the number of metabolites for the top and lowest table +#' +#' @returns patient_no_iem: vector of patient IDs that have no IEMs +make_and_save_diem_plots <- function( + diem_probability_score, + patient_col_names, + expected_biomarkers_df, + zscore_patients_df, + zscore_controls_df, + nr_plots_perpage, + number_of_samples, + number_of_metabolites, + iem_variables, + explanation_violin_plot) { + diem_plot_dir <- paste("./dIEM_plots", sep = "/") + dir.create(diem_plot_dir) + + patient_no_iem <- c() + + for (patient_id in patient_col_names) { + # Select the top IEMs and filter on the IEM threshold + patient_top_iems_probs <- diem_probability_score %>% + select(c(Disease, !!sym(patient_id))) %>% + arrange(desc(!!sym(patient_id))) %>% + slice(1:iem_variables$top_number_iem_diseases) %>% + filter(!!sym(patient_id) >= iem_variables$threshold_iem) + + if (nrow(patient_top_iems_probs) > 0) { + list_metabolites_top_iems <- get_probabilities_top_iems(patient_top_iems_probs, expected_biomarkers_df, patient_id) + + # Get the Z-scores with metabolite information + metabolites_iem_sorted <- merge_metabolite_info_zscores(list_metabolites_top_iems, zscore_patients_df) + metabolites_iem_controls <- merge_metabolite_info_zscores(list_metabolites_top_iems, zscore_controls_df) + + # Get a list of dataframes for each IEM + diem_metabolites_perpage <- get_data_per_metabolite_class( + metabolites_iem_sorted, + metabolites_iem_controls, + nr_plots_perpage, + number_of_samples$patients, + number_of_samples$controls + ) + # Get a dataframe of the top metabolites + top_metabolites_patient <- prepare_toplist( + patient_id, + zscore_patients_df, + number_of_metabolites$highest, + number_of_metabolites$lowest + ) + + # Generate and save dIEM violin plots + create_pdf_violin_plots( + diem_plot_dir, + patient_id, + diem_metabolites_perpage, + top_metabolites_patient, + explanation_violin_plot + ) + } else { + patient_no_iem <- c(patient_no_iem, patient_id) + } + } + return(patient_no_iem) +} + +#' Get the IEM probabilities for a patient for all diseases +#' +#' @param patient_top_iems_probs: dataframe containing the probability scores for diseases for a patient +#' @param expected_biomarkers_df: dataframe with information for HMDB codes about IEMs +#' @param patient_id: string containing the patien ID +#' +#' @returns list_metabolites_iems: list of dataframes containing the HMDB codes and names for all diseases +get_probabilities_top_iems <- function(patient_top_iems_probs, expected_biomarkers_df, patient_id) { + # Get the metabolites for each IEM and their probability + list_metabolites_iems <- list() + metabolites_iems_names <- c() + + for (iem in patient_top_iems_probs$Disease) { + # get the IEM probabilities for the selected patient + iem_probablity <- patient_top_iems_probs %>% + filter(Disease == iem) %>% + pull(!!sym(patient_id)) + metabolites_iems_names <- c(metabolites_iems_names, paste0(iem, ", probability score ", iem_probablity)) + # get the HMDB codes and names for the IEM of the selected patient + metabolites_iem_df <- expected_biomarkers_df %>% + filter(Disease == iem) %>% + select(HMDB_code, HMDB_name) + # Add for each IEM the HMDB codes and names to a list + list_metabolites_iems[[iem]] <- metabolites_iem_df + } + names(list_metabolites_iems) <- metabolites_iems_names + + return(list_metabolites_iems) +} + +#' Save a list of patient IDs to a text file +#' +#' @param threshold_iem: integer containing the IEM threshold +#' @param patient_no_iem: vector containing patient IDs +save_patient_no_iem <- function(threshold_iem, patient_no_iem) { + patient_no_iem <- c( + paste0( + "The following patient(s) did not have dIEM probability scores higher than ", + threshold_iem, " :" + ), + patient_no_iem + ) + write(file = paste0("./missing_probability_scores.txt"), patient_no_iem) +} diff --git a/DIMS/tests/testthat.R b/DIMS/tests/testthat.R index 3f13a06..6e5a496 100644 --- a/DIMS/tests/testthat.R +++ b/DIMS/tests/testthat.R @@ -1,6 +1,8 @@ # Run all unit tests library(testthat) library(withr) +library(vdiffr) +library(pdftools) # enable snapshots local_edition(3) diff --git a/DIMS/tests/testthat/_snaps/generate_violin_plots.md b/DIMS/tests/testthat/_snaps/generate_violin_plots.md new file mode 100644 index 0000000..4930649 --- /dev/null +++ b/DIMS/tests/testthat/_snaps/generate_violin_plots.md @@ -0,0 +1,21 @@ +# create_pdf_violin_plots: Create a pdf with a table of top metabolites and violin plots + + Code + content_pdf_violinplots + Output + [1] "Top deviating metabolites for patient: P2025M1\n Metabolite Z.score\n Increased\n metab1 2.45\n Decreased\n metab11 −1.51\n" + [2] " Results for patient P2025M1\n test acyl carnitines\n metab1 Z=2.34\nMetabolites\n metab3 Z=0.31\n −5 0 5 10 15 20\n Z−scores\n" + [3] " Results for patient P2025M1\n test crea gua\n metab4 Z=−0.46\nMetabolites\n metab11 Z=0.84\n −5 0 5 10 15 20\n Z−scores\n" + [4] " Unit test Generate Violin Plots\nUnit test Generate Violin Plots\n" + +# save_prob_scores_to_excel: Saving the probability score dataframe as an Excel file + + Disease P2025M1 P2025M2 P2025M3 P2025M4 + 1 Disease A 10.900 -10.9 49.90 -49.9 + 2 Disease B 0.953 0.0 2.29 0.0 + 3 Disease C 12.100 0.0 0.00 12.1 + 4 Disease D 0.000 -12.5 0.00 18.2 + 5 Disease E 44.300 0.0 0.00 28.1 + 6 Disease F 0.000 -77.4 -77.40 0.0 + 7 Disease G -38.700 38.7 38.70 -38.7 + diff --git a/DIMS/tests/testthat/_snaps/generate_violin_plots/missing_probability_scores.txt b/DIMS/tests/testthat/_snaps/generate_violin_plots/missing_probability_scores.txt new file mode 100644 index 0000000..f7e3357 --- /dev/null +++ b/DIMS/tests/testthat/_snaps/generate_violin_plots/missing_probability_scores.txt @@ -0,0 +1,3 @@ +The following patient(s) did not have dIEM probability scores higher than 5 : +Patient1 +Patient2 diff --git a/DIMS/tests/testthat/_snaps/generate_violin_plots/violin-plot-p2025m1.svg b/DIMS/tests/testthat/_snaps/generate_violin_plots/violin-plot-p2025m1.svg new file mode 100644 index 0000000..fee3d1d --- /dev/null +++ b/DIMS/tests/testthat/_snaps/generate_violin_plots/violin-plot-p2025m1.svg @@ -0,0 +1,74 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Z=0.31 +Z=2.34 + + + + +metab3 +metab1 + + + + + + + + +-5 +0 +5 +10 +15 +20 +Z-scores +Metabolites +test acyl carnitines +Results for patient P2025M1 + + diff --git a/DIMS/tests/testthat/fixtures/GenerateViolinPlots/test_metabolite_class_controls_df.txt b/DIMS/tests/testthat/fixtures/GenerateViolinPlots/test_metabolite_class_controls_df.txt new file mode 100644 index 0000000..9b04311 --- /dev/null +++ b/DIMS/tests/testthat/fixtures/GenerateViolinPlots/test_metabolite_class_controls_df.txt @@ -0,0 +1,28 @@ +HMDB_name Sample Z_score +HMDB001 C101.1 0.45 +HMDB002 C101.1 1.67 +HMDB003 C101.1 -1.86 +HMDB004 C101.1 0.58 +HMDB011 C101.1 2.47 +HMDB012 C101.1 -0.56 +HMDB000TT1 C101.1 -0.58 +HMDB000TT2 C101.1 0.48 +HMDB000TT3 C101.1 -0.45 +HMDB001 C102.1 2.89 +HMDB002 C102.1 0.79 +HMDB003 C102.1 -1.88 +HMDB004 C102.1 5.46 +HMDB011 C102.1 -0.68 +HMDB012 C102.1 1.65 +HMDB000TT1 C102.1 -0.71 +HMDB000TT2 C102.1 0.39 +HMDB000TT3 C102.1 -0.54 +HMDB001 C103.1 0.54 +HMDB002 C103.1 -0.85 +HMDB003 C103.1 1.58 +HMDB004 C103.1 3.84 +HMDB011 C103.1 0.84 +HMDB012 C103.1 -1.11 +HMDB000TT1 C103.1 -0.22 +HMDB000TT2 C103.1 0.38 +HMDB000TT3 C103.1 -0.48 diff --git a/DIMS/tests/testthat/fixtures/GenerateViolinPlots/test_metabolite_class_patients_df.txt b/DIMS/tests/testthat/fixtures/GenerateViolinPlots/test_metabolite_class_patients_df.txt new file mode 100644 index 0000000..63e79a6 --- /dev/null +++ b/DIMS/tests/testthat/fixtures/GenerateViolinPlots/test_metabolite_class_patients_df.txt @@ -0,0 +1,28 @@ +HMDB_name Sample Z_score +HMDB001 P2025M1 0.31 +HMDB002 P2025M1 1.84 +HMDB003 P2025M1 2.34 +HMDB004 P2025M1 0.84 +HMDB011 P2025M1 -0.46 +HMDB012 P2025M1 0.14 +HMDB000TT1 P2025M1 -0.58 +HMDB000TT2 P2025M1 0.48 +HMDB000TT3 P2025M1 -0.45 +HMDB001 P2025M2 2.45 +HMDB002 P2025M2 0.48 +HMDB003 P2025M2 1.45 +HMDB004 P2025M2 -0.15 +HMDB011 P2025M2 -1.51 +HMDB012 P2025M2 3.56 +HMDB000TT1 P2025M2 -0.71 +HMDB000TT2 P2025M2 0.39 +HMDB000TT3 P2025M2 -0.54 +HMDB001 P2025M3 2.14 +HMDB002 P2025M3 0.15 +HMDB003 P2025M3 -1.44 +HMDB004 P2025M3 -0.78 +HMDB011 P2025M3 1.68 +HMDB012 P2025M3 0.51 +HMDB000TT1 P2025M3 -0.22 +HMDB000TT2 P2025M3 0.38 +HMDB000TT3 P2025M3 -0.48 diff --git a/DIMS/tests/testthat/fixtures/GenerateViolinPlots/test_outlist_df.txt b/DIMS/tests/testthat/fixtures/GenerateViolinPlots/test_outlist_df.txt new file mode 100644 index 0000000..3441eba --- /dev/null +++ b/DIMS/tests/testthat/fixtures/GenerateViolinPlots/test_outlist_df.txt @@ -0,0 +1,7 @@ +"plots" "C101.1" "C102.1" "C103.1" "C104.1" "C105.1" "P2025M1" "P2025M2" "P2025M3" "P2025M4" "P2025M5" "HMDB_name" "HMDB_name_all" "HMDB_ID_all" "sec_HMDB_ID" "HMDB_key" "sec_HMDB_ID_rlvnc" "name" "relevance" "descr" "origin" "fluids" "tissue" "disease" "pathway" "HMDB_code" "avg_ctrls" "sd_ctrls" "nr_ctrls" "C101.1_Zscore" "C102.1_Zscore" "C103.1_Zscore" "C104.1_Zscore" "C105.1_Zscore" "P2025M1_Zscore" "P2025M2_Zscore" "P2025M3_Zscore" "P2025M4_Zscore" "P2025M5_Zscore" +"HMDB001" NA 1000 1100 1300 1650 180000 1000 1100 1300 1650 180000 "metab1" NA NA NA NA NA NA NA NA NA NA NA NA NA "HMDB001" 37010 79934.23 25 0.45 2.89 0.54 0.53 3.46 0.31 2.45 2.14 12.18 3.22 +"HMDB002" NA 1200 1700 750 925 1950 1200 1700 750 925 1950 "metab2" NA NA NA NA NA NA NA NA NA NA NA NA NA "HMDB002" 1305 508.8 26 1.67 0.79 -0.85 1.84 -1.31 1.84 0.48 0.15 2.48 0.48 +"HMDB003" NA 1300 925 1000 1600 750 1300 925 1000 1600 750 "metab3" NA NA NA NA NA NA NA NA NA NA NA NA NA "HMDB003" 1115 336.15 27 -1.86 -1.88 1.58 0.35 0.14 2.34 1.45 -1.44 -0.18 -3.18 +"HMDB004" NA 1400 1125 1220 1650 15050 1400 1125 1220 1650 15050 "metab4" NA NA NA NA NA NA NA NA NA NA NA NA NA "HMDB004" 4089 6130.65 28 0.58 5.46 3.84 -0.54 -0.15 0.84 -0.15 -0.78 0.84 0.47 +"HMDB011" NA 1500 1200 1100 1025 1100 1100 1050 975 1700 10000 "metab5" NA NA NA NA NA NA NA NA NA NA NA NA NA "HMDB011" 1185 186.75 29 2.47 -0.68 0.84 1.48 1.48 -0.46 -1.51 1.68 1.48 1.18 +"HMDB012" NA 1600 1050 1200 1150 1300 975 1175 1100 1750 1500 "metab6" NA NA NA NA NA NA NA NA NA NA NA NA NA "HMDB012" 1260 210.36 30 -0.56 1.65 -1.11 0.43 0.36 0.14 3.56 0.51 -2.45 2.14 diff --git a/DIMS/tests/testthat/fixtures/GenerateViolinPlots/test_zscore_controls_df.txt b/DIMS/tests/testthat/fixtures/GenerateViolinPlots/test_zscore_controls_df.txt new file mode 100644 index 0000000..b6bf5c1 --- /dev/null +++ b/DIMS/tests/testthat/fixtures/GenerateViolinPlots/test_zscore_controls_df.txt @@ -0,0 +1,10 @@ +HMDB_code HMDB_name C101.1 C102.1 C103.1 C104.1 C105.1 +HMDB001 metab1 0.45 2.89 0.54 0.53 3.46 +HMDB002 metab2 1.67 0.79 -0.85 1.84 -1.31 +HMDB004 metab4 0.58 5.46 3.84 -0.54 -0.15 +HMDB005 metab5 2.47 -0.68 0.84 1.48 1.48 +HMDB009 metab9 -1.86 -1.88 1.58 0.35 0.14 +HMDB012 metab12 -0.56 1.65 -1.11 0.43 0.36 +HMDB000TT1 Test_ratio1 -0.58 -0.71 -0.22 -0.21 1.74 +HMDB000TT2 Test_ratio2 0.48 0.39 0.38 0.51 -1.78 +HMDB000TT3 Test_ratio3 -0.45 -0.54 -0.48 -0.29 1.78 diff --git a/DIMS/tests/testthat/fixtures/GenerateViolinPlots/test_zscore_patients_df.txt b/DIMS/tests/testthat/fixtures/GenerateViolinPlots/test_zscore_patients_df.txt new file mode 100644 index 0000000..b457af0 --- /dev/null +++ b/DIMS/tests/testthat/fixtures/GenerateViolinPlots/test_zscore_patients_df.txt @@ -0,0 +1,10 @@ +HMDB_code HMDB_name P2025M1 P2025M2 P2025M3 P2025M4 P2025M5 +HMDB001 metab1 0.31 2.45 2.14 12.18 3.22 +HMDB002 metab2 1.84 0.48 0.15 2.48 0.48 +HMDB004 metab4 0.84 -0.15 -0.78 0.84 0.47 +HMDB005 metab5 -0.46 -1.51 1.68 1.48 1.18 +HMDB009 metab9 2.34 1.45 -1.44 -0.18 -3.18 +HMDB012 metab12 0.14 3.56 0.51 -2.45 2.14 +HMDB000TT1 Test_ratio1 -0.58 -0.71 -0.22 -0.21 1.74 +HMDB000TT2 Test_ratio2 0.48 0.39 0.38 0.51 -1.78 +HMDB000TT3 Test_ratio3 -0.45 -0.54 -0.48 -0.29 1.78 diff --git a/DIMS/tests/testthat/fixtures/make_test_data_GenerateViolinPlots.R b/DIMS/tests/testthat/fixtures/make_test_data_GenerateViolinPlots.R new file mode 100644 index 0000000..e435fa3 --- /dev/null +++ b/DIMS/tests/testthat/fixtures/make_test_data_GenerateViolinPlots.R @@ -0,0 +1,398 @@ +### Functions used to create mock dataframes used for unit testing of GenerateViolinPlots ### + +make_outlist_df <- function() { + test_outlist_df <- data.frame( + plots = NA, + C101.1 = c(1000, 1200, 1300, 1400, 1500, 1600), + C102.1 = c(1100, 1700, 925, 1125, 1200, 1050), + C103.1 = c(1300, 750, 1000, 1220, 1100, 1200), + C104.1 = c(1650, 925, 1600, 1650, 1025, 1150), + C105.1 = c(180000, 1950, 750, 15050, 1100, 1300), + P2025M1 = c(1000, 1200, 1300, 1400, 1100, 975), + P2025M2 = c(1100, 1700, 925, 1125, 1050, 1175), + P2025M3 = c(1300, 750, 1000, 1220, 975, 1100), + P2025M4 = c(1650, 925, 1600, 1650, 1700, 1750), + P2025M5 = c(180000, 1950, 750, 15050, 10000, 1500), + HMDB_name = c("metab1", "metab2", "metab3", "metab4", "metab5", "metab6"), + HMDB_name_all = NA, + HMDB_ID_all = NA, + sec_HMDB_ID = NA, + HMDB_key = NA, + sec_HMDB_ID_rlvnc = NA, + name = NA, + relevance = NA, + descr = NA, + origin = NA, + fluids = NA, + tissue = NA, + disease = NA, + pathway = NA, + HMDB_code = c("HMDB001", "HMDB002", "HMDB003", "HMDB004", "HMDB011", "HMDB012"), + avg_ctrls = c(37010, 1305, 1115, 4089, 1185, 1260), + sd_ctrls = c(79934.23, 508.80, 336.15, 6130.65, 186.75, 210.36), + nr_ctrls = c(25, 26, 27, 28, 29, 30), + C101.1_Zscore = c(0.45, 1.67, -1.86, 0.58, 2.47, -0.56), + C102.1_Zscore = c(2.89, 0.79, -1.88, 5.46, -0.68, 1.65), + C103.1_Zscore = c(0.54, -0.85, 1.58, 3.84, 0.84, -1.11), + C104.1_Zscore = c(0.53, 1.84, 0.35, -0.54, 1.48, 0.43), + C105.1_Zscore = c(3.46, -1.31, 0.14, -0.15, 1.48, 0.36), + P2025M1_Zscore = c(0.31, 1.84, 2.34, 0.84, -0.46, 0.14), + P2025M2_Zscore = c(2.45, 0.48, 1.45, -0.15, -1.51, 3.56), + P2025M3_Zscore = c(2.14, 0.15, -1.44, -0.78, 1.68, 0.51), + P2025M4_Zscore = c(12.18, 2.48, -0.18, 0.84, 1.48, -2.45), + P2025M5_Zscore = c(3.22, 0.48, -3.18, 0.47, 1.18, 2.14) + ) + rownames(test_outlist_df) <- c("HMDB001", "HMDB002", "HMDB003", "HMDB004", "HMDB011", "HMDB012") + + write.table(test_outlist_df, file = "tests/testthat/fixtures/GenerateViolinPlots/test_outlist_df.txt", sep = "\t") +} + +make_intensities_zscore_df <- function() { + test_intensities_zscore_df <- data.frame( + HMDB_code = c("HMDB001", "HMDB002", "HMDB003", "HMDB004", "HMDB011", "HMDB012"), + HMDB_name = c("metab1", "metab2", "metab3", "metab4", "metab5", "metab6"), + C101.1 = c(1000, 1200, 1300, 1400, 1500, 1600), + C102.1 = c(1100, 1700, 925, 1125, 1200, 1050), + C103.1 = c(1300, 750, 1000, 1220, 1100, 1200), + C104.1 = c(1650, 925, 1600, 1650, 1025, 1150), + C105.1 = c(180000, 1950, 750, 15050, 1100, 1300), + P2025M1 = c(1000, 1200, 1300, 1400, 1100, 975), + P2025M2 = c(1100, 1700, 925, 1125, 1050, 1175), + P2025M3 = c(1300, 750, 1000, 1220, 975, 1100), + P2025M4 = c(1650, 925, 1600, 1650, 1700, 1750), + P2025M5 = c(180000, 1950, 750, 15050, 10000, 1500), + mean_control = c(37010, 1305, 1115, 4089, 1185, 1260), + sd_control = c(79934.23, 508.80, 336.15, 6130.65, 186.75, 210.36), + C101.1_Zscore = c(0.45, 1.67, -1.86, 0.58, 2.47, -0.56), + C102.1_Zscore = c(2.89, 0.79, -1.88, 5.46, -0.68, 1.65), + C103.1_Zscore = c(0.54, -0.85, 1.58, 3.84, 0.84, -1.11), + C104.1_Zscore = c(0.53, 1.84, 0.35, -0.54, 1.48, 0.43), + C105.1_Zscore = c(3.46, -1.31, 0.14, -0.15, 1.48, 0.36), + P2025M1_Zscore = c(0.31, 1.84, 2.34, 0.84, -0.46, 0.14), + P2025M2_Zscore = c(2.45, 0.48, 1.45, -0.15, -1.51, 3.56), + P2025M3_Zscore = c(2.14, 0.15, -1.44, -0.78, 1.68, 0.51), + P2025M4_Zscore = c(12.18, 2.48, -0.18, 0.84, 1.48, -2.45), + P2025M5_Zscore = c(3.22, 0.48, -3.18, 0.47, 1.18, 2.14) + ) + rownames(test_intensities_zscore_df) <- c("HMDB001", "HMDB002", "HMDB003", "HMDB004", "HMDB011", "HMDB012") + + write.table(test_intensities_zscore_df, file = "tests/testthat/fixtures/test_intensities_zscore_df.txt", sep = "\t") +} + +make_files_metabolite_groups <- function() { + # Create new directories + dir.create("tests/testthat/fixtures/test_metabolite_groups") + + test_acyl_carnitines <- data.frame( + HMDB_code = c("HMDB001", "HMDB003", "HMDBAA1"), + HMDB_name = c("metab1", "metab3", "ratio1"), + Helix = c("ja", "nee", "ja"), + Helix_naam = c("Metab_1", "Metab_3", "Ratio_1"), + high_zscore = c(2, 2, 2), + low_zscore = c(-1.5, -1.5, -1.5) + ) + + write.table(test_acyl_carnitines, sep = "\t", quote = FALSE, row.names = FALSE, + file = "tests/testthat/fixtures/test_metabolite_groups/test_acyl_carnitines.txt") + + test_crea_gua <- data.frame( + HMDB_code = c("HMDB004", "HMDB011"), + HMDB_name = c("metab4", "metab11"), + Helix = c("ja", "ja"), + Helix_naam = c("Metab_4", "Metab_11"), + high_zscore = c(2, 2), + low_zscore = c(-1.5, -1.5) + ) + + write.table(test_crea_gua, sep = "\t", quote = FALSE, row.names = FALSE, + file = "tests/testthat/fixtures/test_metabolite_groups/test_crea_gua.txt") +} + +make_test_metab_interest_sort <- function() { + test_acyl_carnitines_patients <- data.frame( + HMDB_name = c( + "metab1 ", + "metab3 ", + "metab1 ", + "metab3 ", + "metab1 ", + "metab3 ", + "metab1 ", + "metab3 ", + "metab1 ", + "metab3 " + ), + Sample = c("P2025M1", "P2025M1", "P2025M2", "P2025M2", "P2025M3", + "P2025M3", "P2025M4", "P2025M4", "P2025M5", "P2025M5"), + Z_score = c(0.31, 2.34, 2.45, 1.45, 2.14, -1.44, 12.18, -0.18, 3.22, -3.18) + ) + + test_acyl_carnitines_controls <- data.frame( + HMDB_name = c( + "metab1 ", + "metab3 ", + "metab1 ", + "metab3 ", + "metab1 ", + "metab3 ", + "metab1 ", + "metab3 ", + "metab1 ", + "metab3 " + ), + Sample = c("C101.1", "C101.1", "C102.1", "C102.1", "C103.1", "C103.1", "C104.1", "C104.1", "C105.1", "C105.1"), + Z_score = c(0.45, -1.86, 2.89, -1.88, 0.54, 1.58, 0.53, 0.35, 3.46, 0.14) + ) + + write.table(test_acyl_carnitines_patients, sep = "\t", quote = FALSE, row.names = FALSE, + file = "tests/testthat/fixtures/test_acyl_carnitines_patients.txt") + write.table(test_acyl_carnitines_controls, sep = "\t", quote = FALSE, row.names = FALSE, + file = "tests/testthat/fixtures/test_acyl_carnitines_controls.txt") + + test_crea_gua_patients <- data.frame( + HMDB_name = c( + "metab4 ", + "metab11 ", + "metab4 ", + "metab11 ", + "metab4 ", + "metab11 ", + "metab4 ", + "metab11 ", + "metab4 ", + "metab11 " + ), + Sample = c("P2025M1", "P2025M1", "P2025M2", "P2025M2", "P2025M3", + "P2025M3", "P2025M4", "P2025M4", "P2025M5", "P2025M5"), + Z_score = c(0.84, -0.46, -0.15, -1.51, -0.78, 1.68, 0.84, 1.48, 0.47, 1.18) + ) + + test_crea_gua_controls <- data.frame( + HMDB_name = c( + "metab4 ", + "metab11 ", + "metab4 ", + "metab11 ", + "metab4 ", + "metab11 ", + "metab4 ", + "metab11 ", + "metab4 ", + "metab11 " + ), + Sample = c("C101.1", "C101.1", "C102.1", "C102.1", "C103.1", "C103.1", "C104.1", "C104.1", "C105.1", "C105.1"), + Z_score = c(0.58, 2.47, 5.46, -0.68, 3.84, 0.84, -0.54, 1.48, -0.15, 1.48) + ) + + write.table(test_crea_gua_patients, sep = "\t", quote = FALSE, row.names = FALSE, + file = "tests/testthat/fixtures/test_crea_gua_patients.txt") + write.table(test_crea_gua_controls, sep = "\t", quote = FALSE, row.names = FALSE, + file = "tests/testthat/fixtures/test_crea_gua_controls.txt") +} + +make_test_df_metabs_helix <- function() { + test_df_metabs_helix <- data.frame( + HMDB_name = c("metab1", "metab1", "metab1", "metab1", "metab1", "metab4", "metab11", "metab4", + "metab11", "metab4", "metab11", "metab4", "metab11", "metab4", "metab11"), + Sample = c("P2025M1", "P2025M2", "P2025M3", "P2025M4", "P2025M5", "P2025M1", "P2025M1", "P2025M2", "P2025M2", "P2025M3", + "P2025M3", "P2025M4", "P2025M4", "P2025M5", "P2025M5"), + Z_score = c(0.31, 2.45, 2.14, 12.18, 3.22, 0.84, -0.46, -0.15, -1.51, -0.78, 1.68, 0.84, 1.48, 0.47, 1.18), + Helix_naam = c("Metab_1", "Metab_1", "Metab_1", "Metab_1", "Metab_1", "Metab_4", "Metab_11", + "Metab_4", "Metab_11", "Metab_4", "Metab_11", "Metab_4", "Metab_11", "Metab_4", "Metab_11"), + high_zscore = rep(2, 15), + low_zscore = rep(-1.5, 15) + ) + + write.table(test_df_metabs_helix, sep = "\t", quote = FALSE, row.names = FALSE, + file = "tests/testthat/fixtures/test_df_metabs_helix.txt") +} + +make_test_zscore_patient_df <- function() { + test_zscore_patient_df <- data.frame( + HMDB_code = c("HMDB001", "HMDB002", "HMDB003", "HMDB004", "HMDB005", "HMDB006", "HMDB007", "HMDB008", "HMDB009", "HMDB010", + "HMDB011", "HMDB012", "HMDB013", "HMDB014", "HMDB015", "HMDB016", "HMDB017", "HMDB018", "HMDB019", "HMDB020", + "HMDB021", "HMDB022", "HMDB023", "HMDB024", "HMDB025", "HMDB026", "HMDB027", "HMDB028", "HMDB029", + "HMDB030"), + HMDB_name = c("metab1", "metab2", "metab3", "metab4", "metab5", "metab6", "metab7", "metab8", "metab9", "metab10", + "metab11", "metab12", "metab13", "metab14", "metab15", "metab16", "metab17", "metab18", "metab19", "metab20", + "metab21", "metab22", "metab23", "metab24", "metab25", "metab26", "metab27", "metab28", "metab29", + "metab30"), + P2025M1 = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30), + P2025M2 = c(-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, + -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30), + P2025M3 = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30), + P2025M4 = c(-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30) + ) + + write.table(test_zscore_patient_df, sep = "\t", quote = FALSE, row.names = FALSE, + file = "tests/testthat/fixtures/test_zscore_patient_df.txt") +} + +make_test_metab_perpage <- function() { + test_acyl_carnitines_df <- data.frame( + HMDB_name = c( + "metab1 ", + "metab3 ", + "metab1 ", + "metab3 ", + "metab1 ", + "metab3 ", + "metab1 ", + "metab3 ", + "metab1 ", + "metab3 ", + "metab1 ", + "metab3 ", + "metab1 ", + "metab3 ", + "metab1 ", + "metab3 ", + "metab1 ", + "metab3 ", + "metab1 ", + "metab3 " + ), + Sample = c("P2025M1", "P2025M1", "P2025M2", "P2025M2", "P2025M3", + "P2025M3", "P2025M4", "P2025M4", "P2025M5", "P2025M5", + "C101.1", "C101.1", "C102.1", "C102.1", "C103.1", "C103.1", + "C104.1", "C104.1", "C105.1", "C105.1"), + Z_score = c(0.31, 2.34, 2.45, 1.45, 2.14, -1.44, 12.18, -0.18, 3.22, -3.18, + 0.45, -1.86, 2.89, -1.88, 0.54, 1.58, 0.53, 0.35, 3.46, 0.14) + ) + + write.table(test_acyl_carnitines_df, sep = "\t", quote = FALSE, row.names = FALSE, + file = "tests/testthat/fixtures/test_acyl_carnitines_df.txt") + + test_crea_gua_df <- data.frame( + HMDB_name = c( + "metab4 ", + "metab11 ", + "metab4 ", + "metab11 ", + "metab4 ", + "metab11 ", + "metab4 ", + "metab11 ", + "metab4 ", + "metab11 ", + "metab4 ", + "metab11 ", + "metab4 ", + "metab11 ", + "metab4 ", + "metab11 ", + "metab4 ", + "metab11 ", + "metab4 ", + "metab11 " + ), + Sample = c("P2025M1", "P2025M1", "P2025M2", "P2025M2", "P2025M3", + "P2025M3", "P2025M4", "P2025M4", "P2025M5", "P2025M5", + "C101.1", "C101.1", "C102.1", "C102.1", "C103.1", "C103.1", + "C104.1", "C104.1", "C105.1", "C105.1"), + Z_score = c(0.84, -0.46, -0.15, -1.51, -0.78, 1.68, 0.84, 1.48, 0.47, 1.18, + 0.58, 2.47, 5.46, -0.68, 3.84, 0.84, -0.54, 1.48, -0.15, 1.48) + ) + + write.table(test_crea_gua_df, sep = "\t", quote = FALSE, row.names = FALSE, + file = "tests/testthat/fixtures/test_crea_gua_df.txt") +} + +make_test_expected_biomark_df <- function() { + test_expected_biomarkers_df <- data.frame( + HMDB_code = c("HMDB002", "HMDB002", "HMDB005", "HMDB005", "HMDB005", "HMDB009", "HMDB012", "HMDB012", "HMDB020", "HMDB020", + "HMDB025", "HMDB025", "HMDB025", "HMDB028", "HMDB028"), + HMDB_name = c("metab2", "metab2", "metab5", "metab5", "metab5", "metab9", "metab12", "metab12", "metab20", "metab20", + "metab25", "metab25", "metab25", "metab28", "metab28"), + Disease = c("Disease A", "Disease B", "Disease B", "Disease B", "Disease C", "Disease D", "Disease A", "Disease E", + "Disease F", "Disease C", "Disease F", "Disease G", "Disease D", "Disease G", "Disease E"), + M.z = c("1.2", "1.2", "2.5", "2.5", "2.5", "3.0", "4.5", "4.5", "2.5", "2.5", "5.1", "5.1", "5.1", "5.8", "5.8"), + Change = c("Increase", "Decrease", "Increase", "Increase", "Decrease", "Decrease", "Increase", "Increase", "Decrease", + "Increase", "Increase", "Decrease", "Increase", "Decrease", "Increase"), + Total_Weight = c(10.0, -1.5, 2.0, 5.0, -2.5, -3.0, 14.5, 4.0, + -7.5, 6.0, 20.0, -5.0, 3.0, -1.5, 4.0), + Absolute_Weight = c(10.0, 1.5, 2.0, 5.0, 2.5, 3.0, 14.5, 4.0, + 7.5, 6.0, 20.0, 5.0, 3.0, 1.5, 4.0), + Dispensability = c("Dispensable", "Dispensable", "Indispensable", "Dispensable", "Dispensable", "Indispensable", + "Dispensable", "Dispensable", "Indispensable", "Indispensable", "Dispensable", "Dispensable", + "Dispensable", "Dispensable", "Indispensable") + ) + + write.table(test_expected_biomarkers_df, sep = "\t", quote = FALSE, row.names = FALSE, + file = "tests/testthat/fixtures/test_expected_biomarkers_df.txt") +} + +make_test_probability_score_df <- function() { + test_probability_score_df <- data.frame( + Disease = c("Disease A", "Disease B", "Disease C", "Disease D", "Disease E", "Disease F", "Disease G"), + P2025M1 = c(10.9, 0.953, 12.1, 0, 44.3, 0, -38.7), + P2025M2 = c(-10.9, 0, 0, -12.5, 0, -77.4, 38.7), + P2025M3 = c(49.9, 2.29, 0, 0, 0, -77.4, 38.7), + P2025M4 = c(-49.9, 0, 12.1, 18.2, 28.1, 0, -38.7) + ) + + write.table(test_probability_score_df, sep = "\t", quote = FALSE, row.names = FALSE, + file = "tests/testthat/fixtures/test_probability_score_df.txt") +} + +make_zscore_dfs <- function() { + test_zscore_patients_df <- data.frame( + HMDB_code = c("HMDB001", "HMDB002", "HMDB003", "HMDB004", "HMDB011", "HMDB012", "HMDB000TT1", "HMDB000TT2", "HMDB000TT3"), + HMDB_name = c("metab1", "metab2", "metab3", "metab4", "metab5", "metab6", "Test_ratio1", "Test_ratio2", "Test_ratio3"), + P2025M1 = c(0.31, 1.84, 2.34, 0.84, -0.46, 0.14, -0.58, 0.48, -0.45), + P2025M2 = c(2.45, 0.48, 1.45, -0.15, -1.51, 3.56, -0.71, 0.39, -0.54), + P2025M3 = c(2.14, 0.15, -1.44, -0.78, 1.68, 0.51, -0.22, 0.38, -0.48), + P2025M4 = c(12.18, 2.48, -0.18, 0.84, 1.48, -2.45, -0.21, 0.51, -0.29), + P2025M5 = c(3.22, 0.48, -3.18, 0.47, 1.18, 2.14, 1.74, -1.78, 1.78) + ) + + test_zscore_controls_df <- data.frame( + HMDB_code = c("HMDB001", "HMDB002", "HMDB003", "HMDB004", "HMDB011", "HMDB012", "HMDB000TT1", "HMDB000TT2", "HMDB000TT3"), + HMDB_name = c("metab1", "metab2", "metab3", "metab4", "metab5", "metab6", "Test_ratio1", "Test_ratio2", "Test_ratio3"), + C101.1 = c(0.45, 1.67, -1.86, 0.58, 2.47, -0.56, -0.58, 0.48, -0.45), + C102.1 = c(2.89, 0.79, -1.88, 5.46, -0.68, 1.65, -0.71, 0.39, -0.54), + C103.1 = c(0.54, -0.85, 1.58, 3.84, 0.84, -1.11, -0.22, 0.38, -0.48), + C104.1 = c(0.53, 1.84, 0.35, -0.54, 1.48, 0.43, -0.21, 0.51, -0.29), + C105.1 = c(3.46, -1.31, 0.14, -0.15, 1.48, 0.36, 1.74, -1.78, 1.78) + ) + + write.table(test_zscore_patients_df, sep = "\t", quote = FALSE, row.names = FALSE, + file = "tests/testthat/fixtures/GenerateViolinPlots/test_zscore_patients_df.txt") + write.table(test_zscore_controls_df, sep = "\t", quote = FALSE, row.names = FALSE, + file = "tests/testthat/fixtures/GenerateViolinPlots/test_zscore_controls_df.txt") +} + +make_test_metabolite_class_dfs <- function() { + test_metabolite_class_patients_df <- data.frame( + HMDB_name = c("HMDB001", "HMDB002", "HMDB003", "HMDB004", "HMDB011", "HMDB012", "HMDB000TT1", "HMDB000TT2", "HMDB000TT3", + "HMDB001", "HMDB002", "HMDB003", "HMDB004", "HMDB011", "HMDB012", "HMDB000TT1", "HMDB000TT2", "HMDB000TT3", + "HMDB001", "HMDB002", "HMDB003", "HMDB004", "HMDB011", "HMDB012", "HMDB000TT1", "HMDB000TT2", "HMDB000TT3"), + Sample = c("P2025M1", "P2025M1", "P2025M1", "P2025M1", "P2025M1", "P2025M1", "P2025M1", "P2025M1", "P2025M1", + "P2025M2", "P2025M2", "P2025M2", "P2025M2", "P2025M2", "P2025M2", "P2025M2", "P2025M2", "P2025M2", + "P2025M3", "P2025M3", "P2025M3", "P2025M3", "P2025M3", "P2025M3", "P2025M3", "P2025M3", "P2025M3"), + Z_score = c(0.31, 1.84, 2.34, 0.84, -0.46, 0.14, -0.58, 0.48, -0.45, + 2.45, 0.48, 1.45, -0.15, -1.51, 3.56, -0.71, 0.39, -0.54, + 2.14, 0.15, -1.44, -0.78, 1.68, 0.51, -0.22, 0.38, -0.48) + ) + + test_metabolite_class_controls_df <- data.frame( + HMDB_name = c("HMDB001", "HMDB002", "HMDB003", "HMDB004", "HMDB011", "HMDB012", "HMDB000TT1", "HMDB000TT2", "HMDB000TT3", + "HMDB001", "HMDB002", "HMDB003", "HMDB004", "HMDB011", "HMDB012", "HMDB000TT1", "HMDB000TT2", "HMDB000TT3", + "HMDB001", "HMDB002", "HMDB003", "HMDB004", "HMDB011", "HMDB012", "HMDB000TT1", "HMDB000TT2", "HMDB000TT3"), + Sample = c("C101.1", "C101.1", "C101.1", "C101.1", "C101.1", "C101.1", "C101.1", "C101.1", "C101.1", + "C102.1", "C102.1", "C102.1", "C102.1", "C102.1", "C102.1", "C102.1", "C102.1", "C102.1", + "C103.1", "C103.1", "C103.1", "C103.1", "C103.1", "C103.1", "C103.1", "C103.1", "C103.1"), + Z_score = c(0.45, 1.67, -1.86, 0.58, 2.47, -0.56, -0.58, 0.48, -0.45, + 2.89, 0.79, -1.88, 5.46, -0.68, 1.65, -0.71, 0.39, -0.54, + 0.54, -0.85, 1.58, 3.84, 0.84, -1.11, -0.22, 0.38, -0.48) + ) + + write.table(test_metabolite_class_patients_df, sep = "\t", quote = FALSE, row.names = FALSE, + file = "tests/testthat/fixtures/GenerateViolinPlots/test_metabolite_class_patients_df.txt") + write.table(test_metabolite_class_controls_df, sep = "\t", quote = FALSE, row.names = FALSE, + file = "tests/testthat/fixtures/GenerateViolinPlots/test_metabolite_class_controls_df.txt") + +} diff --git a/DIMS/tests/testthat/fixtures/make_test_outlist_df.R b/DIMS/tests/testthat/fixtures/make_test_outlist_df.R index f5ee8b4..d1b451a 100644 --- a/DIMS/tests/testthat/fixtures/make_test_outlist_df.R +++ b/DIMS/tests/testthat/fixtures/make_test_outlist_df.R @@ -24,7 +24,7 @@ make_test_outlist_df <- function() { HMDB_ID_all = c("HMDB001;HMDB011", "HMDB002", "HMDB003;HMDB013", "HMDB004"), sec_HMDB_ID = c("HMDB1;HMDB11", "", "HMDB3;HMDB13", "HMDB4"), HMDB_key = c("HMDB001", "HMDB002", "HMDB003", "HMDB004"), - sec_HMDB_ID_rlvc = c(c("HMDB1 | HMDB11", "HMDB2", "HMDB3", "HMDB4")), + sec_HMDB_ID_rlvnc = c(c("HMDB1 | HMDB11", "HMDB2", "HMDB3", "HMDB4")), name = c("metab_1 | metab_11", "metab_2", "metab_3", "metab_4"), relevance = c("Endogenous, relevant", "Endogenous, relevant | Exogenous", "Endogenous, relevant", "Endogenous, relevant"), descr = c("descr1", "descr2", "descr3", "descr4"), diff --git a/DIMS/tests/testthat/fixtures/test_acyl_carnitines_controls.txt b/DIMS/tests/testthat/fixtures/test_acyl_carnitines_controls.txt new file mode 100644 index 0000000..acf7277 --- /dev/null +++ b/DIMS/tests/testthat/fixtures/test_acyl_carnitines_controls.txt @@ -0,0 +1,11 @@ +HMDB_name Sample Z_score +metab1 C101.1 0.45 +metab3 C101.1 -1.86 +metab1 C102.1 2.89 +metab3 C102.1 -1.88 +metab1 C103.1 0.54 +metab3 C103.1 1.58 +metab1 C104.1 0.53 +metab3 C104.1 0.35 +metab1 C105.1 3.46 +metab3 C105.1 0.14 diff --git a/DIMS/tests/testthat/fixtures/test_acyl_carnitines_df.txt b/DIMS/tests/testthat/fixtures/test_acyl_carnitines_df.txt new file mode 100644 index 0000000..988ce26 --- /dev/null +++ b/DIMS/tests/testthat/fixtures/test_acyl_carnitines_df.txt @@ -0,0 +1,21 @@ +HMDB_name Sample Z_score +metab1 P2025M1 0.31 +metab3 P2025M1 2.34 +metab1 P2025M2 2.45 +metab3 P2025M2 1.45 +metab1 P2025M3 2.14 +metab3 P2025M3 -1.44 +metab1 P2025M4 12.18 +metab3 P2025M4 -0.18 +metab1 P2025M5 3.22 +metab3 P2025M5 -3.18 +metab1 C101.1 0.45 +metab3 C101.1 -1.86 +metab1 C102.1 2.89 +metab3 C102.1 -1.88 +metab1 C103.1 0.54 +metab3 C103.1 1.58 +metab1 C104.1 0.53 +metab3 C104.1 0.35 +metab1 C105.1 3.46 +metab3 C105.1 0.14 diff --git a/DIMS/tests/testthat/fixtures/test_acyl_carnitines_patients.txt b/DIMS/tests/testthat/fixtures/test_acyl_carnitines_patients.txt new file mode 100644 index 0000000..0d9686e --- /dev/null +++ b/DIMS/tests/testthat/fixtures/test_acyl_carnitines_patients.txt @@ -0,0 +1,11 @@ +HMDB_name Sample Z_score +metab1 P2025M1 0.31 +metab3 P2025M1 2.34 +metab1 P2025M2 2.45 +metab3 P2025M2 1.45 +metab1 P2025M3 2.14 +metab3 P2025M3 -1.44 +metab1 P2025M4 12.18 +metab3 P2025M4 -0.18 +metab1 P2025M5 3.22 +metab3 P2025M5 -3.18 diff --git a/DIMS/tests/testthat/fixtures/test_crea_gua_controls.txt b/DIMS/tests/testthat/fixtures/test_crea_gua_controls.txt new file mode 100644 index 0000000..184dc09 --- /dev/null +++ b/DIMS/tests/testthat/fixtures/test_crea_gua_controls.txt @@ -0,0 +1,11 @@ +HMDB_name Sample Z_score +metab4 C101.1 0.58 +metab11 C101.1 2.47 +metab4 C102.1 5.46 +metab11 C102.1 -0.68 +metab4 C103.1 3.84 +metab11 C103.1 0.84 +metab4 C104.1 -0.54 +metab11 C104.1 1.48 +metab4 C105.1 -0.15 +metab11 C105.1 1.48 diff --git a/DIMS/tests/testthat/fixtures/test_crea_gua_df.txt b/DIMS/tests/testthat/fixtures/test_crea_gua_df.txt new file mode 100644 index 0000000..1219723 --- /dev/null +++ b/DIMS/tests/testthat/fixtures/test_crea_gua_df.txt @@ -0,0 +1,21 @@ +HMDB_name Sample Z_score +metab4 P2025M1 0.84 +metab11 P2025M1 -0.46 +metab4 P2025M2 -0.15 +metab11 P2025M2 -1.51 +metab4 P2025M3 -0.78 +metab11 P2025M3 1.68 +metab4 P2025M4 0.84 +metab11 P2025M4 1.48 +metab4 P2025M5 0.47 +metab11 P2025M5 1.18 +metab4 C101.1 0.58 +metab11 C101.1 2.47 +metab4 C102.1 5.46 +metab11 C102.1 -0.68 +metab4 C103.1 3.84 +metab11 C103.1 0.84 +metab4 C104.1 -0.54 +metab11 C104.1 1.48 +metab4 C105.1 -0.15 +metab11 C105.1 1.48 diff --git a/DIMS/tests/testthat/fixtures/test_crea_gua_patients.txt b/DIMS/tests/testthat/fixtures/test_crea_gua_patients.txt new file mode 100644 index 0000000..6630278 --- /dev/null +++ b/DIMS/tests/testthat/fixtures/test_crea_gua_patients.txt @@ -0,0 +1,11 @@ +HMDB_name Sample Z_score +metab4 P2025M1 0.84 +metab11 P2025M1 -0.46 +metab4 P2025M2 -0.15 +metab11 P2025M2 -1.51 +metab4 P2025M3 -0.78 +metab11 P2025M3 1.68 +metab4 P2025M4 0.84 +metab11 P2025M4 1.48 +metab4 P2025M5 0.47 +metab11 P2025M5 1.18 diff --git a/DIMS/tests/testthat/fixtures/test_df_metabs_helix.txt b/DIMS/tests/testthat/fixtures/test_df_metabs_helix.txt new file mode 100644 index 0000000..4a55714 --- /dev/null +++ b/DIMS/tests/testthat/fixtures/test_df_metabs_helix.txt @@ -0,0 +1,16 @@ +HMDB_name Sample Z_score Helix_naam high_zscore low_zscore +metab1 P2025M1 0.31 Metab_1 2 -1.5 +metab1 P2025M2 2.45 Metab_1 2 -1.5 +metab1 P2025M3 2.14 Metab_1 2 -1.5 +metab1 P2025M4 12.18 Metab_1 2 -1.5 +metab1 P2025M5 3.22 Metab_1 2 -1.5 +metab4 P2025M1 0.84 Metab_4 2 -1.5 +metab11 P2025M1 -0.46 Metab_11 2 -1.5 +metab4 P2025M2 -0.15 Metab_4 2 -1.5 +metab11 P2025M2 -1.51 Metab_11 2 -1.5 +metab4 P2025M3 -0.78 Metab_4 2 -1.5 +metab11 P2025M3 1.68 Metab_11 2 -1.5 +metab4 P2025M4 0.84 Metab_4 2 -1.5 +metab11 P2025M4 1.48 Metab_11 2 -1.5 +metab4 P2025M5 0.47 Metab_4 2 -1.5 +metab11 P2025M5 1.18 Metab_11 2 -1.5 diff --git a/DIMS/tests/testthat/fixtures/test_expected_biomarkers_df.txt b/DIMS/tests/testthat/fixtures/test_expected_biomarkers_df.txt new file mode 100644 index 0000000..4038e7f --- /dev/null +++ b/DIMS/tests/testthat/fixtures/test_expected_biomarkers_df.txt @@ -0,0 +1,16 @@ +HMDB_code HMDB_name Disease M.z Change Total_Weight Absolute_Weight Dispensability +HMDB002 metab2 Disease A 1.2 Increase 10 10 Dispensable +HMDB002 metab2 Disease B 1.2 Decrease -1.5 1.5 Dispensable +HMDB005 metab5 Disease B 2.5 Increase 2 2 Indispensable +HMDB005 metab5 Disease B 2.5 Increase 5 5 Dispensable +HMDB005 metab5 Disease C 2.5 Decrease -2.5 2.5 Dispensable +HMDB009 metab9 Disease D 3.0 Decrease -3 3 Indispensable +HMDB012 metab12 Disease A 4.5 Increase 14.5 14.5 Dispensable +HMDB012 metab12 Disease E 4.5 Increase 4 4 Dispensable +HMDB020 metab20 Disease F 2.5 Decrease -7.5 7.5 Indispensable +HMDB020 metab20 Disease C 2.5 Increase 6 6 Indispensable +HMDB025 metab25 Disease F 5.1 Increase 20 20 Dispensable +HMDB025 metab25 Disease G 5.1 Decrease -5 5 Dispensable +HMDB025 metab25 Disease D 5.1 Increase 3 3 Dispensable +HMDB028 metab28 Disease G 5.8 Decrease -1.5 1.5 Dispensable +HMDB028 metab28 Disease E 5.8 Increase 4 4 Indispensable diff --git a/DIMS/tests/testthat/fixtures/test_intensities_zscore_df.txt b/DIMS/tests/testthat/fixtures/test_intensities_zscore_df.txt new file mode 100644 index 0000000..90edb16 --- /dev/null +++ b/DIMS/tests/testthat/fixtures/test_intensities_zscore_df.txt @@ -0,0 +1,7 @@ +"HMDB_code" "HMDB_name" "C101.1" "C102.1" "C103.1" "C104.1" "C105.1" "P2025M1" "P2025M2" "P2025M3" "P2025M4" "P2025M5" "mean_control" "sd_control" "C101.1_Zscore" "C102.1_Zscore" "C103.1_Zscore" "C104.1_Zscore" "C105.1_Zscore" "P2025M1_Zscore" "P2025M2_Zscore" "P2025M3_Zscore" "P2025M4_Zscore" "P2025M5_Zscore" +"HMDB001" "HMDB001" "metab1" 1000 1100 1300 1650 180000 1000 1100 1300 1650 180000 37010 79934.23 0.45 2.89 0.54 0.53 3.46 0.31 2.45 2.14 12.18 3.22 +"HMDB002" "HMDB002" "metab2" 1200 1700 750 925 1950 1200 1700 750 925 1950 1305 508.8 1.67 0.79 -0.85 1.84 -1.31 1.84 0.48 0.15 2.48 0.48 +"HMDB003" "HMDB003" "metab3" 1300 925 1000 1600 750 1300 925 1000 1600 750 1115 336.15 -1.86 -1.88 1.58 0.35 0.14 2.34 1.45 -1.44 -0.18 -3.18 +"HMDB004" "HMDB004" "metab4" 1400 1125 1220 1650 15050 1400 1125 1220 1650 15050 4089 6130.65 0.58 5.46 3.84 -0.54 -0.15 0.84 -0.15 -0.78 0.84 0.47 +"HMDB011" "HMDB011" "metab5" 1500 1200 1100 1025 1100 1100 1050 975 1700 10000 1185 186.75 2.47 -0.68 0.84 1.48 1.48 -0.46 -1.51 1.68 1.48 1.18 +"HMDB012" "HMDB012" "metab6" 1600 1050 1200 1150 1300 975 1175 1100 1750 1500 1260 210.36 -0.56 1.65 -1.11 0.43 0.36 0.14 3.56 0.51 -2.45 2.14 diff --git a/DIMS/tests/testthat/fixtures/test_metabolite_groups/Diagnostics/test_acyl_carnitines.txt b/DIMS/tests/testthat/fixtures/test_metabolite_groups/Diagnostics/test_acyl_carnitines.txt new file mode 100644 index 0000000..51c9852 --- /dev/null +++ b/DIMS/tests/testthat/fixtures/test_metabolite_groups/Diagnostics/test_acyl_carnitines.txt @@ -0,0 +1,4 @@ +HMDB_code HMDB_name Helix Helix_naam high_zscore low_zscore +HMDB001 metab1 ja Metab_1 2 -1.5 +HMDB003 metab3 nee Metab_3 2 -1.5 +HMDB000TT1 ratio1 ja Ratio_1 2 -1.5 diff --git a/DIMS/tests/testthat/fixtures/test_metabolite_groups/Diagnostics/test_crea_gua.txt b/DIMS/tests/testthat/fixtures/test_metabolite_groups/Diagnostics/test_crea_gua.txt new file mode 100644 index 0000000..08dd589 --- /dev/null +++ b/DIMS/tests/testthat/fixtures/test_metabolite_groups/Diagnostics/test_crea_gua.txt @@ -0,0 +1,3 @@ +HMDB_code HMDB_name Helix Helix_naam high_zscore low_zscore +HMDB004 metab4 ja Metab_4 2 -1.5 +HMDB011 metab11 ja Metab_11 2 -1.5 diff --git a/DIMS/tests/testthat/fixtures/test_metabolite_groups/Other/test_other.txt b/DIMS/tests/testthat/fixtures/test_metabolite_groups/Other/test_other.txt new file mode 100644 index 0000000..7601321 --- /dev/null +++ b/DIMS/tests/testthat/fixtures/test_metabolite_groups/Other/test_other.txt @@ -0,0 +1,4 @@ +HMDB_code HMDB_name Helix Helix_naam high_zscore low_zscore +HMDB004 metab4 ja Metab_4 2 -1.5 +HMDB011 metab11 ja Metab_11 2 -1.5 +HMDB000TT1 ratio1 ja Ratio_1 2 -1.5 diff --git a/DIMS/tests/testthat/fixtures/test_outlist.txt b/DIMS/tests/testthat/fixtures/test_outlist.txt index 957adc6..43a638a 100644 --- a/DIMS/tests/testthat/fixtures/test_outlist.txt +++ b/DIMS/tests/testthat/fixtures/test_outlist.txt @@ -1,4 +1,4 @@ -"plots" "C101.1" "C102.1" "C103.1" "C104.1" "C105.1" "C106.1" "C107.1" "C108.1" "C109.1" "C110.1" "C111.1" "C112.1" "P2.1" "P3.1" "HMDB_name" "HMDB_name_all" "HMDB_ID_all" "sec_HMDB_ID" "HMDB_key" "sec_HMDB_ID_rlvc" "name" "relevance" "descr" "origin" "fluids" "tissue" "disease" "pathway" "HMDB_code" +"plots" "C101.1" "C102.1" "C103.1" "C104.1" "C105.1" "C106.1" "C107.1" "C108.1" "C109.1" "C110.1" "C111.1" "C112.1" "P2.1" "P3.1" "HMDB_name" "HMDB_name_all" "HMDB_ID_all" "sec_HMDB_ID" "HMDB_key" "sec_HMDB_ID_rlvnc" "name" "relevance" "descr" "origin" "fluids" "tissue" "disease" "pathway" "HMDB_code" "HMDB001" NA 1000 1100 1300 1650 180000 1050 1150 1350 1450 1200 1050 1250 3000 5750 "metab_1" "metab_1;metab_11" "HMDB001;HMDB011" "HMDB1;HMDB11" "HMDB001" "HMDB1 | HMDB11" "metab_1 | metab_11" "Endogenous, relevant" "descr1" "Endogenous" "Blood" "Muscle" "disease 1" "pathway1" "HMDB001" "HMDB002" NA 1200 1700 750 925 1950 1100 1250 850 1025 950 1125 975 12500 2750 "metab_2" "metab_2" "HMDB002" "" "HMDB002" "HMDB2" "metab_2" "Endogenous, relevant | Exogenous" "descr2" "Endogenous | Exogenous" "Blood" "" "disease2" "pathway2" "HMDB002" "HMDB003" NA 1300 925 1000 1600 750 1200 825 1175 1500 1750 1300 1450 5500 6750 "metab_3" "metab_3;metab_13" "HMDB003;HMDB013" "HMDB3;HMDB13" "HMDB003" "HMDB3" "metab_3" "Endogenous, relevant" "descr3" "Endogenous" "Blood" "Prostate" "" "pathway3" "HMDB003" diff --git a/DIMS/tests/testthat/fixtures/test_probability_score_df.txt b/DIMS/tests/testthat/fixtures/test_probability_score_df.txt new file mode 100644 index 0000000..385fe1d --- /dev/null +++ b/DIMS/tests/testthat/fixtures/test_probability_score_df.txt @@ -0,0 +1,8 @@ +Disease P2025M1 P2025M2 P2025M3 P2025M4 +Disease A 10.9 -10.9 49.9 -49.9 +Disease B 0.953 0 2.29 0 +Disease C 12.1 0 0 12.1 +Disease D 0 -12.5 0 18.2 +Disease E 44.3 0 0 28.1 +Disease F 0 -77.4 -77.4 0 +Disease G -38.7 38.7 38.7 -38.7 diff --git a/DIMS/tests/testthat/fixtures/test_zscore_patient_df.txt b/DIMS/tests/testthat/fixtures/test_zscore_patient_df.txt new file mode 100644 index 0000000..0de283a --- /dev/null +++ b/DIMS/tests/testthat/fixtures/test_zscore_patient_df.txt @@ -0,0 +1,31 @@ +HMDB_code HMDB_name P2025M1 P2025M2 P2025M3 P2025M4 +HMDB001 metab1 1 -1 1 -1 +HMDB002 metab2 2 -2 2 -2 +HMDB003 metab3 3 -3 3 -3 +HMDB004 metab4 4 -4 4 -4 +HMDB005 metab5 5 -5 5 -5 +HMDB006 metab6 6 -6 6 -6 +HMDB007 metab7 7 -7 7 -7 +HMDB008 metab8 8 -8 8 -8 +HMDB009 metab9 9 -9 9 -9 +HMDB010 metab10 10 -10 10 -10 +HMDB011 metab11 11 -11 11 -11 +HMDB012 metab12 12 -12 12 -12 +HMDB013 metab13 13 -13 13 -13 +HMDB014 metab14 14 -14 14 -14 +HMDB015 metab15 15 -15 15 -15 +HMDB016 metab16 16 -16 -16 16 +HMDB017 metab17 17 -17 -17 17 +HMDB018 metab18 18 -18 -18 18 +HMDB019 metab19 19 -19 -19 19 +HMDB020 metab20 20 -20 -20 20 +HMDB021 metab21 21 -21 -21 21 +HMDB022 metab22 22 -22 -22 22 +HMDB023 metab23 23 -23 -23 23 +HMDB024 metab24 24 -24 -24 24 +HMDB025 metab25 25 -25 -25 25 +HMDB026 metab26 26 -26 -26 26 +HMDB027 metab27 27 -27 -27 27 +HMDB028 metab28 28 -28 -28 28 +HMDB029 metab29 29 -29 -29 29 +HMDB030 metab30 30 -30 -30 30 diff --git a/DIMS/tests/testthat/test_evaluate_tics.R b/DIMS/tests/testthat/test_evaluate_tics.R index b474a81..2682c39 100644 --- a/DIMS/tests/testthat/test_evaluate_tics.R +++ b/DIMS/tests/testthat/test_evaluate_tics.R @@ -7,15 +7,15 @@ source("../../preprocessing/evaluate_tics_functions.R") testthat::test_that("TICS are correctly accepted or rejected", { # It's necessary to copy/symlink the files to the current location for the combine_sum_adducts_parts function # local: setwd("~/Development/DIMS_refactor_PeakFinding_codereview/CustomModules/DIMS/tests/testthat") - test_files <- list.files("fixtures/", "test_evaluate_tics", full.names = TRUE) + test_files <- list.files("fixtures", "test_evaluate_tics", full.names = TRUE) file.symlink(file.path(test_files), getwd()) - + # create replication pattern to test on: technical_replicates <- paste0("test_evaluate_tics_file", 1:3) test_repl_pattern <- list(technical_replicates) names(test_repl_pattern) <- "sample1" test_thresh2remove <- 10^9 - + # test that output has two entries expect_equal(length(find_bad_replicates(test_repl_pattern, test_thresh2remove)), 2) # test that first technical replicate is removed in positive scan mode @@ -24,11 +24,12 @@ testthat::test_that("TICS are correctly accepted or rejected", { expect_equal(find_bad_replicates(test_repl_pattern, test_thresh2remove)$neg, "test_evaluate_tics_file3", TRUE) # test that output files are generated expect_equal(sum(grepl("miss_infusions_", list.files("./"))), 2) - + # Remove symlinked files - files_remove <- list.files("./", "SummedAdducts_test.RData", full.names = TRUE) + files_remove_tics <- list.files("./", "test_evaluate_tics_file", full.names = TRUE) + files_remove_miss <- list.files("./", "miss_infusions_", full.names = TRUE) + files_remove <- c(files_remove_tics, files_remove_miss) file.remove(files_remove) - }) # test remove_from_repl_pattern @@ -39,7 +40,7 @@ testthat::test_that("technical replicates are correctly removed from replication names(test_repl_pattern) <- "sample1" test_bad_samples <- "test_evaluate_tics_file2" test_nr_replicates <- 3 - + # test that the output contains 1 sample expect_equal(length(remove_from_repl_pattern(test_bad_samples, test_repl_pattern, test_nr_replicates)), 1) # test that the output for the sample contains 2 technical replicates @@ -57,10 +58,9 @@ testthat::test_that("overview of technical replicates is correctly created", { test_scanmode <- "positive" # test that overview is correctly created - expect_equal(get_overview_tech_reps(test_repl_pattern_filtered, test_scanmode)[ ,1], "sample1") - expect_equal(get_overview_tech_reps(test_repl_pattern_filtered, test_scanmode)[ ,3], "positive") - expect_true(get_overview_tech_reps(test_repl_pattern_filtered, test_scanmode)[ ,2] == - paste0(technical_replicates, collapse = ";"), TRUE) - -}) + expect_equal(get_overview_tech_reps(test_repl_pattern_filtered, test_scanmode)[, 1], "sample1") + expect_equal(get_overview_tech_reps(test_repl_pattern_filtered, test_scanmode)[, 3], "positive") + expect_true(get_overview_tech_reps(test_repl_pattern_filtered, test_scanmode)[, 2] == + paste0(technical_replicates, collapse = ";"), TRUE) +}) diff --git a/DIMS/tests/testthat/test_generate_excel.R b/DIMS/tests/testthat/test_generate_excel.R index fda4096..e4a7ff3 100644 --- a/DIMS/tests/testthat/test_generate_excel.R +++ b/DIMS/tests/testthat/test_generate_excel.R @@ -42,188 +42,45 @@ testthat::test_that("calculate_zscores: Calculating Z-scores using different met perc <- 5 outlier_threshold <- 2 - expect_type( - calculate_zscores( - test_outlist, - "_Zscore", - control_intensities, - NULL, - intensity_col_ids, - startcol - ), - "list" - ) - - expect_identical( - colnames( - calculate_zscores( - test_outlist, - "_Zscore", - control_intensities, - NULL, - intensity_col_ids, - startcol - ) - ), - c( - "plots", "C101.1", "C102.1", "C103.1", "C104.1", "C105.1", "C106.1", "C107.1", "C108.1", "C109.1", "C110.1", "C111.1", - "C112.1", "P2.1", "P3.1", "HMDB_name", "HMDB_name_all", "HMDB_ID_all", "sec_HMDB_ID", "HMDB_key", "sec_HMDB_ID_rlvc", - "name", "relevance", "descr", "origin", "fluids", "tissue", "disease", "pathway", "HMDB_code", "avg_ctrls", "sd_ctrls", - "nr_ctrls", "C101.1_Zscore", "C102.1_Zscore", "C103.1_Zscore", "C104.1_Zscore", "C105.1_Zscore", "C106.1_Zscore", - "C107.1_Zscore", "C108.1_Zscore", "C109.1_Zscore", "C110.1_Zscore", "C111.1_Zscore", "C112.1_Zscore", "P2.1_Zscore", - "P3.1_Zscore" - ) - ) - expect_equal( - round( - calculate_zscores( - test_outlist, - "_Zscore", - control_intensities, - NULL, - intensity_col_ids, - startcol - )$avg_ctrls, 3 - ), - c(16129.167, 1150.0, 1231.250, 4015.833), - tolerance = 0.001 - ) - expect_equal( - calculate_zscores( - test_outlist, - "_Zscore", - control_intensities, - NULL, - intensity_col_ids, - startcol - )$P2.1_Zscore, - c(-0.2544103, 32.4586955, 13.6066674, 0.4037668), - tolerance = 0.001 - ) - - expect_type( - calculate_zscores( - test_outlist, - "_RobustZscore", - control_col_idx, - perc, - intensity_col_ids, - startcol - ), - "list" - ) - - expect_identical( - colnames( - calculate_zscores( - test_outlist, - "_RobustZscore", - control_col_idx, - perc, - intensity_col_ids, - startcol - ) - )[34:47], - c( - "C101.1_RobustZscore", "C102.1_RobustZscore", "C103.1_RobustZscore", "C104.1_RobustZscore", "C105.1_RobustZscore", - "C106.1_RobustZscore", "C107.1_RobustZscore", "C108.1_RobustZscore", "C109.1_RobustZscore", "C110.1_RobustZscore", - "C111.1_RobustZscore", "C112.1_RobustZscore", "P2.1_RobustZscore", "P3.1_RobustZscore" - ) - ) - - expect_equal( - calculate_zscores( - test_outlist, - "_RobustZscore", - control_col_idx, - perc, - intensity_col_ids, - startcol - )$avg_ctrls, - c(1255.0, 1110.0, 1227.5, 2811.5), - tolerance = 0.001 - ) - - expect_equal( - calculate_zscores( - test_outlist, - "_RobustZscore", - control_col_idx, - perc, - intensity_col_ids, - startcol - )$P2.1_RobustZscore, - c(9.1511750, 46.9804468, 16.8039663, 0.8565111), - tolerance = 0.001 - ) - - expect_type( - calculate_zscores( - test_outlist, - "_OutlierRemovedZscore", - control_col_idx, - outlier_threshold, - intensity_col_ids, - startcol - ), - "list" - ) - - expect_identical( - colnames( - calculate_zscores( - test_outlist, - "_OutlierRemovedZscore", - control_col_idx, - outlier_threshold, - intensity_col_ids, - startcol - ) - )[34:47], - c( - "C101.1_OutlierRemovedZscore", "C102.1_OutlierRemovedZscore", "C103.1_OutlierRemovedZscore", - "C104.1_OutlierRemovedZscore", "C105.1_OutlierRemovedZscore", "C106.1_OutlierRemovedZscore", - "C107.1_OutlierRemovedZscore", "C108.1_OutlierRemovedZscore", "C109.1_OutlierRemovedZscore", - "C110.1_OutlierRemovedZscore", "C111.1_OutlierRemovedZscore", "C112.1_OutlierRemovedZscore", - "P2.1_OutlierRemovedZscore", "P3.1_OutlierRemovedZscore" - ) - ) - - expect_equal( - calculate_zscores( - test_outlist, - "_OutlierRemovedZscore", - control_col_idx, - outlier_threshold, - intensity_col_ids, - startcol - )$avg_ctrls, - c(1231.818, 1077.273, 1231.250, 2649.091), - tolerance = 0.001 - ) - expect_equal( - calculate_zscores( - test_outlist, - "_OutlierRemovedZscore", - control_col_idx, - outlier_threshold, - intensity_col_ids, - startcol - )$nr_ctrls, - c(11, 11, 12, 11) - ) - expect_equal( - calculate_zscores( - test_outlist, - "_OutlierRemovedZscore", - control_col_idx, - outlier_threshold, - intensity_col_ids, - startcol - )$P2.1_OutlierRemovedZscore, - c(8.9955723, 44.9136860, 13.6066674, 0.9345077), - tolerance = 0.001 - ) + expect_type(calculate_zscores(test_outlist, "_Zscore", control_intensities, NULL, intensity_col_ids, startcol), "list") + expect_identical(colnames(calculate_zscores(test_outlist, "_Zscore", control_intensities, NULL, intensity_col_ids, startcol)), + c("plots", "C101.1", "C102.1", "C103.1", "C104.1", "C105.1", "C106.1", "C107.1", "C108.1", "C109.1", "C110.1", + "C111.1", "C112.1", "P2.1", "P3.1", "HMDB_name", "HMDB_name_all", "HMDB_ID_all", "sec_HMDB_ID", + "HMDB_key", "sec_HMDB_ID_rlvnc", "name", "relevance", "descr", "origin", "fluids", "tissue", "disease", + "pathway", "HMDB_code", "avg_ctrls", "sd_ctrls", "nr_ctrls", "C101.1_Zscore", "C102.1_Zscore", "C103.1_Zscore", + "C104.1_Zscore", "C105.1_Zscore", "C106.1_Zscore", "C107.1_Zscore", "C108.1_Zscore", "C109.1_Zscore", + "C110.1_Zscore", "C111.1_Zscore", "C112.1_Zscore", "P2.1_Zscore", "P3.1_Zscore")) + expect_equal(round(calculate_zscores(test_outlist, "_Zscore", control_intensities, NULL, intensity_col_ids, startcol)$avg_ctrls, 3), + c(16129.167, 1150.0, 1231.250, 4015.833), tolerance = 0.001) + expect_equal(calculate_zscores(test_outlist, "_Zscore", control_intensities, NULL, intensity_col_ids, startcol)$P2.1_Zscore, + c(-0.2544103, 32.4586955, 13.6066674, 0.4037668), tolerance = 0.001) + + expect_type(calculate_zscores(test_outlist, "_RobustZscore", control_col_idx, perc, intensity_col_ids, startcol), "list") + expect_identical(colnames(calculate_zscores(test_outlist, "_RobustZscore", control_col_idx, perc, intensity_col_ids, startcol))[34:47], + c("C101.1_RobustZscore", "C102.1_RobustZscore", + "C103.1_RobustZscore", "C104.1_RobustZscore", "C105.1_RobustZscore", "C106.1_RobustZscore", + "C107.1_RobustZscore", "C108.1_RobustZscore", "C109.1_RobustZscore", "C110.1_RobustZscore", + "C111.1_RobustZscore", "C112.1_RobustZscore", "P2.1_RobustZscore", "P3.1_RobustZscore")) + expect_equal(calculate_zscores(test_outlist, "_RobustZscore", control_col_idx, perc, intensity_col_ids, startcol)$avg_ctrls, + c(1255.0, 1110.0, 1227.5, 2811.5), tolerance = 0.001) + expect_equal(calculate_zscores(test_outlist, "_RobustZscore", control_col_idx, perc, intensity_col_ids, startcol)$P2.1_RobustZscore, + c(9.1511750, 46.9804468, 16.8039663, 0.8565111), tolerance = 0.001) + + expect_type(calculate_zscores(test_outlist, "_OutlierRemovedZscore", control_col_idx, outlier_threshold, intensity_col_ids, startcol), "list") + expect_identical(colnames(calculate_zscores(test_outlist, "_OutlierRemovedZscore", control_col_idx, outlier_threshold, intensity_col_ids, startcol))[34:47], + c("C101.1_OutlierRemovedZscore", + "C102.1_OutlierRemovedZscore", "C103.1_OutlierRemovedZscore", "C104.1_OutlierRemovedZscore", + "C105.1_OutlierRemovedZscore", "C106.1_OutlierRemovedZscore", "C107.1_OutlierRemovedZscore", + "C108.1_OutlierRemovedZscore", "C109.1_OutlierRemovedZscore", "C110.1_OutlierRemovedZscore", + "C111.1_OutlierRemovedZscore", "C112.1_OutlierRemovedZscore", "P2.1_OutlierRemovedZscore", + "P3.1_OutlierRemovedZscore") + ) + expect_equal(calculate_zscores(test_outlist, "_OutlierRemovedZscore", control_col_idx, outlier_threshold, intensity_col_ids, startcol)$avg_ctrls, + c(1231.818, 1077.273, 1231.250, 2649.091), tolerance = 0.001) + expect_equal(calculate_zscores(test_outlist, "_OutlierRemovedZscore", control_col_idx, outlier_threshold, intensity_col_ids, startcol)$nr_ctrls, + c(11, 11, 12, 11)) + expect_equal(calculate_zscores(test_outlist, "_OutlierRemovedZscore", control_col_idx, outlier_threshold, intensity_col_ids, startcol)$P2.1_OutlierRemovedZscore, + c(8.9955723, 44.9136860, 13.6066674, 0.9345077), tolerance = 0.001) }) testthat::test_that("robust_scaler: Use robust scaler", { diff --git a/DIMS/tests/testthat/test_generate_violin_plots.R b/DIMS/tests/testthat/test_generate_violin_plots.R new file mode 100644 index 0000000..fd48cbe --- /dev/null +++ b/DIMS/tests/testthat/test_generate_violin_plots.R @@ -0,0 +1,955 @@ +# unit tests for GenerateViolinPlots + +suppressPackageStartupMessages(library("dplyr")) +library(reshape2) +library(openxlsx) +library(ggplot2) +suppressPackageStartupMessages(library("gridExtra")) +library(stringr) + +source("../../export/generate_violin_plots_functions.R") + +testthat::test_that("get_intensities_fraction_side: Get intensities for calculating the ratios", { + test_intensities_zscore_df <- read.delim(test_path("fixtures", "test_intensities_zscore_df.txt")) + test_intensity_cols <- c( + "C101.1", "C102.1", "C103.1", "C104.1", "C105.1", + "P2025M1", "P2025M2", "P2025M3", "P2025M4", "P2025M5" + ) + test_ratios_metabs_df <- data.frame( + HMDB.code = c("HMDBAA1", "HMDBAA2", "HMDBAB1"), + Ratio_name = c("ratio1", "ratio2", "ratio3"), + HMDB_numerator = c("HMDB001", "HMDB002plusHMDB003", "HMDB004"), + HMDB_denominator = c("HMDB011", "HMDB012", "one") + ) + expect_equal( + get_intensities_fraction_side( + test_ratios_metabs_df, + 2, + test_intensities_zscore_df, + "HMDB_numerator", + test_intensity_cols + ), + c(2500, 2625, 1750, 2525, 2700, 2500, 2625, 1750, 2525, 2700) + ) + expect_equal( + get_intensities_fraction_side( + test_ratios_metabs_df, + 3, + test_intensities_zscore_df, + "HMDB_denominator", + test_intensity_cols + ), + 1 + ) +}) + +testthat::test_that("get_sample_ids_with_zscores: Get samples that have both intensity and Z-score columns", { + test_intensities_zscore_df <- read.delim(test_path("fixtures", "test_intensities_zscore_df.txt")) + + test_intensity_cols <- c( + "C101.1", "C102.1", "C103.1", "C104.1", "C105.1", + "P2025M1", "P2025M2", "P2025M3", "P2025M4", "P2025M5" + ) + + expect_equal( + length(get_sample_ids_with_zscores(colnames(test_intensities_zscore_df), test_intensity_cols)), + 10 + ) + expect_equal( + get_sample_ids_with_zscores(colnames(test_intensities_zscore_df), test_intensity_cols), + c( + "C101.1_Zscore", "C102.1_Zscore", "C103.1_Zscore", "C104.1_Zscore", "C105.1_Zscore", + "P2025M1_Zscore", "P2025M2_Zscore", "P2025M3_Zscore", "P2025M4_Zscore", "P2025M5_Zscore" + ) + ) + + test_intensity_cols <- c("C101.1", "C102.1", "C103.1", "C104.1", "C105.1", "P2025M1", "P2025M2", "P2025M3") + expect_equal( + length(get_sample_ids_with_zscores(colnames(test_intensities_zscore_df), test_intensity_cols)), + 8 + ) + expect_equal( + get_sample_ids_with_zscores(colnames(test_intensities_zscore_df), test_intensity_cols), + c( + "C101.1_Zscore", "C102.1_Zscore", "C103.1_Zscore", "C104.1_Zscore", "C105.1_Zscore", + "P2025M1_Zscore", "P2025M2_Zscore", "P2025M3_Zscore" + ) + ) +}) + +testthat::test_that("get_list_dataframes_from_dir: Get a list with dataframes from metabilite files in a directory", { + test_path_metabolite_groups <- test_path("fixtures/test_metabolite_groups/Diagnostics") + + expect_type(get_list_dataframes_from_dir(test_path_metabolite_groups), "list") + expect_identical( + names(get_list_dataframes_from_dir(test_path_metabolite_groups)), + c("test_acyl_carnitines", "test_crea_gua") + ) + + expect_identical( + colnames(get_list_dataframes_from_dir(test_path_metabolite_groups)$test_acyl_carnitines), + c("HMDB_code", "HMDB_name", "Helix", "Helix_naam", "high_zscore", "low_zscore") + ) + expect_identical( + get_list_dataframes_from_dir(test_path_metabolite_groups)$test_acyl_carnitines$HMDB_name, + c("metab1", "metab3", "ratio1") + ) + expect_identical( + get_list_dataframes_from_dir(test_path_metabolite_groups)$test_acyl_carnitines$Helix, + c("ja", "nee", "ja") + ) + + expect_identical( + colnames(get_list_dataframes_from_dir(test_path_metabolite_groups)$test_crea_gua), + c("HMDB_code", "HMDB_name", "Helix", "Helix_naam", "high_zscore", "low_zscore") + ) + expect_identical( + get_list_dataframes_from_dir(test_path_metabolite_groups)$test_crea_gua$HMDB_name, + c("metab4", "metab11") + ) + expect_identical( + get_list_dataframes_from_dir(test_path_metabolite_groups)$test_crea_gua$Helix, + c("ja", "ja") + ) +}) + +testthat::test_that("merge_metabolite_info_zscores: Combine metabolite info dataframe and Z-score dataframe", { + test_acyl_carnitines_df <- read.delim(test_path("fixtures/test_metabolite_groups/Diagnostics", "test_acyl_carnitines.txt")) + test_crea_gua_df <- read.delim(test_path("fixtures/test_metabolite_groups/Diagnostics", "test_crea_gua.txt")) + + test_metab_list_all <- list(test_acyl_carnitines_df, test_crea_gua_df) + names(test_metab_list_all) <- c("test_acyl_carnitines", "test_crea_gua") + + test_patient_cols <- c("P2025M1_Zscore", "P2025M2_Zscore", "P2025M3_Zscore", "P2025M4_Zscore", "P2025M5_Zscore") + test_intensities_zscore_df <- read.delim(test_path("fixtures", "test_intensities_zscore_df.txt")) + test_zscore_patients_df <- test_intensities_zscore_df %>% select(HMDB_code, HMDB_name, any_of(test_patient_cols)) + + expect_type(merge_metabolite_info_zscores(test_metab_list_all, test_zscore_patients_df), "list") + expect_identical( + names(merge_metabolite_info_zscores(test_metab_list_all, test_zscore_patients_df)), + c("test_acyl_carnitines", "test_crea_gua") + ) + + expect_identical( + colnames(merge_metabolite_info_zscores(test_metab_list_all, test_zscore_patients_df)$test_acyl_carnitines), + c("HMDB_name", "Sample", "Z_score") + ) + expect_identical( + (merge_metabolite_info_zscores( + test_metab_list_all, + test_zscore_patients_df + )$test_acyl_carnitines$Z_score), + c(0.31, 2.34, 2.45, 1.45, 2.14, -1.44, 12.18, -0.18, 3.22, -3.18) + ) + expect_identical( + as.character(merge_metabolite_info_zscores( + test_metab_list_all, + test_zscore_patients_df + )$test_acyl_carnitines$Sample), + c( + "P2025M1_Zscore", "P2025M1_Zscore", "P2025M2_Zscore", "P2025M2_Zscore", "P2025M3_Zscore", + "P2025M3_Zscore", "P2025M4_Zscore", "P2025M4_Zscore", "P2025M5_Zscore", "P2025M5_Zscore" + ) + ) + expect_equal( + nchar(merge_metabolite_info_zscores( + test_metab_list_all, + test_zscore_patients_df + )$test_acyl_carnitines$HMDB_name[1]), + 45 + ) + + expect_identical( + colnames(merge_metabolite_info_zscores(test_metab_list_all, test_zscore_patients_df)$test_crea_gua), + c("HMDB_name", "Sample", "Z_score") + ) + expect_identical( + (merge_metabolite_info_zscores( + test_metab_list_all, + test_zscore_patients_df + )$test_crea_gua$Z_score), + c(0.84, -0.46, -0.15, -1.51, -0.78, 1.68, 0.84, 1.48, 0.47, 1.18) + ) + expect_identical( + as.character(merge_metabolite_info_zscores( + test_metab_list_all, + test_zscore_patients_df + )$test_crea_gua$Sample), + c( + "P2025M1_Zscore", "P2025M1_Zscore", "P2025M2_Zscore", "P2025M2_Zscore", "P2025M3_Zscore", + "P2025M3_Zscore", "P2025M4_Zscore", "P2025M4_Zscore", "P2025M5_Zscore", "P2025M5_Zscore" + ) + ) + expect_equal( + nchar(merge_metabolite_info_zscores( + test_metab_list_all, + test_zscore_patients_df + )$test_crea_gua$HMDB_name[1]), + 45 + ) +}) + +testthat::test_that("get_data_per_metabolite_class: Combine patient and control data for each page of the violinplot pdf", { + test_acyl_carnitines_pat <- read.delim(test_path("fixtures/", "test_acyl_carnitines_patients.txt")) + test_acyl_carnitines_ctrl <- read.delim(test_path("fixtures/", "test_acyl_carnitines_controls.txt")) + + test_crea_gua_pat <- read.delim(test_path("fixtures/", "test_crea_gua_patients.txt")) + test_crea_gua_ctrl <- read.delim(test_path("fixtures/", "test_crea_gua_controls.txt")) + + test_metab_interest_sorted <- list(test_acyl_carnitines_pat, test_crea_gua_pat) + names(test_metab_interest_sorted) <- c("test_acyl_carnitines", "test_crea_gua") + + test_metab_interest_contr <- list(test_acyl_carnitines_ctrl, test_crea_gua_ctrl) + names(test_metab_interest_contr) <- c("test_acyl_carnitines", "test_crea_gua") + + test_nr_plots_perpage <- 1 + test_nr_pat <- 5 + test_nr_contr <- 5 + + expect_type( + get_data_per_metabolite_class( + test_metab_interest_sorted, test_metab_interest_contr, + test_nr_plots_perpage, test_nr_pat, test_nr_contr + ), + "list" + ) + expect_equal( + length(get_data_per_metabolite_class( + test_metab_interest_sorted, test_metab_interest_contr, + test_nr_plots_perpage, test_nr_pat, test_nr_contr + )), + 4 + ) + expect_identical( + names(get_data_per_metabolite_class( + test_metab_interest_sorted, test_metab_interest_contr, + test_nr_plots_perpage, test_nr_pat, test_nr_contr + )), + c("test_acyl_carnitines_1", "test_acyl_carnitines_2", "test_crea_gua_1", "test_crea_gua_2") + ) + expect_identical( + unique(get_data_per_metabolite_class( + test_metab_interest_sorted, test_metab_interest_contr, + test_nr_plots_perpage, test_nr_pat, + test_nr_contr + )$test_acyl_carnitines_1$HMDB_name), + c("metab1 ") + ) + expect_identical( + get_data_per_metabolite_class( + test_metab_interest_sorted, test_metab_interest_contr, + test_nr_plots_perpage, test_nr_pat, + test_nr_contr + )$test_acyl_carnitines_1$Sample, + c("P2025M1", "P2025M2", "P2025M3", "P2025M4", "P2025M5", "C101.1", "C102.1", "C103.1", "C104.1", "C105.1") + ) + + test_nr_plots_perpage <- 2 + + expect_equal( + length(get_data_per_metabolite_class( + test_metab_interest_sorted, test_metab_interest_contr, + test_nr_plots_perpage, test_nr_pat, test_nr_contr + )), + 2 + ) + expect_identical( + names(get_data_per_metabolite_class( + test_metab_interest_sorted, test_metab_interest_contr, + test_nr_plots_perpage, test_nr_pat, test_nr_contr + )), + c("test_acyl_carnitines_1", "test_crea_gua_1") + ) + expect_identical( + unique(get_data_per_metabolite_class( + test_metab_interest_sorted, test_metab_interest_contr, + test_nr_plots_perpage, test_nr_pat, + test_nr_contr + )$test_acyl_carnitines_1$HMDB_name), + c("metab1 ", "metab3 ") + ) + expect_identical( + get_data_per_metabolite_class( + test_metab_interest_sorted, test_metab_interest_contr, + test_nr_plots_perpage, test_nr_pat, + test_nr_contr + )$test_acyl_carnitines_1$Sample, + c( + "P2025M1", "P2025M1", "P2025M2", "P2025M2", "P2025M3", + "P2025M3", "P2025M4", "P2025M4", "P2025M5", "P2025M5", + "C101.1", "C101.1", "C102.1", "C102.1", "C103.1", "C103.1", "C104.1", "C104.1", "C105.1", "C105.1" + ) + ) +}) + +testthat::test_that("prepare_helix_patient_data: Generate a dataframe with information for Helix", { + test_acyl_carnitines_pat <- read.delim(test_path("fixtures", "test_acyl_carnitines_patients.txt")) + test_crea_gua_pat <- read.delim(test_path("fixtures", "test_crea_gua_patients.txt")) + + test_metab_interest_sorted <- list(test_acyl_carnitines_pat, test_crea_gua_pat) + names(test_metab_interest_sorted) <- c("test_acyl_carnitines", "test_crea_gua") + + test_acyl_carnitines_df <- read.delim(test_path("fixtures/test_metabolite_groups/Diagnostics", "test_acyl_carnitines.txt")) + test_crea_gua_df <- read.delim(test_path("fixtures/test_metabolite_groups/Diagnostics", "test_crea_gua.txt")) + + test_metab_list_all <- list(test_acyl_carnitines_df, test_crea_gua_df) + names(test_metab_list_all) <- c("test_acyl_carnitines", "test_crea_gua") + + expect_identical( + colnames(prepare_helix_patient_data(test_metab_interest_sorted, test_metab_list_all)), + c("HMDB_name", "Sample", "Z_score", "Helix_naam", "high_zscore", "low_zscore") + ) + expect_equal( + dim(prepare_helix_patient_data(test_metab_interest_sorted, test_metab_list_all)), + c(15, 6) + ) + expect_identical( + unique(prepare_helix_patient_data(test_metab_interest_sorted, test_metab_list_all)$HMDB_name), + c("metab1", "metab4", "metab11") + ) + expect_false("ratio1" %in% prepare_helix_patient_data(test_metab_interest_sorted, test_metab_list_all)$HMDB_name) + expect_equal( + prepare_helix_patient_data(test_metab_interest_sorted, test_metab_list_all)$Z_score, + c(0.31, 2.45, 2.14, 12.18, 3.22, 0.84, -0.46, -0.15, -1.51, -0.78, 1.68, 0.84, 1.48, 0.47, 1.18) + ) +}) + +testthat::test_that("is_diagnostic_patients: Check for diagnostic patients", { + test_patient_column <- c("P2025M1", "P2025M2", "P2025M3", "P2025M4", "C101.1", "C102.1", "P2025D1", "P225M1") + + expect_equal(is_diagnostic_patients(test_patient_column), c(TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE)) + expect_equal(length(is_diagnostic_patients(test_patient_column)), 8) +}) + +testthat::test_that("add_lab_id_and_onderzoeksnr:Adding labnummer and Onderzoeksnummer to the Helix dataframe", { + test_df_metabs_helix <- read.delim(test_path("fixtures/", "test_df_metabs_helix.txt")) + + test_df_metabs_helix <- test_df_metabs_helix %>% + group_by(Sample) %>% + mutate(Vial = cur_group_id()) %>% + ungroup() + + expect_true("labnummer" %in% colnames(add_lab_id_and_onderzoeksnr(test_df_metabs_helix))) + expect_true("Onderzoeksnummer" %in% colnames(add_lab_id_and_onderzoeksnr(test_df_metabs_helix))) + expect_identical( + unique(add_lab_id_and_onderzoeksnr(test_df_metabs_helix)$labnummer), + c("2025M1", "2025M2", "2025M3", "2025M4", "2025M5") + ) + expect_identical( + unique(add_lab_id_and_onderzoeksnr(test_df_metabs_helix)$Onderzoeksnummer), + c("MB2025/1", "MB2025/2", "MB2025/3", "MB2025/4", "MB2025/5") + ) +}) + +testthat::test_that("transform_metab_df_to_helix_df: Make the output for Helix", { + test_protocol_name <- "test_protocol_name" + + test_df_metabs_helix <- read.delim(test_path("fixtures/", "test_df_metabs_helix.txt")) + + expect_equal( + dim(transform_metab_df_to_helix_df(test_protocol_name, test_df_metabs_helix)), + c(15, 6) + ) + expect_identical( + colnames(transform_metab_df_to_helix_df(test_protocol_name, test_df_metabs_helix)), + c("Vial", "labnummer", "Onderzoeksnummer", "Protocol", "Name", "Amount") + ) + expect_identical( + unique(transform_metab_df_to_helix_df(test_protocol_name, test_df_metabs_helix)$Protocol), + "test_protocol_name" + ) + expect_identical( + unique(transform_metab_df_to_helix_df(test_protocol_name, test_df_metabs_helix)$labnummer), + c("2025M1", "2025M2", "2025M3", "2025M4", "2025M5") + ) + expect_identical( + unique(transform_metab_df_to_helix_df(test_protocol_name, test_df_metabs_helix)$Onderzoeksnummer), + c("MB2025/1", "MB2025/2", "MB2025/3", "MB2025/4", "MB2025/5") + ) + expect_equal( + transform_metab_df_to_helix_df(test_protocol_name, test_df_metabs_helix)$Amount, + c(0.31, 2.45, 2.14, 12.18, 3.22, 0.84, -0.46, -0.15, -1.51, -0.78, 1.68, 0.84, 1.48, 0.47, 1.18) + ) +}) + +testthat::test_that("get_top_metabolites_df: Create a dataframe with the top metabolites", { + test_df_metabs_helix <- read.delim(test_path("fixtures/", "test_df_metabs_helix.txt")) + test_patient_id <- "P2025M1" + + expect_equal( + dim(get_top_metabolites_df(test_patient_id, test_df_metabs_helix)), + c(2, 2) + ) + expect_equal( + colnames(get_top_metabolites_df(test_patient_id, test_df_metabs_helix)), + c("Metabolite", "Z-score") + ) + + test_patient_id <- "P2025M2" + expect_equal( + dim(get_top_metabolites_df(test_patient_id, test_df_metabs_helix)), + c(4, 2) + ) + expect_equal( + get_top_metabolites_df(test_patient_id, test_df_metabs_helix)$`Z-score`, + c("", "2.45", "", "-1.51") + ) + + test_patient_id <- "P2025M4" + expect_equal( + dim(get_top_metabolites_df(test_patient_id, test_df_metabs_helix)), + c(3, 2) + ) + expect_equal( + get_top_metabolites_df(test_patient_id, test_df_metabs_helix)$`Z-score`, + c("", "12.18", "") + ) +}) + +testthat::test_that("prepare_toplist: Create a dataframe with the top 20 highest and top 10 lowest metabolites", { + test_zscore_patient_df <- read.delim(test_path("fixtures/", "test_zscore_patient_df.txt")) + test_patient_id <- "P2025M1" + test_num_of_highest_metabs <- 20 + test_num_of_lowest_metabs <- 10 + + expect_equal( + dim(prepare_toplist( + test_patient_id, + test_zscore_patient_df, + test_num_of_highest_metabs, + test_num_of_lowest_metabs + )), + c(32, 3) + ) + expect_equal( + colnames(prepare_toplist( + test_patient_id, + test_zscore_patient_df, + test_num_of_highest_metabs, + test_num_of_lowest_metabs + )), + c("HMDB_ID", "Metabolite", "Z-score") + ) + expect_equal( + prepare_toplist( + test_patient_id, + test_zscore_patient_df, + test_num_of_highest_metabs, + test_num_of_lowest_metabs + )$HMDB_ID, + c( + "Increased", "HMDB030", "HMDB029", "HMDB028", "HMDB027", "HMDB026", "HMDB025", "HMDB024", "HMDB023", + "HMDB022", "HMDB021", "HMDB020", "HMDB019", "HMDB018", "HMDB017", "HMDB016", "HMDB015", "HMDB014", "HMDB013", + "HMDB012", "HMDB011", "Decreased", "HMDB001", "HMDB002", "HMDB003", "HMDB004", "HMDB005", "HMDB006", + "HMDB007", "HMDB008", "HMDB009", "HMDB010" + ) + ) + expect_equal( + prepare_toplist( + test_patient_id, + test_zscore_patient_df, + test_num_of_highest_metabs, + test_num_of_lowest_metabs + )$`Z-score`, + c( + "", "30", "29", "28", "27", "26", "25", "24", "23", "22", "21", "20", "19", "18", "17", "16", "15", + "14", "13", "12", "11", "", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10" + ) + ) + + test_patient_id <- "P2025M2" + + expect_equal( + prepare_toplist( + test_patient_id, + test_zscore_patient_df, + test_num_of_highest_metabs, + test_num_of_lowest_metabs + )$Metabolite, + c( + "", "metab1", "metab2", "metab3", "metab4", "metab5", "metab6", "metab7", "metab8", "metab9", "metab10", + "metab11", "metab12", "metab13", "metab14", "metab15", "metab16", "metab17", "metab18", "metab19", "metab20", + "", "metab30", "metab29", "metab28", "metab27", "metab26", "metab25", "metab24", "metab23", "metab22", + "metab21" + ) + ) + expect_equal( + prepare_toplist( + test_patient_id, + test_zscore_patient_df, + test_num_of_highest_metabs, + test_num_of_lowest_metabs + )$`Z-score`, + c( + "", "-1", "-2", "-3", "-4", "-5", "-6", "-7", "-8", "-9", "-10", "-11", "-12", "-13", "-14", "-15", + "-16", "-17", "-18", "-19", "-20", "", "-30", "-29", "-28", "-27", "-26", "-25", "-24", "-23", "-22", "-21" + ) + ) +}) + +testthat::test_that("create_pdf_violin_plots: Create a pdf with a table of top metabolites and violin plots", { + local_edition(3) + temp_dir <- "./" + dir.create(paste0(temp_dir, "violin_plots/")) + + test_pdf_dir <- paste0(temp_dir, "violin_plots/") + test_patient_id <- "P2025M1" + test_explanation <- "Unit test Generate Violin Plots" + + test_acyl_carnitines_df <- read.delim(test_path("fixtures/", "test_acyl_carnitines_df.txt")) + attr(test_acyl_carnitines_df, "y_order") <- rev(unique(test_acyl_carnitines_df$HMDB_name)) + test_crea_gua_df <- read.delim(test_path("fixtures/", "test_crea_gua_df.txt")) + attr(test_crea_gua_df, "y_order") <- rev(unique(test_crea_gua_df$HMDB_name)) + + test_metab_perpage <- list(test_acyl_carnitines_df, test_crea_gua_df) + names(test_metab_perpage) <- c("test_acyl_carnitines", "test_crea_gua") + + test_top_metab_pt <- data.frame( + Metabolite = c("Increased", "metab1", "Decreased", "metab11"), + `Z-score` = c("", "2.45", "", "-1.51") + ) + + expect_silent(create_pdf_violin_plots( + test_pdf_dir, + test_patient_id, + test_metab_perpage, + test_top_metab_pt, + test_explanation + )) + + out_pdf_violinplots <- file.path(test_pdf_dir, "R_P2025M1.pdf") + expect_true(file.exists(out_pdf_violinplots)) + content_pdf_violinplots <- pdftools::pdf_text(out_pdf_violinplots) + expect_snapshot(content_pdf_violinplots) + + unlink(test_pdf_dir, recursive = TRUE) +}) + +testthat::test_that("create_violin_plot: Create a violin plot", { + test_patient_id <- "P2025M1" + test_sub_perpage <- "test acyl carnitines" + + test_acyl_carnitines_df <- read.delim(test_path("fixtures/", "test_acyl_carnitines_df.txt")) + attr(test_acyl_carnitines_df, "y_order") <- rev(unique(test_acyl_carnitines_df$HMDB_name)) + + test_patient_zscore_df <- test_acyl_carnitines_df %>% filter(Sample == test_patient_id) + + test_metab_zscores_df <- test_acyl_carnitines_df %>% filter(Sample != test_patient_id) + + expect_silent(create_violin_plot(test_metab_zscores_df, test_patient_zscore_df, test_sub_perpage, test_patient_id)) + + expect_doppelganger("violin_plot_P2025M1", create_violin_plot( + test_metab_zscores_df, test_patient_zscore_df, + test_sub_perpage, test_patient_id + )) +}) + +testthat::test_that("run_diem_algorithm :Run dIEM algorithm", { + test_expected_biomarkers_df <- read.delim(test_path("fixtures/", "test_expected_biomarkers_df.txt")) + test_zscore_patient_df <- read.delim(test_path("fixtures/", "test_zscore_patient_df.txt")) + test_sample_cols <- c("P2025M1", "P2025M2", "P2025M3", "P2025M4") + + expect_equal( + dim(run_diem_algorithm(test_expected_biomarkers_df, test_zscore_patient_df, test_sample_cols)), + c(7, 5) + ) + expect_identical( + colnames(run_diem_algorithm(test_expected_biomarkers_df, test_zscore_patient_df, test_sample_cols)), + c("Disease", "P2025M1", "P2025M2", "P2025M3", "P2025M4") + ) + expect_identical( + run_diem_algorithm(test_expected_biomarkers_df, test_zscore_patient_df, test_sample_cols)$Disease, + c("Disease A", "Disease B", "Disease C", "Disease D", "Disease E", "Disease F", "Disease G") + ) + expect_equal(run_diem_algorithm(test_expected_biomarkers_df, test_zscore_patient_df, test_sample_cols)$P2025M1, + c(10.94172, 0.95343, 12.12121, 0.00000, 44.28850, 0.00000, -38.70370), + tolerance = 0.0001 + ) +}) + +testthat::test_that("rank_patient_zscores: Ranking Z-scores for a patient", { + test_zscore_col <- c(1, 5, 6, 2, 7, -2, 3) + + expect_equal(length(rank_patient_zscores(test_zscore_col)), 7) + + expect_identical( + rank_patient_zscores(test_zscore_col), + c(6, 3, 2, 5, 1, 1, 4) + ) + + test_zscore_col <- c(3, 2, 1, 3) + + expect_identical( + rank_patient_zscores(test_zscore_col), + c(1, 2, 3, 1) + ) + + test_zscore_col <- c(-1, -2, -3, -4) + + expect_identical( + rank_patient_zscores(test_zscore_col), + c(4, 3, 2, 1) + ) +}) + +testthat::test_that("save_prob_scores_to_excel: Saving the probability score dataframe as an Excel file", { + local_edition(3) + test_probability_score_df <- read.delim(test_path("fixtures/", "test_probability_score_df.txt")) + + test_run_name <- "test_run" + out_excel_file <- file.path(paste0("dIEM_algoritme_output_", test_run_name, ".xlsx")) + + expect_silent(save_prob_scores_to_excel(test_probability_score_df, test_run_name)) + expect_true(file.exists(out_excel_file)) + + content_excel_file <- openxlsx::read.xlsx(out_excel_file, sheet = 1) + expect_snapshot_output(content_excel_file) + + file.remove(out_excel_file, recursive = TRUE) +}) + +testthat::test_that("prepare_intensities_zscore_df: Preparing the intensities and Z-score dataframe", { + test_intensities_zscore_df <- read.delim(test_path("fixtures/", "test_outlist.txt")) + test_intensities_zscore_df$nr_ctrls <- c(28, 27, 30, 25) + test_intensities_zscore_df$avg_ctrls <- c(16129.17, 1150, 1231.25, 4015.833) + test_intensities_zscore_df$sd_ctrls <- c(51606.27, 349.6752, 313.7249, 6152.479) + + prepare_intensities_zscore_df(test_intensities_zscore_df) + + expect_equal( + colnames(prepare_intensities_zscore_df(test_intensities_zscore_df)), + c( + "HMDB_code", "HMDB_name", "C101.1", "C102.1", "C103.1", "C104.1", "C105.1", "C106.1", "C107.1", "C108.1", + "C109.1", "C110.1", "C111.1", "C112.1", "P2.1", "P3.1", "mean_controls", "sd_controls" + ) + ) + + expect_equal( + unname(sapply(prepare_intensities_zscore_df(test_intensities_zscore_df), class)), + c( + "character", "character", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", + "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric" + ) + ) +}) + +testthat::test_that("get_colnames_samples: Get all column names containing a specific prefix", { + test_colnames <- c( + "HMDB_name", "HMDB_code", "P1001", "P1001_Zscore", "P1002", + "P1003", "P1004", "C101_Zscore", "C102", "C103" + ) + test_intensities_zscore_df <- read.delim(test_path("fixtures/", "test_intensities_zscore_df.txt")) + + expect_equal(get_colnames_samples(test_intensities_zscore_df, "P"), c("P2025M1", "P2025M2", "P2025M3", "P2025M4", "P2025M5")) + expect_equal(get_colnames_samples(test_intensities_zscore_df, "C"), c("C101.1", "C102.1", "C103.1", "C104.1", "C105.1")) +}) + +testthat::test_that("add_zscores_ratios_to_df: Add Zscores for multiple ratios to the dataframe", { + test_outlist_df <- read.delim(test_path("fixtures/GenerateViolinPlots", "test_outlist_df.txt")) + + test_metabolites_ratios_df <- data.frame( + HMDB.code = c("HMDB000TT1", "HMDB000TT2", "HMDB000TT3"), + Ratio_name = c("Test_ratio1", "Test_ratio2", "Test_ratio3"), + HMDB_numerator = c("HMDB001", "HMDB003plusHMDB003", "HMDB001plusHMDB003plusHMDB004"), + HMDB_denominator = c("HMDB002", "HMDB004", "one") + ) + + test_all_sample_ids <- c( + "C101.1", "C102.1", "C103.1", "C104.1", "C105.1", "P2025M1", "P2025M2", "P2025M3", + "P2025M4", "P2025M5" + ) + + expect_equal( + add_zscores_ratios_to_df(test_outlist_df, test_metabolites_ratios_df, test_all_sample_ids)$HMDB_code, + c("HMDB001", "HMDB002", "HMDB003", "HMDB004", "HMDB011", "HMDB012", "HMDB000TT1", "HMDB000TT2", "HMDB000TT3") + ) + expect_equal( + add_zscores_ratios_to_df( + test_outlist_df, + test_metabolites_ratios_df, + test_all_sample_ids + )$C101.1, + c(1000, 1200, 1300, 1400, 1500, 1600, -0.2630344, -0.1069152, 11.8533096) + ) + expect_equal( + add_zscores_ratios_to_df( + test_outlist_df, + test_metabolites_ratios_df, + test_all_sample_ids + )$C101.1_Zscore, + c(0.45, 1.67, -1.86, 0.58, 2.47, -0.56, -0.5899371, 0.4858991, -0.4552026), + tolerance = 0.0001 + ) +}) + +testthat::test_that("calculate_zscore_ratios: Calculate Zscores for ratios", { + test_outlist_df <- read.delim(test_path("fixtures/GenerateViolinPlots", "test_outlist_df.txt")) + test_intensities_zscores_df <- prepare_intensities_zscore_df(test_outlist_df) + + test_metabolites_ratios_df <- data.frame( + HMDB.code = c("HMDB000TT1", "HMDB000TT2", "HMDB000TT3"), + Ratio_name = c("Test_ratio1", "Test_ratio2", "Test_ratio3"), + HMDB_numerator = c("HMDB001", "HMDB003plusHMDB003", "HMDB001plusHMDB003plusHMDB004"), + HMDB_denominator = c("HMDB002", "HMDB004", "one") + ) + + test_all_sample_ids <- c( + "C101.1", "C102.1", "C103.1", "C104.1", "C105.1", "P2025M1", "P2025M2", "P2025M3", + "P2025M4", "P2025M5" + ) + + expect_equal( + calculate_zscore_ratios(test_metabolites_ratios_df, test_outlist_df, test_all_sample_ids)$HMDB_code, + c("HMDB000TT1", "HMDB000TT2", "HMDB000TT3") + ) + expect_equal( + calculate_zscore_ratios(test_metabolites_ratios_df, test_outlist_df, test_all_sample_ids)$C101.1, + c(-0.2630344, -0.1069152, 11.8533096) + ) + expect_equal( + calculate_zscore_ratios(test_metabolites_ratios_df, test_outlist_df, test_all_sample_ids)$C101.1_Zscore, + c(-0.5899371, 0.4858991, -0.4552026), + tolerance = 0.0001 + ) +}) + +testthat::test_that("get_list_page_plot_data: Get a list of dataframes for each chunk", { + test_metab_class_patients_df <- read.delim(test_path( + "fixtures/GenerateViolinPlots", + "test_metabolite_class_patients_df.txt" + )) + test_metab_class_controls_df <- read.delim(test_path( + "fixtures/GenerateViolinPlots", + "test_metabolite_class_controls_df.txt" + )) + test_nr_plots_perpage <- 2 + test_metabolite_in_chunks <- list( + c("HMDB001", "HMDB002", "HMDB003"), + c("HMDB004", "HMDB011", "HMDB012"), + c("HMDB000TT1", "HMDB000TT2", "HMDB000TT3") + ) + + t <- get_list_page_plot_data( + test_metabolite_in_chunks, + test_metab_class_patients_df, + test_metab_class_controls_df, + test_nr_plots_perpage + ) + + expect_type(get_list_page_plot_data( + test_metabolite_in_chunks, + test_metab_class_patients_df, + test_metab_class_controls_df, + test_nr_plots_perpage + ), "list") + + expect_equal(length(get_list_page_plot_data( + test_metabolite_in_chunks, + test_metab_class_patients_df, + test_metab_class_controls_df, + test_nr_plots_perpage + )), 3) + + test_plot_data_list <- get_list_page_plot_data( + test_metabolite_in_chunks, + test_metab_class_patients_df, + test_metab_class_controls_df, + test_nr_plots_perpage + ) + for (num_chunk in seq_along(test_metabolite_in_chunks)) { + expect_equal(unique(test_plot_data_list[[num_chunk]]$HMDB_name), test_metabolite_in_chunks[[num_chunk]]) + expect_equal( + unique(test_plot_data_list[[num_chunk]]$Sample), + c("P2025M1", "P2025M2", "P2025M3", "C101.1", "C102.1", "C103.1") + ) + } +}) + +testthat::test_that("make_and_save_violin_plot_pdfs: Make and save violin plots for each patient in a PDF", { + test_zscore_patients_df <- read.delim(test_path("fixtures/GenerateViolinPlots", "test_zscore_patients_df.txt")) + test_zscore_controls_df <- read.delim(test_path("fixtures/GenerateViolinPlots", "test_zscore_controls_df.txt")) + test_path_metabolite_groups <- test_path("fixtures/test_metabolite_groups") + test_nr_plots_perpage <- 2 + test_number_of_samples <- list( + controls = 5, + patients = 5 + ) + test_run_name <- "unit_test" + test_protocol_name <- "UNIT_TEST_PROTOCOL" + test_explanation_violin_plot <- "Unit test violin plot pdfs" + test_number_of_metabolites <- list( + highest = 2, + lowest = 1 + ) + + expect_silent(make_and_save_violin_plot_pdfs( + test_zscore_patients_df, + test_zscore_controls_df, + test_path_metabolite_groups, + test_nr_plots_perpage, + test_number_of_samples, + test_run_name, + test_protocol_name, + test_explanation_violin_plot, + test_number_of_metabolites + )) + + patient_ids <- c("P2025M1", "P2025M2", "P2025M3", "P2025M4", "P2025M5") + for (patient_id in patient_ids) { + pdf_file_name_diagnotics <- file.path(paste0("Diagnostics/MB", gsub("^P|M", "", patient_id), "_DIMS_PL_DIAG.pdf")) + pdf_file_name_other <- file.path(paste0("Other/R_", patient_id, ".pdf")) + + expect_true(file.exists(pdf_file_name_diagnotics)) + expect_true(file.exists(pdf_file_name_other)) + } + expect_true(file.exists("output_Helix_unit_test.csv")) + + unlink(c("Diagnostics", "Other"), recursive = TRUE) + file.remove("output_Helix_unit_test.csv") +}) + +testthat::test_that("get_probabilities_top_iems: Get the IEM probabilities for a patient for all diseases", { + test_patient_top_iems_probs <- data.frame( + Disease = c("Disease A", "Disease B", "Disease C", "Disease D"), + P2025M1 = c(100, 75, 50, 25) + ) + test_expected_biomarkers_df <- read.delim(test_path("fixtures", "test_expected_biomarkers_df.txt")) + test_patient_id <- "P2025M1" + + + expect_type( + get_probabilities_top_iems(test_patient_top_iems_probs, test_expected_biomarkers_df, test_patient_id), + "list" + ) + + expect_equal( + length(get_probabilities_top_iems(test_patient_top_iems_probs, test_expected_biomarkers_df, test_patient_id)), + 4 + ) + expect_equal( + names(get_probabilities_top_iems(test_patient_top_iems_probs, test_expected_biomarkers_df, test_patient_id)), + c( + "Disease A, probability score 100", "Disease B, probability score 75", "Disease C, probability score 50", + "Disease D, probability score 25" + ) + ) + + expect_equal( + get_probabilities_top_iems( + test_patient_top_iems_probs, + test_expected_biomarkers_df, + test_patient_id + )$"Disease A, probability score 100", + data.frame( + HMDB_code = c("HMDB002", "HMDB012"), + HMDB_name = c("metab2", "metab12") + ) + ) +}) + +testthat::test_that("make_and_save_diem_plots: Make and save dIEM plots", { + test_diem_probability_score <- data.frame( + Disease = c("Disease A", "Disease B", "Disease C", "Disease D", "Disease E"), + P2025M1 = c(100, 75, 50, 25, 12.5), + P2025M2 = c(25, 0, 2, 8, 3), + P2025M3 = c(0, 1, 2, 3, 4) + ) + test_patient_ids <- c("P2025M1", "P2025M2", "P2025M3") + test_expected_biomarkers_df <- read.delim(test_path("fixtures", "test_expected_biomarkers_df.txt")) + test_zscore_patients_df <- read.delim(test_path("fixtures/GenerateViolinPlots", "test_zscore_patients_df.txt")) + test_zscore_controls_df <- read.delim(test_path("fixtures/GenerateViolinPlots", "test_zscore_controls_df.txt")) + test_nr_plots_perpage <- 2 + test_number_of_samples <- list( + controls = 5, + patients = 5 + ) + test_number_of_metabolites <- list( + highest = 2, + lowest = 1 + ) + test_iem_variables <- list( + top_number_iem_diseases = 5, + threshold_iem = 5 + ) + test_explanation_violin_plot <- "Unit test violin plot pdfs" + + expect_silent(make_and_save_diem_plots( + test_diem_probability_score, + test_patient_ids, + test_expected_biomarkers_df, + test_zscore_patients_df, + test_zscore_controls_df, + test_nr_plots_perpage, + test_number_of_samples, + test_number_of_metabolites, + test_iem_variables, + test_explanation_violin_plot + )) + unlink("dIEM_plots/", recursive = TRUE) + + expect_equal(make_and_save_diem_plots( + test_diem_probability_score, + test_patient_ids, + test_expected_biomarkers_df, + test_zscore_patients_df, + test_zscore_controls_df, + test_nr_plots_perpage, + test_number_of_samples, + test_number_of_metabolites, + test_iem_variables, + test_explanation_violin_plot + ), "P2025M3") + + expect_true(file.exists("dIEM_plots/IEM_P2025M1.pdf")) + expect_true(file.exists("dIEM_plots/IEM_P2025M2.pdf")) + + unlink("dIEM_plots/", recursive = TRUE) +}) + +testthat::test_that("make_metabolite_order: Make the order of metabolites for the violin plots", { + test_metabolites_vector <- c("metab1", "metab2", "metab3") + test_num_plots_per_page <- 5 + + make_metabolite_order(test_num_plots_per_page, test_metabolites_vector) + + expect_equal(length(make_metabolite_order(test_num_plots_per_page, test_metabolites_vector)), 5) + expect_equal( + make_metabolite_order(test_num_plots_per_page, test_metabolites_vector), + c("metab1", "metab2", "metab3", " ", " ") + ) +}) + +testthat::test_that("pad_truncate_hmdb_names: Pad or truncate HMDB names to a fixed width", { + test_dataframe <- data.frame( + HMDB_name = c("metab1", "metabolite2", "metabo3") + ) + test_width <- 10 + test_pad_character <- "+" + + expect_equal( + nrow(pad_truncate_hmdb_names(test_dataframe, test_width, test_pad_character)), + 3 + ) + expect_equal( + pad_truncate_hmdb_names(test_dataframe, test_width, test_pad_character)$HMDB_name, + c("metab1++++", "metabol...", "metabo3+++") + ) + + test_df <- pad_truncate_hmdb_names(test_dataframe, test_width, test_pad_character) + for (row in seq_len(nrow(test_df))) { + expect_equal(nchar(test_df[row, "HMDB_name"]), 10) + } +}) + +testthat::test_that("save_patient_no_iem: Save a list of patient IDs to a text file", { + local_edition(3) + test_threshold_iem <- 5 + test_patient_no_iem <- c("Patient1", "Patient2") + + expect_silent(save_patient_no_iem(test_threshold_iem, test_patient_no_iem)) + + expect_true(file.exists("missing_probability_scores.txt")) + + expect_snapshot_file("missing_probability_scores.txt") + file.remove("missing_probability_scores.txt") +})