diff --git a/.gitignore b/.gitignore index 8faabc57..f84ea082 100644 --- a/.gitignore +++ b/.gitignore @@ -12,7 +12,8 @@ tests/testthat/Rplots.pdf *.ASOIAF.ged ASOIAF.ged *.Rproj - +benchmark_results.csv .vscode/launch.json dataRelatedPairs_new2.csv data-raw/ASOIAF_040725.ged +dataRelatedPairs.csv diff --git a/NAMESPACE b/NAMESPACE index 47496f84..69e23307 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,9 @@ export(SimPed) export(allGens) +export(assignCoupleIDs) +export(calcAllGens) +export(calcFamilySize) export(calculateRelatedness) export(checkIDs) export(checkParentIDs) @@ -17,6 +20,7 @@ export(famSizeCal) export(fitComponentModel) export(identifyComponentModel) export(inferRelatedness) +export(insertEven) export(makeInbreeding) export(makeTwins) export(parseTree) @@ -30,14 +34,21 @@ export(ped2maternal) export(ped2mit) export(ped2paternal) export(plotPedigree) +export(readGed) export(readGedcom) export(readWikifamilytree) +export(readgedcom) export(recodeSex) export(related_coef) export(relatedness) export(repairSex) export(resample) export(simulatePedigree) +export(sizeAllGens) +export(summariseFamilies) +export(summariseMatrilines) +export(summarisePatrilines) +export(summarisePedigrees) export(summarizeFamilies) export(summarizeMatrilines) export(summarizePatrilines) diff --git a/NEWS.md b/NEWS.md index d8bd3dd0..c0daa9a2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,7 +2,11 @@ * revived checkParents function to check for handling phantom parents and missing parents * added tests for checkParents function * added GoT analysis -* reduced complexity of com2links and summarizePedigree with the use of subfunctions +* reduced complexity of com2links, summarizePedigree, and checkIDs with the use of subfunctions +* allow verbose argument to be passed to standardizeColnames +* list SimPed and related_coef as aliases for functions +* harmonizing function names like calcFamilySize from famSizeCal +* implemented adjBeta function to evaluation alternative build method # BGmisc 1.3.5.1 * Setting the default for the `sparse` argument in `ped2com()` to TRUE diff --git a/R/buildPedigree.R b/R/buildPedigree.R index 3d94ecad..98dfbaa3 100644 --- a/R/buildPedigree.R +++ b/R/buildPedigree.R @@ -88,7 +88,8 @@ ped2graph <- function(ped, ...) { # Check ped/data.fram if (!inherits(ped, "data.frame")) { - stop("ped should be a data.frame or inherit to a data.frame")} + stop("ped should be a data.frame or inherit to a data.frame") + } # Handle adjacent argument adjacent <- match.arg(tolower(adjacent)[1], choices = c( @@ -182,8 +183,10 @@ ped2maternal <- function(ped, personID = "ID", momID = "momID", dadID = "dadID", matID = "matID", ...) { # Call to wrapper function - .ped2id(ped = ped, personID = personID, momID = momID, - dadID = dadID, famID = matID, type = "mothers") + .ped2id( + ped = ped, personID = personID, momID = momID, + dadID = dadID, famID = matID, type = "mothers" + ) } #' Add a paternal line ID variable to a pedigree @@ -203,6 +206,8 @@ ped2paternal <- function(ped, personID = "ID", momID = "momID", dadID = "dadID", patID = "patID", ...) { # Call to wrapper function - .ped2id(ped = ped, personID = personID, momID = momID, - dadID = dadID, famID = patID, type = "fathers") + .ped2id( + ped = ped, personID = personID, momID = momID, + dadID = dadID, famID = patID, type = "fathers" + ) } diff --git a/R/calculateFamilySize.R b/R/calculateFamilySize.R index 51365d85..06cc3f16 100644 --- a/R/calculateFamilySize.R +++ b/R/calculateFamilySize.R @@ -5,7 +5,7 @@ #' @param marR Mating rate (numeric value ranging from 0 to 1). #' @return Returns a vector containing the number of individuals in every generation. #' @export -allGens <- function(kpc, Ngen, marR) { +calcAllGens <- function(kpc, Ngen, marR) { # Check if the number of generations is valid if (Ngen < 1) { stop("The number of generations should be an integer greater or equal than 1") @@ -23,14 +23,16 @@ allGens <- function(kpc, Ngen, marR) { } return(allGens) } - +#' @rdname calcAllGens +#' @export +allGens <- calcAllGens #' sizeAllGens #' An internal supporting function for \code{simulatePedigree}. -#' @inheritParams allGens +#' @inheritParams calcAllGens #' @return Returns a vector including the number of individuals in every generation. -sizeAllGens <- function(kpc, Ngen, marR) { +calcFamilySizeByGen <- function(kpc, Ngen, marR) { Nmid <- Ngen - 2 midGens <- numeric(length = Nmid) @@ -46,14 +48,16 @@ sizeAllGens <- function(kpc, Ngen, marR) { # print(allGens) return(allGens) } - +#' @rdname calcFamilySizeByGen +#' @export +sizeAllGens <- calcFamilySizeByGen #' famSizeCal #' A function to calculate the total number of individuals in a pedigree given parameters. This is a supporting function for function \code{simulatePedigree} -#' @inheritParams allGens +#' @inheritParams calcAllGens #' @return Returns a numeric value indicating the total pedigree size. #' @export -famSizeCal <- function(kpc, Ngen, marR) { +calcFamilySize <- function(kpc, Ngen, marR) { if (Ngen < 1) { stop("The number of generations should be an integer greater than or equal to 1") } else if (Ngen == 1) { @@ -71,3 +75,8 @@ famSizeCal <- function(kpc, Ngen, marR) { } return(size) } + +#' @rdname calcFamilySize +#' @export +#' +famSizeCal <- calcFamilySize diff --git a/R/checkIDs.R b/R/checkIDs.R index b9da9a6d..b687f4c7 100644 --- a/R/checkIDs.R +++ b/R/checkIDs.R @@ -17,7 +17,7 @@ #' @export checkIDs <- function(ped, verbose = FALSE, repair = FALSE) { # Standardize column names in the input dataframe - ped <- standardizeColnames(ped) + ped <- standardizeColnames(ped, verbose = verbose) # Initialize a list to store validation results validation_results <- list() @@ -28,91 +28,15 @@ checkIDs <- function(ped, verbose = FALSE, repair = FALSE) { } # Identify non-unique IDs - duplicated_ids <- ped$ID[duplicated(ped$ID) | duplicated(ped$ID, fromLast = TRUE)] - + id_check <- checkIDuniqueness(ped = ped, verbose = verbose) - # Update the validation_results list - if (length(duplicated_ids) > 0) { - if (verbose) { - cat(paste0(length(duplicated_ids), " non-unique IDs found.\n")) - } - validation_results$all_unique_ids <- FALSE - validation_results$total_non_unique_ids <- length(duplicated_ids) - validation_results$non_unique_ids <- unique(duplicated_ids) - } else { - if (verbose) { - cat("All IDs are unique.\n") - } - validation_results$all_unique_ids <- TRUE - validation_results$total_non_unique_ids <- 0 - validation_results$non_unique_ids <- NULL - } if (verbose) { cat("Step 2: Checking for within row duplicats...\n") - cat("Is own father?\n") - } - is_own_father <- ped$ID[ped$ID == ped$dadID & !is.na(ped$dadID)] - if (verbose) { - cat("Is own mother?\n") - } - is_own_mother <- ped$ID[ped$ID == ped$momID & !is.na(ped$momID)] - if (verbose) { - cat("Is mother father?\n") } - duplicated_parents <- ped$ID[ped$dadID == ped$momID & !is.na(ped$dadID) & !is.na(ped$momID)] + row_check <- checkWithinRowDuplicates(ped = ped, verbose = verbose) - # get the total number of within row duplicates - validation_results$total_own_father <- length(is_own_father) - validation_results$total_own_mother <- length(is_own_mother) - validation_results$total_duplicated_parents <- length(duplicated_parents) - validation_results$total_within_row_duplicates <- sum(length(is_own_father), length(is_own_mother), length(duplicated_parents)) - # Update the validation_results list + validation_results <- c(id_check, row_check) - if (validation_results$total_within_row_duplicates > 0) { - if (verbose) { - cat(paste0( - validation_results$total_within_row_duplicates, - " within row duplicates found.\n" - )) - } - validation_results$within_row_duplicates <- TRUE - if (validation_results$total_own_father > 0) { - validation_results$is_own_father_ids <- unique(is_own_father) - if (verbose) { - cat(paste0( - validation_results$total_own_father, - " individuals are their own fathers.\n" - )) - } - } - if (validation_results$total_own_mother > 0) { - validation_results$is_own_mother_ids <- unique(is_own_mother) - if (verbose) { - cat(paste0( - validation_results$total_own_mother, - " individuals are their own mothers.\n" - )) - } - } - if (validation_results$total_duplicated_parents > 0) { - validation_results$duplicated_parents_ids <- unique(duplicated_parents) - if (verbose) { - cat(paste0( - validation_results$total_duplicated_parents, - " individuals have the same mother and father.\n" - )) - } - } - } else { - if (verbose) { - cat("No within row duplicates found.\n") - } - validation_results$within_row_duplicates <- FALSE - validation_results$total_within_row_duplicates <- 0 - validation_results$is_own_father_ids <- NULL - validation_results$is_own_mother_ids <- NULL - validation_results$duplicated_parents_ids <- NULL - } if (verbose) { cat("Validation Results:\n") print(validation_results) @@ -163,9 +87,82 @@ checkIDs <- function(ped, verbose = FALSE, repair = FALSE) { #' Repair Missing IDs #' #' This function repairs missing IDs in a pedigree. -#' @param ped A pedigree object -#' @param verbose A logical indicating whether to print progress messages +#' @inheritParams checkIDs #' @return A corrected pedigree repairIDs <- function(ped, verbose = FALSE) { checkIDs(ped = ped, verbose = verbose, repair = TRUE) } + +#' Check for duplicated individual IDs +#' +#' This function checks for duplicated individual IDs in a pedigree. +#' +#' @inheritParams checkIDs +#' @return A list containing the results of the check +#' +checkIDuniqueness <- function(ped, verbose = FALSE) { + # Identify non-unique IDs + + duplicated_ids <- ped$ID[duplicated(ped$ID) | duplicated(ped$ID, fromLast = TRUE)] + + if (verbose) { + if (length(duplicated_ids) > 0) { + cat(length(duplicated_ids), " non-unique IDs found.\n") + } else { + cat("All IDs are unique.\n") + } + } + + # Update the validation_results list + list( + all_unique_ids = length(duplicated_ids) == 0, + total_non_unique_ids = length(duplicated_ids), + non_unique_ids = if (length(duplicated_ids) > 0) unique(duplicated_ids) else NULL + ) +} + + + +#' Check for within-row duplicates (self-parents, same mom/dad) +#' +#' This function checks for within-row duplicates in a pedigree. +#' +#' @inheritParams checkIDs +#' @return A list containing the results of the check +#' +checkWithinRowDuplicates <- function(ped, verbose = FALSE) { + # is the individual their own father or mother? + is_own_father <- ped$ID[ped$ID == ped$dadID & !is.na(ped$dadID)] + is_own_mother <- ped$ID[ped$ID == ped$momID & !is.na(ped$momID)] + + # is mother and father the same? + duplicated_parents <- ped$ID[ + ped$dadID == ped$momID & + !is.na(ped$dadID) & !is.na(ped$momID) + ] + + # get the total number of within row duplicates + total <- length(is_own_father) + length(is_own_mother) + length(duplicated_parents) + + if (verbose) { + if (total > 0) { + cat(total, " within row duplicates found.\n") + if (length(is_own_father) > 0) cat(length(is_own_father), " individuals are their own fathers.\n") + if (length(is_own_mother) > 0) cat(length(is_own_mother), " individuals are their own mothers.\n") + if (length(duplicated_parents) > 0) cat(length(duplicated_parents), " individuals have the same mother and father.\n") + } else { + cat("No within row duplicates found.\n") + } + } + # Update the validation_results list + list( + total_own_father = length(is_own_father), + total_own_mother = length(is_own_mother), + total_duplicated_parents = length(duplicated_parents), + total_within_row_duplicates = total, + within_row_duplicates = total > 0, + is_own_father_ids = if (length(is_own_father) > 0) unique(is_own_father) else NULL, + is_own_mother_ids = if (length(is_own_mother) > 0) unique(is_own_mother) else NULL, + duplicated_parents_ids = if (length(duplicated_parents) > 0) unique(duplicated_parents) else NULL + ) +} diff --git a/R/checkParents.R b/R/checkParents.R index eab8e0ca..43c8a924 100644 --- a/R/checkParents.R +++ b/R/checkParents.R @@ -26,7 +26,7 @@ checkParentIDs <- function(ped, verbose = FALSE, repair = FALSE, parentswithoutrow = repair) { # Standardize column names in the input dataframe ped_og <- ped - ped <- standardizeColnames(ped) + ped <- standardizeColnames(ped, verbose = verbose) # Initialize a list to store validation results @@ -145,7 +145,7 @@ checkParentIDs <- function(ped, verbose = FALSE, repair = FALSE, # Are any parents in both momID and dadID? momdad <- intersect(ped$dadID, ped$momID) - if (!is.na(momdad) && length(momdad) > 0) { + if (length(momdad) > 0 && !is.na(momdad)) { validation_results$parents_in_both <- momdad if (verbose) { cat(paste( @@ -185,21 +185,36 @@ checkParentIDs <- function(ped, verbose = FALSE, repair = FALSE, - if (!is.na(validation_results$female_var)) { + if (length(validation_results$female_var) > 0 && !is.na(validation_results$female_var)) { corrected_moms <- ped$ID[mom_indices[!is.na(mom_indices)]] ped$sex[mom_indices[!is.na(mom_indices)]] <- validation_results$female_var + changes$corrected_mom_sex <- corrected_moms + if (verbose && length(corrected_moms) > 0) { + cat("Corrected sex of moms for:", paste(corrected_moms, collapse = ", "), "\n") + } + } else { + corrected_moms <- ped$ID[mom_indices[!is.na(mom_indices)]] + ped$sex[mom_indices[!is.na(mom_indices)]] <- 0 + changes$corrected_mom_sex <- corrected_moms if (verbose && length(corrected_moms) > 0) { cat("Corrected sex of moms for:", paste(corrected_moms, collapse = ", "), "\n") } } - if (!is.na(validation_results$male_var)) { + if (length(validation_results$male_var) > 0 && !is.na(validation_results$male_var)) { corrected_dads <- ped$ID[dad_indices[!is.na(dad_indices)]] ped$sex[dad_indices[!is.na(dad_indices)]] <- validation_results$male_var changes$corrected_dad_sex <- corrected_dads if (verbose && length(corrected_dads) > 0) { cat("Corrected sex of dads for:", paste(corrected_dads, collapse = ", "), "\n") } + } else { + corrected_dads <- ped$ID[dad_indices[!is.na(dad_indices)]] + ped$sex[dad_indices[!is.na(dad_indices)]] <- 1 + changes$corrected_dad_sex <- corrected_dads + if (verbose && length(corrected_dads) > 0) { + cat("Corrected sex of dads for:", paste(corrected_dads, collapse = ", "), "\n") + } } } } @@ -218,7 +233,7 @@ checkParentIDs <- function(ped, verbose = FALSE, repair = FALSE, new_entry$ID <- new_id new_entry$dadID <- NA new_entry$momID <- NA - new_entry$sex <- validation_results$male_var + new_entry$sex <- if (length(validation_results$male_var) > 0 && !is.na(validation_results$male_var)) validation_results$male_var else 1 new_entries <- rbind(new_entries, new_entry) } @@ -231,7 +246,7 @@ checkParentIDs <- function(ped, verbose = FALSE, repair = FALSE, new_entry$ID <- new_id new_entry$dadID <- NA new_entry$momID <- NA - new_entry$sex <- validation_results$female_var + new_entry$sex <- if (length(validation_results$female_var) > 0 && !is.na(validation_results$female_var)) validation_results$female_var else 0 new_entries <- rbind(new_entries, new_entry) } @@ -250,39 +265,10 @@ checkParentIDs <- function(ped, verbose = FALSE, repair = FALSE, cat("Added phantom moms for:", paste(changes$phantom_moms_added, collapse = ", "), "\n") } } - # add phantom parents + # add parents who appear in momID or dadID but are missing from ID if (parentswithoutrow) { # Add parents who appear in momID or dadID but are missing from ID - listed_parents <- unique(c(ped$momID, ped$dadID)) - listed_parents <- listed_parents[!is.na(listed_parents)] - - existing_ids <- ped$ID - missing_parents <- setdiff(listed_parents, existing_ids) - - if (length(missing_parents) > 0) { - if (verbose) { - cat("Adding parents who were listed in momID/dadID but missing from ID:\n") - print(missing_parents) - } - - for (pid in missing_parents) { - role <- unique( - c( - if (pid %in% ped$momID) "mom" else NULL, - if (pid %in% ped$dadID) "dad" else NULL - ) - ) - inferred_sex <- if ("mom" %in% role) validation_results$female_var else validation_results$male_var - - new_row <- ped[1, ] - new_row$ID <- pid - new_row$dadID <- NA - new_row$momID <- NA - new_row$sex <- inferred_sex - new_entries <- rbind(new_entries, new_row) - } - } - ped <- merge(ped, new_entries, all = TRUE) + ped <- addRowlessParents(ped = ped, verbose = verbose, validation_results = validation_results) } if (verbose) { @@ -300,3 +286,50 @@ checkParentIDs <- function(ped, verbose = FALSE, repair = FALSE, repairParentIDs <- function(ped, verbose = FALSE) { checkParentIDs(ped = ped, verbose = verbose, repair = TRUE) } + +#' Add addRowlessParents +#' +#' This function adds parents who appear in momID or dadID but are missing from ID +#' @inheritParams checkParentIDs +#' @param validation_results validation results + +addRowlessParents <- function(ped, verbose, validation_results) { + # Add parents who appear in momID or dadID but are missing from ID + new_entries <- data.frame() + + listed_parents <- unique(c(ped$momID, ped$dadID)) + listed_parents <- listed_parents[!is.na(listed_parents)] + + existing_ids <- ped$ID + missing_parents <- setdiff(listed_parents, existing_ids) + + if (length(missing_parents) > 0) { + if (verbose) { + cat("Adding parents who were listed in momID/dadID but missing from ID:\n") + print(missing_parents) + } + + for (pid in missing_parents) { + role <- unique( + c( + if (pid %in% ped$momID) "mom" else NULL, + if (pid %in% ped$dadID) "dad" else NULL + ) + ) + inferred_sex <- if ("mom" %in% role) validation_results$female_var else validation_results$male_var + + new_row <- ped[1, ] + new_row$ID <- pid + new_row$dadID <- NA + new_row$momID <- NA + new_row$sex <- inferred_sex + new_entries <- rbind(new_entries, new_row) + } + + ped <- merge(ped, new_entries, all = TRUE) + if (verbose) { + cat("Added phantom parents for:", paste(new_entries$ID, collapse = ", "), "\n") + } + } + return(ped) +} diff --git a/R/checkSex.R b/R/checkSex.R index d220dec6..8fbabf6a 100644 --- a/R/checkSex.R +++ b/R/checkSex.R @@ -37,7 +37,7 @@ #' checkSex <- function(ped, code_male = NULL, code_female = NULL, verbose = FALSE, repair = FALSE) { # Standardize column names in the input dataframe - ped <- standardizeColnames(ped) + ped <- standardizeColnames(ped, verbose = verbose) # TO DO: bypass the rest of the function if recode_only is TRUE diff --git a/R/cleanPedigree.R b/R/cleanPedigree.R index ffd8054e..737b558c 100644 --- a/R/cleanPedigree.R +++ b/R/cleanPedigree.R @@ -16,9 +16,9 @@ standardizeColnames <- function(df, verbose = FALSE) { "fam" = "^(?:fam(?:ily)?[\\.\\-_]?(?:id)?)", "ID" = "^(?:i(?:d$|ndiv(?:idual)?)|p(?:erson)?[\\.\\-_]?id)", "gen" = "^(?:gen(?:s|eration)?)", - "dadID" = "^(?:d(?:ad)?id|paid|fatherid|pid[\\.\\-_]?fath[er]*)", + "dadID" = "^(?:d(?:ad)?id|paid|fatherid|pid[\\.\\-_]?fath[er]*|sire)", "patID" = "^(?:dat[\\.\\-_]?id|pat[\\.\\-_]?id|paternal[\\.\\-_]?(?:id)?)", - "momID" = "^(?:m(?:om|a|other)?[\\.\\-_]?id|pid[\\.\\-_]?moth[er]*)", + "momID" = "^(?:m(?:om|a|other)?[\\.\\-_]?id|pid[\\.\\-_]?moth[er]*|dame)", "matID" = "^(?:mat[\\.\\-_]?id|maternal[\\.\\-_]?(?:id)?)", "spID" = "^(?:s(?:pt)?id|spouse[\\.\\-_]?(?:id)?|partner[\\.\\-_]?(?:id)?|husb(?:and)?[\\.\\-_]?id|wife[\\.\\-_]?(?:id)?|pid[\\.\\-_]?spouse1?)", "twinID" = "^(?:twin[\\.\\-_]?(?:id)?)", @@ -60,7 +60,7 @@ standardizeColnames <- function(df, verbose = FALSE) { # check_sex = TRUE, # check_parents = TRUE, # verbose = FALSE) { -# corrected_ped <- ped <- standardizeColnames(ped) +# corrected_ped <- ped <- standardizeColnames(ped, verbose = verbose) # if (verbose) { # print("Repairing pedigree...") # } diff --git a/R/computeRelatedness.R b/R/computeRelatedness.R index e3f53b8b..3fa91ffd 100644 --- a/R/computeRelatedness.R +++ b/R/computeRelatedness.R @@ -64,6 +64,14 @@ calculateRelatedness <- function( return(coef) } + +#' @rdname calculateRelatedness +#' @export +related_coef <- function(...) { + warning("The 'related_coef' function is deprecated. Please use 'calculateRelatedness' instead.") + calculateRelatedness(...) +} + #' Infer Relatedness Coefficient #' #' @description @@ -79,6 +87,7 @@ calculateRelatedness <- function( #' @param aceA Numeric. Proportion of variance attributable to additive genetic variance. Must be between 0 and 1. Default is 0.9. #' @param aceC Numeric. Proportion of variance attributable to shared environmental variance. Must be between 0 and 1. Default is 0. #' @param sharedC Numeric. Proportion of shared environment shared between the two individuals. Must be between 0 (no shared environment) and 1 (completely shared environment). Default is 0. +#' @param ... Further named arguments that may be passed to another function. #' #' @return #' Numeric. The calculated relatedness coefficient (`est_r`). @@ -97,6 +106,13 @@ inferRelatedness <- function(obsR, aceA = .9, aceC = 0, sharedC = 0) { return(calc_r) } +#' @rdname inferRelatedness +#' @export +relatedness <- function(...) { + warning("The 'relatedness' function is deprecated. Please use 'inferRelatedness' instead.") + inferRelatedness(...) +} + #' Falconer's Formula #' #' @description diff --git a/R/convertPedigree.R b/R/convertPedigree.R index 0f454acc..86e9970a 100644 --- a/R/convertPedigree.R +++ b/R/convertPedigree.R @@ -17,8 +17,9 @@ #' @param flatten.diag logical. If TRUE, overwrite the diagonal of the final relatedness matrix with ones #' @param standardize.colnames logical. If TRUE, standardize the column names of the pedigree dataset #' @param transpose_method character. The method to use for computing the transpose. Options are "tcrossprod", "crossprod", or "star" -#' @param adjacency_method character. The method to use for computing the adjacency matrix. Options are "loop" or "indexed" +#' @param adjacency_method character. The method to use for computing the adjacency matrix. Options are "loop", "indexed", direct or beta #' @param isChild_method character. The method to use for computing the isChild matrix. Options are "classic" or "partialparent" +#' @param adjBeta_method numeric The method to use for computing the building the adjacency_method matrix when using the "beta" build #' @param ... additional arguments to be passed to \code{\link{ped2com}} #' @details The algorithms and methodologies used in this function are further discussed and exemplified in the vignette titled "examplePedigreeFunctions". For more advanced scenarios and detailed explanations, consult this vignette. #' @export @@ -31,7 +32,7 @@ ped2com <- function(ped, component, flatten.diag = FALSE, standardize.colnames = TRUE, transpose_method = "tcrossprod", - adjacency_method = "indexed", + adjacency_method = "direct", isChild_method = "classic", saveable = FALSE, resume = FALSE, @@ -40,6 +41,7 @@ ped2com <- function(ped, component, save_rate_parlist = 100000 * save_rate, update_rate = 100, save_path = "checkpoint/", + adjBeta_method = NULL, ...) { #------ # Checkpointing @@ -89,8 +91,8 @@ ped2com <- function(ped, component, if (!transpose_method %in% c("tcrossprod", "crossprod", "star", "tcross.alt.crossprod", "tcross.alt.star")) { stop("Invalid method specified. Choose from 'tcrossprod', 'crossprod', or 'star' or 'tcross.alt.crossprod' or 'tcross.alt.star'.") } - if (!adjacency_method %in% c("indexed", "loop", "direct")) { - stop("Invalid method specified. Choose from 'indexed', 'loop', or 'direct'.") + if (!adjacency_method %in% c("indexed", "loop", "direct", "beta")) { + stop("Invalid method specified. Choose from 'indexed', 'loop', 'direct', or 'beta'.") } # standardize colnames @@ -160,7 +162,8 @@ ped2com <- function(ped, component, lastComputed = lastComputed, nr = nr, parList = parList, - lens = lens + lens = lens, + adjBeta_method = adjBeta_method ) # Construct sparse matrix @@ -224,7 +227,7 @@ ped2com <- function(ped, component, } else { # isChild is the 'S' matrix from RAM - isChild <- isChild(isChild_method=isChild_method, ped=ped) + isChild <- isChild(isChild_method = isChild_method, ped = ped) if (saveable) { saveRDS(isChild, file = checkpoint_files$isChild) @@ -406,7 +409,8 @@ ped2mit <- ped2mt <- function(ped, max.gen = 25, resume = resume, save_rate_gen = save_rate_gen, save_rate_parlist = save_rate_parlist, - save_path = save_path + save_path = save_path, + ... ) } @@ -422,7 +426,7 @@ ped2cn <- function(ped, max.gen = 25, sparse = TRUE, verbose = FALSE, saveable = FALSE, resume = FALSE, save_rate = 5, - adjacency_method = "indexed", + adjacency_method = "direct", save_rate_gen = save_rate, save_rate_parlist = 1000 * save_rate, save_path = "checkpoint/", @@ -442,7 +446,8 @@ ped2cn <- function(ped, max.gen = 25, sparse = TRUE, verbose = FALSE, resume = resume, save_rate_gen = save_rate_gen, save_rate_parlist = save_rate_parlist, - save_path = save_path + save_path = save_path, + ... ) } #' Take a pedigree and turn it into an extended environmental relatedness matrix @@ -540,8 +545,7 @@ ped2ce <- function(ped, .adjIndexed <- function(ped, component, saveable, resume, save_path, verbose, lastComputed, nr, checkpoint_files, update_rate, - parList, lens, save_rate_parlist, - ...) { + parList, lens, save_rate_parlist) { # Loop through each individual in the pedigree # Build the adjacency matrix for parent-child relationships # Is person in column j the parent of the person in row i? .5 for yes, 0 for no. @@ -599,7 +603,7 @@ ped2ce <- function(ped, .adjDirect <- function(ped, component, saveable, resume, save_path, verbose, lastComputed, nr, checkpoint_files, update_rate, - parList, lens, save_rate_parlist, + parList, lens, save_rate_parlist, adjBeta_method, ...) { # Loop through each individual in the pedigree # Build the adjacency matrix for parent-child relationships @@ -615,8 +619,60 @@ ped2ce <- function(ped, iss <- c(mIDs$rID, dIDs$rID) jss <- c(mIDs$cID, dIDs$cID) } else if (component %in% c("common nuclear")) { - stop("Common Nuclear component is not yet implemented for direct method. Use index method.\n") - # change to warning and call indexed version + # message("Common Nuclear component is not yet implemented for direct method. Using index method.\n") + + # 1) Create a logical mask for only known parents + mask <- !is.na(ped$momID) & !is.na(ped$dadID) + + # 2) Create a single hash label for each known (momID, dadID) pair + base <- max(ped$ID, na.rm = TRUE) + 1L + pairCode <- ped$momID[mask] + base * ped$dadID[mask] + + # 3) Factor that label => each row with the same (mom,dad) gets the same integer code + childVec <- which(mask) + + # 4) Group children by pairCode, so each group is "all children with that (mom,dad)" + groupList <- split(childVec, pairCode) + + # 5) For each group with >1 children, form pairwise adjacency (i->j) for i != j + iss_list <- vector("list", length(groupList)) + jss_list <- vector("list", length(groupList)) + counter <- 1 + + for (g in groupList) { + k <- length(g) + if (k > 1) { + # We'll form all k^2 combos, then remove the diagonal i=j + # rep() calls faster than expand.grid + + # v = each child repeated k times + # w = entire group repeated once for each child + v <- rep(g, each = k) # row index + w <- rep(g, times = k) # col index + + keep <- (v != w) # remove diagonal where v == w + iss_list[[counter]] <- v[keep] + jss_list[[counter]] <- w[keep] + counter <- counter + 1 + } + } + + + iss <- unlist(iss_list, use.names = FALSE) + jss <- unlist(jss_list, use.names = FALSE) + + # list_of_adjacency <- .adjBeta(ped=ped,adjBeta_method=adjBeta_method, + # component = component, + # saveable = saveable, resume = resume, + # save_path = save_path, verbose = verbose, + # lastComputed = lastComputed, nr = nr, + # checkpoint_files = checkpoint_files, + # update_rate = update_rate, + # parList = parList, + # lens = lens, save_rate_parlist = save_rate_parlist, + # ...) + + # return(list_of_adjacency) } else if (component %in% c("mitochondrial")) { mIDs <- stats::na.omit(data.frame(rID = ped$ID, cID = ped$momID)) iss <- c(mIDs$rID) @@ -641,11 +697,11 @@ ped2ce <- function(ped, #' @param checkpoint_files a list of checkpoint files compute_parent_adjacency <- function(ped, component, - adjacency_method = "indexed", + adjacency_method = "direct", saveable, resume, - save_path, verbose, - lastComputed, nr, checkpoint_files, update_rate, - parList, lens, save_rate_parlist, + save_path, verbose = FALSE, + lastComputed = 0, nr, checkpoint_files, update_rate, + parList, lens, save_rate_parlist, adjBeta_method = NULL, ...) { if (adjacency_method == "loop") { if (lastComputed < nr) { # Original version @@ -704,8 +760,26 @@ compute_parent_adjacency <- function(ped, component, ... ) } + } else if (adjacency_method == "beta") { + list_of_adjacency <- .adjBeta( + ped = ped, + adjBeta_method = adjBeta_method, + component = component, + saveable = saveable, + resume = resume, + save_path = save_path, + verbose = verbose, + lastComputed = lastComputed, + nr = nr, + checkpoint_files = checkpoint_files, + update_rate = update_rate, + parList = parList, + lens = lens, + save_rate_parlist = save_rate_parlist, + ... + ) } else { - stop("Invalid method specified. Choose from 'loop', 'direct', or 'indexed'.") + stop("Invalid method specified. Choose from 'loop', 'direct', 'indexed', or beta") } if (saveable) { saveRDS(parList, file = checkpoint_files$parList) @@ -735,3 +809,204 @@ isChild <- function(isChild_method, ped) { }) } } + + +.adjBeta <- function(ped, component, + adjBeta_method = 5, + parList = NULL, + lastComputed = 0, + nr = NULL, + lens = NULL, + saveable = FALSE, + resume = FALSE, + save_path = NULL, + verbose = FALSE, + save_rate_parlist = NULL, + update_rate = NULL, + checkpoint_files = NULL, + ...) { # 1) Pairwise compare mother IDs + if (adjBeta_method == 1) { + # gets slow when data are bigger. much slower than indexed + momMatch <- outer(ped$momID, ped$momID, FUN = "==") + momMatch[is.na(momMatch)] <- FALSE + + # 2) Pairwise compare father IDs + dadMatch <- outer(ped$dadID, ped$dadID, FUN = "==") + dadMatch[is.na(dadMatch)] <- FALSE + + # 3) Sibling adjacency if both mom & dad match + adj <- momMatch & dadMatch + + # 4) Extract indices where adj[i,j] is TRUE + w <- which(adj, arr.ind = TRUE) + # iss <- w[, 1] + # jss <- w[, 2] + # + list_of_adjacency <- list( + iss = w[, 1], + jss = w[, 2] + ) + } else if (adjBeta_method == 2) { + # 1) Create a logical mask for known parents + mask <- !is.na(ped$momID) & !is.na(ped$dadID) + + # 2) Create a single string label for each known (momID, dadID) pair + pairLabel <- paste0(ped$momID[mask], "_", ped$dadID[mask]) + + # 3) Factor that label => each row with the same (mom,dad) gets the same integer code + # This is "creating a new ID" for each unique parent pair + pairCode <- match(pairLabel, unique(pairLabel)) + + # childVec are the row indices in 'ped' that have known parents + childVec <- which(mask) # length(childVec) = sum(mask) + + # 4) Group children by pairCode, so each group is "all children with that (mom,dad)" + groupList <- split(childVec, pairCode) + + # 5) For each group with >1 children, form pairwise adjacency i->j + iss_list <- list() + jss_list <- list() + counter <- 1 + + for (g in groupList) { + if (length(g) > 1) { + combos <- expand.grid(g, g, KEEP.OUT.ATTRS = FALSE) + combos <- combos[combos[, 1] != combos[, 2], , drop = FALSE] + iss_list[[counter]] <- combos[, 1] + jss_list[[counter]] <- combos[, 2] + counter <- counter + 1 + } + } + # iss <- unlist(iss_list, use.names = FALSE) + # jss <- unlist(jss_list, use.names = FALSE) + + list_of_adjacency <- list( + iss = unlist(iss_list, use.names = FALSE), + jss = unlist(jss_list, use.names = FALSE) + ) + } else if (adjBeta_method == 3) { + nr <- nrow(ped) + # terrible + # Define a scalar-checking function: + f_check <- function(i, j) { + # i, j are each single integers + # Return one boolean: do they share both parents? + !is.na(ped$momID[i]) && !is.na(ped$dadID[i]) && + !is.na(ped$momID[j]) && !is.na(ped$dadID[j]) && + (ped$momID[i] == ped$momID[j]) && + (ped$dadID[i] == ped$dadID[j]) + } + + # Vectorize it so outer() will produce an nr x nr matrix + vf_check <- Vectorize(f_check) + + # Now outer() calls vf_check(...) in a way that yields scalar results + adj <- outer(seq_len(nr), seq_len(nr), FUN = vf_check) + + # Extract which cells of adj are TRUE + w <- which(adj, arr.ind = TRUE) + # iss <- w[, 1] + # jss <- w[, 2] + + list_of_adjacency <- list( + iss = iss <- w[, 1], + jss = jss <- w[, 2] + ) + } else if (adjBeta_method == 4) { + # 1) Create a logical mask for known parents + mask <- !is.na(ped$momID) & !is.na(ped$dadID) + + # 2) Create a single string label for each known (momID, dadID) pair + pairLabel <- paste0(ped$momID[mask], "_", ped$dadID[mask]) + + # 3) Factor that label => each row with the same (mom,dad) gets the same integer code + pairCode <- match(pairLabel, unique(pairLabel)) + + # childVec are the row indices in 'ped' that have known parents + childVec <- which(mask) + + # 4) Group children by pairCode, so each group is "all children with that (mom,dad)" + groupList <- split(childVec, pairCode) + + # 5) For each group with >1 children, form pairwise adjacency (i->j) for i != j + iss_list <- vector("list", length(groupList)) + jss_list <- vector("list", length(groupList)) + counter <- 1 + + for (g in groupList) { + k <- length(g) + if (k > 1) { + # We'll form all k^2 combos, then remove the diagonal i=j + # Instead of expand.grid, do rep() calls: + + # v = each child repeated k times + # w = entire group repeated once for each child + v <- rep(g, each = k) # row index + w <- rep(g, times = k) # col index + + keep <- (v != w) # remove diagonal where v == w + iss_list[[counter]] <- v[keep] + jss_list[[counter]] <- w[keep] + counter <- counter + 1 + } + } + + list_of_adjacency <- list( + iss = unlist(iss_list, use.names = FALSE), + jss = unlist(jss_list, use.names = FALSE) + ) + } else if (adjBeta_method == 5) { + # 1) Create a logical mask for known parents + mask <- !is.na(ped$momID) & !is.na(ped$dadID) + + # 2) Create a single hash label for each known (momID, dadID) pair + # pairLabel <- paste0(ped$momID[mask], "_", ped$dadID[mask]) + base <- max(ped$ID, na.rm = TRUE) + 1L + pairCode <- ped$momID[mask] + base * ped$dadID[mask] + + # 3) Factor that label => each row with the same (mom,dad) gets the same integer code + childVec <- which(mask) + + # 4) Group children by pairCode, so each group is "all children with that (mom,dad)" + groupList <- split(childVec, pairCode) + + # 5) For each group with >1 children, form pairwise adjacency (i->j) for i != j + iss_list <- vector("list", length(groupList)) + jss_list <- vector("list", length(groupList)) + counter <- 1 + + for (g in groupList) { + k <- length(g) + if (k > 1) { + # We'll form all k^2 combos, then remove the diagonal i=j + # Instead of expand.grid, do rep() calls: + + # v = each child repeated k times + # w = entire group repeated once for each child + v <- rep(g, each = k) # row index + w <- rep(g, times = k) # col index + + keep <- (v != w) # remove diagonal where v == w + iss_list[[counter]] <- v[keep] + jss_list[[counter]] <- w[keep] + counter <- counter + 1 + } + } + + list_of_adjacency <- list( + iss = unlist(iss_list, use.names = FALSE), + jss = unlist(jss_list, use.names = FALSE) + ) + } else { + list_of_adjacency <- .adjIndexed( + ped = ped, component = component, + saveable = saveable, resume = resume, + save_path = save_path, verbose = verbose, + lastComputed = lastComputed, nr = nr, + checkpoint_files = checkpoint_files, + update_rate = update_rate, parList = parList, + lens = lens, save_rate_parlist = save_rate_parlist + ) + } + return(list_of_adjacency) +} diff --git a/R/helpGeneric.R b/R/helpGeneric.R index e8ef381c..4e1d8ca9 100644 --- a/R/helpGeneric.R +++ b/R/helpGeneric.R @@ -24,10 +24,9 @@ rmvn <- function(n, sigma) { #' @param x vector of any length #' @return replaces null values in a vector to NA #' -nullToNA <- function(x) { +null2NA <- function(x) { if (length(x) == 0) { x <- NA - # Handle case when x is a list } else if (is.list(x)) { for (i in seq_along(x)) { @@ -39,6 +38,9 @@ nullToNA <- function(x) { return(x) } +#' @rdname null2NA +#' +nullToNA <- null2NA #' modified tryCatch function #' @@ -46,10 +48,14 @@ nullToNA <- function(x) { #' @keywords internal #' @return Fuses the nullToNA function with efunc #' -try_na <- function(x) { - nullToNA(tryCatch(x, error = efunc)) +tryNA <- function(x) { + null2NA(tryCatch(x, error = efunc)) } +#' @rdname tryNA +#' @keywords internal +#' +try_na <- tryNA #' Compute the null space of a matrix #' #' @param M a matrix of which the null space is desired @@ -97,73 +103,3 @@ resample <- function(x, ...) { } x[sample.int(length(x), ...)] } - - -#' SimPed (Deprecated) -#' -#' This function is a wrapper around the new `simulatePedigree` function. -#' `SimPed` has been deprecated, and it's advised to use `simulatePedigree` directly. -#' -#' @param ... Arguments to be passed to `simulatePedigree`. -#' @return The same result as calling `simulatePedigree`. -#' @seealso \code{\link{simulatePedigree}} for the updated function. -#' @description When calling this function, a warning will be issued about its deprecation. -#' @keywords deprecated -#' @examples -#' \dontrun{ -#' # This is an example of the deprecated function: -#' SimPed(...) -#' # It is recommended to use: -#' simulatePedigree(...) -#' } -#' @export -SimPed <- function(...) { # nolint: object_name_linter. - warning("The 'SimPed' function is deprecated. Please use 'simulatePedigree' instead.") - simulatePedigree(...) -} - -#' related_coef (Deprecated) -#' -#' This function is a wrapper around the new `calculateRelatedness` function. -#' `related_coef` has been deprecated, and it's advised to use `calculateRelatedness` directly. -#' -#' @param ... Arguments to be passed to `calculateRelatedness`. -#' @return The same result as calling `calculateRelatedness`. -#' @seealso \code{\link{calculateRelatedness}} for the updated function. -#' @description When calling this function, a warning will be issued about its deprecation. -#' @keywords deprecated -#' @examples -#' \dontrun{ -#' # This is an example of the deprecated function: -#' related_coef(...) -#' # It is recommended to use: -#' calculateRelatedness(...) -#' } -#' @export -related_coef <- function(...) { - warning("The 'related_coef' function is deprecated. Please use 'calculateRelatedness' instead.") - calculateRelatedness(...) -} - -#' relatedness (Deprecated) -#' -#' This function is a wrapper around the new `inferRelatedness` function. -#' `relatedness` has been deprecated, and it's advised to use `inferRelatedness` directly. -#' -#' @param ... Arguments to be passed to `inferRelatedness`. -#' @return The same result as calling `inferRelatedness`. -#' @seealso \code{\link{inferRelatedness}} for the updated function. -#' @description When calling this function, a warning will be issued about its deprecation. -#' @keywords deprecated -#' @examples -#' \dontrun{ -#' # This is an example of the deprecated function: -#' relatedness(...) -#' # It is recommended to use: -#' inferRelatedness(...) -#' } -#' @export -relatedness <- function(...) { - warning("The 'relatedness' function is deprecated. Please use 'inferRelatedness' instead.") - inferRelatedness(...) -} diff --git a/R/helpPedigree.R b/R/helpPedigree.R index 7b6bb4be..87263fb9 100644 --- a/R/helpPedigree.R +++ b/R/helpPedigree.R @@ -39,15 +39,17 @@ createGenDataFrame <- function(sizeGens, genIndex, idGen) { #' #' @param idGen Vector of IDs for the generation. #' @param sexR Numeric value indicating the sex ratio (proportion of males). +#' @param code_male The value to use for males. Default is "M" +#' @param code_female The value to use for females. Default is "F" #' @return Vector of sexes ("M" for male, "F" for female) for the offspring. #' @importFrom stats runif -determineSex <- function(idGen, sexR) { +determineSex <- function(idGen, sexR, code_male = "M", code_female = "F") { if (runif(1) > .5) { - sexVec1 <- rep("M", floor(length(idGen) * sexR)) - sexVec2 <- rep("F", length(idGen) - length(sexVec1)) + sexVec1 <- rep(code_male, floor(length(idGen) * sexR)) + sexVec2 <- rep(code_female, length(idGen) - length(sexVec1)) } else { - sexVec1 <- rep("F", floor(length(idGen) * (1 - sexR))) - sexVec2 <- rep("M", length(idGen) - length(sexVec1)) + sexVec1 <- rep(code_female, floor(length(idGen) * (1 - sexR))) + sexVec2 <- rep(code_male, length(idGen) - length(sexVec1)) } sexVec <- sample(c(sexVec1, sexVec2)) return(sexVec) @@ -60,7 +62,8 @@ determineSex <- function(idGen, sexR) { #' #' @param df_Ngen The dataframe for the current generation, including columns for individual IDs and spouse IDs. #' @return The input dataframe augmented with a 'coupleId' column, where each mated pair has a unique identifier. -assignCoupleIds <- function(df_Ngen) { +#' @export +assignCoupleIDs <- function(df_Ngen) { df_Ngen$coupleId <- NA_character_ # Initialize the coupleId column with NAs usedCoupleIds <- character() # Initialize an empty character vector to track used IDs @@ -86,6 +89,10 @@ assignCoupleIds <- function(df_Ngen) { return(df_Ngen) } + +#' @rdname assignCoupleIDs +assignCoupleIds <- assignCoupleIDs + #' Generate or Adjust Number of Kids per Couple Based on Mating Rate #' #' This function generates or adjusts the number of kids per couple in a generation diff --git a/R/insertEven.R b/R/insertEven.R index ba2d55e9..573bd8c9 100644 --- a/R/insertEven.R +++ b/R/insertEven.R @@ -12,7 +12,7 @@ #' @export #' @seealso \code{\link{SimPed}} for the main function that uses this supporting function. -evenInsert <- function(m, n, verbose = FALSE) { +insertEven <- function(m, n, verbose = FALSE) { if (length(m) > length(n)) { temp <- m m <- n @@ -36,3 +36,7 @@ evenInsert <- function(m, n, verbose = FALSE) { return(vec) } + +#' @rdname insertEven +#' @export +evenInsert <- insertEven diff --git a/R/makeLinks.R b/R/makeLinks.R index 96768fd5..37c44274 100644 --- a/R/makeLinks.R +++ b/R/makeLinks.R @@ -17,10 +17,12 @@ #' @param legacy Logical. If TRUE, uses the legacy branch of the function. #' @param outcome_name Character string representing the outcome name (used in file naming). #' @param drop_upper_triangular Logical. If TRUE, drops the upper triangular portion of the matrix. +#' @param include_all_links_1ped Logical. If TRUE, includes all links in the output. (Default is true when only one ped is provided) #' @param ... Additional arguments to be passed to \code{\link{com2links}} #' #' @return A data frame of related pairs if \code{writetodisk} is FALSE; otherwise, writes the results to disk. -#' @export +#' @export com2links + com2links <- function( rel_pairs_file = "dataRelatedPairs.csv", ad_ped_matrix = NULL, @@ -38,433 +40,268 @@ com2links <- function( legacy = FALSE, outcome_name = "data", drop_upper_triangular = TRUE, + include_all_links_1ped = FALSE, ...) { - # Non-legacy mode processing - - if (!legacy) { - # --- Input Validations and Preprocessing --- - - # Ensure that at least one relationship matrix is provided. - if (is.null(ad_ped_matrix) && is.null(mit_ped_matrix) && is.null(cn_ped_matrix)) { - stop("At least one of 'ad_ped_matrix', 'mit_ped_matrix', or 'cn_ped_matrix' must be provided.") - } - # Validate and convert ad_ped_matrix to a sparse dgCMatrix if provided. - if (!is.null(ad_ped_matrix)) { - ad_ped_matrix <- validate_and_convert_matrix(mat=ad_ped_matrix, - name = "ad_ped_matrix") - } + # --- Input Validations and Preprocessing --- - # Validate and convert cn_ped_matrix to a sparse dgCMatrix if provided. - if (!is.null(cn_ped_matrix)) { - cn_ped_matrix <- validate_and_convert_matrix(mat=cn_ped_matrix, - name="cn_ped_matrix", - ensure_symmetric = TRUE) - } - - # Validate and process mit_ped_matrix: convert and ensure binary values. - if (!is.null(mit_ped_matrix)) { - - mit_ped_matrix <- validate_and_convert_matrix(mat=mit_ped_matrix, - name="mit_ped_matrix",force_binary = TRUE, - ensure_symmetric = TRUE) - } - - # --- Build IDs and Prepare Matrix Pointers --- - - # Extract individual IDs from the first available matrix. - ids <- NULL - if (!is.null(cn_ped_matrix)) { - ids <- as.numeric(dimnames(cn_ped_matrix)[[1]]) - nc <- ncol(cn_ped_matrix) - } else if (!is.null(ad_ped_matrix)) { - ids <- as.numeric(dimnames(ad_ped_matrix)[[1]]) - nc <- ncol(ad_ped_matrix) - } else if (!is.null(mit_ped_matrix)) { - ids <- as.numeric(dimnames(mit_ped_matrix)[[1]]) - nc <- ncol(mit_ped_matrix) - } + # Ensure that at least one relationship matrix is provided. + if (is.null(ad_ped_matrix) && is.null(mit_ped_matrix) && is.null(cn_ped_matrix)) { + stop("At least one of 'ad_ped_matrix', 'mit_ped_matrix', or 'cn_ped_matrix' must be provided.") + } + # Validate and convert ad_ped_matrix to a sparse dgCMatrix if provided. + if (!is.null(ad_ped_matrix)) { + ad_ped_matrix <- validate_and_convert_matrix( + mat = ad_ped_matrix, + name = "ad_ped_matrix" + ) + } - if (is.null(ids)) { - stop("Could not extract IDs from the provided matrices.") - } + # Validate and convert cn_ped_matrix to a sparse dgCMatrix if provided. + if (!is.null(cn_ped_matrix)) { + cn_ped_matrix <- validate_and_convert_matrix( + mat = cn_ped_matrix, + name = "cn_ped_matrix", + ensure_symmetric = TRUE + ) + } - # Count how many matrices are provided. - sum_nulls <- sum(!is.null(ad_ped_matrix), - !is.null(mit_ped_matrix), - !is.null(cn_ped_matrix), - na.rm = TRUE + # Validate and process mit_ped_matrix: convert and ensure binary values. + if (!is.null(mit_ped_matrix)) { + mit_ped_matrix <- validate_and_convert_matrix( + mat = mit_ped_matrix, + name = "mit_ped_matrix", force_binary = TRUE, + ensure_symmetric = TRUE ) - if (verbose) { - print(sum_nulls) - } + } - # Extract the internal pointers (p, i, and x slots) for each provided matrix. - if (!is.null(ad_ped_matrix)) { - ad_ped_p <- ad_ped_matrix@p + 1L - ad_ped_i <- ad_ped_matrix@i + 1L - ad_ped_x <- ad_ped_matrix@x - } - if (!is.null(mit_ped_matrix)) { - mt_p <- mit_ped_matrix@p + 1L - mt_i <- mit_ped_matrix@i + 1L - mt_x <- mit_ped_matrix@x - } - if (!is.null(cn_ped_matrix)) { - cn_p <- cn_ped_matrix@p + 1L - cn_i <- cn_ped_matrix@i + 1L - cn_x <- cn_ped_matrix@x - } + # --- Build IDs and Prepare Matrix Pointers --- - # --- Process Based on the Number of Provided Matrices --- - # --- Case: All Three Matrices Provided --- - if (sum_nulls == 3) { - # Set pointers for all three matrices. - newColPos1 <- ad_ped_p - iss1 <- ad_ped_i - x1 <- ad_ped_x + # Extract individual IDs from the first available matrix. + ids <- NULL - newColPos2 <- mt_p - iss2 <- mt_i - x2 <- mt_x - newColPos3 <- cn_p - iss3 <- cn_i - x3 <- cn_x - # Define relationship column names. - relNames <- c("addRel", "mitRel", "cnuRel") + if (!is.null(cn_ped_matrix)) { + ids <- as.numeric(dimnames(cn_ped_matrix)[[1]]) + nc <- ncol(cn_ped_matrix) + } else if (!is.null(ad_ped_matrix)) { + ids <- as.numeric(dimnames(ad_ped_matrix)[[1]]) + nc <- ncol(ad_ped_matrix) + } else if (!is.null(mit_ped_matrix)) { + ids <- as.numeric(dimnames(mit_ped_matrix)[[1]]) + nc <- ncol(mit_ped_matrix) + } - # Optionally remove the original pointers to free memory. - if (gc == TRUE) { - remove(ad_ped_p, ad_ped_i, ad_ped_x, mt_p, mt_i, mt_x, cn_p, cn_i, cn_x) - } - if (verbose) { - message("All 3 matrix is present") - } + if (is.null(ids)) { + stop("Could not extract IDs from the provided matrices.") + } - # File names - # rel_pairs_file <- paste0(outcome_name, "_dataRelatedPairs.csv") - # mapa_id_file <- paste0(outcome_name, "_data_mapaID.csv") + # --- matrix_case construction and switch dispatch --- + matrix_case <- paste(sort(c( + if (!is.null(ad_ped_matrix)) "ad" else NULL, + if (!is.null(mit_ped_matrix)) "mt" else NULL, + if (!is.null(cn_ped_matrix)) "cn" else NULL + )), collapse = "-") - # Initialize the related pairs file with headers. - df_relpairs <- initialize_empty_df(relNames = relNames) + if (verbose) { + print(matrix_case) + } - # Write the headers to the related pairs file. - if (writetodisk == TRUE) { - utils::write.table( - df_relpairs, - file = rel_pairs_file, sep = ",", append = FALSE, row.names = FALSE - ) + switch(matrix_case, + "ad" = process_one( + matrix = ad_ped_matrix, + rel_name = "addRel", + ids = ids, + nc = nc, + rel_pairs_file = rel_pairs_file, + writetodisk = writetodisk, + write_buffer_size = write_buffer_size, + drop_upper_triangular = drop_upper_triangular, + update_rate = update_rate, + verbose = verbose, + gc = gc, + include_all_links = include_all_links_1ped, + ... + ), + "mt" = process_one( + matrix = mit_ped_matrix, + rel_name = "mitRel", + ids = ids, + nc = nc, + rel_pairs_file = rel_pairs_file, + writetodisk = writetodisk, + write_buffer_size = write_buffer_size, + drop_upper_triangular = drop_upper_triangular, + update_rate = update_rate, + verbose = verbose, + gc = gc, + include_all_links = include_all_links_1ped, + ... + ), + "cn" = process_one( + matrix = cn_ped_matrix, + rel_name = "cnuRel", + ids = ids, + nc = nc, + rel_pairs_file = rel_pairs_file, + writetodisk = writetodisk, + write_buffer_size = write_buffer_size, + drop_upper_triangular = drop_upper_triangular, + update_rate = update_rate, + verbose = verbose, + gc = gc, + include_all_links = include_all_links_1ped, + ... + ), + "ad-mt" = process_two( + matrix1 = ad_ped_matrix, + name1 = "addRel", + matrix2 = mit_ped_matrix, + name2 = "mitRel", + ids = ids, + nc = nc, + rel_pairs_file = rel_pairs_file, + writetodisk = writetodisk, + write_buffer_size = write_buffer_size, + drop_upper_triangular = drop_upper_triangular, + update_rate = update_rate, + verbose = verbose, + gc = gc, + ... + ), + "ad-cn" = process_two( + matrix1 = ad_ped_matrix, + name1 = "addRel", + matrix2 = cn_ped_matrix, + name2 = "cnuRel", + ids = ids, + nc = nc, + rel_pairs_file = rel_pairs_file, + writetodisk = writetodisk, + write_buffer_size = write_buffer_size, + drop_upper_triangular = drop_upper_triangular, + update_rate = update_rate, + verbose = verbose, + gc = gc, + ... + ), + "cn-mt" = process_two( + matrix1 = cn_ped_matrix, + name1 = "cnuRel", + matrix2 = mit_ped_matrix, + name2 = "mitRel", + ids = ids, + nc = nc, + rel_pairs_file = rel_pairs_file, + writetodisk = writetodisk, + write_buffer_size = write_buffer_size, + drop_upper_triangular = drop_upper_triangular, + update_rate = update_rate, + verbose = verbose, + gc = gc, + ... + ), + "ad-cn-mt" = process_all_three( + mat1 = ad_ped_matrix, + name1 = "addRel", + mat2 = mit_ped_matrix, + name2 = "mitRel", + mat3 = cn_ped_matrix, + name3 = "cnuRel", + ids = ids, + nc = nc, + rel_pairs_file = rel_pairs_file, + writetodisk = writetodisk, + write_buffer_size = write_buffer_size, + drop_upper_triangular = drop_upper_triangular, + update_rate = update_rate, + verbose = verbose, + gc = gc, + ... + ), + stop("Unsupported matrix combination") + ) +} +#' Convert Sparse Relationship Matrices to Kinship Links for one Matrix +#' @inheritParams com2links +#' @param include_all_links Logical. If TRUE, all links are included in the output. +#' @keywords internal - # Prepare an empty buffer for batching writes. - write_buffer <- list() - remove(df_relpairs) - } - # Loop over each column (individual) in the matrix. - for (j in 1L:nc) { - ID2 <- ids[j] - - # Extract column indices for the 1st component - ncp1 <- newColPos1[j] - ncp1p <- newColPos1[j + 1L] - cond1 <- ncp1 < ncp1p - if (cond1) { - vv1 <- ncp1:(ncp1p - 1L) - iss1vv <- iss1[vv1] - } - # Extract indices for the 2nd component - ncp2 <- newColPos2[j] - ncp2p <- newColPos2[j + 1L] - cond2 <- ncp2 < ncp2p - if (cond2) { - vv2 <- ncp2:(ncp2p - 1L) - iss2vv <- iss2[vv2] - } - # Extract indices for the 3rd component - ncp3 <- newColPos3[j] - ncp3p <- newColPos3[j + 1L] - cond3 <- ncp3 < ncp3p - if (cond3) { - vv3 <- ncp3:(ncp3p - 1L) - iss3vv <- iss3[vv3] - } +process_one <- function(matrix, rel_name, ids, nc, rel_pairs_file, writetodisk, + write_buffer_size, drop_upper_triangular, update_rate, verbose, gc, + include_all_links = TRUE, ...) { + if (include_all_links == FALSE) { + # Extract pointers and indices from the matrix. + newColPos <- matrix@p + 1L + iss <- matrix@i + 1L + x <- matrix@x - # Create a unique, sorted set of row indices from all provided matrices. - u <- sort(igraph::union(igraph::union(if (cond1) { - iss1vv - }, if (cond2) { - iss2vv - }), if (cond3) { - iss3vv - })) - - # If any relationships exist for this individual, build the related pairs. - if (cond1 || cond2 || cond3) { - ID1 <- ids[u] - tds <- data.frame(ID1 = ID1, ID2 = ID2) - tds[[relNames[1]]] <- 0 - tds[[relNames[2]]] <- 0 - tds[[relNames[3]]] <- 0 - - # Assign the relationship coefficients from each matrix. - if (cond1) { - tds[u %in% iss1vv, relNames[1]] <- x1[vv1] - } - if (cond2) { - tds[u %in% iss2vv, relNames[2]] <- x2[vv2] - } - if (cond3) { - tds[u %in% iss3vv, relNames[3]] <- x3[vv3] - } + # Initialize the related pairs file with headers. + df_relpairs <- initialize_empty_df(relNames = rel_name) - # Optionally drop upper-triangular entries. - if (drop_upper_triangular == TRUE) { - tds <- tds[tds$ID1 <= tds$ID2, ] # or < if you want strictly lower triangle - } + if (writetodisk == TRUE) { + utils::write.table( + df_relpairs, + file = rel_pairs_file, sep = ",", append = FALSE, row.names = FALSE + ) - # Write the batch to disk or accumulate in the data frame. - if (nrow(tds) > 0) { - if (writetodisk == TRUE) { - write_buffer[[length(write_buffer) + 1]] <- tds - - if (length(write_buffer) >= write_buffer_size) { # Write in batches - utils::write.table(do.call(rbind, write_buffer), - file = rel_pairs_file, - row.names = FALSE, col.names = FALSE, append = TRUE, sep = "," - ) - write_buffer <- list() - } - } else { - df_relpairs <- rbind(df_relpairs, tds) - } - } - } - if (verbose && (j %% update_rate == 0L)) { - cat("Done with", j, "of", nc, "\n") - } - } - } else if (sum_nulls == 2) { - # --- Case: Two Matrices Provided --- - # Set pointers and relationship names based on which matrix is missing. - - if (is.null(ad_ped_matrix)) { - newColPos1 <- mt_p - iss1 <- mt_i - x1 <- mt_x - newColPos2 <- cn_p - iss2 <- cn_i - x2 <- cn_x - relNames <- c("mitRel", "cnuRel") - if (gc == TRUE) { - remove(mt_p, mt_i, mt_x, cn_p, cn_i, cn_x) - } - } - if (is.null(mit_ped_matrix)) { - newColPos1 <- ad_ped_p - iss1 <- ad_ped_i - x1 <- ad_ped_x - newColPos2 <- cn_p - iss2 <- cn_i - x2 <- cn_x - relNames <- c("addRel", "cnuRel") - if (gc == TRUE) { - remove(ad_ped_p, ad_ped_i, ad_ped_x, cn_p, cn_i, cn_x) - } - } - if (is.null(cn_ped_matrix)) { - newColPos1 <- ad_ped_p - iss1 <- ad_ped_i - x1 <- ad_ped_x - newColPos2 <- mt_p - iss2 <- mt_i - x2 <- mt_x - relNames <- c("addRel", "mitRel") - if (gc == TRUE) { - remove(ad_ped_p, ad_ped_i, ad_ped_x, mt_p, mt_i, mt_x) - } - } + # Prepare an empty buffer for batching writes. + write_buffer <- list() + remove(df_relpairs) + } - # Initialize the related pairs file with the appropriate headers. - df_relpairs <- initialize_empty_df(relNames = relNames) + # Process each column in the matrix. + for (j in 1L:nc) { + ID2 <- ids[j] - if (writetodisk == TRUE) { - utils::write.table( - df_relpairs, - file = rel_pairs_file, sep = ",", append = FALSE, row.names = FALSE - ) - # initial buffer - write_buffer <- list() - remove(df_relpairs) + # Extract column indices + ncp <- newColPos[j] + ncpp <- newColPos[j + 1L] + cond <- ncp < ncpp + if (cond) { + vv <- ncp:(ncpp - 1L) + issvv <- iss[vv] } - # Process each column to extract relationships. - for (j in 1L:nc) { - ID2 <- ids[j] - - # Extract indices from the first matrix. - ncp1 <- newColPos1[j] - ncp1p <- newColPos1[j + 1L] - cond1 <- ncp1 < ncp1p - if (cond1) { - vv1 <- ncp1:(ncp1p - 1L) - iss1vv <- iss1[vv1] - } - # Extract indices from the second matrix. - ncp2 <- newColPos2[j] - ncp2p <- newColPos2[j + 1L] - cond2 <- ncp2 < ncp2p - if (cond2) { - vv2 <- ncp2:(ncp2p - 1L) - iss2vv <- iss2[vv2] - } - - # Merge the indices from both matrices. - u <- sort(igraph::union(if (cond1) { - iss1vv - }, if (cond2) { - iss2vv - })) - - # Create related pairs if relationships are found. - if (cond1 || cond2) { - ID1 <- ids[u] - tds <- data.frame(ID1 = ID1, ID2 = ID2) - tds[[relNames[1]]] <- 0 - tds[[relNames[2]]] <- 0 - - if (cond1) { - tds[u %in% iss1vv, relNames[1]] <- x1[vv1] - } - if (cond2) { - tds[u %in% iss2vv, relNames[2]] <- x2[vv2] - } - if (drop_upper_triangular == TRUE) { - tds <- tds[tds$ID1 <= tds$ID2, ] # or < if you want strictly lower triangle - } + # Create a unique set of row indices. + u <- sort(issvv) - # Write the batch to disk or accumulate in the data frame. - if (nrow(tds) > 0) { - if (writetodisk == TRUE) { - write_buffer[[length(write_buffer) + 1]] <- tds - - if (length(write_buffer) >= write_buffer_size) { # Write in batches - utils::write.table(do.call(rbind, write_buffer), - file = rel_pairs_file, - row.names = FALSE, col.names = FALSE, append = TRUE, sep = "," - ) - write_buffer <- list() - } - } else { - df_relpairs <- rbind(df_relpairs, tds) - } - } - } - if (verbose && (j %% update_rate == 0L)) { - cat("Done with", j, "of", nc, "\n") - } - } - } else if (sum_nulls == 1) { - # --- Case: Only One Matrix Provided --- - if (verbose) { - message("Only one matrix is present") - } - if (!is.null(ad_ped_matrix)) { - newColPos1 <- ad_ped_p - iss1 <- ad_ped_i - x1 <- ad_ped_x - relNames <- c("addRel") - if (gc == TRUE) { - remove(ad_ped_p, ad_ped_i, ad_ped_x) + # If any relationships exist for this individual, build the related pairs. + if (cond) { + # Create a data frame with unique pairs. + ID1 <- ids[u] + tds <- data.frame(ID1 = ID1, ID2 = ID2) + tds[[rel_name]] <- 0 + if (cond) { + tds[u %in% issvv, rel_name] <- x[vv] } - } - if (!is.null(mit_ped_matrix)) { - newColPos1 <- mt_p - iss1 <- mt_i - x1 <- mt_x - relNames <- c("mitRel") - if (gc == TRUE) { - remove(mt_p, mt_i, mt_x) + if (drop_upper_triangular == TRUE) { + tds <- tds[tds$ID1 <= tds$ID2, ] # or < if you want strictly lower triangle } - } - if (!is.null(cn_ped_matrix)) { - newColPos1 <- cn_p - iss1 <- cn_i - x1 <- cn_x - relNames <- c("cnuRel") - if (gc == TRUE) { - remove(cn_p, cn_i, cn_x) - } - } - - # Initialize the related pairs file. - df_relpairs <- initialize_empty_df(relNames = relNames) - - if (writetodisk == TRUE) { - utils::write.table( - df_relpairs, - file = rel_pairs_file, sep = ",", append = FALSE, row.names = FALSE - ) - - # initial buffer - write_buffer <- list() - - remove(df_relpairs) - } - - # Process each column. - for (j in 1L:nc) { - ID2 <- ids[j] - # Extract column indices - ncp1 <- newColPos1[j] - ncp1p <- newColPos1[j + 1L] - cond1 <- ncp1 < ncp1p - if (cond1) { - vv1 <- ncp1:(ncp1p - 1L) - iss1vv <- iss1[vv1] - } - - # Use the indices from the single matrix. - u <- sort(iss1vv) - - if (cond1) { - ID1 <- ids[u] - tds <- data.frame(ID1 = ID1, ID2 = ID2) - tds[[relNames[1]]] <- 0 - - if (cond1) { - tds[u %in% iss1vv, relNames[1]] <- x1[vv1] - } - if (drop_upper_triangular == TRUE) { - tds <- tds[tds$ID1 <= tds$ID2, ] # or < if you want strictly lower triangle - } - # Write the batch to disk or accumulate in the data frame. - if (nrow(tds) > 0) { - if (writetodisk == TRUE) { - write_buffer[[length(write_buffer) + 1]] <- tds - - if (length(write_buffer) >= write_buffer_size) { # Write in batches - utils::write.table(do.call(rbind, write_buffer), - file = rel_pairs_file, - row.names = FALSE, col.names = FALSE, append = TRUE, sep = "," - ) - write_buffer <- list() - } - } else { - df_relpairs <- rbind(df_relpairs, tds) + # Write the batch to disk or accumulate in the data frame. + if (nrow(tds) > 0) { + if (writetodisk == TRUE) { + write_buffer[[length(write_buffer) + 1]] <- tds + + if (length(write_buffer) >= write_buffer_size) { # Write in batches + utils::write.table(do.call(rbind, write_buffer), + file = rel_pairs_file, + row.names = FALSE, col.names = FALSE, append = TRUE, sep = "," + ) + write_buffer <- list() } + } else { + df_relpairs <- rbind(df_relpairs, tds) } } - if (verbose && (j %% update_rate == 0L)) { - cat("Done with", j, "of", nc, "\n") - } } - } else { - stop("No matrices provided") + if (verbose && (j %% update_rate == 0L)) { + cat("Done with", j, "of", nc, "\n") + } } - # If not writing to disk, return the accumulated data frame. if (writetodisk == FALSE) { return(df_relpairs) @@ -476,150 +313,234 @@ com2links <- function( row.names = FALSE, col.names = FALSE, append = TRUE, sep = "," ) } - # return(NULL) } - } else if (legacy) { - # --- Legacy Mode --- - # In legacy mode, convert matrices to the expected symmetric formats. - com2links.legacy( + if (gc == TRUE) { + remove(newColPos, iss, x) + } + } else { + matrix2 <- matrix(rep(1, length(ids)^2), + nrow = length(ids), + dimnames = list(ids, ids) + ) + process_two( + matrix2 = matrix, name2 = rel_name, + matrix1 = methods::as(matrix2, "CsparseMatrix"), + name1 = "phantom", + ids = ids, + nc = nc, rel_pairs_file = rel_pairs_file, - ad_ped_matrix = ad_ped_matrix, - mit_ped_matrix = mit_ped_matrix, - cn_ped_matrix = cn_ped_matrix, + writetodisk = writetodisk, + write_buffer_size = write_buffer_size, + drop_upper_triangular = drop_upper_triangular, update_rate = update_rate, verbose = verbose, - outcome_name = outcome_name + gc = gc ) - return(NULL) } +} - # --- End of Legacy Mode --- +process_all_three <- function( + mat1, name1, + mat2, name2, + mat3, name3, + ids, nc, + rel_pairs_file, + writetodisk, + write_buffer_size, + drop_upper_triangular, + update_rate, + verbose, + gc, + ...) { + # Extract matrix slots + p1 <- mat1@p + 1L + i1 <- mat1@i + 1L + x1 <- mat1@x + p2 <- mat2@p + 1L + i2 <- mat2@i + 1L + x2 <- mat2@x + p3 <- mat3@p + 1L + i3 <- mat3@i + 1L + x3 <- mat3@x + + relNames <- c(name1, name2, name3) + df_relpairs <- initialize_empty_df(relNames) + + if (writetodisk) { + utils::write.table(df_relpairs, file = rel_pairs_file, sep = ",", append = FALSE, row.names = FALSE) + write_buffer <- list() + rm(df_relpairs) + } - # Merge and write the parentage matrices - # df <- full_join(mat_ped_matrix %>% arrange(ID), pat_ped_matrix %>% arrange(ID)) + for (j in seq_len(nc)) { + ID2 <- ids[j] - # write.table(df, file = mapa_id_file, sep = ",", append = FALSE, row.names = FALSE) -} + # Get index spans + v1 <- if (p1[j] < p1[j + 1L]) { + idx <- p1[j]:(p1[j + 1L] - 1L) + list(i = i1[idx], x = x1[idx]) + } else { + NULL + } + v2 <- if (p2[j] < p2[j + 1L]) { + idx <- p2[j]:(p2[j + 1L] - 1L) + list(i = i2[idx], x = x2[idx]) + } else { + NULL + } + v3 <- if (p3[j] < p3[j + 1L]) { + idx <- p3[j]:(p3[j + 1L] - 1L) + list(i = i3[idx], x = x3[idx]) + } else { + NULL + } -#' Convert Pedigree Matrices to Related Pairs File (Legacy) -#' @description -#' This legacy function converts pedigree matrices into a related pairs file. -#' @inheritParams com2links + # Union of index positions + u <- sort(unique(c( + if (!is.null(v1)) v1$i else NULL, + if (!is.null(v2)) v2$i else NULL, + if (!is.null(v3)) v3$i else NULL + ))) + if (length(u) > 0) { + ID1 <- ids[u] + tds <- data.frame(ID1 = ID1, ID2 = ID2) + tds[[name1]] <- if (!is.null(v1)) ifelse(u %in% v1$i, v1$x[match(u, v1$i)], 0) else 0 + tds[[name2]] <- if (!is.null(v2)) ifelse(u %in% v2$i, v2$x[match(u, v2$i)], 0) else 0 + tds[[name3]] <- if (!is.null(v3)) ifelse(u %in% v3$i, v3$x[match(u, v3$i)], 0) else 0 + + if (drop_upper_triangular) { + tds <- tds[tds$ID1 <= tds$ID2, ] + } -com2links.legacy <- function( - rel_pairs_file = "dataRelatedPairs.csv", - ad_ped_matrix = NULL, - mit_ped_matrix = mt_ped_matrix, - mt_ped_matrix = NULL, - cn_ped_matrix = NULL, - update_rate = 500, - verbose = FALSE, - outcome_name = "data", + if (nrow(tds) > 0) { + if (writetodisk) { + write_buffer[[length(write_buffer) + 1L]] <- tds + if (length(write_buffer) >= write_buffer_size) { + utils::write.table(do.call(rbind, write_buffer), + file = rel_pairs_file, row.names = FALSE, + col.names = FALSE, append = TRUE, sep = "," + ) + write_buffer <- list() + } + } else { + df_relpairs <- rbind(df_relpairs, tds) + } + } + } + + if (verbose && (j %% update_rate == 0L)) { + cat("Done with", j, "of", nc, "\n") + } + } + + if (!writetodisk) { + return(df_relpairs) + } else if (length(write_buffer) > 0) { + utils::write.table(do.call(rbind, write_buffer), + file = rel_pairs_file, row.names = FALSE, + col.names = FALSE, append = TRUE, sep = "," + ) + } + + invisible(NULL) +} + +process_two <- function( + matrix1, name1, + matrix2, name2, + ids, nc, + rel_pairs_file, + writetodisk, + write_buffer_size, + drop_upper_triangular, + update_rate, + verbose, + gc, ...) { - # --- Legacy Mode --- - if (verbose) { - message("Using legacy mode") + # Extract internal slots + p1 <- matrix1@p + 1L + i1 <- matrix1@i + 1L + x1 <- matrix1@x + p2 <- matrix2@p + 1L + i2 <- matrix2@i + 1L + x2 <- matrix2@x + + relNames <- c(name1, name2) + df_relpairs <- initialize_empty_df(relNames) + + if (writetodisk) { + utils::write.table(df_relpairs, file = rel_pairs_file, sep = ",", append = FALSE, row.names = FALSE) + write_buffer <- list() + rm(df_relpairs) + } + + for (j in seq_len(nc)) { + ID2 <- ids[j] + + # Get index/value slices + v1 <- if (p1[j] < p1[j + 1L]) { + idx <- p1[j]:(p1[j + 1L] - 1L) + list(i = i1[idx], x = x1[idx]) + } else { + NULL } - # In legacy mode, convert matrices to the expected symmetric formats. - - # load(paste0(outcome_name,'_dataBiggestCnPedigree.Rdata')) - # biggestCnPed <- methods::as(biggestCnPed, "symmetricMatrix") - # load(paste0(outcome_name,'_dataBiggestPedigree.Rdata')) - # load(paste0(outcome_name,'_dataBiggestMtPedigree.Rdata')) - - # rel_pairs_file <- paste0(outcome_name,'_datacnmitBiggestRelatedPairsTake3.csv') - - biggestMtPed <- mit_ped_matrix - remove(mit_ped_matrix) - biggestCnPed <- methods::as(cn_ped_matrix, "symmetricMatrix") - remove(cn_ped_matrix) - biggestPed <- ad_ped_matrix - remove(ad_ped_matrix) - biggestMtPed@x[biggestMtPed@x > 0] <- 1 - - # Set the output file name. - if (exists("rel_pairs_file")) { - fname <- rel_pairs_file + v2 <- if (p2[j] < p2[j + 1L]) { + idx <- p2[j]:(p2[j + 1L] - 1L) + list(i = i2[idx], x = x2[idx]) } else { - fname <- paste0(outcome_name, "_dataBiggestRelatedPairsTake2.csv") + NULL } - # Initialize the output file with headers. - ds <- data.frame(ID1 = numeric(0), ID2 = numeric(0), - addRel = numeric(0), - mitRel = numeric(0), cnuRel = numeric(0)) - - utils::write.table(ds, file = fname, sep = ",", - append = FALSE, row.names = FALSE) - - # Extract IDs from the common nuclear matrix. - ids <- as.numeric(dimnames(biggestCnPed)[[1]]) - - # Extract pointers from the legacy matrices. - newColPos1 <- biggestPed@p + 1L - iss1 <- biggestPed@i + 1L - newColPos2 <- biggestMtPed@p + 1L - iss2 <- biggestMtPed@i + 1L - newColPos3 <- biggestCnPed@p + 1L - iss3 <- biggestCnPed@i + 1L - nc <- ncol(biggestPed) - - # Process each individual. - for (j in 1L:nc) { - ID2 <- ids[j] - ncp1 <- newColPos1[j] - ncp1p <- newColPos1[j + 1L] - cond1 <- ncp1 < ncp1p - if (cond1) { - vv1 <- ncp1:(ncp1p - 1L) - iss1vv <- iss1[vv1] - } - ncp2 <- newColPos2[j] - ncp2p <- newColPos2[j + 1L] - cond2 <- ncp2 < ncp2p - if (cond2) { - vv2 <- ncp2:(ncp2p - 1L) - iss2vv <- iss2[vv2] - } - ncp3 <- newColPos3[j] - ncp3p <- newColPos3[j + 1L] - cond3 <- ncp3 < ncp3p - if (cond3) { - vv3 <- ncp3:(ncp3p - 1L) - iss3vv <- iss3[vv3] + + # Union of indices from both matrices + u <- sort(unique(c( + if (!is.null(v1)) v1$i else NULL, + if (!is.null(v2)) v2$i else NULL + ))) + + if (length(u) > 0) { + ID1 <- ids[u] + tds <- data.frame(ID1 = ID1, ID2 = ID2) + tds[[name1]] <- if (!is.null(v1)) ifelse(u %in% v1$i, v1$x[match(u, v1$i)], 0) else 0 + tds[[name2]] <- if (!is.null(v2)) ifelse(u %in% v2$i, v2$x[match(u, v2$i)], 0) else 0 + + if (drop_upper_triangular) { + tds <- tds[tds$ID1 <= tds$ID2, ] } - # Merge indices from all three matrices. - u <- sort(igraph::union(igraph::union(if (cond1) { - iss1vv - }, if (cond2) { - iss2vv - }), if (cond3) { - iss3vv - })) - # browser() - if (cond1 || cond2 || cond3) { - ID1 <- ids[u] - tds <- data.frame(ID1 = ID1, ID2 = ID2, - addRel = 0, mitRel = 0, cnuRel = 0) - if (cond1) { - tds$addRel[u %in% iss1vv] <- biggestPed@x[vv1] - } - if (cond2) { - tds$mitRel[u %in% iss2vv] <- biggestMtPed@x[vv2] - } - if (cond3) { - tds$cnuRel[u %in% iss3vv] <- biggestCnPed@x[vv3] + if (nrow(tds) > 0) { + if (writetodisk) { + write_buffer[[length(write_buffer) + 1L]] <- tds + if (length(write_buffer) >= write_buffer_size) { + utils::write.table(do.call(rbind, write_buffer), + file = rel_pairs_file, row.names = FALSE, + col.names = FALSE, append = TRUE, sep = "," + ) + write_buffer <- list() + } + } else { + df_relpairs <- rbind(df_relpairs, tds) } - utils::write.table(tds, file = fname, row.names = FALSE, - col.names = FALSE, append = TRUE, sep = ",") - } - if (!(j %% update_rate)) { - cat(paste0("Done with ", j, " of ", nc, "\n")) } } - return(NULL) + + if (verbose && (j %% update_rate == 0L)) { + cat("Done with", j, "of", nc, "\n") + } + } + + if (!writetodisk) { + return(df_relpairs) + } else if (length(write_buffer) > 0) { + utils::write.table(do.call(rbind, write_buffer), + file = rel_pairs_file, row.names = FALSE, + col.names = FALSE, append = TRUE, sep = "," + ) } + invisible(NULL) +} + + #' @title validate_and_convert_matrix #' @description #' This function validates and converts a matrix to a specific format. @@ -631,11 +552,14 @@ com2links.legacy <- function( #' #' @return The validated and converted matrix. validate_and_convert_matrix <- function(mat, name, ensure_symmetric = FALSE, force_binary = FALSE) { - if (!inherits(mat, c("matrix", "dgCMatrix", "dsCMatrix"))) { - stop(paste0("The '", name, "' must be a matrix or dgCMatrix.")) + if (!inherits(mat, c( + "matrix", "dgCMatrix", "dsCMatrix", "generalMatrix", + "symmetricMatrix", "triangularMatrix", "dsyMatrix", "dspMatrix", "dsyMatrix", "CsparseMatrix" + ))) { + stop(paste0("The '", name, "' must be a matrix or generalMatrix")) } - if (!inherits(mat, "dgCMatrix")) { - mat <- methods::as(mat, if (ensure_symmetric) "symmetricMatrix" else "dgCMatrix") + if (!inherits(mat, "generalMatrix")) { + mat <- methods::as(mat, if (ensure_symmetric) "symmetricMatrix" else "generalMatrix") } if (force_binary) { mat@x[mat@x > 0] <- 1 diff --git a/R/makeLinkslegacy.R b/R/makeLinkslegacy.R new file mode 100644 index 00000000..23265d05 --- /dev/null +++ b/R/makeLinkslegacy.R @@ -0,0 +1,630 @@ +#' Convert Sparse Relationship Matrices to Kinship Links +#' @inheritParams com2links +#' @keywords internal + + +com2links.legacy <- function( + rel_pairs_file = "dataRelatedPairs.csv", + ad_ped_matrix = NULL, + mit_ped_matrix = mt_ped_matrix, + mt_ped_matrix = NULL, + cn_ped_matrix = NULL, + # pat_ped_matrix = NULL, + # mat_ped_matrix = NULL, + # mapa_id_file = "data_mapaID.csv", + write_buffer_size = 1000, + update_rate = 1000, + gc = TRUE, + writetodisk = TRUE, + verbose = FALSE, + legacy = FALSE, + outcome_name = "data", + drop_upper_triangular = TRUE, + ...) { + # Non-legacy mode processing + + if (!legacy) { + # --- Input Validations and Preprocessing --- + + # Ensure that at least one relationship matrix is provided. + if (is.null(ad_ped_matrix) && is.null(mit_ped_matrix) && is.null(cn_ped_matrix)) { + stop("At least one of 'ad_ped_matrix', 'mit_ped_matrix', or 'cn_ped_matrix' must be provided.") + } + # Validate and convert ad_ped_matrix to a sparse dgCMatrix if provided. + if (!is.null(ad_ped_matrix)) { + ad_ped_matrix <- validate_and_convert_matrix( + mat = ad_ped_matrix, + name = "ad_ped_matrix" + ) + } + + # Validate and convert cn_ped_matrix to a sparse dgCMatrix if provided. + if (!is.null(cn_ped_matrix)) { + cn_ped_matrix <- validate_and_convert_matrix( + mat = cn_ped_matrix, + name = "cn_ped_matrix", + ensure_symmetric = TRUE + ) + } + + # Validate and process mit_ped_matrix: convert and ensure binary values. + if (!is.null(mit_ped_matrix)) { + mit_ped_matrix <- validate_and_convert_matrix( + mat = mit_ped_matrix, + name = "mit_ped_matrix", force_binary = TRUE, + ensure_symmetric = TRUE + ) + } + + # --- Build IDs and Prepare Matrix Pointers --- + + # Extract individual IDs from the first available matrix. + ids <- NULL + + + + if (!is.null(cn_ped_matrix)) { + ids <- as.numeric(dimnames(cn_ped_matrix)[[1]]) + nc <- ncol(cn_ped_matrix) + } else if (!is.null(ad_ped_matrix)) { + ids <- as.numeric(dimnames(ad_ped_matrix)[[1]]) + nc <- ncol(ad_ped_matrix) + } else if (!is.null(mit_ped_matrix)) { + ids <- as.numeric(dimnames(mit_ped_matrix)[[1]]) + nc <- ncol(mit_ped_matrix) + } + + if (is.null(ids)) { + stop("Could not extract IDs from the provided matrices.") + } + + + # Construct case identifier + matrix_case <- paste(sort(c( + if (!is.null(ad_ped_matrix)) "ad" else NULL, + if (!is.null(mit_ped_matrix)) "mt" else NULL, + if (!is.null(cn_ped_matrix)) "cn" else NULL + )), collapse = "-") + + + # Count how many matrices are provided. + sum_nulls <- sum(!is.null(ad_ped_matrix), + !is.null(mit_ped_matrix), + !is.null(cn_ped_matrix), + na.rm = TRUE + ) + if (verbose) { + print(matrix_case) + } + + # Extract the internal pointers (p, i, and x slots) for each provided matrix. + if (!is.null(ad_ped_matrix)) { + ad_ped_p <- ad_ped_matrix@p + 1L + ad_ped_i <- ad_ped_matrix@i + 1L + ad_ped_x <- ad_ped_matrix@x + } + if (!is.null(mit_ped_matrix)) { + mt_p <- mit_ped_matrix@p + 1L + mt_i <- mit_ped_matrix@i + 1L + mt_x <- mit_ped_matrix@x + } + if (!is.null(cn_ped_matrix)) { + cn_p <- cn_ped_matrix@p + 1L + cn_i <- cn_ped_matrix@i + 1L + cn_x <- cn_ped_matrix@x + } + + # --- Process Based on the Number of Provided Matrices --- + # --- Case: All Three Matrices Provided --- + if (sum_nulls == 3) { + # Set pointers for all three matrices. + newColPos1 <- ad_ped_p + iss1 <- ad_ped_i + x1 <- ad_ped_x + + newColPos2 <- mt_p + iss2 <- mt_i + x2 <- mt_x + + newColPos3 <- cn_p + iss3 <- cn_i + x3 <- cn_x + + # Define relationship column names. + relNames <- c("addRel", "mitRel", "cnuRel") + + # Optionally remove the original pointers to free memory. + if (gc == TRUE) { + remove(ad_ped_p, ad_ped_i, ad_ped_x, mt_p, mt_i, mt_x, cn_p, cn_i, cn_x) + } + if (verbose) { + message("All 3 matrix is present") + } + + # File names + # rel_pairs_file <- paste0(outcome_name, "_dataRelatedPairs.csv") + # mapa_id_file <- paste0(outcome_name, "_data_mapaID.csv") + + # Initialize the related pairs file with headers. + df_relpairs <- initialize_empty_df(relNames = relNames) + + # Write the headers to the related pairs file. + if (writetodisk == TRUE) { + utils::write.table( + df_relpairs, + file = rel_pairs_file, sep = ",", append = FALSE, row.names = FALSE + ) + + # Prepare an empty buffer for batching writes. + write_buffer <- list() + remove(df_relpairs) + } + + # Loop over each column (individual) in the matrix. + for (j in 1L:nc) { + ID2 <- ids[j] + + # Extract column indices for the 1st component + ncp1 <- newColPos1[j] + ncp1p <- newColPos1[j + 1L] + cond1 <- ncp1 < ncp1p + if (cond1) { + vv1 <- ncp1:(ncp1p - 1L) + iss1vv <- iss1[vv1] + } + # Extract indices for the 2nd component + ncp2 <- newColPos2[j] + ncp2p <- newColPos2[j + 1L] + cond2 <- ncp2 < ncp2p + if (cond2) { + vv2 <- ncp2:(ncp2p - 1L) + iss2vv <- iss2[vv2] + } + + # Extract indices for the 3rd component + ncp3 <- newColPos3[j] + ncp3p <- newColPos3[j + 1L] + cond3 <- ncp3 < ncp3p + if (cond3) { + vv3 <- ncp3:(ncp3p - 1L) + iss3vv <- iss3[vv3] + } + + # Create a unique, sorted set of row indices from all provided matrices. + u <- sort(igraph::union(igraph::union(if (cond1) { + iss1vv + }, if (cond2) { + iss2vv + }), if (cond3) { + iss3vv + })) + + # If any relationships exist for this individual, build the related pairs. + if (cond1 || cond2 || cond3) { + ID1 <- ids[u] + tds <- data.frame(ID1 = ID1, ID2 = ID2) + tds[[relNames[1]]] <- 0 + tds[[relNames[2]]] <- 0 + tds[[relNames[3]]] <- 0 + + # Assign the relationship coefficients from each matrix. + if (cond1) { + tds[u %in% iss1vv, relNames[1]] <- x1[vv1] + } + if (cond2) { + tds[u %in% iss2vv, relNames[2]] <- x2[vv2] + } + if (cond3) { + tds[u %in% iss3vv, relNames[3]] <- x3[vv3] + } + + # Optionally drop upper-triangular entries. + if (drop_upper_triangular == TRUE) { + tds <- tds[tds$ID1 <= tds$ID2, ] # or < if you want strictly lower triangle + } + + # Write the batch to disk or accumulate in the data frame. + if (nrow(tds) > 0) { + if (writetodisk == TRUE) { + write_buffer[[length(write_buffer) + 1]] <- tds + + if (length(write_buffer) >= write_buffer_size) { # Write in batches + utils::write.table(do.call(rbind, write_buffer), + file = rel_pairs_file, + row.names = FALSE, col.names = FALSE, append = TRUE, sep = "," + ) + write_buffer <- list() + } + } else { + df_relpairs <- rbind(df_relpairs, tds) + } + } + } + if (verbose && (j %% update_rate == 0L)) { + cat("Done with", j, "of", nc, "\n") + } + } + } else if (sum_nulls == 2) { + # --- Case: Two Matrices Provided --- + # Set pointers and relationship names based on which matrix is missing. + + if (is.null(ad_ped_matrix)) { + newColPos1 <- mt_p + iss1 <- mt_i + x1 <- mt_x + newColPos2 <- cn_p + iss2 <- cn_i + x2 <- cn_x + relNames <- c("mitRel", "cnuRel") + if (gc == TRUE) { + remove(mt_p, mt_i, mt_x, cn_p, cn_i, cn_x) + } + } + if (is.null(mit_ped_matrix)) { + newColPos1 <- ad_ped_p + iss1 <- ad_ped_i + x1 <- ad_ped_x + newColPos2 <- cn_p + iss2 <- cn_i + x2 <- cn_x + relNames <- c("addRel", "cnuRel") + if (gc == TRUE) { + remove(ad_ped_p, ad_ped_i, ad_ped_x, cn_p, cn_i, cn_x) + } + } + if (is.null(cn_ped_matrix)) { + newColPos1 <- ad_ped_p + iss1 <- ad_ped_i + x1 <- ad_ped_x + newColPos2 <- mt_p + iss2 <- mt_i + x2 <- mt_x + relNames <- c("addRel", "mitRel") + if (gc == TRUE) { + remove(ad_ped_p, ad_ped_i, ad_ped_x, mt_p, mt_i, mt_x) + } + } + + # Initialize the related pairs file with the appropriate headers. + df_relpairs <- initialize_empty_df(relNames = relNames) + + if (writetodisk == TRUE) { + utils::write.table( + df_relpairs, + file = rel_pairs_file, sep = ",", append = FALSE, row.names = FALSE + ) + # initial buffer + write_buffer <- list() + remove(df_relpairs) + } + + # Process each column to extract relationships. + for (j in 1L:nc) { + ID2 <- ids[j] + + # Extract indices from the first matrix. + ncp1 <- newColPos1[j] + ncp1p <- newColPos1[j + 1L] + cond1 <- ncp1 < ncp1p + if (cond1) { + vv1 <- ncp1:(ncp1p - 1L) + iss1vv <- iss1[vv1] + } + # Extract indices from the second matrix. + ncp2 <- newColPos2[j] + ncp2p <- newColPos2[j + 1L] + cond2 <- ncp2 < ncp2p + if (cond2) { + vv2 <- ncp2:(ncp2p - 1L) + iss2vv <- iss2[vv2] + } + + # Merge the indices from both matrices. + u <- sort(igraph::union(if (cond1) { + iss1vv + }, if (cond2) { + iss2vv + })) + + # Create related pairs if relationships are found. + if (cond1 || cond2) { + ID1 <- ids[u] + tds <- data.frame(ID1 = ID1, ID2 = ID2) + tds[[relNames[1]]] <- 0 + tds[[relNames[2]]] <- 0 + + if (cond1) { + tds[u %in% iss1vv, relNames[1]] <- x1[vv1] + } + if (cond2) { + tds[u %in% iss2vv, relNames[2]] <- x2[vv2] + } + if (drop_upper_triangular == TRUE) { + tds <- tds[tds$ID1 <= tds$ID2, ] # or < if you want strictly lower triangle + } + + # Write the batch to disk or accumulate in the data frame. + if (nrow(tds) > 0) { + if (writetodisk == TRUE) { + write_buffer[[length(write_buffer) + 1]] <- tds + + if (length(write_buffer) >= write_buffer_size) { # Write in batches + utils::write.table(do.call(rbind, write_buffer), + file = rel_pairs_file, + row.names = FALSE, col.names = FALSE, append = TRUE, sep = "," + ) + write_buffer <- list() + } + } else { + df_relpairs <- rbind(df_relpairs, tds) + } + } + } + if (verbose && (j %% update_rate == 0L)) { + cat("Done with", j, "of", nc, "\n") + } + } + } else if (sum_nulls == 1) { + # --- Case: Only One Matrix Provided --- + if (verbose) { + message("Only one matrix is present") + } + if (!is.null(ad_ped_matrix)) { + newColPos1 <- ad_ped_p + iss1 <- ad_ped_i + x1 <- ad_ped_x + relNames <- c("addRel") + if (gc == TRUE) { + remove(ad_ped_p, ad_ped_i, ad_ped_x) + } + } + if (!is.null(mit_ped_matrix)) { + newColPos1 <- mt_p + iss1 <- mt_i + x1 <- mt_x + relNames <- c("mitRel") + if (gc == TRUE) { + remove(mt_p, mt_i, mt_x) + } + } + if (!is.null(cn_ped_matrix)) { + newColPos1 <- cn_p + iss1 <- cn_i + x1 <- cn_x + relNames <- c("cnuRel") + if (gc == TRUE) { + remove(cn_p, cn_i, cn_x) + } + } + + # Initialize the related pairs file. + df_relpairs <- initialize_empty_df(relNames = relNames) + + if (writetodisk == TRUE) { + utils::write.table( + df_relpairs, + file = rel_pairs_file, sep = ",", append = FALSE, row.names = FALSE + ) + + # initial buffer + write_buffer <- list() + + remove(df_relpairs) + } + + # Process each column. + for (j in 1L:nc) { + ID2 <- ids[j] + # Extract column indices + ncp1 <- newColPos1[j] + ncp1p <- newColPos1[j + 1L] + cond1 <- ncp1 < ncp1p + if (cond1) { + vv1 <- ncp1:(ncp1p - 1L) + iss1vv <- iss1[vv1] + } + + # Use the indices from the single matrix. + u <- sort(iss1vv) + + if (cond1) { + ID1 <- ids[u] + tds <- data.frame(ID1 = ID1, ID2 = ID2) + tds[[relNames[1]]] <- 0 + + if (cond1) { + tds[u %in% iss1vv, relNames[1]] <- x1[vv1] + } + if (drop_upper_triangular == TRUE) { + tds <- tds[tds$ID1 <= tds$ID2, ] # or < if you want strictly lower triangle + } + + # Write the batch to disk or accumulate in the data frame. + if (nrow(tds) > 0) { + if (writetodisk == TRUE) { + write_buffer[[length(write_buffer) + 1]] <- tds + + if (length(write_buffer) >= write_buffer_size) { # Write in batches + utils::write.table(do.call(rbind, write_buffer), + file = rel_pairs_file, + row.names = FALSE, col.names = FALSE, append = TRUE, sep = "," + ) + write_buffer <- list() + } + } else { + df_relpairs <- rbind(df_relpairs, tds) + } + } + } + if (verbose && (j %% update_rate == 0L)) { + cat("Done with", j, "of", nc, "\n") + } + } + } else { + stop("No matrices provided") + } + + # If not writing to disk, return the accumulated data frame. + if (writetodisk == FALSE) { + return(df_relpairs) + } else { + # Write any remaining buffered rows. + if (length(write_buffer) > 0) { + utils::write.table(do.call(rbind, write_buffer), + file = rel_pairs_file, + row.names = FALSE, col.names = FALSE, append = TRUE, sep = "," + ) + } + # return(NULL) + } + } else if (legacy) { + # --- Legacy Mode --- + # In legacy mode, convert matrices to the expected symmetric formats. + com2links.og( + rel_pairs_file = rel_pairs_file, + ad_ped_matrix = ad_ped_matrix, + mit_ped_matrix = mit_ped_matrix, + cn_ped_matrix = cn_ped_matrix, + update_rate = update_rate, + verbose = verbose, + outcome_name = outcome_name + ) + return(NULL) + } + + # --- End of Legacy Mode --- + + # Merge and write the parentage matrices + # df <- full_join(mat_ped_matrix %>% arrange(ID), pat_ped_matrix %>% arrange(ID)) + + # write.table(df, file = mapa_id_file, sep = ",", append = FALSE, row.names = FALSE) +} + +#' Convert Pedigree Matrices to Related Pairs File (Legacy) +#' @description +#' This legacy function converts pedigree matrices into a related pairs file. +#' @inheritParams com2links +#' @keywords internal + + +com2links.og <- function( + rel_pairs_file = "dataRelatedPairs.csv", + ad_ped_matrix = NULL, + mit_ped_matrix = mt_ped_matrix, + mt_ped_matrix = NULL, + cn_ped_matrix = NULL, + update_rate = 500, + verbose = FALSE, + outcome_name = "data", + ...) { + # --- Legacy Mode --- + if (verbose) { + message("Using legacy mode") + } + # In legacy mode, convert matrices to the expected symmetric formats. + + # load(paste0(outcome_name,'_dataBiggestCnPedigree.Rdata')) + # biggestCnPed <- methods::as(biggestCnPed, "symmetricMatrix") + # load(paste0(outcome_name,'_dataBiggestPedigree.Rdata')) + # load(paste0(outcome_name,'_dataBiggestMtPedigree.Rdata')) + + # rel_pairs_file <- paste0(outcome_name,'_datacnmitBiggestRelatedPairsTake3.csv') + + biggestMtPed <- mit_ped_matrix + remove(mit_ped_matrix) + biggestCnPed <- methods::as(cn_ped_matrix, "symmetricMatrix") + remove(cn_ped_matrix) + biggestPed <- ad_ped_matrix + remove(ad_ped_matrix) + biggestMtPed@x[biggestMtPed@x > 0] <- 1 + + # Set the output file name. + if (exists("rel_pairs_file")) { + fname <- rel_pairs_file + } else { + fname <- paste0(outcome_name, "_dataBiggestRelatedPairsTake2.csv") + } + # Initialize the output file with headers. + ds <- data.frame( + ID1 = numeric(0), ID2 = numeric(0), + addRel = numeric(0), + mitRel = numeric(0), cnuRel = numeric(0) + ) + + utils::write.table(ds, + file = fname, sep = ",", + append = FALSE, row.names = FALSE + ) + + # Extract IDs from the common nuclear matrix. + ids <- as.numeric(dimnames(biggestCnPed)[[1]]) + + # Extract pointers from the legacy matrices. + newColPos1 <- biggestPed@p + 1L + iss1 <- biggestPed@i + 1L + newColPos2 <- biggestMtPed@p + 1L + iss2 <- biggestMtPed@i + 1L + newColPos3 <- biggestCnPed@p + 1L + iss3 <- biggestCnPed@i + 1L + nc <- ncol(biggestPed) + + # Process each individual. + for (j in 1L:nc) { + ID2 <- ids[j] + ncp1 <- newColPos1[j] + ncp1p <- newColPos1[j + 1L] + cond1 <- ncp1 < ncp1p + if (cond1) { + vv1 <- ncp1:(ncp1p - 1L) + iss1vv <- iss1[vv1] + } + ncp2 <- newColPos2[j] + ncp2p <- newColPos2[j + 1L] + cond2 <- ncp2 < ncp2p + if (cond2) { + vv2 <- ncp2:(ncp2p - 1L) + iss2vv <- iss2[vv2] + } + ncp3 <- newColPos3[j] + ncp3p <- newColPos3[j + 1L] + cond3 <- ncp3 < ncp3p + if (cond3) { + vv3 <- ncp3:(ncp3p - 1L) + iss3vv <- iss3[vv3] + } + + # Merge indices from all three matrices. + u <- sort(igraph::union(igraph::union(if (cond1) { + iss1vv + }, if (cond2) { + iss2vv + }), if (cond3) { + iss3vv + })) + # browser() + if (cond1 || cond2 || cond3) { + ID1 <- ids[u] + tds <- data.frame( + ID1 = ID1, ID2 = ID2, + addRel = 0, mitRel = 0, cnuRel = 0 + ) + if (cond1) { + tds$addRel[u %in% iss1vv] <- biggestPed@x[vv1] + } + if (cond2) { + tds$mitRel[u %in% iss2vv] <- biggestMtPed@x[vv2] + } + if (cond3) { + tds$cnuRel[u %in% iss3vv] <- biggestCnPed@x[vv3] + } + utils::write.table(tds, + file = fname, row.names = FALSE, + col.names = FALSE, append = TRUE, sep = "," + ) + } + if (!(j %% update_rate)) { + cat(paste0("Done with ", j, " of ", nc, "\n")) + } + } + return(NULL) +} diff --git a/R/plotPedigree.R b/R/plotPedigree.R index 051610e6..5b2229cd 100644 --- a/R/plotPedigree.R +++ b/R/plotPedigree.R @@ -25,7 +25,7 @@ plotPedigree <- function(ped, pconnect = .5, ...) { # Standardize column names in the input dataframe - ped <- standardizeColnames(ped) + ped <- standardizeColnames(ped, verbose = verbose) # Define required columns simulated_vars <- c("fam", "ID", "dadID", "momID", "sex") diff --git a/R/readGedcom.R b/R/readGedcom.R index d6858e58..7181fac4 100644 --- a/R/readGedcom.R +++ b/R/readGedcom.R @@ -1,7 +1,6 @@ #' Read a GEDCOM File #' #' This function reads a GEDCOM file and parses it into a structured data frame of individuals. -#' Inspired by https://raw.githubusercontent.com/jjfitz/readgedcom/master/R/read_gedcom.R #' #' @param file_path The path to the GEDCOM file. #' @param add_parents A logical value indicating whether to add parents to the data frame. @@ -9,6 +8,8 @@ #' @param combine_cols A logical value indicating whether to combine columns with duplicate values. #' @param verbose A logical value indicating whether to print messages. #' @param skinny A logical value indicating whether to return a skinny data frame. +#' @param update_rate numeric. The rate at which to print progress +#' @param post_process A logical value indicating whether to post-process the data frame. #' @param ... Additional arguments to be passed to the function. #' @return A data frame containing information about individuals, with the following potential columns: #' - `id`: ID of the individual @@ -53,288 +54,432 @@ readGedcom <- function(file_path, remove_empty_cols = TRUE, combine_cols = TRUE, skinny = FALSE, + update_rate = 1000, + post_process = TRUE, ...) { - # Checks - if (!file.exists(file_path)) stop("File does not exist: ", file_path) - if (verbose) { - print(paste("Reading file:", file_path)) - } - file <- data.frame(X1 = readLines(file_path)) - file_length <- nrow(file) - if (verbose) { - print(paste0("File is ", file_length, " lines long")) + # Ensure the file exists and read all lines. + if (!file.exists(file_path)) { + stop("File does not exist: ", file_path) } + if (verbose) message("Reading file: ", file_path) + lines <- readLines(file_path) + total_lines <- length(lines) + if (verbose) message("File is ", total_lines, " lines long") - # Count the number of rows containing specific patterns - num_rows <- countPatternRows(file) + # Count pattern occurrences (pattern_rows remains used in subfunctions) + pattern_rows <- countPatternRows(data.frame(X1 = lines)) # List of variables to initialize - var_names <- list( + all_var_names <- unlist(list( identifiers = c("id", "momID", "dadID"), names = c( - "name", "name_given", "name_given_pieces", - "name_surn", "name_surn_pieces", "name_marriedsurn", "name_nick", "name_npfx", "name_nsfx" + "name", "name_given", "name_given_pieces", "name_surn", "name_surn_pieces", "name_marriedsurn", + "name_nick", "name_npfx", "name_nsfx" ), sex = c("sex"), birth = c("birth_date", "birth_lat", "birth_long", "birth_place"), death = c("death_caus", "death_date", "death_lat", "death_long", "death_place"), attributes = c( - "attribute_caste", "attribute_children", "attribute_description", - "attribute_education", "attribute_idnumber", "attribute_marriages", - "attribute_nationality", "attribute_occupation", - "attribute_property", "attribute_religion", "attribute_residence", - "attribute_ssn", "attribute_title" + "attribute_caste", "attribute_children", "attribute_description", "attribute_education", + "attribute_idnumber", "attribute_marriages", "attribute_nationality", "attribute_occupation", + "attribute_property", "attribute_religion", "attribute_residence", "attribute_ssn", + "attribute_title" ), relationships = c("FAMC", "FAMS") + ), use.names = FALSE) + + # Split the file into blocks; each block corresponds to one individual. + blocks <- splitIndividuals(lines, verbose) + + # Parse each individual block into a record (a named list) + records <- lapply(blocks, parseIndividualBlock, + pattern_rows = pattern_rows, + all_var_names = all_var_names, verbose = verbose ) - all_var_names <- unlist(var_names, use.names = FALSE) + # Remove any NULLs (if a block did not contain an individual id) + records <- Filter(Negate(is.null), records) - # Initialize all variables to NA - vars <- stats::setNames(as.list(rep(NA_character_, length(all_var_names))), all_var_names) + if (length(records) == 0) { + warning("No people found in file") + return(NULL) + } - df_temp <- as.data.frame(matrix(nrow = 1, ncol = length(all_var_names))) - names(df_temp) <- all_var_names + # Convert the list of records to a data frame. + df_temp <- do.call(rbind, lapply(records, function(rec) { + as.data.frame(rec, stringsAsFactors = FALSE) + })) + + if (verbose) message("File has ", nrow(df_temp), " people") + + # Run post-processing if requested. + if (post_process) { + if (verbose) message("Post-processing data frame") + df_temp <- postProcessGedcom( + df_temp = df_temp, + remove_empty_cols = remove_empty_cols, + combine_cols = combine_cols, + add_parents = add_parents, + skinny = skinny, + verbose = verbose + ) + } - if (verbose) { - print("Parsing GEDCOM file") + return(df_temp) +} + +# --- SUBFUNCTIONS --- +#' Split GEDCOM Lines into Individual Blocks +#' +#' This function partitions the GEDCOM file (as a vector of lines) into a list of blocks, +#' where each block corresponds to a single individual starting with an "@ INDI" line. +#' +#' @param lines A character vector of lines from the GEDCOM file. +#' @param verbose Logical indicating whether to output progress messages. +#' @return A list of character vectors, each representing one individual. +splitIndividuals <- function(lines, verbose = FALSE) { + indi_idx <- grep("@ INDI", lines) + if (length(indi_idx) == 0) { + return(list()) } - for (i in 1:length(file[1][[1]])) { - tmpv <- file[1][[1]][[i]] - if (grepl("@ INDI", tmpv)) { - line_to_write <- as.data.frame(vars) - df_temp <- rbind(df_temp, line_to_write) + blocks <- list() + for (i in seq_along(indi_idx)) { + start <- indi_idx[i] + end <- if (i < length(indi_idx)) indi_idx[i + 1] - 1 else length(lines) + block <- lines[start:end] + blocks[[length(blocks) + 1]] <- block + } + if (verbose) message("Found ", length(blocks), " individual blocks") + return(blocks) +} + +#' Initialize an Empty Individual Record +#' +#' Creates a named list with all GEDCOM fields set to NA. +#' +#' @param all_var_names A character vector of variable names. +#' @return A named list representing an empty individual record. +initializeRecord <- function(all_var_names) { + stats::setNames(as.list(rep(NA_character_, length(all_var_names))), all_var_names) +} - # Reset all variables to NA - vars <- stats::setNames(as.list(rep(NA_character_, length(all_var_names))), all_var_names) +#' Parse a GEDCOM Individual Block +#' +#' Processes a block of GEDCOM lines corresponding to a single individual. +#' +#' @param block A character vector containing the GEDCOM lines for one individual. +#' @param pattern_rows A list with counts of lines matching specific GEDCOM tags. +#' @param all_var_names A character vector of variable names. +#' @param verbose Logical indicating whether to print progress messages. +#' @return A named list representing the parsed record for the individual, or NULL if no ID is found. +#' @keywords internal +parseIndividualBlock <- function(block, pattern_rows, all_var_names, verbose = FALSE) { + record <- initializeRecord(all_var_names) + n_lines <- length(block) + + # Loop through the block by index so that we can look ahead for event details. + i <- 1 + while (i <= n_lines) { + line <- block[i] + + # Process individual identifier (e.g., "@ INDI ...") + if (grepl("@ INDI", line)) { + record$id <- stringr::str_extract(line, "(?<=@.)\\d*(?=@)") + i <- i + 1 + next + } - vars$id <- stringr::str_extract(tmpv, "(?<=@.)\\d*(?=@)") + # Special processing for full name using " NAME" tag. + if (grepl(" NAME", line) && pattern_rows$num_name_rows > 0) { + record <- parseNameLine(line, record) + i <- i + 1 next } - # names - if (num_rows$num_name_rows > 0 && grepl(" NAME", tmpv)) { - vars$name <- extract_info(tmpv, "NAME") - vars$name_given <- stringr::str_extract(vars$name, ".*(?= /)") - vars$name_surn <- stringr::str_extract(vars$name, "(?<=/).*(?=/)") - vars$name <- stringr::str_squish(stringr::str_replace(vars$name, "/", " ")) + # Process birth and death events by consuming multiple lines. + if (grepl(" BIRT", line) && pattern_rows$num_birt_rows > 0) { + record <- processEventLine("birth", block, i, record, pattern_rows) + i <- i + 1 # Skip further processing of this line. next } - # PERSONAL_NAME_PIECES := NAME | NPFX | GIVN | NICK | SPFX | SURN | NSFX - result <- process_tag("GIVN", "name_given_pieces", num_rows, tmpv, vars) - vars <- result$vars - if (result$matched) next - - # npfx := Name Prefix - result <- process_tag("NPFX", "name_npfx", num_rows, tmpv, vars) - vars <- result$vars - if (result$matched) next - - # NICK := Nickname - result <- process_tag("NICK", "name_nick", num_rows, tmpv, vars) - vars <- result$vars - if (result$matched) next - - # surn := Surname - result <- process_tag("SURN", "name_surn_pieces", num_rows, tmpv, vars) - vars <- result$vars - if (result$matched) next - - # nsfx := Name suffix - result <- process_tag("NSFX", "name_nsfx", num_rows, tmpv, vars) - vars <- result$vars - if (result$matched) next - - result <- process_tag("_MARNM", "name_marriedsurn", num_rows, tmpv, vars) - vars <- result$vars - if (result$matched) next - - # Birth event related information - if (num_rows$num_birt_rows > 0 && grepl(" BIRT", tmpv)) { - if (num_rows$num_date_rows > 0 && i + 1 <= file_length) { - vars$birth_date <- extract_info(file[1][[1]][[i + 1]], "DATE") - if (num_rows$num_plac_rows > 0 && i + 2 <= file_length) { - vars$birth_place <- extract_info(file[1][[1]][[i + 2]], "PLAC") - if (num_rows$num_lati_rows > 0 && i + 4 <= file_length) { - vars$birth_lat <- extract_info(file[1][[1]][[i + 4]], "LATI") - if (num_rows$num_long_rows > 0 && i + 5 <= file_length) { - vars$birth_long <- extract_info(file[1][[1]][[i + 5]], "LONG") - } - } - } - } + if (grepl(" DEAT", line) && pattern_rows$num_deat_rows > 0) { + record <- processEventLine("death", block, i, record, pattern_rows) + i <- i + 1 next } - # Death event related information - # the ifs are nested so that there is no need to check if you've already run out of - if (num_rows$num_deat_rows > 0 && grepl(" DEAT", tmpv)) { - if (num_rows$num_date_rows > 0 && i + 1 <= file_length) { - vars$death_date <- extract_info(file[1][[1]][[i + 1]], "DATE") - if (num_rows$num_plac_rows > 0 && i + 2 <= file_length) { - vars$death_place <- extract_info(file[1][[1]][[i + 2]], "PLAC") - if (num_rows$num_caus_rows > 0 && i + 3 <= file_length) { - vars$death_caus <- extract_info(file[1][[1]][[i + 3]], "CAUS") - if (num_rows$num_lati_rows > 0 && i + 4 <= file_length) { - vars$death_lat <- extract_info(file[1][[1]][[i + 4]], "LATI") - if (num_rows$num_long_rows > 0 && i + 5 <= file_length) { - vars$death_long <- extract_info(file[1][[1]][[i + 5]], "LONG") - } - } - } - } - } + # Process other tags using common mappings. + # Define mappings for name pieces (if not handled by NAME tag). + name_piece_mappings <- list( + list(tag = "GIVN", field = "name_given_pieces", mode = "replace"), + list(tag = "NPFX", field = "name_npfx", mode = "replace"), + list(tag = "NICK", field = "name_nick", mode = "replace"), + list(tag = "SURN", field = "name_surn_pieces", mode = "replace"), + list(tag = "NSFX", field = "name_nsfx", mode = "replace"), + list(tag = "_MARNM", field = "name_marriedsurn", mode = "replace") + ) + out <- applyTagMappings(line, record, pattern_rows, name_piece_mappings) + if (out$matched) { + record <- out$record + i <- i + 1 next } + # Process attribute tags. + attribute_mappings <- list( + list(tag = "SEX", field = "sex", mode = "replace"), + list(tag = "CAST", field = "attribute_caste", mode = "replace"), + list(tag = "DSCR", field = "attribute_description", mode = "replace"), + list(tag = "EDUC", field = "attribute_education", mode = "replace"), + list(tag = "IDNO", field = "attribute_idnumber", mode = "replace"), + list(tag = "NATI", field = "attribute_nationality", mode = "replace"), + list(tag = "NCHI", field = "attribute_children", mode = "replace"), + list(tag = "NMR", field = "attribute_marriages", mode = "replace"), + list(tag = "OCCU", field = "attribute_occupation", mode = "replace"), + list(tag = "PROP", field = "attribute_property", mode = "replace"), + list(tag = "RELI", field = "attribute_religion", mode = "replace"), + list(tag = "RESI", field = "attribute_residence", mode = "replace"), + list(tag = "SSN", field = "attribute_ssn", mode = "replace"), + list(tag = "TITL", field = "attribute_title", mode = "replace") + ) + out <- applyTagMappings(line, record, pattern_rows, attribute_mappings) + if (out$matched) { + record <- out$record + i <- i + 1 + next + } - # Attribute tags using process_tag() - for (tag_field in list( - c("SEX", "sex"), - - # CAST caste - # g7:CAST The name of an individual’s rank or status in society which is sometimes based on racial or religious differences, or differences in wealth, inherited rank, profession, or occupation. - c("CAST", "attribute_caste"), - - # DSCR physical description - # g7:DSCR The physical characteristics of a person. - c("DSCR", "attribute_description"), - - # EDUC education - # g7:EDUC Indicator of a level of education attained. - c("EDUC", "attribute_education"), - - # IDNO identifying number - # g7:IDNO A number or other string assigned to identify a person within some significant external system. It must have a TYPE substructure to define what kind of identification number is being provided. - c("IDNO", "attribute_idnumber"), + # Process relationship tags, using a custom extractor. + relationship_mappings <- list( + list( + tag = "FAMC", field = "FAMC", mode = "append", + extractor = function(x) stringr::str_extract(x, "(?<=@.)\\d*(?=@)") + ), + list( + tag = "FAMS", field = "FAMS", mode = "append", + extractor = function(x) stringr::str_extract(x, "(?<=@.)\\d*(?=@)") + ) + ) + out <- applyTagMappings(line, record, pattern_rows, relationship_mappings) + if (out$matched) { + record <- out$record + i <- i + 1 + next + } - # NATI nationality - # g7:NATI An individual’s national heritage or origin, or other folk, house, kindred, lineage, or tribal interest. - c("NATI", "attribute_nationality"), + # Optionally print progress for long records. + i <- i + 1 + } - # NCHI number of children - # g7:INDI-NCHI The number of children that this person is known to be the parent of (all marriages). - c("NCHI", "attribute_children"), + # If the record has no ID, return NULL. + if (is.na(record$id)) { + return(NULL) + } + return(record) +} - # NMR number of marriages - # g7:NMR The number of times this person has participated in a family as a spouse or parent. - c("NMR", "attribute_marriages"), +#' Parse a Full Name Line +#' +#' Extracts full name information from a GEDCOM "NAME" line and updates the record accordingly. +#' +#' @param line A character string containing the name line. +#' @param record A named list representing the individual's record. +#' @return The updated record with parsed name information. +parseNameLine <- function(line, record) { + record$name <- extract_info(line, "NAME") + record$name_given <- stringr::str_extract(record$name, ".*(?= /)") + record$name_surn <- stringr::str_extract(record$name, "(?<=/).*(?=/)") + record$name <- stringr::str_squish(stringr::str_replace(record$name, "/", " ")) + return(record) +} - # OCCU occupation - # g7:OCCU The type of work or profession of an individual. - c("OCCU", "attribute_occupation"), +#' Process Event Lines (Birth or Death) +#' +#' Extracts event details (e.g., date, place, cause, latitude, longitude) from a block of GEDCOM lines. +#' For "birth": expect DATE on line i+1, PLAC on i+2, LATI on i+4, LONG on i+5. +#' For "death": expect DATE on line i+1, PLAC on i+2, CAUS on i+3, LATI on i+4, LONG on i+5. +#' @param event A character string indicating the event type ("birth" or "death"). +#' @param block A character vector of GEDCOM lines. +#' @param i The current line index where the event tag is found. +#' @param record A named list representing the individual's record. +#' @param pattern_rows A list with counts of GEDCOM tag occurrences. +#' @return The updated record with parsed event information.# +# For "death": expect DATE on line i+1, PLAC on i+2, CAUS on i+3, LATI on i+4, LONG on i+5. +processEventLine <- function(event, block, i, record, pattern_rows) { + n_lines <- length(block) + if (event == "birth") { + if (i + 1 <= n_lines) record$birth_date <- extract_info(block[i + 1], "DATE") + if (i + 2 <= n_lines) record$birth_place <- extract_info(block[i + 2], "PLAC") + if (i + 4 <= n_lines) record$birth_lat <- extract_info(block[i + 4], "LATI") + if (i + 5 <= n_lines) record$birth_long <- extract_info(block[i + 5], "LONG") + } else if (event == "death") { + if (i + 1 <= n_lines) record$death_date <- extract_info(block[i + 1], "DATE") + if (i + 2 <= n_lines) record$death_place <- extract_info(block[i + 2], "PLAC") + if (i + 3 <= n_lines) record$death_caus <- extract_info(block[i + 3], "CAUS") + if (i + 4 <= n_lines) record$death_lat <- extract_info(block[i + 4], "LATI") + if (i + 5 <= n_lines) record$death_long <- extract_info(block[i + 5], "LONG") + } + return(record) +} - # PROP property - # g7:PROP Pertaining to possessions such as real estate or other property of interest. - c("PROP", "attribute_property"), +#' Apply Tag Mappings to a Line +#' +#' Iterates over a list of tag mappings and, if a tag matches the line, updates the record. +#' +#' @param line A character string from the GEDCOM file. +#' @param record A named list representing the individual's record. +#' @param pattern_rows A list with GEDCOM tag counts. +#' @param tag_mappings A list of lists. Each sublist should define: +#' - \code{tag}: the GEDCOM tag, +#' - \code{field}: the record field to update, +#' - \code{mode}: either "replace" or "append", +#' - \code{extractor}: (optional) a custom extraction function. +#' @return A list with the updated record (\code{record}) and a logical flag (\code{matched}). +applyTagMappings <- function(line, record, pattern_rows, tag_mappings) { + for (mapping in tag_mappings) { + extractor <- if (is.null(mapping$extractor)) NULL else mapping$extractor + result <- process_tag(mapping$tag, mapping$field, pattern_rows, line, record, + extractor = extractor, mode = mapping$mode + ) + record <- result$vars + if (result$matched) { + return(list(record = record, matched = TRUE)) + } + } + return(list(record = record, matched = FALSE)) +} - # RELI religion - # g7:INDI-RELI A religious denomination to which a person is affiliated or for which a record applies. - c("RELI", "attribute_religion"), - # RESI residence - # g7:INDI-RESI An address or place of residence where an individual resided. - c("RESI", "attribute_residence"), +#' Extract Information from Line +#' +#' This function extracts information from a line based on a specified type. +#' @param line A character string representing a line from a GEDCOM file. +#' @param type A character string representing the type of information to extract. +#' @return A character string with the extracted information. +#' @keywords internal +extract_info <- function(line, type) { + stringr::str_squish(stringr::str_extract(line, paste0("(?<=", type, " ).+"))) +} - # SSN social security number - # g7:SSN A number assigned by the United States Social Security Administration, used for tax identification purposes. It is a type of IDNO. - c("SSN", "attribute_ssn"), +#' Count GEDCOM Pattern Rows +#' +#' Counts the number of lines in a file (passed as a data frame with column "X1") +#' that match various GEDCOM patterns. +#' +#' @param file A data frame with a column \code{X1} containing GEDCOM lines. +#' @return A list with counts of specific GEDCOM tag occurrences. +countPatternRows <- function(file) { + pattern_counts <- sapply( + c( + "@ INDI", " NAME", " GIVN", " NPFX", " NICK", " SURN", " NSFX", " _MARNM", + " BIRT", " DEAT", " SEX", " CAST", " DSCR", " EDUC", " IDNO", " NATI", + " NCHI", " NMR", " OCCU", " PROP", " RELI", " RESI", " SSN", " TITL", + " FAMC", " FAMS", " PLAC", " LATI", " LONG", " DATE", " CAUS" + ), + function(pat) sum(grepl(pat, file$X1)) + ) + num_rows <- list( + num_indi_rows = pattern_counts["@ INDI"], + num_name_rows = pattern_counts[" NAME"], + num_givn_rows = pattern_counts[" GIVN"], + num_npfx_rows = pattern_counts[" NPFX"], + num_nick_rows = pattern_counts[" NICK"], + num_surn_rows = pattern_counts[" SURN"], + num_nsfx_rows = pattern_counts[" NSFX"], + num_marnm_rows = pattern_counts[" _MARNM"], + num_birt_rows = pattern_counts[" BIRT"], + num_deat_rows = pattern_counts[" DEAT"], + num_sex_rows = pattern_counts[" SEX"], + num_cast_rows = pattern_counts[" CAST"], + num_dscr_rows = pattern_counts[" DSCR"], + num_educ_rows = pattern_counts[" EDUC"], + num_idno_rows = pattern_counts[" IDNO"], + num_nati_rows = pattern_counts[" NATI"], + num_nchi_rows = pattern_counts[" NCHI"], + num_nmr_rows = pattern_counts[" NMR"], + num_occu_rows = pattern_counts[" OCCU"], + num_prop_rows = pattern_counts[" PROP"], + num_reli_rows = pattern_counts[" RELI"], + num_resi_rows = pattern_counts[" RESI"], + num_ssn_rows = pattern_counts[" SSN"], + num_titl_rows = pattern_counts[" TITL"], + num_famc_rows = pattern_counts[" FAMC"], + num_fams_rows = pattern_counts[" FAMS"], + num_plac_rows = pattern_counts[" PLAC"], + num_lati_rows = pattern_counts[" LATI"], + num_long_rows = pattern_counts[" LONG"], + num_date_rows = pattern_counts[" DATE"], + num_caus_rows = pattern_counts[" CAUS"] + ) + return(num_rows) +} - # TITL title - # g7:INDI-TITL A formal designation used by an individual in connection with positions of royalty or other social status, such as Grand Duke. - c("TITL", "attribute_title") - )) { - result <- process_tag(tag_field[1], tag_field[2], num_rows, tmpv, vars) - vars <- result$vars - if (result$matched) next +#' Process a GEDCOM Tag +#' +#' Extracts and assigns a value to a specified field in `vars` if the pattern is present. +#' Returns both the updated variable list and a flag indicating whether the tag was matched. +#' +#' @param tag The GEDCOM tag (e.g., "SEX", "CAST", etc.). +#' @param field_name The name of the variable to assign to in `vars`. +#' @param pattern_rows Output from `countPatternRows()`. +#' @param line The GEDCOM line to parse. +#' @param vars The current list of variables to update. +#' @return A list with updated `vars` and a `matched` flag. +#' @keywords internal +process_tag <- function(tag, field_name, pattern_rows, line, vars, + extractor = NULL, mode = "replace") { + count_name <- paste0("num_", tolower(tag), "_rows") + matched <- FALSE + if (!is.null(pattern_rows[[count_name]]) && + pattern_rows[[count_name]] > 0 && + grepl(paste0(" ", tag), line)) { + value <- if (is.null(extractor)) { + extract_info(line, tag) + } else { + extractor(line) } - - # relationship data - # g7:INDI-FAMC - ## The family in which an individual appears as a child. It is also used with a g7:FAMC-STAT substructure to show individuals who are not children of the family. See FAMILY_RECORD for more details. - result <- process_tag("FAMC", "FAMC", num_rows, tmpv, vars, - extractor = function(x) stringr::str_extract(x, "(?<=@.)\\d*(?=@)"), - mode = "append" - ) - vars <- result$vars - if (result$matched) next - - # FAMS (Family spouse) g7:FAMS - # The family in which an individual appears as a partner. See FAMILY_RECORD for more details. - result <- process_tag("FAMS", "FAMS", num_rows, tmpv, vars, - extractor = function(x) stringr::str_extract(x, "(?<=@.)\\d*(?=@)"), - mode = "append" - ) - vars <- result$vars - if (result$matched) next - - if (verbose && i %% 1000 == 0) { - cat("Processed", i, "lines\n") + if (mode == "append" && !is.na(vars[[field_name]])) { + vars[[field_name]] <- paste0(vars[[field_name]], ", ", value) + } else { + vars[[field_name]] <- value } + matched <- TRUE } - # write final file line - line_to_write <- as.data.frame(vars) - df_temp <- rbind(df_temp, line_to_write) - df_temp <- df_temp[!is.na(df_temp$id), ] + return(list(vars = vars, matched = matched)) +} - if (verbose) { - print(paste0("File has ", nrow(df_temp), " people")) - } - if (nrow(df_temp) == 0) { - warning("No people found in file") - return(NULL) - } - if (nrow(df_temp) != num_rows$num_indi_rows) { - warning("The number of people found in the processed file does not match the number of individuals raw data") - } - # Add mom and dad ids +#' Post-process GEDCOM Data Frame +#' +#' This function optionally adds parent information, combines duplicate columns, +#' and removes empty columns from the GEDCOM data frame. +#' +#' @param df_temp A data frame produced by \code{readGedcom()}. +#' @param remove_empty_cols Logical indicating whether to remove columns that are entirely missing. +#' @param combine_cols Logical indicating whether to combine columns with duplicate values. +#' @param add_parents Logical indicating whether to add parent information. +#' @param skinny Logical indicating whether to slim down the data frame. +#' @param verbose Logical indicating whether to print progress messages. +#' @return The post-processed data frame. +postProcessGedcom <- function(df_temp, + remove_empty_cols = TRUE, + combine_cols = TRUE, + add_parents = TRUE, + skinny = TRUE, + verbose = FALSE) { if (add_parents) { - if (verbose) { - print("Processing parents") - } + if (verbose) message("Processing parents") df_temp <- processParents(df_temp, datasource = "gedcom") } - - - if (combine_cols) { - if (verbose) { - print("Combining Duplicate Columns") - } - # need to check if any values aren't NA in name_given_pieces and name_surn_pieces - # Combine `name_given` and `name_given_pieces` - - # Combine `name_given` and `name_given_pieces` - if (!all(is.na(df_temp$name_given_pieces)) | !all(is.na(df_temp$name_given))) { - result <- combine_columns(df_temp$name_given, df_temp$name_given_pieces) - df_temp$name_given <- result$combined - if (!result$retain_col2) { - df_temp$name_given_pieces <- NULL - } - } - - # Combine `name_surn` and `name_surn_pieces` - if (!all(is.na(df_temp$name_surn_pieces)) | !all(is.na(df_temp$name_surn))) { - result <- combine_columns(df_temp$name_surn, df_temp$name_surn_pieces) - df_temp$name_surn <- result$combined - if (!result$retain_col2) { - df_temp$name_surn_pieces <- NULL - } - } + df_temp <- collapseNames(verbose = verbose, df_temp = df_temp) } - if (remove_empty_cols) { - # Remove empty columns - if (verbose) { - print("Removing empty columns") - } + if (verbose) message("Removing empty columns") df_temp <- df_temp[, colSums(is.na(df_temp)) < nrow(df_temp)] } if (skinny) { - if (verbose) { - print("Slimming down the data frame") - } + if (verbose) message("Slimming down the data frame") df_temp <- df_temp[, colSums(is.na(df_temp)) < nrow(df_temp)] df_temp$FAMC <- NULL df_temp$FAMS <- NULL @@ -342,15 +487,41 @@ readGedcom <- function(file_path, return(df_temp) } - -#' Create a mapping of family IDs to parent IDs +#' Process Parents Information from GEDCOM Data #' -#' This function creates a mapping from family IDs to the IDs of the parents. +#' Adds parent IDs to the individuals based on family relationship data. #' -#' @param df_temp A data frame containing information about individuals. -#' @return A list mapping family IDs to parent IDs. -#' @keywords internal +#' @param df_temp A data frame produced by \code{readGedcom()}. +#' @param datasource Character string indicating the data source ("gedcom" or "wiki"). +#' @return The updated data frame with parent IDs added. +processParents <- function(df_temp, datasource) { + if (datasource == "gedcom") { + required_cols <- c("FAMC", "sex", "FAMS") + } else if (datasource == "wiki") { + required_cols <- c("id") + } else { + stop("Invalid datasource") + } + if (!all(required_cols %in% colnames(df_temp))) { + missing_cols <- setdiff(required_cols, colnames(df_temp)) + warning("Missing necessary columns: ", paste(missing_cols, collapse = ", ")) + return(df_temp) + } + family_to_parents <- mapFAMS2parents(df_temp) + if (is.null(family_to_parents) || length(family_to_parents) == 0) { + return(df_temp) + } + df_temp <- mapFAMC2parents(df_temp, family_to_parents) + return(df_temp) +} + +#' Create a Mapping from Family IDs to Parent IDs #' +#' This function scans the data frame and creates a mapping of family IDs +#' to the corresponding parent IDs. +#' +#' @param df_temp A data frame produced by \code{readGedcom()}. +#' @return A list mapping family IDs to parent information. mapFAMS2parents <- function(df_temp) { if (!all(c("FAMS", "sex") %in% colnames(df_temp))) { warning("The data frame does not contain the necessary columns (FAMS, sex)") @@ -411,50 +582,30 @@ mapFAMC2parents <- function(df_temp, family_to_parents) { return(df_temp) } -#' Process parents information +#' collapse Names #' -#' This function processes the dataframe to add momID and dadID columns. +#' This function combines the `name_given` and `name_given_pieces` columns in a data frame. #' -#' @param df_temp A data frame containing information about individuals. -#' @return A data frame with added momID and dadID columns. -#' @keywords internal -processParents <- function(df_temp, datasource) { - # Ensure required columns are present - if (datasource == "gedcom") { - required_cols <- c("FAMC", "sex", "FAMS") - } else if (datasource == "wiki") { - required_cols <- c("id") - } else { - stop("Invalid datasource") +#' @inheritParams readGedcom +#' @param df_temp A data frame containing the columns to be combined. +#' @return A data frame with the combined columns. +collapseNames <- function(verbose, df_temp) { + if (verbose) message("Combining Duplicate Columns") + + if (!all(is.na(df_temp$name_given_pieces)) | !all(is.na(df_temp$name_given))) { + result <- combine_columns(df_temp$name_given, df_temp$name_given_pieces) + df_temp$name_given <- result$combined + if (!result$retain_col2) df_temp$name_given_pieces <- NULL } - if (!all(required_cols %in% colnames(df_temp))) { - missing_cols <- setdiff(required_cols, colnames(df_temp)) - warning("Missing necessary columns: ", paste(missing_cols, collapse = ", ")) - return(df_temp) + if (!all(is.na(df_temp$name_surn_pieces)) | !all(is.na(df_temp$name_surn))) { + result <- combine_columns(df_temp$name_surn, df_temp$name_surn_pieces) + df_temp$name_surn <- result$combined + if (!result$retain_col2) df_temp$name_surn_pieces <- NULL } - - family_to_parents <- mapFAMS2parents(df_temp) - if (is.null(family_to_parents) || length(family_to_parents) == 0) { - return(df_temp) - } - df_temp <- mapFAMC2parents(df_temp, family_to_parents) return(df_temp) } - - -#' Extract Information from Line -#' -#' This function extracts information from a line based on a specified type. -#' @param line A character string representing a line from a GEDCOM file. -#' @param type A character string representing the type of information to extract. -#' @return A character string with the extracted information. -#' @keywords internal -extract_info <- function(line, type) { - stringr::str_squish(stringr::str_extract(line, paste0("(?<=", type, " ).+"))) -} - #' Combine Columns #' #' This function combines two columns, handling conflicts and merging non-conflicting data. @@ -466,102 +617,20 @@ extract_info <- function(line, type) { combine_columns <- function(col1, col2) { col1_lower <- stringr::str_to_lower(col1) col2_lower <- stringr::str_to_lower(col2) - - # Check if there are any conflicting non-NA values conflicts <- !is.na(col1_lower) & !is.na(col2_lower) & col1_lower != col2_lower - if (any(conflicts)) { warning("Columns have conflicting values. They were not merged.") - return(list(combined = col1, retain_col2 = TRUE)) # Indicate to retain col2 + return(list(combined = col1, retain_col2 = TRUE)) } else { combined <- ifelse(is.na(col1), col2, col1) return(list(combined = combined, retain_col2 = FALSE)) } } -#' Check for Pattern Rows -#' -#' This function counts the number of rows containing specific patterns. -#' @param file A data frame containing the GEDCOM file. -#' @return A list with the number of rows containing each pattern. -#' @keywords internal -#' -countPatternRows <- function(file) { - # Count the number of rows containing specific patterns - pattern_counts <- sapply( - c( - "@ INDI", " NAME", " GIVN", " NPFX", " NICK", " SURN", " NSFX", " _MARNM", - " BIRT", " DEAT", " SEX", " CAST", " DSCR", " EDUC", " IDNO", " NATI", - " NCHI", " NMR", " OCCU", " PROP", " RELI", " RESI", " SSN", " TITL", - " FAMC", " FAMS", " PLAC", " LATI", " LONG", " DATE", " CAUS" - ), - function(pat) sum(grepl(pat, file$X1)) - ) - num_rows <- list( - num_indi_rows = pattern_counts["@ INDI"], - num_name_rows = pattern_counts[" NAME"], - num_givn_rows = pattern_counts[" GIVN"], - num_npfx_rows = pattern_counts[" NPFX"], - num_nick_rows = pattern_counts[" NICK"], - num_surn_rows = pattern_counts[" SURN"], - num_nsfx_rows = pattern_counts[" NSFX"], - num_marnm_rows = pattern_counts[" _MARNM"], - num_birt_rows = pattern_counts[" BIRT"], - num_deat_rows = pattern_counts[" DEAT"], - num_sex_rows = pattern_counts[" SEX"], - num_cast_rows = pattern_counts[" CAST"], - num_dscr_rows = pattern_counts[" DSCR"], - num_educ_rows = pattern_counts[" EDUC"], - num_idno_rows = pattern_counts[" IDNO"], - num_nati_rows = pattern_counts[" NATI"], - num_nchi_rows = pattern_counts[" NCHI"], - num_nmr_rows = pattern_counts[" NMR"], - num_occu_rows = pattern_counts[" OCCU"], - num_prop_rows = pattern_counts[" PROP"], - num_reli_rows = pattern_counts[" RELI"], - num_resi_rows = pattern_counts[" RESI"], - num_ssn_rows = pattern_counts[" SSN"], - num_titl_rows = pattern_counts[" TITL"], - num_famc_rows = pattern_counts[" FAMC"], - num_fams_rows = pattern_counts[" FAMS"], - num_plac_rows = pattern_counts[" PLAC"], - num_lati_rows = pattern_counts[" LATI"], - num_long_rows = pattern_counts[" LONG"], - num_date_rows = pattern_counts[" DATE"], - num_caus_rows = pattern_counts[" CAUS"] - ) - return(num_rows) -} - -#' Process a GEDCOM Tag -#' -#' Extracts and assigns a value to a specified field in `vars` if the pattern is present. -#' Returns both the updated variable list and a flag indicating whether the tag was matched. -#' -#' @param tag The GEDCOM tag (e.g., "SEX", "CAST", etc.). -#' @param field_name The name of the variable to assign to in `vars`. -#' @param pattern_rows Output from `countPatternRows()`. -#' @param line The GEDCOM line to parse. -#' @param vars The current list of variables to update. -#' @return A list with updated `vars` and a `matched` flag. -#' @keywords internal -#' -process_tag <- function(tag, field_name, pattern_rows, line, vars, - extractor = NULL, mode = "replace") { - count_name <- paste0("num_", tolower(tag), "_rows") - matched <- FALSE - if (!is.null(pattern_rows[[count_name]]) && - pattern_rows[[count_name]] > 0 && - grepl(paste0(" ", tag), line)) { - value <- if (is.null(extractor)) extract_info(line, tag) else extractor(line) - - if (mode == "append" && !is.na(vars[[field_name]])) { - vars[[field_name]] <- paste0(vars[[field_name]], ", ", value) - } else { - vars[[field_name]] <- value - } - - matched <- TRUE - } - return(list(vars = vars, matched = matched)) -} +# --- Exported Aliases --- +#' @rdname readGedcom +#' @export +readGed <- readGedcom +#' @rdname readGedcom +#' @export +readgedcom <- readGedcom diff --git a/R/readGedcomlegacy.R b/R/readGedcomlegacy.R new file mode 100644 index 00000000..8221197a --- /dev/null +++ b/R/readGedcomlegacy.R @@ -0,0 +1,621 @@ +#' Read a GEDCOM File +#' +#' This function reads a GEDCOM file and parses it into a structured data frame of individuals. +#' Inspired by https://raw.githubusercontent.com/jjfitz/readgedcom/master/R/read_gedcom.R +#' +#' @param file_path The path to the GEDCOM file. +#' @param add_parents A logical value indicating whether to add parents to the data frame. +#' @param remove_empty_cols A logical value indicating whether to remove columns with all missing values. +#' @param combine_cols A logical value indicating whether to combine columns with duplicate values. +#' @param verbose A logical value indicating whether to print messages. +#' @param skinny A logical value indicating whether to return a skinny data frame. +#' @param update_rate numeric. The rate at which to print progress +#' @param ... Additional arguments to be passed to the function. +#' @return A data frame containing information about individuals, with the following potential columns: +#' - `id`: ID of the individual +#' - `momID`: ID of the individual's mother +#' - `dadID`: ID of the individual's father +#' - `sex`: Sex of the individual +#' - `name`: Full name of the individual +#' - `name_given`: First name of the individual +#' - `name_surn`: Last name of the individual +#' - `name_marriedsurn`: Married name of the individual +#' - `name_nick`: Nickname of the individual +#' - `name_npfx`: Name prefix +#' - `name_nsfx`: Name suffix +#' - `birth_date`: Birth date of the individual +#' - `birth_lat`: Latitude of the birthplace +#' - `birth_long`: Longitude of the birthplace +#' - `birth_place`: Birthplace of the individual +#' - `death_caus`: Cause of death +#' - `death_date`: Death date of the individual +#' - `death_lat`: Latitude of the place of death +#' - `death_long`: Longitude of the place of death +#' - `death_place`: Place of death of the individual +#' - `attribute_caste`: Caste of the individual +#' - `attribute_children`: Number of children of the individual +#' - `attribute_description`: Description of the individual +#' - `attribute_education`: Education of the individual +#' - `attribute_idnumber`: Identification number of the individual +#' - `attribute_marriages`: Number of marriages of the individual +#' - `attribute_nationality`: Nationality of the individual +#' - `attribute_occupation`: Occupation of the individual +#' - `attribute_property`: Property owned by the individual +#' - `attribute_religion`: Religion of the individual +#' - `attribute_residence`: Residence of the individual +#' - `attribute_ssn`: Social security number of the individual +#' - `attribute_title`: Title of the individual +#' - `FAMC`: ID(s) of the family where the individual is a child +#' - `FAMS`: ID(s) of the family where the individual is a spouse +#' @keywords internal +readGedcom.legacy <- function(file_path, + verbose = FALSE, + add_parents = TRUE, + remove_empty_cols = TRUE, + combine_cols = TRUE, + skinny = FALSE, + update_rate = 1000, + post_process = TRUE, + ...) { + # Checks + if (!file.exists(file_path)) stop("File does not exist: ", file_path) + if (verbose) { + print(paste("Reading file:", file_path)) + } + file <- data.frame(X1 = readLines(file_path)) + file_length <- nrow(file) + if (verbose) { + print(paste0("File is ", file_length, " lines long")) + } + + # Count the number of rows containing specific patterns + num_rows <- countPatternRows.legacy(file) + + # List of variables to initialize + var_names <- list( + identifiers = c("id", "momID", "dadID"), + names = c( + "name", "name_given", "name_given_pieces", + "name_surn", "name_surn_pieces", "name_marriedsurn", + "name_nick", "name_npfx", "name_nsfx" + ), + sex = c("sex"), + birth = c("birth_date", "birth_lat", "birth_long", "birth_place"), + death = c( + "death_caus", "death_date", + "death_lat", "death_long", "death_place" + ), + attributes = c( + "attribute_caste", "attribute_children", "attribute_description", + "attribute_education", "attribute_idnumber", "attribute_marriages", + "attribute_nationality", "attribute_occupation", + "attribute_property", "attribute_religion", "attribute_residence", + "attribute_ssn", "attribute_title" + ), + relationships = c("FAMC", "FAMS") + ) + + all_var_names <- unlist(var_names, use.names = FALSE) + + # Initialize all variables to NA + vars <- stats::setNames( + as.list(rep(NA_character_, length(all_var_names))), + all_var_names + ) + + df_temp <- as.data.frame(matrix(nrow = 1, ncol = length(all_var_names))) + names(df_temp) <- all_var_names + + if (verbose) { + print("Parsing GEDCOM file") + } + for (i in 1:length(file[1][[1]])) { + tmpv <- file[1][[1]][[i]] + + if (grepl("@ INDI", tmpv)) { + line_to_write <- as.data.frame(vars) + df_temp <- rbind(df_temp, line_to_write) + + # Reset all variables to NA + vars <- stats::setNames(as.list(rep( + NA_character_, + length(all_var_names) + )), all_var_names) + + vars$id <- stringr::str_extract(tmpv, "(?<=@.)\\d*(?=@)") + next + } + + # names + if (num_rows$num_name_rows > 0 && grepl(" NAME", tmpv)) { + vars$name <- extract_info.legacy(tmpv, "NAME") + vars$name_given <- stringr::str_extract(vars$name, ".*(?= /)") + vars$name_surn <- stringr::str_extract(vars$name, "(?<=/).*(?=/)") + vars$name <- stringr::str_squish(stringr::str_replace(vars$name, "/", " ")) + next + } + # PERSONAL_NAME_PIECES := NAME | NPFX | GIVN | NICK | SPFX | SURN | NSFX + result <- process_tag.legacy("GIVN", "name_given_pieces", num_rows, tmpv, vars) + vars <- result$vars + if (result$matched) next + + # npfx := Name Prefix + result <- process_tag.legacy("NPFX", "name_npfx", num_rows, tmpv, vars) + vars <- result$vars + if (result$matched) next + + # NICK := Nickname + result <- process_tag.legacy("NICK", "name_nick", num_rows, tmpv, vars) + vars <- result$vars + if (result$matched) next + + # surn := Surname + result <- process_tag.legacy("SURN", "name_surn_pieces", num_rows, tmpv, vars) + vars <- result$vars + if (result$matched) next + + # nsfx := Name suffix + result <- process_tag.legacy("NSFX", "name_nsfx", num_rows, tmpv, vars) + vars <- result$vars + if (result$matched) next + + result <- process_tag.legacy("_MARNM", "name_marriedsurn", num_rows, tmpv, vars) + vars <- result$vars + if (result$matched) next + + # Birth event related information + if (num_rows$num_birt_rows > 0 && grepl(" BIRT", tmpv)) { + if (num_rows$num_date_rows > 0 && i + 1 <= file_length) { + vars$birth_date <- extract_info.legacy(file[1][[1]][[i + 1]], "DATE") + if (num_rows$num_plac_rows > 0 && i + 2 <= file_length) { + vars$birth_place <- extract_info.legacy(file[1][[1]][[i + 2]], "PLAC") + if (num_rows$num_lati_rows > 0 && i + 4 <= file_length) { + vars$birth_lat <- extract_info.legacy(file[1][[1]][[i + 4]], "LATI") + if (num_rows$num_long_rows > 0 && i + 5 <= file_length) { + vars$birth_long <- extract_info.legacy(file[1][[1]][[i + 5]], "LONG") + } + } + } + } + next + } + + # Death event related information + # the ifs are nested so that there is no need to check if you've already run out of + if (num_rows$num_deat_rows > 0 && grepl(" DEAT", tmpv)) { + if (num_rows$num_date_rows > 0 && i + 1 <= file_length) { + vars$death_date <- extract_info.legacy(file[1][[1]][[i + 1]], "DATE") + if (num_rows$num_plac_rows > 0 && i + 2 <= file_length) { + vars$death_place <- extract_info.legacy(file[1][[1]][[i + 2]], "PLAC") + if (num_rows$num_caus_rows > 0 && i + 3 <= file_length) { + vars$death_caus <- extract_info.legacy(file[1][[1]][[i + 3]], "CAUS") + if (num_rows$num_lati_rows > 0 && i + 4 <= file_length) { + vars$death_lat <- extract_info.legacy(file[1][[1]][[i + 4]], "LATI") + if (num_rows$num_long_rows > 0 && i + 5 <= file_length) { + vars$death_long <- extract_info.legacy(file[1][[1]][[i + 5]], "LONG") + } + } + } + } + } + next + } + + + # Attribute tags using process_tag() + for (tag_field in list( + c("SEX", "sex"), + + # CAST caste + # g7:CAST The name of an individual’s rank or status in society which is sometimes based on racial or religious differences, or differences in wealth, inherited rank, profession, or occupation. + c("CAST", "attribute_caste"), + + # DSCR physical description + # g7:DSCR The physical characteristics of a person. + c("DSCR", "attribute_description"), + + # EDUC education + # g7:EDUC Indicator of a level of education attained. + c("EDUC", "attribute_education"), + + # IDNO identifying number + # g7:IDNO A number or other string assigned to identify a person within some significant external system. It must have a TYPE substructure to define what kind of identification number is being provided. + c("IDNO", "attribute_idnumber"), + + # NATI nationality + # g7:NATI An individual’s national heritage or origin, or other folk, house, kindred, lineage, or tribal interest. + c("NATI", "attribute_nationality"), + + # NCHI number of children + # g7:INDI-NCHI The number of children that this person is known to be the parent of (all marriages). + c("NCHI", "attribute_children"), + + # NMR number of marriages + # g7:NMR The number of times this person has participated in a family as a spouse or parent. + c("NMR", "attribute_marriages"), + + # OCCU occupation + # g7:OCCU The type of work or profession of an individual. + c("OCCU", "attribute_occupation"), + + # PROP property + # g7:PROP Pertaining to possessions such as real estate or other property of interest. + c("PROP", "attribute_property"), + + # RELI religion + # g7:INDI-RELI A religious denomination to which a person is affiliated or for which a record applies. + c("RELI", "attribute_religion"), + + # RESI residence + # g7:INDI-RESI An address or place of residence where an individual resided. + c("RESI", "attribute_residence"), + + # SSN social security number + # g7:SSN A number assigned by the United States Social Security Administration, used for tax identification purposes. It is a type of IDNO. + c("SSN", "attribute_ssn"), + + # TITL title + # g7:INDI-TITL A formal designation used by an individual in connection with positions of royalty or other social status, such as Grand Duke. + c("TITL", "attribute_title") + )) { + result <- process_tag.legacy(tag_field[1], tag_field[2], num_rows, tmpv, vars) + vars <- result$vars + if (result$matched) next + } + + # relationship data + # g7:INDI-FAMC + ## The family in which an individual appears as a child. It is also used with a g7:FAMC-STAT substructure to show individuals who are not children of the family. See FAMILY_RECORD for more details. + result <- process_tag.legacy("FAMC", "FAMC", num_rows, tmpv, vars, + extractor = function(x) stringr::str_extract(x, "(?<=@.)\\d*(?=@)"), + mode = "append" + ) + vars <- result$vars + if (result$matched) next + + # FAMS (Family spouse) g7:FAMS + # The family in which an individual appears as a partner. See FAMILY_RECORD for more details. + result <- process_tag.legacy("FAMS", "FAMS", num_rows, tmpv, vars, + extractor = function(x) stringr::str_extract(x, "(?<=@.)\\d*(?=@)"), + mode = "append" + ) + vars <- result$vars + if (result$matched) next + + if (verbose && i %% update_rate == 0) { + cat("Processed", i, "lines\n") + } + } + # write final file line + line_to_write <- as.data.frame(vars) + df_temp <- rbind(df_temp, line_to_write) + df_temp <- df_temp[!is.na(df_temp$id), ] + + if (verbose) { + print(paste0("File has ", nrow(df_temp), " people")) + } + if (nrow(df_temp) == 0) { + warning("No people found in file") + return(NULL) + } + if (nrow(df_temp) != num_rows$num_indi_rows) { + warning("The number of people found in the processed file does not match the number of individuals raw data") + } + + if (post_process) { + if (verbose) { + print("Post-processing data frame") + } + # Remove the first row (empty) + df_temp <- postProcessGedcom.legacy( + df_temp = df_temp, + remove_empty_cols = remove_empty_cols, + combine_cols = combine_cols, + add_parents = add_parents, + skinny = skinny, + verbose = verbose + ) + } + + return(df_temp) +} + +#' Post-process GEDCOM Data Frame +#' +#' @inheritParams readGedcom.legacy +#' @inheritParams mapFAMS2parents.legacy +#' @return A data frame with processed information. + +postProcessGedcom.legacy <- function(df_temp, + remove_empty_cols = TRUE, + combine_cols = TRUE, + add_parents = TRUE, + skinny = TRUE, + verbose = FALSE) { + # Add mom and dad ids + if (add_parents) { + if (verbose) { + print("Processing parents") + } + df_temp <- processParents.legacy(df_temp, datasource = "gedcom") + } + + if (combine_cols) { + df_temp <- collapseNames.legacy(verbose = verbose, df_temp = df_temp) + } + + if (remove_empty_cols) { + # Remove empty columns + if (verbose) { + print("Removing empty columns") + } + df_temp <- df_temp[, colSums(is.na(df_temp)) < nrow(df_temp)] + } + if (skinny) { + if (verbose) { + print("Slimming down the data frame") + } + df_temp <- df_temp[, colSums(is.na(df_temp)) < nrow(df_temp)] + df_temp$FAMC <- NULL + df_temp$FAMS <- NULL + } + + return(df_temp) +} + +#' Create a mapping of family IDs to parent IDs +#' +#' This function creates a mapping from family IDs to the IDs of the parents. +#' +#' @param df_temp A data frame containing information about individuals. +#' @return A list mapping family IDs to parent IDs. +#' @keywords internal +#' +mapFAMS2parents.legacy <- function(df_temp) { + if (!all(c("FAMS", "sex") %in% colnames(df_temp))) { + warning("The data frame does not contain the necessary columns (FAMS, sex)") + return(NULL) + } + family_to_parents <- list() + for (i in 1:nrow(df_temp)) { + if (!is.na(df_temp$FAMS[i])) { + fams_ids <- unlist(strsplit(df_temp$FAMS[i], ", ")) + for (fams_id in fams_ids) { + if (!is.null(family_to_parents[[fams_id]])) { + if (df_temp$sex[i] == "M") { + family_to_parents[[fams_id]]$father <- df_temp$id[i] + } else if (df_temp$sex[i] == "F") { + family_to_parents[[fams_id]]$mother <- df_temp$id[i] + } + } else { + family_to_parents[[fams_id]] <- list() + if (df_temp$sex[i] == "M") { + family_to_parents[[fams_id]]$father <- df_temp$id[i] + } else if (df_temp$sex[i] == "F") { + family_to_parents[[fams_id]]$mother <- df_temp$id[i] + } + } + } + } + } + return(family_to_parents) +} + +#' Assign momID and dadID based on family mapping +#' +#' This function assigns mother and father IDs to individuals in the data frame +#' based on the mapping of family IDs to parent IDs. +#' +#' @param df_temp A data frame containing individual information. +#' @param family_to_parents A list mapping family IDs to parent IDs. +#' @return A data frame with added momID and dad_ID columns. +#' @keywords internal +mapFAMC2parents.legacy <- function(df_temp, family_to_parents) { + df_temp$momID <- NA_character_ + df_temp$dadID <- NA_character_ + for (i in 1:nrow(df_temp)) { + if (!is.na(df_temp$FAMC[i])) { + famc_ids <- unlist(strsplit(df_temp$FAMC[i], ", ")) + for (famc_id in famc_ids) { + if (!is.null(family_to_parents[[famc_id]])) { + if (!is.null(family_to_parents[[famc_id]]$father)) { + df_temp$dadID[i] <- family_to_parents[[famc_id]]$father + } + if (!is.null(family_to_parents[[famc_id]]$mother)) { + df_temp$momID[i] <- family_to_parents[[famc_id]]$mother + } + } + } + } + } + return(df_temp) +} + +#' Process parents information +#' +#' This function processes the dataframe to add momID and dadID columns. +#' +#' @param df_temp A data frame containing information about individuals. +#' @return A data frame with added momID and dadID columns. +#' @keywords internal +processParents.legacy <- function(df_temp, datasource) { + # Ensure required columns are present + if (datasource == "gedcom") { + required_cols <- c("FAMC", "sex", "FAMS") + } else if (datasource == "wiki") { + required_cols <- c("id") + } else { + stop("Invalid datasource") + } + + if (!all(required_cols %in% colnames(df_temp))) { + missing_cols <- setdiff(required_cols, colnames(df_temp)) + warning("Missing necessary columns: ", paste(missing_cols, collapse = ", ")) + return(df_temp) + } + + family_to_parents <- mapFAMS2parents.legacy(df_temp) + if (is.null(family_to_parents) || length(family_to_parents) == 0) { + return(df_temp) + } + df_temp <- mapFAMC2parents.legacy(df_temp, family_to_parents) + return(df_temp) +} + + + +#' Extract Information from Line +#' +#' This function extracts information from a line based on a specified type. +#' @param line A character string representing a line from a GEDCOM file. +#' @param type A character string representing the type of information to extract. +#' @return A character string with the extracted information. +#' @keywords internal +extract_info.legacy <- function(line, type) { + stringr::str_squish(stringr::str_extract(line, paste0("(?<=", type, " ).+"))) +} + +#' Combine Columns +#' +#' This function combines two columns, handling conflicts and merging non-conflicting data. +#' @param col1 The first column to combine. +#' @param col2 The second column to combine. +#' @return A list with the combined column and a flag indicating if the second column should be retained. +#' @keywords internal +# Helper function to check for conflicts and merge columns +combine_columns <- function(col1, col2) { + col1_lower <- stringr::str_to_lower(col1) + col2_lower <- stringr::str_to_lower(col2) + + # Check if there are any conflicting non-NA values + conflicts <- !is.na(col1_lower) & !is.na(col2_lower) & col1_lower != col2_lower + + if (any(conflicts)) { + warning("Columns have conflicting values. They were not merged.") + return(list(combined = col1, retain_col2 = TRUE)) # Indicate to retain col2 + } else { + combined <- ifelse(is.na(col1), col2, col1) + return(list(combined = combined, retain_col2 = FALSE)) + } +} + +#' Check for Pattern Rows +#' +#' This function counts the number of rows containing specific patterns. +#' @param file A data frame containing the GEDCOM file. +#' @return A list with the number of rows containing each pattern. +#' @keywords internal +#' +countPatternRows.legacy <- function(file) { + # Count the number of rows containing specific patterns + pattern_counts <- sapply( + c( + "@ INDI", " NAME", " GIVN", " NPFX", " NICK", " SURN", " NSFX", " _MARNM", + " BIRT", " DEAT", " SEX", " CAST", " DSCR", " EDUC", " IDNO", " NATI", + " NCHI", " NMR", " OCCU", " PROP", " RELI", " RESI", " SSN", " TITL", + " FAMC", " FAMS", " PLAC", " LATI", " LONG", " DATE", " CAUS" + ), + function(pat) sum(grepl(pat, file$X1)) + ) + num_rows <- list( + num_indi_rows = pattern_counts["@ INDI"], + num_name_rows = pattern_counts[" NAME"], + num_givn_rows = pattern_counts[" GIVN"], + num_npfx_rows = pattern_counts[" NPFX"], + num_nick_rows = pattern_counts[" NICK"], + num_surn_rows = pattern_counts[" SURN"], + num_nsfx_rows = pattern_counts[" NSFX"], + num_marnm_rows = pattern_counts[" _MARNM"], + num_birt_rows = pattern_counts[" BIRT"], + num_deat_rows = pattern_counts[" DEAT"], + num_sex_rows = pattern_counts[" SEX"], + num_cast_rows = pattern_counts[" CAST"], + num_dscr_rows = pattern_counts[" DSCR"], + num_educ_rows = pattern_counts[" EDUC"], + num_idno_rows = pattern_counts[" IDNO"], + num_nati_rows = pattern_counts[" NATI"], + num_nchi_rows = pattern_counts[" NCHI"], + num_nmr_rows = pattern_counts[" NMR"], + num_occu_rows = pattern_counts[" OCCU"], + num_prop_rows = pattern_counts[" PROP"], + num_reli_rows = pattern_counts[" RELI"], + num_resi_rows = pattern_counts[" RESI"], + num_ssn_rows = pattern_counts[" SSN"], + num_titl_rows = pattern_counts[" TITL"], + num_famc_rows = pattern_counts[" FAMC"], + num_fams_rows = pattern_counts[" FAMS"], + num_plac_rows = pattern_counts[" PLAC"], + num_lati_rows = pattern_counts[" LATI"], + num_long_rows = pattern_counts[" LONG"], + num_date_rows = pattern_counts[" DATE"], + num_caus_rows = pattern_counts[" CAUS"] + ) + return(num_rows) +} + +#' Process a GEDCOM Tag +#' +#' Extracts and assigns a value to a specified field in `vars` if the pattern is present. +#' Returns both the updated variable list and a flag indicating whether the tag was matched. +#' +#' @param tag The GEDCOM tag (e.g., "SEX", "CAST", etc.). +#' @param field_name The name of the variable to assign to in `vars`. +#' @param pattern_rows Output from `countPatternRows()`. +#' @param line The GEDCOM line to parse. +#' @param vars The current list of variables to update. +#' @return A list with updated `vars` and a `matched` flag. +#' @keywords internal +#' +process_tag.legacy <- function(tag, field_name, pattern_rows, line, vars, + extractor = NULL, mode = "replace") { + count_name <- paste0("num_", tolower(tag), "_rows") + matched <- FALSE + if (!is.null(pattern_rows[[count_name]]) && + pattern_rows[[count_name]] > 0 && + grepl(paste0(" ", tag), line)) { + value <- if (is.null(extractor)) extract_info.legacy(line, tag) else extractor(line) + + if (mode == "append" && !is.na(vars[[field_name]])) { + vars[[field_name]] <- paste0(vars[[field_name]], ", ", value) + } else { + vars[[field_name]] <- value + } + + matched <- TRUE + } + return(list(vars = vars, matched = matched)) +} + +#' collapse Names +#' +#' This function combines the `name_given` and `name_given_pieces` columns in a data frame. +#' +#' @inheritParams readGedcom.legacy +#' @param df_temp A data frame containing the columns to be combined. +#' @keywords internal +collapseNames.legacy <- function(verbose, df_temp) { + if (verbose) { + print("Combining Duplicate Columns") + } + # need to check if any values aren't NA in name_given_pieces and name_surn_pieces + # Combine `name_given` and `name_given_pieces` + + # Combine `name_given` and `name_given_pieces` + if (!all(is.na(df_temp$name_given_pieces)) | !all(is.na(df_temp$name_given))) { + result <- combine_columns(df_temp$name_given, df_temp$name_given_pieces) + df_temp$name_given <- result$combined + if (!result$retain_col2) { + df_temp$name_given_pieces <- NULL + } + } + + # Combine `name_surn` and `name_surn_pieces` + if (!all(is.na(df_temp$name_surn_pieces)) | !all(is.na(df_temp$name_surn))) { + result <- combine_columns(df_temp$name_surn, df_temp$name_surn_pieces) + df_temp$name_surn <- result$combined + if (!result$retain_col2) { + df_temp$name_surn_pieces <- NULL + } + } + return(df_temp) +} diff --git a/R/simulatePedigree.R b/R/simulatePedigree.R index d50b7732..65f35ff8 100644 --- a/R/simulatePedigree.R +++ b/R/simulatePedigree.R @@ -24,7 +24,6 @@ buildWithinGenerations <- function(sizeGens, marR, sexR, Ngen) { df_Ngen$sex <- determineSex(idGen = idGen, sexR = sexR) - # print(paste("tiger",i)) # The first generation if (i == 1) { @@ -181,11 +180,9 @@ buildBetweenGenerations <- function(df_Fam, Ngen, sizeGens, verbose, marR, sexR, # count the number of couples in the i th gen countCouple <- (nrow(df_Ngen) - sum(is.na(df_Ngen$spID))) * .5 - # Now, assign couple IDs for the current generation df_Ngen <- assignCoupleIds(df_Ngen) - # get the number of linked female and male children after excluding the single children # get a vector of single person id in the ith generation IdSingle <- df_Ngen$id[is.na(df_Ngen$spID)] @@ -194,9 +191,13 @@ buildBetweenGenerations <- function(df_Fam, Ngen, sizeGens, verbose, marR, sexR, SingleM <- sum(df_Ngen$sex == "M" & is.na(df_Ngen$spID)) CoupleM <- N_LinkedMale - SingleM - df_Fam[df_Fam$gen == i, ] <- markPotentialChildren(df_Ngen = df_Ngen, i = i, Ngen = Ngen, sizeGens = sizeGens, CoupleF = CoupleF) - - + df_Fam[df_Fam$gen == i, ] <- markPotentialChildren( + df_Ngen = df_Ngen, + i = i, + Ngen = Ngen, + sizeGens = sizeGens, + CoupleF = CoupleF + ) if (verbose) { print( "Step 2.2: mark a group of potential parents in the i-1 th generation" @@ -251,7 +252,6 @@ buildBetweenGenerations <- function(df_Fam, Ngen, sizeGens, verbose, marR, sexR, # generate link kids to the couples random_numbers <- adjustKidsPerCouple(nMates = sum(df_Ngen$ifparent) / 2, kpc = kpc, rd_kpc = rd_kpc) - # cat("final random numbers",random_numbers, "\n") # cat("mean",sum(random_numbers)/length(random_numbers), "\n") # create two vectors for maId and paId; replicate the ids to match the same length as IdOfp @@ -343,6 +343,7 @@ buildBetweenGenerations <- function(df_Fam, Ngen, sizeGens, verbose, marR, sexR, #' @param balancedSex Not fully developed yet. Always \code{TRUE} in the current version. #' @param balancedMar Not fully developed yet. Always \code{TRUE} in the current version. #' @param verbose logical If TRUE, print progress through stages of algorithm +#' @param ... Additional arguments to be passed to other functions. #' @return A \code{data.frame} with each row representing a simulated individual. The columns are as follows: #' \itemize{ @@ -390,8 +391,6 @@ simulatePedigree <- function(kpc = 3, sizeGens = sizeGens, verbose = verbose, marR = marR, sexR = sexR, kpc = kpc, rd_kpc = rd_kpc ) - - df_Fam <- df_Fam[, 1:7] df_Fam <- df_Fam[!(is.na(df_Fam$pat) & is.na(df_Fam$mat) & is.na(df_Fam$spID)), ] colnames(df_Fam)[c(2, 4, 5)] <- c("ID", "dadID", "momID") @@ -408,3 +407,10 @@ simulatePedigree <- function(kpc = 3, # print(df_Fam) return(df_Fam) } + +#' @rdname simulatePedigree +#' @export +SimPed <- function(...) { # nolint: object_name_linter. + warning("The 'SimPed' function is deprecated. Please use 'simulatePedigree' instead.") + simulatePedigree(...) +} diff --git a/R/summarizePedigree.R b/R/summarizePedigree.R index 6c6564f3..6cee4e5e 100644 --- a/R/summarizePedigree.R +++ b/R/summarizePedigree.R @@ -39,7 +39,7 @@ summarizePedigrees <- function(ped, famID = "famID", personID = "ID", matID = "matID", patID = "patID", type = c("fathers", "mothers", "families"), byr = NULL, include_founder = FALSE, founder_sort_var = NULL, - nbiggest = 5, noldest = 5, skip_var = NULL, + nbiggest = 5, noldest = nbiggest, skip_var = NULL, five_num_summary = FALSE, network_checks = FALSE, verbose = FALSE) { # Fast Fails @@ -123,15 +123,11 @@ summarizePedigrees <- function(ped, famID = "famID", personID = "ID", ) # Find the originating member for each line if (include_founder) { - if (verbose) message("Finding originating members for families...") - originating_member_family <- findFounder(ped_dt, + family_summary_dt <- summarizeFounder( + verbose = verbose, ped_dt = ped_dt, group_var = famID, - sort_var = founder_sort_var - ) - # Merge summary statistics with originating members for additional information - family_summary_dt <- merge(family_summary_dt, - originating_member_family, - by = famID, suffixes = c("", "_founder") + sort_var = founder_sort_var, + foo_summary_dt = family_summary_dt ) } output$family_summary <- family_summary_dt @@ -146,14 +142,11 @@ summarizePedigrees <- function(ped, famID = "famID", personID = "ID", five_num_summary = five_num_summary ) if (include_founder) { - if (verbose) message("Finding originating members for matrilineal lines...") - originating_member_maternal <- findFounder(ped_dt, + maternal_summary_dt <- summarizeFounder( + verbose = verbose, ped_dt = ped_dt, group_var = matID, - sort_var = founder_sort_var - ) - maternal_summary_dt <- merge(maternal_summary_dt, - originating_member_maternal, - by = matID, suffixes = c("", "_founder") + sort_var = founder_sort_var, + foo_summary_dt = maternal_summary_dt ) } output$maternal_summary <- maternal_summary_dt @@ -167,14 +160,11 @@ summarizePedigrees <- function(ped, famID = "famID", personID = "ID", five_num_summary = five_num_summary ) if (include_founder) { - if (verbose) message("Finding originating members for patrilineal lines...") - originating_member_paternal <- findFounder(ped_dt, + paternal_summary_dt <- summarizeFounder( + verbose = verbose, ped_dt = ped_dt, group_var = patID, - sort_var = founder_sort_var - ) - paternal_summary_dt <- merge(paternal_summary_dt, - originating_member_paternal, - by = patID, suffixes = c("", "_founder") + sort_var = founder_sort_var, + foo_summary_dt = paternal_summary_dt ) } @@ -245,12 +235,11 @@ summarizePedigrees <- function(ped, famID = "famID", personID = "ID", ) } } - return(output) } -# Function to calculate summary statistics for all numeric variables +#' Function to calculate summary statistics for all numeric variables #' This function calculates summary statistics for all numeric variables in a data.table. It is supposed to be used internally by the \code{summarize_pedigree} function. #' @inheritParams summarizePedigrees #' @param data A data.table containing the pedigree data. @@ -298,8 +287,8 @@ calculateSummaryDT <- function(data, group_var, skip_var, return(summary_stats) } -# Function to find the originating member for each line - +#' Function to find the originating member for each line +#' #' This function finds the originating member for each line in a pedigree. It is supposed to be used internally by the \code{summarize_pedigree} function. #' @inheritParams summarizePedigrees #' @param sort_var A character string specifying the column name to sort by. @@ -312,7 +301,29 @@ findFounder <- function(data, group_var, sort_var) { data[order(get(sort_var)), .SD[1], by = group_var] } +#' Function to summarize the originating members for each line +#' +#' This function summarizes the originating members for each line in a pedigree. It is supposed to be used internally by the \code{summarize_pedigree} function. +#' +#' @inheritParams summarizePedigrees +#' @inheritParams findFounder +#' +#' @keywords internal +summarizeFounder <- function(ped_dt, group_var, sort_var, foo_summary_dt, verbose) { + if (verbose) message(paste0("Finding originating members for ", "group_var")) + originating_member_foo <- findFounder( + data = ped_dt, + group_var = group_var, + sort_var = sort_var + ) + # Merge summary statistics with originating members for additional information + foo_summary_dt <- merge(foo_summary_dt, + originating_member_foo, + by = group_var, suffixes = c("", "_founder") + ) + return(foo_summary_dt) +} #' Summarize the maternal lines in a pedigree #' @inheritParams summarizePedigrees #' @seealso [summarizePedigrees ()] @@ -340,6 +351,9 @@ summarizeMatrilines <- function(ped, famID = "famID", personID = "ID", ) } + + + #' Summarize the paternal lines in a pedigree #' @inheritParams summarizePedigrees #' @seealso [summarizePedigrees ()] @@ -420,3 +434,19 @@ findBiggest <- function(foo_summary_dt, nbiggest, n_foo) { )]) return(biggest_foo) } + +#' @rdname summarizePedigrees +#' @export +summarisePedigrees <- summarizePedigrees + +#' @rdname summarizeFamilies +#' @export +summariseFamilies <- summarizeFamilies + +#' @rdname summarizeMatrilines +#' @export +summariseMatrilines <- summarizeMatrilines + +#' @rdname summarizePatrilines +#' @export +summarisePatrilines <- summarizePatrilines diff --git a/R/tweakPedigree.R b/R/tweakPedigree.R index 2367fe21..d617851b 100644 --- a/R/tweakPedigree.R +++ b/R/tweakPedigree.R @@ -18,7 +18,7 @@ makeTwins <- function(ped, ID_twin1 = NA_integer_, ID_twin2 = NA_integer_, gen_t "fam", "ID", "gen", "dadID", "momID", "spID", "sex" ), collapse = "")) { - ped <- standardizeColnames(ped) + ped <- standardizeColnames(ped, verbose = verbose) if (verbose) { cat("The input pedigree is not in the same format as the output of simulatePedigree\n") } @@ -131,7 +131,7 @@ makeInbreeding <- function(ped, c("fam", "ID", "gen", "dadID", "momID", "spID", "sex"), collapse = "" )) { - ped <- standardizeColnames(ped) + ped <- standardizeColnames(ped, verbose = verbose) if (verbose) { cat("The input pedigree is not in the same format as the output of simulatePedigree\n") } @@ -255,7 +255,7 @@ dropLink <- function(ped, if (!is.na(ID_drop)) { ped[ped$ID %in% ID_drop, c("dadID", "momID")] <- NA_integer_ } else { - warning("No individual is dropped from his/her parents.") + warning("No individual is dropped from their parents.") } } else { ped[ped$ID == ID_drop, c("dadID", "momID")] <- NA_integer_ diff --git a/data-raw/benchged.R b/data-raw/benchged.R new file mode 100644 index 00000000..18a9fc47 --- /dev/null +++ b/data-raw/benchged.R @@ -0,0 +1,31 @@ +library(microbenchmark) +library(Matrix) +# library(BGmisc) +# data("hazard") +library(tidyverse) + + + +# Run benchmarking for "loop" and "indexed" methods in ped2com() +benchmark_results <- microbenchmark( + reg = { + readGedcom("data-raw/royal92.ged") + }, + alpha = { + readGedcom.alpha("data-raw/royal92.ged") + }, + times = 5 # Run each method 100 times +) + +summary(benchmark_results) + +lm(benchmark_results$time ~ benchmark_results$expr) %>% + summary() +# Print benchmark results +print(benchmark_results) + +# Optional: Save results to CSV for later analysis +write.csv(summary(benchmark_results), + "benchmark_results.csv", + row.names = FALSE +) diff --git a/data-raw/benchmark.R b/data-raw/benchmark.R index 8c4c90b5..2add0e4f 100644 --- a/data-raw/benchmark.R +++ b/data-raw/benchmark.R @@ -2,7 +2,7 @@ library(microbenchmark) library(Matrix) # library(BGmisc) # data("hazard") - +library(tidyverse) # make big data set.seed(15) @@ -10,54 +10,307 @@ Ngen <- 5 kpc <- 5 sexR <- .50 marR <- .7 -ped <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR) - -# Define parameters -component <- "additive" # Change this to test different components -saveable <- FALSE # Disable saving to avoid disk I/O slowing down benchmarking -resume <- FALSE # Disable resume to ensure full fresh runs -save_path <- "checkpoint/" -verbose <- FALSE # Turn off verbose for cleaner output -update_rate <- 100 -save_rate_parlist <- 1000 - -# Run benchmarking for "loop" and "indexed" methods in ped2com() -benchmark_results <- microbenchmark( - loop = { - ped2com( - ped = ped, - component = component, - adjacency_method = "loop", # Test "loop" method - saveable = saveable, - resume = resume, - save_path = save_path, - verbose = verbose, - update_rate = update_rate, - save_rate_parlist = save_rate_parlist - ) - }, - indexed = { - ped2com( - ped = ped, - component = component, - adjacency_method = "indexed", # Test "indexed" method - saveable = saveable, - resume = resume, - save_path = save_path, - verbose = verbose, - update_rate = update_rate, - save_rate_parlist = save_rate_parlist +ped <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR) %>% + mutate( + fam = "fam 1" + ) +set.seed(151) +Ngen <- 5 +marR <- .8 +ped2 <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR) %>% + mutate( + fam = "fam 2", + ID = ID + max(ped2$ID, na.rm = TRUE), + momID = momID + max(ped$ID, na.rm = TRUE), + dadID = dadID + max(ped$ID, na.rm = TRUE), + spID = spID + max(ped$ID, na.rm = TRUE) + ) +set.seed(1151) +kpc <- 8 +Ngen <- 6 +ped3 <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR) %>% + mutate( + fam = "fam 3", + ID = ID + max(ped2$ID, na.rm = TRUE), + momID = momID + max(ped2$ID, na.rm = TRUE), + dadID = dadID + max(ped2$ID, na.rm = TRUE), + spID = spID + max(ped2$ID, na.rm = TRUE) + ) +ped3 <- ped3 %>% + mutate( + fam = "fam 4", + ID = ID + max(ped3$ID, na.rm = TRUE), + momID = momID + max(ped3$ID, na.rm = TRUE), + dadID = dadID + max(ped3$ID, na.rm = TRUE), + spID = spID + max(ped3$ID, na.rm = TRUE) + ) %>% + rbind(ped3) + +set.seed(1151) +kpc <- 2 +Ngen <- 10 +ped4 <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR) %>% + mutate( + fam = "fam 5", + ID = ID + max(ped3$ID, na.rm = TRUE), + momID = momID + max(ped3$ID, na.rm = TRUE), + dadID = dadID + max(ped3$ID, na.rm = TRUE), + spID = spID + max(ped3$ID, na.rm = TRUE) + ) + + +ped <- rbind(ped, ped2) +ped <- rbind(ped, ped3) +ped <- rbind(ped, ped4) + +if (TRUE) { + # Define parameters + component <- "common nuclear" # "additive" # Change this to test different components + saveable <- FALSE # Disable saving to avoid disk I/O slowing down benchmarking + resume <- FALSE # Disable resume to ensure full fresh runs + save_path <- "checkpoint/" + verbose <- FALSE # Turn off verbose for cleaner output + update_rate <- 100 + save_rate_parlist <- 1000 + # method_approach <- 1 + # Run benchmarking for "loop" and "indexed" methods in ped2com() + benchmark_results <- microbenchmark( + # loop_big = { + # ped2com( + # ped = ped, + # component = component, + # adjacency_method = "loop", # Test "loop" method + # saveable = saveable, + # resume = resume, + # save_path = save_path, + # verbose = verbose, + # update_rate = update_rate, + # save_rate_parlist = save_rate_parlist + # ) + # }, + indexed_big = { + ped2com( + ped = ped, + component = component, + adjacency_method = "indexed", # Test "indexed" method + saveable = saveable, + resume = resume, + save_path = save_path, + verbose = verbose, + update_rate = update_rate, + save_rate_parlist = save_rate_parlist + ) + }, + direct4_big = { + ped2com( + ped = ped, + component = component, + adjacency_method = "direct", # Test "indexed" method + saveable = saveable, + resume = resume, + save_path = save_path, + verbose = verbose, + update_rate = update_rate, + method_approach = 4, + save_rate_parlist = save_rate_parlist + ) + }, + direct2_big = { + ped2com( + ped = ped, + component = component, + adjacency_method = "direct", # Test "indexed" method + saveable = saveable, + resume = resume, + save_path = save_path, + verbose = verbose, + update_rate = update_rate, + method_approach = 2, + save_rate_parlist = save_rate_parlist + ) + }, + direct5_big = { + ped2com( + ped = ped, + component = component, + adjacency_method = "direct", # Test "indexed" method + saveable = saveable, + resume = resume, + save_path = save_path, + verbose = verbose, + update_rate = update_rate, + method_approach = 5, + save_rate_parlist = save_rate_parlist + ) + }, + # loop = { + # ped2com( + # ped = ped2, + # component = component, + # adjacency_method = "loop", # Test "loop" method + # saveable = saveable, + # resume = resume, + # save_path = save_path, + # verbose = verbose, + # update_rate = update_rate, + # save_rate_parlist = save_rate_parlist + # + # ) + # }, + indexed = { + ped2com( + ped = ped2, + component = component, + adjacency_method = "indexed", # Test "indexed" method + saveable = saveable, + resume = resume, + save_path = save_path, + verbose = verbose, + update_rate = update_rate, + save_rate_parlist = save_rate_parlist + ) + }, + direct4 = { + ped2com( + ped = ped2, + component = component, + adjacency_method = "direct", # Test "indexed" method + saveable = saveable, + resume = resume, + save_path = save_path, + verbose = verbose, + update_rate = update_rate, + method_approach = 4, + save_rate_parlist = save_rate_parlist + ) + }, + direct2 = { + ped2com( + ped = ped2, + component = component, + adjacency_method = "direct", # Test "indexed" method + saveable = saveable, + resume = resume, + save_path = save_path, + verbose = verbose, + update_rate = update_rate, + method_approach = 2, + save_rate_parlist = save_rate_parlist + ) + }, + direct5 = { + ped2com( + ped = ped2, + component = component, + adjacency_method = "direct", # Test "indexed" method + saveable = saveable, + resume = resume, + save_path = save_path, + verbose = verbose, + update_rate = update_rate, + method_approach = 5, + save_rate_parlist = save_rate_parlist + ) + }, + times = 10 # Run each method 100 times + ) + + + summary(benchmark_results) + + df_plot <- benchmark_results %>% mutate( + size = case_when( + expr %in% c("loop", "indexed", "direct4", "direct2", "direct5") ~ "small", + expr %in% c("loop_big", "indexed_big", "direct4_big", "direct2_big", "direct5_big") ~ "big" + ), + method = case_when( + expr %in% c("loop", "loop_big") ~ "loop", + expr %in% c("indexed", "indexed_big") ~ "indexed", + expr %in% c("direct4", "direct4_big") ~ "direct4", + expr %in% c("direct2", "direct2_big") ~ "direct2", + expr %in% c("direct5", "direct5_big") ~ "direct5" ) - }, - times = 100 # Run each method 100 times -) + ) # %>% + + # set indexed as reference level + df_plot$method <- factor(df_plot$method, levels = c("indexed", "loop", "direct2", "direct4", "direct5")) + df_plot$size <- factor(df_plot$size, levels = c("small", "big")) + + + lm(time ~ method * size, data = df_plot) %>% + summary() %>% + print() + + + p <- ggplot(df_plot, aes(x = method, y = time)) + + geom_boxplot(aes(fill = size), alpha = 0.5) + + labs( + title = "Benchmarking Results", + x = "Method", + y = "Time (seconds)" + ) + + theme_minimal() + + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + # Print benchmark results + + p + print(benchmark_results) + + # Optional: Save results to CSV for later analysis + write.csv(summary(benchmark_results), + "benchmark_results.csv", + row.names = FALSE + ) + # Print benchmark +} -# Print benchmark +if (FALSE) { + verbose <- FALSE + ad_ped_matrix <- ped2com(ped, component = "additive", adjacency_method = "direct", sparse = TRUE) + mit_ped_matrix <- ped2com(ped, component = "mitochondrial", adjacency_method = "direct", sparse = TRUE) + cn_ped_matrix <- ped2com(ped, component = "common nuclear", adjacency_method = "indexed", sparse = TRUE) + benchmark_results <- microbenchmark( + beta = { + com2links.beta( + ad_ped_matrix = ad_ped_matrix, + mit_ped_matrix = mit_ped_matrix, + cn_ped_matrix = cn_ped_matrix, + writetodisk = TRUE, + verbose = verbose + ) + file.remove("dataRelatedPairs.csv") + }, regular = { + com2links( + ad_ped_matrix = ad_ped_matrix, + mit_ped_matrix = mit_ped_matrix, + cn_ped_matrix = cn_ped_matrix, + writetodisk = TRUE, + verbose = verbose + ) + file.remove("dataRelatedPairs.csv") + }, legacy = { + com2links( + ad_ped_matrix = ad_ped_matrix, + mit_ped_matrix = mit_ped_matrix, + cn_ped_matrix = cn_ped_matrix, + verbose = verbose, + legacy = TRUE + ) + file.remove("dataRelatedPairs.csv") + }, + times = 100 # Run each method 100 times + ) + summary(benchmark_results) -# Print benchmark results -print(benchmark_results) + lm(benchmark_results$time ~ benchmark_results$expr) %>% + summary() + # Print benchmark results + print(benchmark_results) -# Optional: Save results to CSV for later analysis -write.csv(summary(benchmark_results), - "benchmark_results.csv", row.names = FALSE) + # Optional: Save results to CSV for later analysis + write.csv(summary(benchmark_results), + "benchmark_results.csv", + row.names = FALSE + ) +} diff --git a/data-raw/df_inbreeding.R b/data-raw/df_inbreeding.R index 9af10ff0..430830ff 100644 --- a/data-raw/df_inbreeding.R +++ b/data-raw/df_inbreeding.R @@ -8,5 +8,5 @@ inbreeding <- raw ## # data processing -#write.csv(inbreeding, "data-raw/inbreeding.csv", row.names = FALSE) +# write.csv(inbreeding, "data-raw/inbreeding.csv", row.names = FALSE) usethis::use_data(inbreeding, overwrite = TRUE, compress = "xz") diff --git a/data-raw/df_potter.R b/data-raw/df_potter.R index aa039c81..d1f5faf3 100644 --- a/data-raw/df_potter.R +++ b/data-raw/df_potter.R @@ -45,9 +45,11 @@ potter <- data.frame( "Molly Weasley", "Lucy Weasley" ), - gen = c(1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, - 3, 3, 3, 3, 3, 3), + gen = c( + 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3 + ), momID = c( 101, 101, 103, 103, NA, 3, 4, 10, NA, NA, 10, 10, 10, 10, 10, 10, NA, 105, 105, NA, diff --git a/data/royal92.rda b/data/royal92.rda index 4a10b174..678261bd 100644 Binary files a/data/royal92.rda and b/data/royal92.rda differ diff --git a/man/SimPed.Rd b/man/SimPed.Rd deleted file mode 100644 index 79dd1f96..00000000 --- a/man/SimPed.Rd +++ /dev/null @@ -1,33 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helpGeneric.R -\name{SimPed} -\alias{SimPed} -\title{SimPed (Deprecated)} -\usage{ -SimPed(...) -} -\arguments{ -\item{...}{Arguments to be passed to `simulatePedigree`.} -} -\value{ -The same result as calling `simulatePedigree`. -} -\description{ -When calling this function, a warning will be issued about its deprecation. -} -\details{ -This function is a wrapper around the new `simulatePedigree` function. -`SimPed` has been deprecated, and it's advised to use `simulatePedigree` directly. -} -\examples{ -\dontrun{ -# This is an example of the deprecated function: -SimPed(...) -# It is recommended to use: -simulatePedigree(...) -} -} -\seealso{ -\code{\link{simulatePedigree}} for the updated function. -} -\keyword{deprecated} diff --git a/man/addRowlessParents.Rd b/man/addRowlessParents.Rd new file mode 100644 index 00000000..83132805 --- /dev/null +++ b/man/addRowlessParents.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/checkParents.R +\name{addRowlessParents} +\alias{addRowlessParents} +\title{Add addRowlessParents} +\usage{ +addRowlessParents(ped, verbose, validation_results) +} +\arguments{ +\item{ped}{A dataframe representing the pedigree data with columns 'ID', 'dadID', and 'momID'.} + +\item{verbose}{A logical flag indicating whether to print progress and validation messages to the console.} + +\item{validation_results}{validation results} +} +\description{ +This function adds parents who appear in momID or dadID but are missing from ID +} diff --git a/man/applyTagMappings.Rd b/man/applyTagMappings.Rd new file mode 100644 index 00000000..ae08a548 --- /dev/null +++ b/man/applyTagMappings.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcom.R +\name{applyTagMappings} +\alias{applyTagMappings} +\title{Apply Tag Mappings to a Line} +\usage{ +applyTagMappings(line, record, pattern_rows, tag_mappings) +} +\arguments{ +\item{line}{A character string from the GEDCOM file.} + +\item{record}{A named list representing the individual's record.} + +\item{pattern_rows}{A list with GEDCOM tag counts.} + +\item{tag_mappings}{A list of lists. Each sublist should define: +- \code{tag}: the GEDCOM tag, +- \code{field}: the record field to update, +- \code{mode}: either "replace" or "append", +- \code{extractor}: (optional) a custom extraction function.} +} +\value{ +A list with the updated record (\code{record}) and a logical flag (\code{matched}). +} +\description{ +Iterates over a list of tag mappings and, if a tag matches the line, updates the record. +} diff --git a/man/assignCoupleIds.Rd b/man/assignCoupleIDs.Rd similarity index 88% rename from man/assignCoupleIds.Rd rename to man/assignCoupleIDs.Rd index 7f379f64..95165581 100644 --- a/man/assignCoupleIds.Rd +++ b/man/assignCoupleIDs.Rd @@ -1,9 +1,12 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/helpPedigree.R -\name{assignCoupleIds} +\name{assignCoupleIDs} +\alias{assignCoupleIDs} \alias{assignCoupleIds} \title{Assign Couple IDs} \usage{ +assignCoupleIDs(df_Ngen) + assignCoupleIds(df_Ngen) } \arguments{ diff --git a/man/allGens.Rd b/man/calcAllGens.Rd similarity index 91% rename from man/allGens.Rd rename to man/calcAllGens.Rd index 6bc6d9e0..66a89c25 100644 --- a/man/allGens.Rd +++ b/man/calcAllGens.Rd @@ -1,10 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/calculateFamilySize.R -\name{allGens} +\name{calcAllGens} +\alias{calcAllGens} \alias{allGens} \title{allGens A function to calculate the number of individuals in each generation. This is a supporting function for \code{simulatePedigree}.} \usage{ +calcAllGens(kpc, Ngen, marR) + allGens(kpc, Ngen, marR) } \arguments{ diff --git a/man/famSizeCal.Rd b/man/calcFamilySize.Rd similarity index 90% rename from man/famSizeCal.Rd rename to man/calcFamilySize.Rd index e51f95ee..a0128d0c 100644 --- a/man/famSizeCal.Rd +++ b/man/calcFamilySize.Rd @@ -1,10 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/calculateFamilySize.R -\name{famSizeCal} +\name{calcFamilySize} +\alias{calcFamilySize} \alias{famSizeCal} \title{famSizeCal A function to calculate the total number of individuals in a pedigree given parameters. This is a supporting function for function \code{simulatePedigree}} \usage{ +calcFamilySize(kpc, Ngen, marR) + famSizeCal(kpc, Ngen, marR) } \arguments{ diff --git a/man/sizeAllGens.Rd b/man/calcFamilySizeByGen.Rd similarity index 86% rename from man/sizeAllGens.Rd rename to man/calcFamilySizeByGen.Rd index e477c19e..ae3e5e88 100644 --- a/man/sizeAllGens.Rd +++ b/man/calcFamilySizeByGen.Rd @@ -1,10 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/calculateFamilySize.R -\name{sizeAllGens} +\name{calcFamilySizeByGen} +\alias{calcFamilySizeByGen} \alias{sizeAllGens} \title{sizeAllGens An internal supporting function for \code{simulatePedigree}.} \usage{ +calcFamilySizeByGen(kpc, Ngen, marR) + sizeAllGens(kpc, Ngen, marR) } \arguments{ diff --git a/man/calculateRelatedness.Rd b/man/calculateRelatedness.Rd index b9a8f067..6d932e63 100644 --- a/man/calculateRelatedness.Rd +++ b/man/calculateRelatedness.Rd @@ -2,6 +2,7 @@ % Please edit documentation in R/computeRelatedness.R \name{calculateRelatedness} \alias{calculateRelatedness} +\alias{related_coef} \title{Calculate Relatedness Coefficient} \usage{ calculateRelatedness( @@ -18,6 +19,8 @@ calculateRelatedness( denom_m = FALSE, ... ) + +related_coef(...) } \arguments{ \item{generations}{Number of generations back of common ancestors the pair share.} diff --git a/man/calculateSummaryDT.Rd b/man/calculateSummaryDT.Rd index 45e67d3f..cb403b04 100644 --- a/man/calculateSummaryDT.Rd +++ b/man/calculateSummaryDT.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/summarizePedigree.R \name{calculateSummaryDT} \alias{calculateSummaryDT} -\title{This function calculates summary statistics for all numeric variables in a data.table. It is supposed to be used internally by the \code{summarize_pedigree} function.} +\title{Function to calculate summary statistics for all numeric variables +This function calculates summary statistics for all numeric variables in a data.table. It is supposed to be used internally by the \code{summarize_pedigree} function.} \usage{ calculateSummaryDT(data, group_var, skip_var, five_num_summary = FALSE) } @@ -20,6 +21,7 @@ the minimum, median, and maximum values.} A data.table containing the summary statistics for all numeric variables. } \description{ +Function to calculate summary statistics for all numeric variables This function calculates summary statistics for all numeric variables in a data.table. It is supposed to be used internally by the \code{summarize_pedigree} function. } \keyword{internal} diff --git a/man/checkIDuniqueness.Rd b/man/checkIDuniqueness.Rd new file mode 100644 index 00000000..6c1aebf6 --- /dev/null +++ b/man/checkIDuniqueness.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/checkIDs.R +\name{checkIDuniqueness} +\alias{checkIDuniqueness} +\title{Check for duplicated individual IDs} +\usage{ +checkIDuniqueness(ped, verbose = FALSE) +} +\arguments{ +\item{ped}{A dataframe representing the pedigree data with columns `ID`, `dadID`, and `momID`.} + +\item{verbose}{A logical flag indicating whether to print progress and validation messages to the console.} +} +\value{ +A list containing the results of the check +} +\description{ +This function checks for duplicated individual IDs in a pedigree. +} diff --git a/man/checkWithinRowDuplicates.Rd b/man/checkWithinRowDuplicates.Rd new file mode 100644 index 00000000..4797306c --- /dev/null +++ b/man/checkWithinRowDuplicates.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/checkIDs.R +\name{checkWithinRowDuplicates} +\alias{checkWithinRowDuplicates} +\title{Check for within-row duplicates (self-parents, same mom/dad)} +\usage{ +checkWithinRowDuplicates(ped, verbose = FALSE) +} +\arguments{ +\item{ped}{A dataframe representing the pedigree data with columns `ID`, `dadID`, and `momID`.} + +\item{verbose}{A logical flag indicating whether to print progress and validation messages to the console.} +} +\value{ +A list containing the results of the check +} +\description{ +This function checks for within-row duplicates in a pedigree. +} diff --git a/man/collapseNames.Rd b/man/collapseNames.Rd new file mode 100644 index 00000000..b3a0c514 --- /dev/null +++ b/man/collapseNames.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcom.R +\name{collapseNames} +\alias{collapseNames} +\title{collapse Names} +\usage{ +collapseNames(verbose, df_temp) +} +\arguments{ +\item{verbose}{A logical value indicating whether to print messages.} + +\item{df_temp}{A data frame containing the columns to be combined.} +} +\value{ +A data frame with the combined columns. +} +\description{ +This function combines the `name_given` and `name_given_pieces` columns in a data frame. +} diff --git a/man/collapseNames.legacy.Rd b/man/collapseNames.legacy.Rd new file mode 100644 index 00000000..dbf12d7c --- /dev/null +++ b/man/collapseNames.legacy.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcomlegacy.R +\name{collapseNames.legacy} +\alias{collapseNames.legacy} +\title{collapse Names} +\usage{ +collapseNames.legacy(verbose, df_temp) +} +\arguments{ +\item{verbose}{A logical value indicating whether to print messages.} + +\item{df_temp}{A data frame containing the columns to be combined.} +} +\description{ +This function combines the `name_given` and `name_given_pieces` columns in a data frame. +} +\keyword{internal} diff --git a/man/com2links.Rd b/man/com2links.Rd index 9dae2f09..dc846645 100644 --- a/man/com2links.Rd +++ b/man/com2links.Rd @@ -18,6 +18,7 @@ com2links( legacy = FALSE, outcome_name = "data", drop_upper_triangular = TRUE, + include_all_links_1ped = FALSE, ... ) } @@ -48,6 +49,8 @@ com2links( \item{drop_upper_triangular}{Logical. If TRUE, drops the upper triangular portion of the matrix.} +\item{include_all_links_1ped}{Logical. If TRUE, includes all links in the output. (Default is true when only one ped is provided)} + \item{...}{Additional arguments to be passed to \code{\link{com2links}}} } \value{ diff --git a/man/com2links.legacy.Rd b/man/com2links.legacy.Rd index 757117a1..4467dc85 100644 --- a/man/com2links.legacy.Rd +++ b/man/com2links.legacy.Rd @@ -1,8 +1,8 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/makeLinks.R +% Please edit documentation in R/makeLinkslegacy.R \name{com2links.legacy} \alias{com2links.legacy} -\title{Convert Pedigree Matrices to Related Pairs File (Legacy)} +\title{Convert Sparse Relationship Matrices to Kinship Links} \usage{ com2links.legacy( rel_pairs_file = "dataRelatedPairs.csv", @@ -10,9 +10,14 @@ com2links.legacy( mit_ped_matrix = mt_ped_matrix, mt_ped_matrix = NULL, cn_ped_matrix = NULL, - update_rate = 500, + write_buffer_size = 1000, + update_rate = 1000, + gc = TRUE, + writetodisk = TRUE, verbose = FALSE, + legacy = FALSE, outcome_name = "data", + drop_upper_triangular = TRUE, ... ) } @@ -27,14 +32,25 @@ com2links.legacy( \item{cn_ped_matrix}{Matrix of common nuclear relatedness coefficients.} +\item{write_buffer_size}{Number of related pairs to write to disk at a time.} + \item{update_rate}{Numeric. Frequency (in iterations) at which progress messages are printed.} +\item{gc}{Logical. If TRUE, performs garbage collection via \code{\link{gc}} to free memory.} + +\item{writetodisk}{Logical. If TRUE, writes the related pairs to disk; if FALSE, returns a data frame.} + \item{verbose}{Logical. If TRUE, prints progress messages.} +\item{legacy}{Logical. If TRUE, uses the legacy branch of the function.} + \item{outcome_name}{Character string representing the outcome name (used in file naming).} +\item{drop_upper_triangular}{Logical. If TRUE, drops the upper triangular portion of the matrix.} + \item{...}{Additional arguments to be passed to \code{\link{com2links}}} } \description{ -This legacy function converts pedigree matrices into a related pairs file. +Convert Sparse Relationship Matrices to Kinship Links } +\keyword{internal} diff --git a/man/com2links.og.Rd b/man/com2links.og.Rd new file mode 100644 index 00000000..e51c53cd --- /dev/null +++ b/man/com2links.og.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/makeLinkslegacy.R +\name{com2links.og} +\alias{com2links.og} +\title{Convert Pedigree Matrices to Related Pairs File (Legacy)} +\usage{ +com2links.og( + rel_pairs_file = "dataRelatedPairs.csv", + ad_ped_matrix = NULL, + mit_ped_matrix = mt_ped_matrix, + mt_ped_matrix = NULL, + cn_ped_matrix = NULL, + update_rate = 500, + verbose = FALSE, + outcome_name = "data", + ... +) +} +\arguments{ +\item{rel_pairs_file}{File path to write related pairs to (CSV format).} + +\item{ad_ped_matrix}{Matrix of additive genetic relatedness coefficients.} + +\item{mit_ped_matrix}{Matrix of mitochondrial relatedness coefficients. Alias: \code{mt_ped_matrix}.} + +\item{mt_ped_matrix}{Matrix of mitochondrial relatedness coefficients.} + +\item{cn_ped_matrix}{Matrix of common nuclear relatedness coefficients.} + +\item{update_rate}{Numeric. Frequency (in iterations) at which progress messages are printed.} + +\item{verbose}{Logical. If TRUE, prints progress messages.} + +\item{outcome_name}{Character string representing the outcome name (used in file naming).} + +\item{...}{Additional arguments to be passed to \code{\link{com2links}}} +} +\description{ +This legacy function converts pedigree matrices into a related pairs file. +} +\keyword{internal} diff --git a/man/combine_columns.Rd b/man/combine_columns.Rd index 43554b0b..9ab750ef 100644 --- a/man/combine_columns.Rd +++ b/man/combine_columns.Rd @@ -1,9 +1,11 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/readGedcom.R +% Please edit documentation in R/readGedcom.R, R/readGedcomlegacy.R \name{combine_columns} \alias{combine_columns} \title{Combine Columns} \usage{ +combine_columns(col1, col2) + combine_columns(col1, col2) } \arguments{ @@ -12,9 +14,13 @@ combine_columns(col1, col2) \item{col2}{The second column to combine.} } \value{ +A list with the combined column and a flag indicating if the second column should be retained. + A list with the combined column and a flag indicating if the second column should be retained. } \description{ +This function combines two columns, handling conflicts and merging non-conflicting data. + This function combines two columns, handling conflicts and merging non-conflicting data. } \keyword{internal} diff --git a/man/compute_parent_adjacency.Rd b/man/compute_parent_adjacency.Rd index 9cd4311c..f6364808 100644 --- a/man/compute_parent_adjacency.Rd +++ b/man/compute_parent_adjacency.Rd @@ -7,18 +7,19 @@ compute_parent_adjacency( ped, component, - adjacency_method = "indexed", + adjacency_method = "direct", saveable, resume, save_path, - verbose, - lastComputed, + verbose = FALSE, + lastComputed = 0, nr, checkpoint_files, update_rate, parList, lens, save_rate_parlist, + adjBeta_method = NULL, ... ) } @@ -27,7 +28,7 @@ compute_parent_adjacency( \item{component}{character. Which component of the pedigree to return. See Details.} -\item{adjacency_method}{character. The method to use for computing the adjacency matrix. Options are "loop" or "indexed"} +\item{adjacency_method}{character. The method to use for computing the adjacency matrix. Options are "loop", "indexed", direct or beta} \item{saveable}{logical. If TRUE, save the intermediate results to disk} @@ -51,6 +52,8 @@ compute_parent_adjacency( \item{save_rate_parlist}{numeric. The rate at which to save the intermediate results by parent list. If NULL, defaults to save_rate*1000} +\item{adjBeta_method}{numeric The method to use for computing the building the adjacency_method matrix when using the "beta" build} + \item{...}{additional arguments to be passed to \code{\link{ped2com}}} } \description{ diff --git a/man/countPatternRows.Rd b/man/countPatternRows.Rd index f9100912..3fe3c3af 100644 --- a/man/countPatternRows.Rd +++ b/man/countPatternRows.Rd @@ -2,17 +2,17 @@ % Please edit documentation in R/readGedcom.R \name{countPatternRows} \alias{countPatternRows} -\title{Check for Pattern Rows} +\title{Count GEDCOM Pattern Rows} \usage{ countPatternRows(file) } \arguments{ -\item{file}{A data frame containing the GEDCOM file.} +\item{file}{A data frame with a column \code{X1} containing GEDCOM lines.} } \value{ -A list with the number of rows containing each pattern. +A list with counts of specific GEDCOM tag occurrences. } \description{ -This function counts the number of rows containing specific patterns. +Counts the number of lines in a file (passed as a data frame with column "X1") +that match various GEDCOM patterns. } -\keyword{internal} diff --git a/man/countPatternRows.legacy.Rd b/man/countPatternRows.legacy.Rd new file mode 100644 index 00000000..c55ccf85 --- /dev/null +++ b/man/countPatternRows.legacy.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcomlegacy.R +\name{countPatternRows.legacy} +\alias{countPatternRows.legacy} +\title{Check for Pattern Rows} +\usage{ +countPatternRows.legacy(file) +} +\arguments{ +\item{file}{A data frame containing the GEDCOM file.} +} +\value{ +A list with the number of rows containing each pattern. +} +\description{ +This function counts the number of rows containing specific patterns. +} +\keyword{internal} diff --git a/man/determineSex.Rd b/man/determineSex.Rd index c98644f6..39711ada 100644 --- a/man/determineSex.Rd +++ b/man/determineSex.Rd @@ -4,12 +4,16 @@ \alias{determineSex} \title{Determine Sex of Offspring} \usage{ -determineSex(idGen, sexR) +determineSex(idGen, sexR, code_male = "M", code_female = "F") } \arguments{ \item{idGen}{Vector of IDs for the generation.} \item{sexR}{Numeric value indicating the sex ratio (proportion of males).} + +\item{code_male}{The value to use for males. Default is "M"} + +\item{code_female}{The value to use for females. Default is "F"} } \value{ Vector of sexes ("M" for male, "F" for female) for the offspring. diff --git a/man/extract_info.legacy.Rd b/man/extract_info.legacy.Rd new file mode 100644 index 00000000..23ac5bd8 --- /dev/null +++ b/man/extract_info.legacy.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcomlegacy.R +\name{extract_info.legacy} +\alias{extract_info.legacy} +\title{Extract Information from Line} +\usage{ +extract_info.legacy(line, type) +} +\arguments{ +\item{line}{A character string representing a line from a GEDCOM file.} + +\item{type}{A character string representing the type of information to extract.} +} +\value{ +A character string with the extracted information. +} +\description{ +This function extracts information from a line based on a specified type. +} +\keyword{internal} diff --git a/man/findFounder.Rd b/man/findFounder.Rd index a4cf70bb..19bfcb1f 100644 --- a/man/findFounder.Rd +++ b/man/findFounder.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/summarizePedigree.R \name{findFounder} \alias{findFounder} -\title{This function finds the originating member for each line in a pedigree. It is supposed to be used internally by the \code{summarize_pedigree} function.} +\title{Function to find the originating member for each line} \usage{ findFounder(data, group_var, sort_var) } diff --git a/man/inferRelatedness.Rd b/man/inferRelatedness.Rd index dba7a0ed..1dd8b17c 100644 --- a/man/inferRelatedness.Rd +++ b/man/inferRelatedness.Rd @@ -2,9 +2,12 @@ % Please edit documentation in R/computeRelatedness.R \name{inferRelatedness} \alias{inferRelatedness} +\alias{relatedness} \title{Infer Relatedness Coefficient} \usage{ inferRelatedness(obsR, aceA = 0.9, aceC = 0, sharedC = 0) + +relatedness(...) } \arguments{ \item{obsR}{Numeric. Observed correlation between the two groups. Must be between -1 and 1.} @@ -14,6 +17,8 @@ inferRelatedness(obsR, aceA = 0.9, aceC = 0, sharedC = 0) \item{aceC}{Numeric. Proportion of variance attributable to shared environmental variance. Must be between 0 and 1. Default is 0.} \item{sharedC}{Numeric. Proportion of shared environment shared between the two individuals. Must be between 0 (no shared environment) and 1 (completely shared environment). Default is 0.} + +\item{...}{Further named arguments that may be passed to another function.} } \value{ Numeric. The calculated relatedness coefficient (`est_r`). diff --git a/man/initializeRecord.Rd b/man/initializeRecord.Rd new file mode 100644 index 00000000..3c0c08ca --- /dev/null +++ b/man/initializeRecord.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcom.R +\name{initializeRecord} +\alias{initializeRecord} +\title{Initialize an Empty Individual Record} +\usage{ +initializeRecord(all_var_names) +} +\arguments{ +\item{all_var_names}{A character vector of variable names.} +} +\value{ +A named list representing an empty individual record. +} +\description{ +Creates a named list with all GEDCOM fields set to NA. +} diff --git a/man/evenInsert.Rd b/man/insertEven.Rd similarity index 93% rename from man/evenInsert.Rd rename to man/insertEven.Rd index 2dae39c1..7fc08138 100644 --- a/man/evenInsert.Rd +++ b/man/insertEven.Rd @@ -1,10 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/insertEven.R -\name{evenInsert} +\name{insertEven} +\alias{insertEven} \alias{evenInsert} \title{evenInsert A function to insert m elements evenly into a length n vector.} \usage{ +insertEven(m, n, verbose = FALSE) + evenInsert(m, n, verbose = FALSE) } \arguments{ diff --git a/man/mapFAMC2parents.legacy.Rd b/man/mapFAMC2parents.legacy.Rd new file mode 100644 index 00000000..f3166e4f --- /dev/null +++ b/man/mapFAMC2parents.legacy.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcomlegacy.R +\name{mapFAMC2parents.legacy} +\alias{mapFAMC2parents.legacy} +\title{Assign momID and dadID based on family mapping} +\usage{ +mapFAMC2parents.legacy(df_temp, family_to_parents) +} +\arguments{ +\item{df_temp}{A data frame containing individual information.} + +\item{family_to_parents}{A list mapping family IDs to parent IDs.} +} +\value{ +A data frame with added momID and dad_ID columns. +} +\description{ +This function assigns mother and father IDs to individuals in the data frame +based on the mapping of family IDs to parent IDs. +} +\keyword{internal} diff --git a/man/mapFAMS2parents.Rd b/man/mapFAMS2parents.Rd index 25d5a9f3..10cfb4e4 100644 --- a/man/mapFAMS2parents.Rd +++ b/man/mapFAMS2parents.Rd @@ -2,17 +2,17 @@ % Please edit documentation in R/readGedcom.R \name{mapFAMS2parents} \alias{mapFAMS2parents} -\title{Create a mapping of family IDs to parent IDs} +\title{Create a Mapping from Family IDs to Parent IDs} \usage{ mapFAMS2parents(df_temp) } \arguments{ -\item{df_temp}{A data frame containing information about individuals.} +\item{df_temp}{A data frame produced by \code{readGedcom()}.} } \value{ -A list mapping family IDs to parent IDs. +A list mapping family IDs to parent information. } \description{ -This function creates a mapping from family IDs to the IDs of the parents. +This function scans the data frame and creates a mapping of family IDs +to the corresponding parent IDs. } -\keyword{internal} diff --git a/man/mapFAMS2parents.legacy.Rd b/man/mapFAMS2parents.legacy.Rd new file mode 100644 index 00000000..798af515 --- /dev/null +++ b/man/mapFAMS2parents.legacy.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcomlegacy.R +\name{mapFAMS2parents.legacy} +\alias{mapFAMS2parents.legacy} +\title{Create a mapping of family IDs to parent IDs} +\usage{ +mapFAMS2parents.legacy(df_temp) +} +\arguments{ +\item{df_temp}{A data frame containing information about individuals.} +} +\value{ +A list mapping family IDs to parent IDs. +} +\description{ +This function creates a mapping from family IDs to the IDs of the parents. +} +\keyword{internal} diff --git a/man/nullToNA.Rd b/man/null2NA.Rd similarity index 86% rename from man/nullToNA.Rd rename to man/null2NA.Rd index 4bccb4b3..cb6d7571 100644 --- a/man/nullToNA.Rd +++ b/man/null2NA.Rd @@ -1,9 +1,12 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/helpGeneric.R -\name{nullToNA} +\name{null2NA} +\alias{null2NA} \alias{nullToNA} \title{nullToNA} \usage{ +null2NA(x) + nullToNA(x) } \arguments{ diff --git a/man/parseIndividualBlock.Rd b/man/parseIndividualBlock.Rd new file mode 100644 index 00000000..8f58554b --- /dev/null +++ b/man/parseIndividualBlock.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcom.R +\name{parseIndividualBlock} +\alias{parseIndividualBlock} +\title{Parse a GEDCOM Individual Block} +\usage{ +parseIndividualBlock(block, pattern_rows, all_var_names, verbose = FALSE) +} +\arguments{ +\item{block}{A character vector containing the GEDCOM lines for one individual.} + +\item{pattern_rows}{A list with counts of lines matching specific GEDCOM tags.} + +\item{all_var_names}{A character vector of variable names.} + +\item{verbose}{Logical indicating whether to print progress messages.} +} +\value{ +A named list representing the parsed record for the individual, or NULL if no ID is found. +} +\description{ +Processes a block of GEDCOM lines corresponding to a single individual. +} +\keyword{internal} diff --git a/man/parseNameLine.Rd b/man/parseNameLine.Rd new file mode 100644 index 00000000..44490d60 --- /dev/null +++ b/man/parseNameLine.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcom.R +\name{parseNameLine} +\alias{parseNameLine} +\title{Parse a Full Name Line} +\usage{ +parseNameLine(line, record) +} +\arguments{ +\item{line}{A character string containing the name line.} + +\item{record}{A named list representing the individual's record.} +} +\value{ +The updated record with parsed name information. +} +\description{ +Extracts full name information from a GEDCOM "NAME" line and updates the record accordingly. +} diff --git a/man/ped2add.Rd b/man/ped2add.Rd index 79803b21..c2179e99 100644 --- a/man/ped2add.Rd +++ b/man/ped2add.Rd @@ -42,7 +42,7 @@ ped2add( \item{transpose_method}{character. The method to use for computing the transpose. Options are "tcrossprod", "crossprod", or "star"} -\item{adjacency_method}{character. The method to use for computing the adjacency matrix. Options are "loop" or "indexed"} +\item{adjacency_method}{character. The method to use for computing the adjacency matrix. Options are "loop", "indexed", direct or beta} \item{saveable}{logical. If TRUE, save the intermediate results to disk} diff --git a/man/ped2cn.Rd b/man/ped2cn.Rd index 82d25902..c738d13b 100644 --- a/man/ped2cn.Rd +++ b/man/ped2cn.Rd @@ -16,7 +16,7 @@ ped2cn( saveable = FALSE, resume = FALSE, save_rate = 5, - adjacency_method = "indexed", + adjacency_method = "direct", save_rate_gen = save_rate, save_rate_parlist = 1000 * save_rate, save_path = "checkpoint/", @@ -48,7 +48,7 @@ ped2cn( \item{save_rate}{numeric. The rate at which to save the intermediate results} -\item{adjacency_method}{character. The method to use for computing the adjacency matrix. Options are "loop" or "indexed"} +\item{adjacency_method}{character. The method to use for computing the adjacency matrix. Options are "loop", "indexed", direct or beta} \item{save_rate_gen}{numeric. The rate at which to save the intermediate results by generation. If NULL, defaults to save_rate} diff --git a/man/ped2com.Rd b/man/ped2com.Rd index 27f632f5..58c0fc47 100644 --- a/man/ped2com.Rd +++ b/man/ped2com.Rd @@ -14,7 +14,7 @@ ped2com( flatten.diag = FALSE, standardize.colnames = TRUE, transpose_method = "tcrossprod", - adjacency_method = "indexed", + adjacency_method = "direct", isChild_method = "classic", saveable = FALSE, resume = FALSE, @@ -23,6 +23,7 @@ ped2com( save_rate_parlist = 1e+05 * save_rate, update_rate = 100, save_path = "checkpoint/", + adjBeta_method = NULL, ... ) } @@ -47,7 +48,7 @@ ped2com( \item{transpose_method}{character. The method to use for computing the transpose. Options are "tcrossprod", "crossprod", or "star"} -\item{adjacency_method}{character. The method to use for computing the adjacency matrix. Options are "loop" or "indexed"} +\item{adjacency_method}{character. The method to use for computing the adjacency matrix. Options are "loop", "indexed", direct or beta} \item{isChild_method}{character. The method to use for computing the isChild matrix. Options are "classic" or "partialparent"} @@ -65,6 +66,8 @@ ped2com( \item{save_path}{character. The path to save the checkpoint files} +\item{adjBeta_method}{numeric The method to use for computing the building the adjacency_method matrix when using the "beta" build} + \item{...}{additional arguments to be passed to \code{\link{ped2com}}} } \description{ diff --git a/man/ped2mit.Rd b/man/ped2mit.Rd index d11480d1..c19e9ba7 100644 --- a/man/ped2mit.Rd +++ b/man/ped2mit.Rd @@ -43,7 +43,7 @@ ped2mit( \item{transpose_method}{character. The method to use for computing the transpose. Options are "tcrossprod", "crossprod", or "star"} -\item{adjacency_method}{character. The method to use for computing the adjacency matrix. Options are "loop" or "indexed"} +\item{adjacency_method}{character. The method to use for computing the adjacency matrix. Options are "loop", "indexed", direct or beta} \item{saveable}{logical. If TRUE, save the intermediate results to disk} diff --git a/man/postProcessGedcom.Rd b/man/postProcessGedcom.Rd new file mode 100644 index 00000000..62bbc50e --- /dev/null +++ b/man/postProcessGedcom.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcom.R +\name{postProcessGedcom} +\alias{postProcessGedcom} +\title{Post-process GEDCOM Data Frame} +\usage{ +postProcessGedcom( + df_temp, + remove_empty_cols = TRUE, + combine_cols = TRUE, + add_parents = TRUE, + skinny = TRUE, + verbose = FALSE +) +} +\arguments{ +\item{df_temp}{A data frame produced by \code{readGedcom()}.} + +\item{remove_empty_cols}{Logical indicating whether to remove columns that are entirely missing.} + +\item{combine_cols}{Logical indicating whether to combine columns with duplicate values.} + +\item{add_parents}{Logical indicating whether to add parent information.} + +\item{skinny}{Logical indicating whether to slim down the data frame.} + +\item{verbose}{Logical indicating whether to print progress messages.} +} +\value{ +The post-processed data frame. +} +\description{ +This function optionally adds parent information, combines duplicate columns, +and removes empty columns from the GEDCOM data frame. +} diff --git a/man/postProcessGedcom.legacy.Rd b/man/postProcessGedcom.legacy.Rd new file mode 100644 index 00000000..060b9367 --- /dev/null +++ b/man/postProcessGedcom.legacy.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcomlegacy.R +\name{postProcessGedcom.legacy} +\alias{postProcessGedcom.legacy} +\title{Post-process GEDCOM Data Frame} +\usage{ +postProcessGedcom.legacy( + df_temp, + remove_empty_cols = TRUE, + combine_cols = TRUE, + add_parents = TRUE, + skinny = TRUE, + verbose = FALSE +) +} +\arguments{ +\item{df_temp}{A data frame containing information about individuals.} + +\item{remove_empty_cols}{A logical value indicating whether to remove columns with all missing values.} + +\item{combine_cols}{A logical value indicating whether to combine columns with duplicate values.} + +\item{add_parents}{A logical value indicating whether to add parents to the data frame.} + +\item{skinny}{A logical value indicating whether to return a skinny data frame.} + +\item{verbose}{A logical value indicating whether to print messages.} +} +\value{ +A data frame with processed information. +} +\description{ +Post-process GEDCOM Data Frame +} diff --git a/man/processEventLine.Rd b/man/processEventLine.Rd new file mode 100644 index 00000000..d4cff3d3 --- /dev/null +++ b/man/processEventLine.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcom.R +\name{processEventLine} +\alias{processEventLine} +\title{Process Event Lines (Birth or Death)} +\usage{ +processEventLine(event, block, i, record, pattern_rows) +} +\arguments{ +\item{event}{A character string indicating the event type ("birth" or "death").} + +\item{block}{A character vector of GEDCOM lines.} + +\item{i}{The current line index where the event tag is found.} + +\item{record}{A named list representing the individual's record.} + +\item{pattern_rows}{A list with counts of GEDCOM tag occurrences.} +} +\value{ +The updated record with parsed event information.# +} +\description{ +Extracts event details (e.g., date, place, cause, latitude, longitude) from a block of GEDCOM lines. +For "birth": expect DATE on line i+1, PLAC on i+2, LATI on i+4, LONG on i+5. +For "death": expect DATE on line i+1, PLAC on i+2, CAUS on i+3, LATI on i+4, LONG on i+5. +} diff --git a/man/processParents.Rd b/man/processParents.Rd index 9aa205a7..aec5deb3 100644 --- a/man/processParents.Rd +++ b/man/processParents.Rd @@ -2,17 +2,18 @@ % Please edit documentation in R/readGedcom.R \name{processParents} \alias{processParents} -\title{Process parents information} +\title{Process Parents Information from GEDCOM Data} \usage{ processParents(df_temp, datasource) } \arguments{ -\item{df_temp}{A data frame containing information about individuals.} +\item{df_temp}{A data frame produced by \code{readGedcom()}.} + +\item{datasource}{Character string indicating the data source ("gedcom" or "wiki").} } \value{ -A data frame with added momID and dadID columns. +The updated data frame with parent IDs added. } \description{ -This function processes the dataframe to add momID and dadID columns. +Adds parent IDs to the individuals based on family relationship data. } -\keyword{internal} diff --git a/man/processParents.legacy.Rd b/man/processParents.legacy.Rd new file mode 100644 index 00000000..774663aa --- /dev/null +++ b/man/processParents.legacy.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcomlegacy.R +\name{processParents.legacy} +\alias{processParents.legacy} +\title{Process parents information} +\usage{ +processParents.legacy(df_temp, datasource) +} +\arguments{ +\item{df_temp}{A data frame containing information about individuals.} +} +\value{ +A data frame with added momID and dadID columns. +} +\description{ +This function processes the dataframe to add momID and dadID columns. +} +\keyword{internal} diff --git a/man/process_one.Rd b/man/process_one.Rd new file mode 100644 index 00000000..0d57de2b --- /dev/null +++ b/man/process_one.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/makeLinks.R +\name{process_one} +\alias{process_one} +\title{Convert Sparse Relationship Matrices to Kinship Links for one Matrix} +\usage{ +process_one( + matrix, + rel_name, + ids, + nc, + rel_pairs_file, + writetodisk, + write_buffer_size, + drop_upper_triangular, + update_rate, + verbose, + gc, + include_all_links = TRUE, + ... +) +} +\arguments{ +\item{rel_pairs_file}{File path to write related pairs to (CSV format).} + +\item{writetodisk}{Logical. If TRUE, writes the related pairs to disk; if FALSE, returns a data frame.} + +\item{write_buffer_size}{Number of related pairs to write to disk at a time.} + +\item{drop_upper_triangular}{Logical. If TRUE, drops the upper triangular portion of the matrix.} + +\item{update_rate}{Numeric. Frequency (in iterations) at which progress messages are printed.} + +\item{verbose}{Logical. If TRUE, prints progress messages.} + +\item{gc}{Logical. If TRUE, performs garbage collection via \code{\link{gc}} to free memory.} + +\item{include_all_links}{Logical. If TRUE, all links are included in the output.} + +\item{...}{Additional arguments to be passed to \code{\link{com2links}}} +} +\description{ +Convert Sparse Relationship Matrices to Kinship Links for one Matrix +} +\keyword{internal} diff --git a/man/process_tag.legacy.Rd b/man/process_tag.legacy.Rd new file mode 100644 index 00000000..f31cbad6 --- /dev/null +++ b/man/process_tag.legacy.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcomlegacy.R +\name{process_tag.legacy} +\alias{process_tag.legacy} +\title{Process a GEDCOM Tag} +\usage{ +process_tag.legacy( + tag, + field_name, + pattern_rows, + line, + vars, + extractor = NULL, + mode = "replace" +) +} +\arguments{ +\item{tag}{The GEDCOM tag (e.g., "SEX", "CAST", etc.).} + +\item{field_name}{The name of the variable to assign to in `vars`.} + +\item{pattern_rows}{Output from `countPatternRows()`.} + +\item{line}{The GEDCOM line to parse.} + +\item{vars}{The current list of variables to update.} +} +\value{ +A list with updated `vars` and a `matched` flag. +} +\description{ +Extracts and assigns a value to a specified field in `vars` if the pattern is present. +Returns both the updated variable list and a flag indicating whether the tag was matched. +} +\keyword{internal} diff --git a/man/readGedcom.Rd b/man/readGedcom.Rd index 78d45773..d9701427 100644 --- a/man/readGedcom.Rd +++ b/man/readGedcom.Rd @@ -2,6 +2,8 @@ % Please edit documentation in R/readGedcom.R \name{readGedcom} \alias{readGedcom} +\alias{readGed} +\alias{readgedcom} \title{Read a GEDCOM File} \usage{ readGedcom( @@ -11,6 +13,32 @@ readGedcom( remove_empty_cols = TRUE, combine_cols = TRUE, skinny = FALSE, + update_rate = 1000, + post_process = TRUE, + ... +) + +readGed( + file_path, + verbose = FALSE, + add_parents = TRUE, + remove_empty_cols = TRUE, + combine_cols = TRUE, + skinny = FALSE, + update_rate = 1000, + post_process = TRUE, + ... +) + +readgedcom( + file_path, + verbose = FALSE, + add_parents = TRUE, + remove_empty_cols = TRUE, + combine_cols = TRUE, + skinny = FALSE, + update_rate = 1000, + post_process = TRUE, ... ) } @@ -27,6 +55,10 @@ readGedcom( \item{skinny}{A logical value indicating whether to return a skinny data frame.} +\item{update_rate}{numeric. The rate at which to print progress} + +\item{post_process}{A logical value indicating whether to post-process the data frame.} + \item{...}{Additional arguments to be passed to the function.} } \value{ @@ -69,5 +101,4 @@ A data frame containing information about individuals, with the following potent } \description{ This function reads a GEDCOM file and parses it into a structured data frame of individuals. -Inspired by https://raw.githubusercontent.com/jjfitz/readgedcom/master/R/read_gedcom.R } diff --git a/man/readGedcom.legacy.Rd b/man/readGedcom.legacy.Rd new file mode 100644 index 00000000..e4cf2957 --- /dev/null +++ b/man/readGedcom.legacy.Rd @@ -0,0 +1,78 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcomlegacy.R +\name{readGedcom.legacy} +\alias{readGedcom.legacy} +\title{Read a GEDCOM File} +\usage{ +readGedcom.legacy( + file_path, + verbose = FALSE, + add_parents = TRUE, + remove_empty_cols = TRUE, + combine_cols = TRUE, + skinny = FALSE, + update_rate = 1000, + post_process = TRUE, + ... +) +} +\arguments{ +\item{file_path}{The path to the GEDCOM file.} + +\item{verbose}{A logical value indicating whether to print messages.} + +\item{add_parents}{A logical value indicating whether to add parents to the data frame.} + +\item{remove_empty_cols}{A logical value indicating whether to remove columns with all missing values.} + +\item{combine_cols}{A logical value indicating whether to combine columns with duplicate values.} + +\item{skinny}{A logical value indicating whether to return a skinny data frame.} + +\item{update_rate}{numeric. The rate at which to print progress} + +\item{...}{Additional arguments to be passed to the function.} +} +\value{ +A data frame containing information about individuals, with the following potential columns: +- `id`: ID of the individual +- `momID`: ID of the individual's mother +- `dadID`: ID of the individual's father +- `sex`: Sex of the individual +- `name`: Full name of the individual +- `name_given`: First name of the individual +- `name_surn`: Last name of the individual +- `name_marriedsurn`: Married name of the individual +- `name_nick`: Nickname of the individual +- `name_npfx`: Name prefix +- `name_nsfx`: Name suffix +- `birth_date`: Birth date of the individual +- `birth_lat`: Latitude of the birthplace +- `birth_long`: Longitude of the birthplace +- `birth_place`: Birthplace of the individual +- `death_caus`: Cause of death +- `death_date`: Death date of the individual +- `death_lat`: Latitude of the place of death +- `death_long`: Longitude of the place of death +- `death_place`: Place of death of the individual +- `attribute_caste`: Caste of the individual +- `attribute_children`: Number of children of the individual +- `attribute_description`: Description of the individual +- `attribute_education`: Education of the individual +- `attribute_idnumber`: Identification number of the individual +- `attribute_marriages`: Number of marriages of the individual +- `attribute_nationality`: Nationality of the individual +- `attribute_occupation`: Occupation of the individual +- `attribute_property`: Property owned by the individual +- `attribute_religion`: Religion of the individual +- `attribute_residence`: Residence of the individual +- `attribute_ssn`: Social security number of the individual +- `attribute_title`: Title of the individual +- `FAMC`: ID(s) of the family where the individual is a child +- `FAMS`: ID(s) of the family where the individual is a spouse +} +\description{ +This function reads a GEDCOM file and parses it into a structured data frame of individuals. +Inspired by https://raw.githubusercontent.com/jjfitz/readgedcom/master/R/read_gedcom.R +} +\keyword{internal} diff --git a/man/related_coef.Rd b/man/related_coef.Rd deleted file mode 100644 index 11112982..00000000 --- a/man/related_coef.Rd +++ /dev/null @@ -1,33 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helpGeneric.R -\name{related_coef} -\alias{related_coef} -\title{related_coef (Deprecated)} -\usage{ -related_coef(...) -} -\arguments{ -\item{...}{Arguments to be passed to `calculateRelatedness`.} -} -\value{ -The same result as calling `calculateRelatedness`. -} -\description{ -When calling this function, a warning will be issued about its deprecation. -} -\details{ -This function is a wrapper around the new `calculateRelatedness` function. -`related_coef` has been deprecated, and it's advised to use `calculateRelatedness` directly. -} -\examples{ -\dontrun{ -# This is an example of the deprecated function: -related_coef(...) -# It is recommended to use: -calculateRelatedness(...) -} -} -\seealso{ -\code{\link{calculateRelatedness}} for the updated function. -} -\keyword{deprecated} diff --git a/man/relatedness.Rd b/man/relatedness.Rd deleted file mode 100644 index f84740df..00000000 --- a/man/relatedness.Rd +++ /dev/null @@ -1,33 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helpGeneric.R -\name{relatedness} -\alias{relatedness} -\title{relatedness (Deprecated)} -\usage{ -relatedness(...) -} -\arguments{ -\item{...}{Arguments to be passed to `inferRelatedness`.} -} -\value{ -The same result as calling `inferRelatedness`. -} -\description{ -When calling this function, a warning will be issued about its deprecation. -} -\details{ -This function is a wrapper around the new `inferRelatedness` function. -`relatedness` has been deprecated, and it's advised to use `inferRelatedness` directly. -} -\examples{ -\dontrun{ -# This is an example of the deprecated function: -relatedness(...) -# It is recommended to use: -inferRelatedness(...) -} -} -\seealso{ -\code{\link{inferRelatedness}} for the updated function. -} -\keyword{deprecated} diff --git a/man/repairIDs.Rd b/man/repairIDs.Rd index 2ce49f7a..cc37d78f 100644 --- a/man/repairIDs.Rd +++ b/man/repairIDs.Rd @@ -7,9 +7,9 @@ repairIDs(ped, verbose = FALSE) } \arguments{ -\item{ped}{A pedigree object} +\item{ped}{A dataframe representing the pedigree data with columns `ID`, `dadID`, and `momID`.} -\item{verbose}{A logical indicating whether to print progress messages} +\item{verbose}{A logical flag indicating whether to print progress and validation messages to the console.} } \value{ A corrected pedigree diff --git a/man/simulatePedigree.Rd b/man/simulatePedigree.Rd index a2e07b23..b7c28650 100644 --- a/man/simulatePedigree.Rd +++ b/man/simulatePedigree.Rd @@ -2,6 +2,7 @@ % Please edit documentation in R/simulatePedigree.R \name{simulatePedigree} \alias{simulatePedigree} +\alias{SimPed} \title{Simulate Pedigrees This function simulates "balanced" pedigrees based on a group of parameters: 1) k - Kids per couple; @@ -19,6 +20,8 @@ simulatePedigree( balancedMar = TRUE, verbose = FALSE ) + +SimPed(...) } \arguments{ \item{kpc}{Number of kids per couple. An integer >= 2 that determines how many kids each fertilized mated couple will have in the pedigree. Default value is 3. Returns an error when kpc equals 1.} @@ -36,6 +39,8 @@ simulatePedigree( \item{balancedMar}{Not fully developed yet. Always \code{TRUE} in the current version.} \item{verbose}{logical If TRUE, print progress through stages of algorithm} + +\item{...}{Additional arguments to be passed to other functions.} } \value{ A \code{data.frame} with each row representing a simulated individual. The columns are as follows: diff --git a/man/splitIndividuals.Rd b/man/splitIndividuals.Rd new file mode 100644 index 00000000..dcb9a7e8 --- /dev/null +++ b/man/splitIndividuals.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcom.R +\name{splitIndividuals} +\alias{splitIndividuals} +\title{Split GEDCOM Lines into Individual Blocks} +\usage{ +splitIndividuals(lines, verbose = FALSE) +} +\arguments{ +\item{lines}{A character vector of lines from the GEDCOM file.} + +\item{verbose}{Logical indicating whether to output progress messages.} +} +\value{ +A list of character vectors, each representing one individual. +} +\description{ +This function partitions the GEDCOM file (as a vector of lines) into a list of blocks, +where each block corresponds to a single individual starting with an "@ INDI" line. +} diff --git a/man/summarizeFamilies.Rd b/man/summarizeFamilies.Rd index 8903eb93..2cb50761 100644 --- a/man/summarizeFamilies.Rd +++ b/man/summarizeFamilies.Rd @@ -2,6 +2,7 @@ % Please edit documentation in R/summarizePedigree.R \name{summarizeFamilies} \alias{summarizeFamilies} +\alias{summariseFamilies} \title{Summarize the families in a pedigree} \usage{ summarizeFamilies( @@ -21,6 +22,24 @@ summarizeFamilies( five_num_summary = FALSE, verbose = FALSE ) + +summariseFamilies( + ped, + famID = "famID", + personID = "ID", + momID = "momID", + dadID = "dadID", + matID = "matID", + patID = "patID", + byr = NULL, + founder_sort_var = NULL, + include_founder = FALSE, + nbiggest = 5, + noldest = 5, + skip_var = NULL, + five_num_summary = FALSE, + verbose = FALSE +) } \arguments{ \item{ped}{a pedigree dataset. Needs ID, momID, and dadID columns} diff --git a/man/summarizeFounder.Rd b/man/summarizeFounder.Rd new file mode 100644 index 00000000..703834e3 --- /dev/null +++ b/man/summarizeFounder.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/summarizePedigree.R +\name{summarizeFounder} +\alias{summarizeFounder} +\title{Function to summarize the originating members for each line} +\usage{ +summarizeFounder(ped_dt, group_var, sort_var, foo_summary_dt, verbose) +} +\arguments{ +\item{sort_var}{A character string specifying the column name to sort by.} + +\item{verbose}{Logical, if TRUE, print progress messages.} +} +\description{ +This function summarizes the originating members for each line in a pedigree. It is supposed to be used internally by the \code{summarize_pedigree} function. +} +\keyword{internal} diff --git a/man/summarizeMatrilines.Rd b/man/summarizeMatrilines.Rd index 2890b622..577204f6 100644 --- a/man/summarizeMatrilines.Rd +++ b/man/summarizeMatrilines.Rd @@ -2,6 +2,7 @@ % Please edit documentation in R/summarizePedigree.R \name{summarizeMatrilines} \alias{summarizeMatrilines} +\alias{summariseMatrilines} \title{Summarize the maternal lines in a pedigree} \usage{ summarizeMatrilines( @@ -21,6 +22,24 @@ summarizeMatrilines( five_num_summary = FALSE, verbose = FALSE ) + +summariseMatrilines( + ped, + famID = "famID", + personID = "ID", + momID = "momID", + dadID = "dadID", + matID = "matID", + patID = "patID", + byr = NULL, + include_founder = FALSE, + founder_sort_var = NULL, + nbiggest = 5, + noldest = 5, + skip_var = NULL, + five_num_summary = FALSE, + verbose = FALSE +) } \arguments{ \item{ped}{a pedigree dataset. Needs ID, momID, and dadID columns} diff --git a/man/summarizePatrilines.Rd b/man/summarizePatrilines.Rd index aed89bcd..27fd9494 100644 --- a/man/summarizePatrilines.Rd +++ b/man/summarizePatrilines.Rd @@ -2,6 +2,7 @@ % Please edit documentation in R/summarizePedigree.R \name{summarizePatrilines} \alias{summarizePatrilines} +\alias{summarisePatrilines} \title{Summarize the paternal lines in a pedigree} \usage{ summarizePatrilines( @@ -21,6 +22,24 @@ summarizePatrilines( five_num_summary = FALSE, verbose = FALSE ) + +summarisePatrilines( + ped, + famID = "famID", + personID = "ID", + momID = "momID", + dadID = "dadID", + matID = "matID", + patID = "patID", + byr = NULL, + founder_sort_var = NULL, + include_founder = FALSE, + nbiggest = 5, + noldest = 5, + skip_var = NULL, + five_num_summary = FALSE, + verbose = FALSE +) } \arguments{ \item{ped}{a pedigree dataset. Needs ID, momID, and dadID columns} diff --git a/man/summarizePedigrees.Rd b/man/summarizePedigrees.Rd index b298d326..a4f6a6e9 100644 --- a/man/summarizePedigrees.Rd +++ b/man/summarizePedigrees.Rd @@ -2,6 +2,7 @@ % Please edit documentation in R/summarizePedigree.R \name{summarizePedigrees} \alias{summarizePedigrees} +\alias{summarisePedigrees} \title{Summarize Pedigree Data} \usage{ summarizePedigrees( @@ -17,7 +18,27 @@ summarizePedigrees( include_founder = FALSE, founder_sort_var = NULL, nbiggest = 5, - noldest = 5, + noldest = nbiggest, + skip_var = NULL, + five_num_summary = FALSE, + network_checks = FALSE, + verbose = FALSE +) + +summarisePedigrees( + ped, + famID = "famID", + personID = "ID", + momID = "momID", + dadID = "dadID", + matID = "matID", + patID = "patID", + type = c("fathers", "mothers", "families"), + byr = NULL, + include_founder = FALSE, + founder_sort_var = NULL, + nbiggest = 5, + noldest = nbiggest, skip_var = NULL, five_num_summary = FALSE, network_checks = FALSE, diff --git a/man/try_na.Rd b/man/tryNA.Rd similarity index 89% rename from man/try_na.Rd rename to man/tryNA.Rd index 0f8fddeb..388fa600 100644 --- a/man/try_na.Rd +++ b/man/tryNA.Rd @@ -1,9 +1,12 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/helpGeneric.R -\name{try_na} +\name{tryNA} +\alias{tryNA} \alias{try_na} \title{modified tryCatch function} \usage{ +tryNA(x) + try_na(x) } \arguments{ diff --git a/tests/testthat/test-checkParents.R b/tests/testthat/test-checkParents.R index 21660c4b..50916289 100644 --- a/tests/testthat/test-checkParents.R +++ b/tests/testthat/test-checkParents.R @@ -22,6 +22,6 @@ test_that("checksif single parents found correctly in ASOIAF dataset", { single_moms <- length(df_asoiaf$id[is.na(df_asoiaf$dadID) & !is.na(df_asoiaf$momID)]) expect_equal(single_moms, length(results$missing_fathers)) expect_equal(single_dads, length(results$missing_mothers)) - repaired_df <- checkParentIDs(df_asoiaf, verbose = FALSE, repair = TRUE) + repaired_df <- checkParentIDs(df_asoiaf, verbose = FALSE, repair = TRUE, parentswithoutrow = TRUE) expect_equal(nrow(repaired_df), nrow(df_asoiaf) + single_moms + single_dads) }) diff --git a/tests/testthat/test-computeRelatedness.R b/tests/testthat/test-computeRelatedness.R index 05fb7e62..192df30b 100644 --- a/tests/testthat/test-computeRelatedness.R +++ b/tests/testthat/test-computeRelatedness.R @@ -28,11 +28,13 @@ test_that("calculateRelatedness function with empirical", { # Test 7: empirical divide by zero test_that("calculateH handles divide by zero for empirical", { -expect_error( - calculateRelatedness(generations = 2, - empirical = TRUE, total_a = 0, - total_m = 0)) - + expect_error( + calculateRelatedness( + generations = 2, + empirical = TRUE, total_a = 0, + total_m = 0 + ) + ) }) test_that("inferRelatedness performs as expected", { result <- inferRelatedness(0, aceA = .9, aceC = 0, sharedC = 0) @@ -105,4 +107,3 @@ test_that("calculateH stops for illegal coefficients", { "The observed correlations should be between -1 and 1" ) }) - diff --git a/tests/testthat/test-convertPedigree.R b/tests/testthat/test-convertPedigree.R index 11dfbeca..a011c334 100644 --- a/tests/testthat/test-convertPedigree.R +++ b/tests/testthat/test-convertPedigree.R @@ -297,14 +297,26 @@ test_that("adjacency_method 'indexed', 'loop', and direct produce the same resu test_that("adjacency_method 'indexed', 'loop', and direct produce the same results for common nuclear matrix", { data(hazard) tolerance <- 1e-10 - + adjBeta_method_1 <- 1 + adjBeta_method_2 <- 4 + adjBeta_method_3 <- 5 # common nuclear ped_common_indexed <- ped2com(hazard, component = "common nuclear", adjacency_method = "indexed") ped_common_loop <- ped2com(hazard, component = "common nuclear", adjacency_method = "loop") - # ped_common_direct <- ped2com(hazard, component = "common nuclear", adjacency_method = "direct") + ped_common_direct <- ped2com(hazard, component = "common nuclear", adjacency_method = "direct") + ped_common_adjBeta_1 <- ped2com(hazard, component = "common nuclear", adjacency_method = "beta", adjBeta_method = adjBeta_method_2) + ped_common_adjBeta_2 <- ped2com(hazard, component = "common nuclear", adjacency_method = "beta", adjBeta_method = adjBeta_method_3) + expect_equal(ped_common_indexed, ped_common_loop, tolerance = tolerance) - # expect_equal(ped_common_loop, ped_common_direct, tolerance = tolerance) - # expect_equal(ped_common_indexed, ped_common_direct, tolerance = tolerance) + expect_equal(ped_common_loop, ped_common_direct, tolerance = tolerance) + expect_equal(ped_common_indexed, ped_common_direct, tolerance = tolerance) + expect_equal(ped_common_loop, ped_common_adjBeta_1, tolerance = tolerance) + expect_equal(ped_common_indexed, ped_common_adjBeta_1, tolerance = tolerance) + expect_equal(ped_common_loop, ped_common_adjBeta_2, tolerance = tolerance) + expect_equal(ped_common_indexed, ped_common_adjBeta_2, tolerance = tolerance) + expect_equal(ped_common_direct, ped_common_adjBeta_1, tolerance = tolerance) + expect_equal(ped_common_direct, ped_common_adjBeta_2, tolerance = tolerance) + expect_equal(ped_common_adjBeta_1, ped_common_adjBeta_2, tolerance = tolerance) }) diff --git a/tests/testthat/test-makeLinks.R b/tests/testthat/test-makeLinks.R index e8dfe99e..05fc58fd 100644 --- a/tests/testthat/test-makeLinks.R +++ b/tests/testthat/test-makeLinks.R @@ -7,10 +7,9 @@ test_that("com2links handles missing matrices properly", { - test_that("com2links rejects invalid matrix types", { fake_matrix <- data.frame(A = c(1, 2), B = c(3, 4)) - expect_error(com2links(ad_ped_matrix = fake_matrix), "The 'ad_ped_matrix' must be a matrix or dgCMatrix.") + expect_error(com2links(ad_ped_matrix = fake_matrix), "The 'ad_ped_matrix' must be a matrix or generalMatrix") }) test_that("com2links produces correct output with a single relationship matrix (hazard dataset)", { @@ -25,6 +24,18 @@ test_that("com2links produces correct output with a single relationship matrix ( expect_true(all(result$addRel >= 0)) # Relatedness values should be non-negative }) +test_that("com2links produces correct output with cn_ped_matrix", { + data(ASOIAF) + cn_ped_matrix <- ped2cn(ASOIAF, sparse = TRUE) + + result <- com2links(cn_ped_matrix = cn_ped_matrix, writetodisk = FALSE) + + expect_true(is.data.frame(result)) + expect_true(all(c("ID1", "ID2", "cnuRel") %in% colnames(result))) + expect_equal(ncol(result), 3) # Expect ID1, ID2, and addRel + expect_true(all(result$cnRel >= 0)) # Relatedness values should be non-negative +}) + test_that("com2links produces correct output with mt_ped_matrix", { data(hazard) mit_ped_matrix <- ped2mit(hazard, sparse = TRUE) @@ -34,7 +45,7 @@ test_that("com2links produces correct output with mt_ped_matrix", { expect_true(is.data.frame(result)) expect_true(all(c("ID1", "ID2", "mitRel") %in% colnames(result))) expect_equal(ncol(result), 3) # Expect ID1, ID2, and addRel - expect_true(all(result$addRel >= 0)) # Relatedness values should be non-negative + expect_true(all(result$mitRel %in% c(0, 1))) # Mitochondrial should be binary }) test_that("com2links processes multiple matrices correctly (hazard dataset)", { @@ -53,7 +64,46 @@ test_that("com2links processes multiple matrices correctly (hazard dataset)", { expect_true(all(result$cnuRel >= 0)) }) +test_that("com2links processes creates same length for cn with 3, 2, and 1 matrices are used", { + data(hazard) + ad_ped_matrix <- ped2add(hazard, sparse = TRUE) + mit_ped_matrix <- ped2mit(hazard, sparse = TRUE) + cn_ped_matrix <- ped2cn(hazard, sparse = TRUE) + result3 <- com2links(ad_ped_matrix = ad_ped_matrix, mit_ped_matrix = mit_ped_matrix, cn_ped_matrix = cn_ped_matrix, writetodisk = FALSE) + + expect_true(is.data.frame(result3)) + expect_true(all(c("ID1", "ID2", "addRel", "mitRel", "cnuRel") %in% colnames(result3))) + expect_equal(ncol(result3), 5) # Expect ID1, ID2, addRel, mitRel, and cnuRel + expect_true(all(result3$addRel >= 0)) + expect_true(all(result3$mitRel %in% c(0, 1))) # Mitochondrial should be binary + expect_true(all(result3$cnuRel >= 0)) + + result2 <- com2links(ad_ped_matrix = ad_ped_matrix, cn_ped_matrix = cn_ped_matrix, writetodisk = FALSE) + expect_true(is.data.frame(result2)) + expect_true(all(c("ID1", "ID2", "addRel", "cnuRel") %in% colnames(result2))) + expect_equal(ncol(result2), 4) # Expect ID1, ID2, addRel, and cnuRel + expect_true(all(result2$addRel >= 0)) + expect_true(all(result2$cnuRel >= 0)) + + expect_equal(result3$cnuRel, result2$cnuRel) + + result1 <- com2links(cn_ped_matrix = cn_ped_matrix, writetodisk = FALSE) + result1_legacy <- com2links.legacy(cn_ped_matrix = cn_ped_matrix, writetodisk = FALSE) + expect_true(is.data.frame(result1)) + expect_true(is.data.frame(result1_legacy)) + expect_true(all(c("ID1", "ID2", "cnuRel") %in% colnames(result1))) + expect_true(all(c("ID1", "ID2", "cnuRel") %in% colnames(result1_legacy))) + expect_equal(ncol(result1), 3) # Expect ID1, ID2, and cnuRel + expect_equal(ncol(result1_legacy), 3) # Expect ID1, ID2, and cnuRel + expect_true(all(result1$cnuRel >= 0)) + expect_true(all(result1_legacy$cnuRel >= 0)) + expect_equal(result3$cnuRel[result3$cnuRel == 1], result1$cnuRel[result1$cnuRel == 1]) + expect_equal(result3$cnuRel[result3$cnuRel == 1], result1_legacy$cnuRel[result1_legacy$cnuRel == 1]) + expect_equal(result2$cnuRel[result2$cnuRel == 1], result1$cnuRel[result1$cnuRel == 1]) + expect_equal(result2$cnuRel[result2$cnuRel == 1], result1_legacy$cnuRel[result1_legacy$cnuRel == 1]) + expect_equal(result1$cnuRel[result1$cnuRel == 1], result1_legacy$cnuRel[result1_legacy$cnuRel == 1]) +}) test_that("com2links written version matchs", { data(hazard) ad_ped_matrix <- ped2com(hazard, component = "additive", adjacency_method = "direct", sparse = TRUE) @@ -95,7 +145,7 @@ test_that("com2links legacy works", { mit_ped_matrix <- ped2com(hazard, component = "mitochondrial", adjacency_method = "direct", sparse = TRUE) cn_ped_matrix <- ped2com(hazard, component = "common nuclear", adjacency_method = "indexed", sparse = TRUE) - resultlegacy <- com2links( + resultlegacy <- com2links.legacy( ad_ped_matrix = ad_ped_matrix, mit_ped_matrix = mit_ped_matrix, cn_ped_matrix = cn_ped_matrix, legacy = TRUE @@ -108,22 +158,112 @@ test_that("com2links legacy works", { expect_true(all(c("ID1", "ID2", "addRel", "mitRel", "cnuRel") %in% colnames(written_data))) - result <- com2links( + + result_beta <- com2links( + ad_ped_matrix = ad_ped_matrix, + mit_ped_matrix = mit_ped_matrix, cn_ped_matrix = cn_ped_matrix, + writetodisk = FALSE + ) + + expect_true(is.data.frame(result_beta)) + expect_true(all(c("ID1", "ID2", "addRel", "mitRel", "cnuRel") %in% colnames(result_beta))) + + + result <- com2links.legacy( ad_ped_matrix = ad_ped_matrix, mit_ped_matrix = mit_ped_matrix, cn_ped_matrix = cn_ped_matrix, writetodisk = FALSE ) expect_true(is.data.frame(result)) + expect_true(all(c("ID1", "ID2", "addRel", "mitRel", "cnuRel") %in% colnames(result))) # Drop row names to avoid mismatches in expect_equal rownames(result) <- NULL rownames(written_data) <- NULL + rownames(result_beta) <- NULL # Final comparison between written versions expect_equal(written_data, result) + expect_equal(result_beta, result) }) +test_that("com2links beta works", { + data(hazard) + ad_ped_matrix <- ped2com(hazard, component = "additive", adjacency_method = "direct", sparse = TRUE) + mit_ped_matrix <- ped2com(hazard, component = "mitochondrial", adjacency_method = "direct", sparse = TRUE) + cn_ped_matrix <- ped2com(hazard, component = "common nuclear", adjacency_method = "indexed", sparse = TRUE) + + # compare 2 + result_beta <- com2links( + ad_ped_matrix = ad_ped_matrix, + mit_ped_matrix = mit_ped_matrix, + writetodisk = FALSE + ) + + expect_true(is.data.frame(result_beta)) + expect_true(all(c("ID1", "ID2", "addRel", "mitRel") %in% colnames(result_beta))) + + + result <- com2links.legacy( + ad_ped_matrix = ad_ped_matrix, + mit_ped_matrix = mit_ped_matrix, + writetodisk = FALSE + ) + + expect_true(is.data.frame(result)) + expect_true(all(c("ID1", "ID2", "addRel", "mitRel") %in% colnames(result))) + # Drop row names to avoid mismatches in expect_equal + rownames(result) <- NULL + rownames(result_beta) <- NULL + + # Final comparison between versions + expect_equal(result_beta, result) + + + # write to disk + result_disk <- com2links( + ad_ped_matrix = ad_ped_matrix, + mit_ped_matrix = mit_ped_matrix, + writetodisk = TRUE + ) + expect_true(file.exists("dataRelatedPairs.csv")) + written_data <- read.csv("dataRelatedPairs.csv") + # remove the file + expect_true(file.remove("dataRelatedPairs.csv")) + + expect_true(all(c("ID1", "ID2", "addRel", "mitRel") %in% colnames(written_data))) + rownames(written_data) <- NULL + expect_equal(result_beta, written_data) + expect_equal(result, written_data) + # compare 1 + + result_beta <- com2links( + mit_ped_matrix = mit_ped_matrix, + writetodisk = FALSE + ) + + expect_true(is.data.frame(result_beta)) + expect_true(all(c("ID1", "ID2", "mitRel") %in% colnames(result_beta))) + + + result <- com2links( + mit_ped_matrix = mit_ped_matrix, + writetodisk = FALSE + ) + + expect_true(is.data.frame(result)) + expect_true(all(c("ID1", "ID2", "mitRel") %in% colnames(result))) + # Drop row names to avoid mismatches in expect_equal + rownames(result) <- NULL + rownames(result_beta) <- NULL + + # Final comparison between versions + expect_equal(result_beta, result) +}) + + + test_that("com2links correctly handles missing matrices", { data(hazard) @@ -131,9 +271,10 @@ test_that("com2links correctly handles missing matrices", { expect_error( com2links(ad_ped_matrix = NULL, mit_ped_matrix = NULL, cn_ped_matrix = NULL), - "At least one of 'ad_ped_matrix', 'mit_ped_matrix', or 'cn_ped_matrix' must be provided." ) + "At least one of 'ad_ped_matrix', 'mit_ped_matrix', or 'cn_ped_matrix' must be provided." + ) - expect_error(com2links(ad_ped_matrix = hazard), "The 'ad_ped_matrix' must be a matrix or dgCMatrix.") + expect_error(com2links(ad_ped_matrix = hazard), "The 'ad_ped_matrix' must be a matrix or generalMatrix") }) test_that("com2links correctly processes inbreeding dataset", { @@ -178,14 +319,15 @@ test_that("com2links handles large batch writing correctly", { sexR <- 0.5 df_fam <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR) - ad_ped_matrix <- ped2add(df_fam, sparse = TRUE) + cn_ped_matrix <- ped2cn(df_fam, sparse = TRUE) temp_file <- tempfile(fileext = ".csv") - com2links(ad_ped_matrix = ad_ped_matrix, rel_pairs_file = temp_file, writetodisk = TRUE, verbose = TRUE) + com2links(cn_ped_matrix = cn_ped_matrix, rel_pairs_file = temp_file, writetodisk = TRUE, verbose = TRUE) expect_true(file.exists(temp_file)) written_data <- read.csv(temp_file) - expect_true(nrow(written_data) > 1000) # Ensuring batch writing logic works + expect_true(nrow(written_data) == 155) # Ensuring batch writing logic works + expect_true(file.remove(temp_file)) }) test_that("com2links garbage collection does not affect output, using two components", { diff --git a/tests/testthat/test-plotPedigree.R b/tests/testthat/test-plotPedigree.R index 4944bcdb..ce4697c4 100644 --- a/tests/testthat/test-plotPedigree.R +++ b/tests/testthat/test-plotPedigree.R @@ -32,3 +32,16 @@ test_that("pedigree plots correctly with affected variables", { # file.remove("Rplots.pdf") }) # file.remove("Rplots.pdf") + +test_that("pedigree errs when affected variables named", { + data(inbreeding) + + expect_error(plotPedigree(data, verbose = TRUE, affected = "affected")) +}) + + +test_that("pedigree plots multiple families", { + data(inbreeding) + + expect_output(plotPedigree(inbreeding, verbose = TRUE)) +}) diff --git a/tests/testthat/test-readPedigrees.R b/tests/testthat/test-readPedigrees.R index 1f098697..6adb4687 100644 --- a/tests/testthat/test-readPedigrees.R +++ b/tests/testthat/test-readPedigrees.R @@ -180,44 +180,54 @@ test_that("if file does not exist, readGedcom throws an error", { -# readWikifamilytree - -test_that("readWikifamilytree reads a string correctly", { - # Create a temporary WikiFamilyTree file for testing - # Example usage - family_tree_text <- "{{familytree/start |summary=I have a brother Joe and a little sister: my mom married my dad, and my dad's parents were Grandma and Grandpa; they had another child, Aunt Daisy.}} -{{familytree | | | | GMa |~|y|~| GPa | | GMa=Gladys|GPa=Sydney}} -{{familytree | | | | | | | |)|-|-|-|.| }} -{{familytree | | | MOM |y| DAD | |DAISY| MOM=Mom|DAD=Dad|DAISY=[[Daisy Duke]]}} -{{familytree | |,|-|-|-|+|-|-|-|.| | | }} -{{familytree | JOE | | ME | | SIS | | | JOE=My brother Joe|ME='''Me!'''|SIS=My little sister}} -{{familytree/end}}" - - temp_file <- tempfile(fileext = ".txt") - writeLines(family_tree_text, temp_file) +test_that("readGedcom parses death event correctly", { + # Test that a GEDCOM file with a death event is parsed correctly. + gedcom_content <- c( + "0 @I1@ INDI", + "1 NAME John /Doe/", + "1 SEX M", + "1 DEAT", + "2 DATE 31 DEC 2000", + "2 PLAC Lastplace", + "2 CAUS Old age", + "2 LATI 12.3456", + "2 LONG -65.4321" + ) + temp_file <- tempfile(fileext = ".ged") + writeLines(gedcom_content, temp_file) + df <- readGedcom(temp_file, verbose = TRUE) - result <- readWikifamilytree(text = family_tree_text) - result2 <- readWikifamilytree(file_path = temp_file) + expect_true("death_date" %in% colnames(df)) + expect_true("death_place" %in% colnames(df)) + expect_true("death_caus" %in% colnames(df)) + expect_true("death_lat" %in% colnames(df)) + expect_true("death_long" %in% colnames(df)) - expect_equal( - result$summary, - "I have a brother Joe and a little sister: my mom married my dad, and my dad's parents were Grandma and Grandpa; they had another child, Aunt Daisy." - ) + expect_equal(df$death_date[1], "31 DEC 2000") + expect_equal(df$death_place[1], "Lastplace") + expect_equal(df$death_caus[1], "Old age") + expect_equal(df$death_lat[1], "12.3456") + expect_equal(df$death_long[1], "-65.4321") - expect_equal( - result2$summary, - "I have a brother Joe and a little sister: my mom married my dad, and my dad's parents were Grandma and Grandpa; they had another child, Aunt Daisy." - ) + unlink(temp_file) }) +test_that("readGedcom handles incomplete individual records gracefully", { + # Test that an individual record missing a NAME line is handled without error. + gedcom_content <- c( + "0 @I1@ INDI", + "1 SEX M" + # No NAME or BIRT information. + ) + temp_file <- tempfile(fileext = ".ged") + writeLines(gedcom_content, temp_file) -# read E:/Dropbox/Lab/Research/Projects/2024/BGMiscJoss/BGmisc_main/data-raw/Targaryen tree Dance.txt + df <- readGedcom(temp_file, verbose = TRUE) -#test_that("readWikifamilytree reads a file correctly", { - # Create a temporary WikiFamilyTree file for testing - # Example usage -# family_tree_file_path <- "data-raw/Targaryen tree Dance.txt" # system.file("extdata", "Targaryen tree Dance.txt", package = "BGmisc") + # Expect one record with missing name fields. + expect_equal(nrow(df), 1) + expect_true(is.null(df$name[1])) - # result <- readWikifamilytree(file_path=family_tree_file_path) -#}) + unlink(temp_file) +}) diff --git a/tests/testthat/test-readWikiTree.R b/tests/testthat/test-readWikiTree.R new file mode 100644 index 00000000..166539d9 --- /dev/null +++ b/tests/testthat/test-readWikiTree.R @@ -0,0 +1,41 @@ +# readWikifamilytree + +test_that("readWikifamilytree reads a string correctly", { + # Create a temporary WikiFamilyTree file for testing + # Example usage + family_tree_text <- "{{familytree/start |summary=I have a brother Joe and a little sister: my mom married my dad, and my dad's parents were Grandma and Grandpa; they had another child, Aunt Daisy.}} +{{familytree | | | | GMa |~|y|~| GPa | | GMa=Gladys|GPa=Sydney}} +{{familytree | | | | | | | |)|-|-|-|.| }} +{{familytree | | | MOM |y| DAD | |DAISY| MOM=Mom|DAD=Dad|DAISY=[[Daisy Duke]]}} +{{familytree | |,|-|-|-|+|-|-|-|.| | | }} +{{familytree | JOE | | ME | | SIS | | | JOE=My brother Joe|ME='''Me!'''|SIS=My little sister}} +{{familytree/end}}" + + temp_file <- tempfile(fileext = ".txt") + writeLines(family_tree_text, temp_file) + + + result <- readWikifamilytree(text = family_tree_text) + result2 <- readWikifamilytree(file_path = temp_file) + + expect_equal( + result$summary, + "I have a brother Joe and a little sister: my mom married my dad, and my dad's parents were Grandma and Grandpa; they had another child, Aunt Daisy." + ) + + expect_equal( + result2$summary, + "I have a brother Joe and a little sister: my mom married my dad, and my dad's parents were Grandma and Grandpa; they had another child, Aunt Daisy." + ) +}) + + +# read E:/Dropbox/Lab/Research/Projects/2024/BGMiscJoss/BGmisc_main/data-raw/Targaryen tree Dance.txt + +# test_that("readWikifamilytree reads a file correctly", { +# Create a temporary WikiFamilyTree file for testing +# Example usage +# family_tree_file_path <- "data-raw/Targaryen tree Dance.txt" # system.file("extdata", "Targaryen tree Dance.txt", package = "BGmisc") + +# result <- readWikifamilytree(file_path=family_tree_file_path) +# }) diff --git a/tests/testthat/test-summarizePedigrees.R b/tests/testthat/test-summarizePedigrees.R index 61dae665..3fd62464 100644 --- a/tests/testthat/test-summarizePedigrees.R +++ b/tests/testthat/test-summarizePedigrees.R @@ -7,7 +7,13 @@ test_that("Counts the correct number people", { expect_equal(result_observed, result_expected) }) - +# Test: SummarizeFamilies is used when SummariseFamilies +test_that("SummarizeFamilies works like SummariseFamilies", { + df <- ped2fam(potter, famID = "newFamID", personID = "personID") + df_summarized <- summarizeFamilies(df, famID = "newFamID", personID = "personID") + df_summarised <- summariseFamilies(df, famID = "newFamID", personID = "personID") + expect_equal(df_summarised, df_summarized) +}) # Test Case 2: Multiple families test_that("summarizeFamilies() works with multiple families", { df <- ped2fam(inbreeding, famID = "newFamID", personID = "ID") @@ -53,9 +59,11 @@ test_that("summarizeMatrilines() works", { nbiggest <- 2 df <- ped2fam(potter, famID = "newFamID", personID = "personID") %>% ped2maternal(personID = "personID") - df_summarized <- summarizeMatrilines(df, famID = "newFamID", - personID = "personID", - nbiggest = nbiggest) + df_summarized <- summarizeMatrilines(df, + famID = "newFamID", + personID = "personID", + nbiggest = nbiggest + ) # is the total count from the family summary the same as the raw data? result_observed <- sum(df_summarized$maternal_summary$count) result_expected <- nrow(potter) @@ -70,15 +78,23 @@ test_that("summarizeMatrilines() works", { result_observed <- nrow(df_summarized$biggest_maternal) expect_equal(result_observed, nbiggest) }) - +# Test: SummarizeMatrilines is used when SummariseMatrilines +test_that("SummarizeMatrilines works like SummariseMatrilines", { + df <- ped2fam(potter, famID = "newFamID", personID = "personID") + df_summarized <- summarizeMatrilines(df, famID = "newFamID", personID = "personID") + df_summarised <- summariseMatrilines(df, famID = "newFamID", personID = "personID") + expect_equal(df_summarised, df_summarized) +}) # Test Case 5: Does this function work for summarizePatrilines test_that("summarizePatrilines() works", { nbiggest <- 4 df <- ped2fam(potter, famID = "newFamID", personID = "personID") %>% ped2paternal(personID = "personID") - df_summarized <- summarizePatrilines(df, famID = "newFamID", - personID = "personID", - nbiggest = nbiggest) + df_summarized <- summarizePatrilines(df, + famID = "newFamID", + personID = "personID", + nbiggest = nbiggest + ) # is the total count from the family summary the same as the raw data? result_observed <- sum(df_summarized$paternal_summary$count) result_expected <- nrow(potter) @@ -94,6 +110,13 @@ test_that("summarizePatrilines() works", { expect_equal(result_observed, nbiggest) }) +# Test: summarizePatrilines is used when SummarisePatrilines +test_that("summarizePatrilines works like SummarisePatrilines", { + df <- ped2fam(potter, famID = "newFamID", personID = "personID") + df_summarized <- summarizePatrilines(df, famID = "newFamID", personID = "personID") + df_summarised <- summarisePatrilines(df, famID = "newFamID", personID = "personID") + expect_equal(df_summarised, df_summarized) +}) # Test Case 6: Handling of missing values in critical columns test_that("summarizePedigrees() handles missing values correctly", { df <- data.frame( @@ -111,7 +134,7 @@ test_that("summarizePedigrees() handles missing values correctly", { }) # Test Case 7: When all variables are skipped -test_that("summarizePedigrees works when all numeric variables are skipped",{ +test_that("summarizePedigrees works when all numeric variables are skipped", { df <- data.frame( ID = 1:5, momID = c(NA, 1, 1, NA, 4), @@ -126,11 +149,16 @@ test_that("summarizePedigrees works when all numeric variables are skipped",{ # Test Case 8: Handling invalid column names test_that("summarizePedigrees() throws error on invalid column names", { - df <- data.frame(ID = 1:5, momID = c(NA, 1, 1, NA, 4), - dadID = c(NA, 2, 2, NA, 5), famID = c(1, 1, 1, 2, 2)) + df <- data.frame( + ID = 1:5, momID = c(NA, 1, 1, NA, 4), + dadID = c(NA, 2, 2, NA, 5), famID = c(1, 1, 1, 2, 2) + ) expect_error(summarizePedigrees(df, byr = "unknown_column")) }) + + + # Test Case 9: Handling empty dataset # test_that("summarizePedigrees() handles empty dataset gracefully", { # df <- data.frame(ID = integer(), momID = integer(), dadID = integer(), famID = integer()) @@ -149,3 +177,11 @@ test_that("summarizePedigrees() works for single-entry pedigree", { expect_equal(nrow(df_summarized$family_summary), 1) expect_equal(df_summarized$oldest_families$byr_mean, 1920) }) + +# Test: summarizePedigrees is used when SummarisePedigrees +test_that("SummarizePedigrees works like SummarisePedigrees", { + df <- ped2fam(potter, famID = "newFamID", personID = "personID") + df_summarized <- summarizePedigrees(df, famID = "newFamID", personID = "personID") + df_summarised <- summarisePedigrees(df, famID = "newFamID", personID = "personID") + expect_equal(df_summarised, df_summarized) +}) diff --git a/vignettes/ASOIAF.Rmd b/vignettes/ASOIAF.Rmd index f9dceaa0..c83d9241 100644 --- a/vignettes/ASOIAF.Rmd +++ b/vignettes/ASOIAF.Rmd @@ -18,14 +18,13 @@ In this vignette, we use the `BGmisc` package to reconstruct the *ASOIAF* pedigr We begin by loading the required libraries and examining the structure of the built-in `ASOIAF` pedigree. -```{r} +```{r echo=TRUE, message=FALSE, warning=FALSE} library(BGmisc) library(tidyverse) data(ASOIAF) ``` - -The ASOIAF dataset includes character IDs, names, sex codes, and parent identifiers for a subset of characters drawn from the *A Song of Ice and Fire* canon. +The ASOIAF dataset includes character IDs, names, family identifiers, and parent identifiers for a subset of characters drawn from the *A Song of Ice and Fire* canon. ```{r} head(ASOIAF) @@ -163,8 +162,8 @@ This code creates new IDs for individuals with one known parent and a missing ot ## Visualize the Pedigree -```{r} -# fixParents(id=df_got$ID, dadid=df_got$dadID, momid=df_got$momID, sex=df_got$sex, missid = NA) +We can now visualize the repaired pedigree using the `plotPedigree()` function. This function generates a plot of the pedigree, with individuals colored based on their affected status. In this case, we highlight Jon and Daenerys as "affected" individuals. Otherwise they would be difficult to distinguish from the rest of the pedigree. +```{r, message=FALSE, warning=FALSE} plotPedigree(df_repaired, affected = df_repaired$affected, verbose = FALSE) ``` diff --git a/vignettes/ASOIAF.html b/vignettes/ASOIAF.html index 6762cd1f..4066c186 100644 --- a/vignettes/ASOIAF.html +++ b/vignettes/ASOIAF.html @@ -359,37 +359,12 @@

Load Packages and Data

We begin by loading the required libraries and examining the structure of the built-in ASOIAF pedigree.

library(BGmisc)
-library(tidyverse)
-
## ── Attaching core tidyverse packages ────── tidyverse 2.0.0 ──
-## âś” dplyr     1.1.4     âś” readr     2.1.5
-## âś” forcats   1.0.0     âś” stringr   1.5.1
-## âś” ggplot2   3.5.1     âś” tibble    3.2.1
-## âś” lubridate 1.9.4     âś” tidyr     1.3.1
-## âś” purrr     1.0.4     
-## ── Conflicts ──────────────────────── tidyverse_conflicts() ──
-## âś– dplyr::between()     masks BGmisc::between()
-## âś– dplyr::filter()      masks stats::filter()
-## âś– dplyr::first()       masks BGmisc::first()
-## âś– lubridate::hour()    masks BGmisc::hour()
-## âś– lubridate::isoweek() masks BGmisc::isoweek()
-## âś– dplyr::lag()         masks stats::lag()
-## âś– dplyr::last()        masks BGmisc::last()
-## âś– lubridate::mday()    masks BGmisc::mday()
-## âś– lubridate::minute()  masks BGmisc::minute()
-## âś– lubridate::month()   masks BGmisc::month()
-## âś– lubridate::quarter() masks BGmisc::quarter()
-## âś– lubridate::second()  masks BGmisc::second()
-## âś– purrr::transpose()   masks BGmisc::transpose()
-## âś– lubridate::wday()    masks BGmisc::wday()
-## âś– lubridate::week()    masks BGmisc::week()
-## âś– lubridate::yday()    masks BGmisc::yday()
-## âś– lubridate::year()    masks BGmisc::year()
-## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
-
data(ASOIAF)
-

The ASOIAF dataset includes character IDs, names, sex codes, and -parent identifiers for a subset of characters drawn from the A Song -of Ice and Fire canon.

-
head(ASOIAF)
+library(tidyverse) +data(ASOIAF) +

The ASOIAF dataset includes character IDs, names, family identifiers, +and parent identifiers for a subset of characters drawn from the A +Song of Ice and Fire canon.

+
head(ASOIAF)
##   id famID momID dadID          name sex
 ## 1  1     1    NA    NA   Walder Frey   M
 ## 2  2     1    NA    NA   Perra Royce   F
@@ -404,11 +379,11 @@ 

Prepare and Validate Sex Codes

calculationss and visualization. We use checkSex() to inspect the sex variable, repairing inconsistencies programmatically.

-
df_got <- checkSex(ASOIAF,
-  code_male = 1,
-  code_female = 0,
-  verbose = FALSE, repair = TRUE
-)
+
df_got <- checkSex(ASOIAF,
+  code_male = 1,
+  code_female = 0,
+  verbose = FALSE, repair = TRUE
+)

Compute Relatedness Matrices

@@ -434,25 +409,25 @@

Compute Relatedness Matrices

  • We set sparse = FALSE to return full (dense) matrices rather than compressed sparse formats.

  • -
    add <- ped2com(df_got,
    -  isChild_method = "partialparent",
    -  component = "additive",
    -  adjacency_method = "direct",
    -  sparse = TRUE
    -)
    -
    -mt <- ped2com(df_got,
    -  isChild_method = "partialparent",
    -  component = "mitochondrial",
    -  adjacency_method = "direct",
    -  sparse = TRUE
    -)
    -
    -cn <- ped2cn(df_got,
    -  isChild_method = "partialparent",
    -  adjacency_method = "indexed",
    -  sparse = TRUE
    -)
    +
    add <- ped2com(df_got,
    +  isChild_method = "partialparent",
    +  component = "additive",
    +  adjacency_method = "direct",
    +  sparse = TRUE
    +)
    +
    +mt <- ped2com(df_got,
    +  isChild_method = "partialparent",
    +  component = "mitochondrial",
    +  adjacency_method = "direct",
    +  sparse = TRUE
    +)
    +
    +cn <- ped2cn(df_got,
    +  isChild_method = "partialparent",
    +  adjacency_method = "indexed",
    +  sparse = TRUE
    +)

    Convert to Pairwise Format

    @@ -460,15 +435,12 @@

    Convert to Pairwise Format

    long-format tables using com2links(). This function returns a dataframe where each row represents a unique pair of individuals, including their additive and common nuclear coefficients.

    -
    df_links <- com2links(
    -  writetodisk = FALSE,
    -  ad_ped_matrix = add, cn_ped_matrix = cn, mit_ped_matrix = mt,
    -  drop_upper_triangular = TRUE
    -) # %>%
    -
    ## 'as(<dsCMatrix>, "dgCMatrix")' is deprecated.
    -## Use 'as(., "generalMatrix")' instead.
    -## See help("Deprecated") and help("Matrix-deprecated").
    -
    #  filter(ID1 != ID2)
    +
    df_links <- com2links(
    +  writetodisk = FALSE,
    +  ad_ped_matrix = add, cn_ped_matrix = cn, mit_ped_matrix = mt,
    +  drop_upper_triangular = TRUE
    +) # %>%
    +#  filter(ID1 != ID2)

    The function can return the entire matrix or just the lower triangular part, which is often sufficient for our purposes. Setting drop_upper_triangular = TRUE ensures we only retain one @@ -480,21 +452,21 @@

    Locate Jon and Daenerys

    We next identify the rows in the pairwise relatedness table that correspond to Jon Snow and Daenerys Targaryen. First, we retrieve their individual IDs:

    -
    # Find the IDs of Jon Snow and Daenerys Targaryen
    -
    -jon_id <- df_got %>%
    -  filter(name == "Jon Snow") %>%
    -  pull(ID)
    -
    -dany_id <- df_got %>%
    -  filter(name == "Daenerys Targaryen") %>%
    -  pull(ID)
    +
    # Find the IDs of Jon Snow and Daenerys Targaryen
    +
    +jon_id <- df_got %>%
    +  filter(name == "Jon Snow") %>%
    +  pull(ID)
    +
    +dany_id <- df_got %>%
    +  filter(name == "Daenerys Targaryen") %>%
    +  pull(ID)

    Then we isolate their dyad:

    -
    jon_dany_row <- df_links %>%
    -  filter(ID1 == jon_id | ID2 == jon_id) %>%
    -  filter(ID1 %in% dany_id | ID2 %in% dany_id)
    -
    -jon_dany_row
    +
    jon_dany_row <- df_links %>%
    +  filter(ID1 == jon_id | ID2 == jon_id) %>%
    +  filter(ID1 %in% dany_id | ID2 %in% dany_id)
    +
    +jon_dany_row
    ##   ID1 ID2     addRel mitRel cnuRel
     ## 1 206 211 0.31274414      0      0
     ## 2 211 304 0.01953125      0      0
    @@ -519,18 +491,18 @@

    Plotting the Pedigree with Incomplete Parental Information

    To facilitate plotting, we check for individuals with one known parent but a missing other. For those cases, we assign a placeholder ID to the missing parent.

    -
    df_repaired <- checkParentIDs(df_got,
    -  addphantoms = TRUE,
    -  repair = TRUE,
    -  parentswithoutrow = FALSE,
    -  repairsex = FALSE
    -) %>% mutate(
    -  fam = 1,
    -  affected = case_when(
    -    ID %in% c(jon_id, dany_id, "365") ~ 1,
    -    TRUE ~ 0
    -  )
    -)
    +
    df_repaired <- checkParentIDs(df_got,
    +  addphantoms = TRUE,
    +  repair = TRUE,
    +  parentswithoutrow = FALSE,
    +  repairsex = FALSE
    +) %>% mutate(
    +  fam = 1,
    +  affected = case_when(
    +    ID %in% c(jon_id, dany_id, "365") ~ 1,
    +    TRUE ~ 0
    +  )
    +)
    ## REPAIR IN EARLY ALPHA

    This code creates new IDs for individuals with one known parent and a missing other. It checks if either momID or @@ -540,9 +512,13 @@

    Plotting the Pedigree with Incomplete Parental Information

    Visualize the Pedigree

    -
    # fixParents(id=df_got$ID, dadid=df_got$dadID, momid=df_got$momID, sex=df_got$sex, missid = NA)
    -
    -plotPedigree(df_repaired, affected = df_repaired$affected, verbose = FALSE)
    +

    We can now visualize the repaired pedigree using the +plotPedigree() function. This function generates a plot of +the pedigree, with individuals colored based on their affected status. +In this case, we highlight Jon and Daenerys as “affected” individuals. +Otherwise they would be difficult to distinguish from the rest of the +pedigree.

    +
    plotPedigree(df_repaired, affected = df_repaired$affected, verbose = FALSE)

    ## Did not plot the following people: 85 88 125 142 228 229 258 259 274 275 305 336 357 381 388 405 409 418 420 424 428 451 487
    ## named list()
    diff --git a/vignettes/partial.Rmd b/vignettes/partial.Rmd index 45fb7ce6..50009cfc 100644 --- a/vignettes/partial.Rmd +++ b/vignettes/partial.Rmd @@ -49,11 +49,13 @@ We compute the additive genetic relationship matrix using both the classic and p ped_add_partial_complete <- ped2com(df, isChild_method = "partialparent", component = "additive", - adjacency_method = "direct" + adjacency_method = "direct", + sparse = FALSE ) ped_add_classic_complete <- ped2com(df, isChild_method = "classic", - component = "additive", adjacency_method = "direct" + component = "additive", adjacency_method = "direct", + sparse = FALSE ) ``` @@ -67,22 +69,31 @@ library(corrplot) corrplot(as.matrix(ped_add_classic_complete), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE, title = "Additive component - Classic method" + is.corr = FALSE, title = "Additive component - Classic method", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) + corrplot(as.matrix(ped_add_partial_complete), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE, title = "Additive component - Partial parent method" + is.corr = FALSE, title = "Additive component - Partial parent method", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) ``` To verify this, we subtract one matrix from the other and calculate RMSE. The difference should be numerically zero. Indeed, it is `r sqrt(mean((ped_add_classic_complete-ped_add_partial_complete)^2))`. -```{r} -corrplot(as.matrix(ped_add_classic_complete - ped_add_partial_complete), +```{r,warning=FALSE} +corrplot((as.matrix(ped_add_classic_complete) - as.matrix(ped_add_partial_complete)), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE + is.corr = FALSE, order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) ``` @@ -101,12 +112,14 @@ df$momID[df$ID == 4] <- NA ped_add_partial_mom <- ped_add_partial <- ped2com(df, isChild_method = "partialparent", component = "additive", - adjacency_method = "direct" + adjacency_method = "direct", + sparse = FALSE ) ped_add_classic_mom <- ped_add_classic <- ped2com(df, isChild_method = "classic", - component = "additive", adjacency_method = "direct" + component = "additive", adjacency_method = "direct", + sparse = FALSE ) ``` @@ -119,12 +132,17 @@ The resulting additive matrices reflect this difference. The RMSE between the tw ```{r} corrplot(as.matrix(ped_add_classic), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE, title = "Classic (mother removed)" + is.corr = FALSE, title = "Classic (mother removed)", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) - corrplot(as.matrix(ped_add_partial), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE, title = "Partial (mother removed)" + is.corr = FALSE, title = "Partial (mother removed)", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) ``` @@ -132,16 +150,19 @@ corrplot(as.matrix(ped_add_partial), We quantify the overall matrix difference: ```{r} -sqrt(mean((ped_add_classic - ped_add_partial)^2)) +sqrt(mean((as.matrix(ped_add_classic) - as.matrix(ped_add_partial))^2)) ``` Next, we compare each method to the matrix from the complete pedigree. This evaluates how much each method deviates from the correct additive structure. ```{r} -corrplot(as.matrix(ped_add_classic_complete - ped_add_classic), +corrplot(as.matrix(ped_add_classic_complete) - as.matrix(ped_add_classic), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE + is.corr = FALSE, + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) sqrt(mean((ped_add_classic_complete - ped_add_classic)^2)) @@ -152,7 +173,10 @@ The RMSE between the true additive component and the classic method is `r sqrt(m ```{r} corrplot(as.matrix(ped_add_classic_complete - ped_add_partial), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE + is.corr = FALSE, + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) sqrt(mean((ped_add_classic_complete - ped_add_partial)^2)) @@ -190,17 +214,23 @@ ped_add_classic_dad <- ped_add_classic <- ped2com(df, ``` -As we can see, the two matrices are different. The RMSE between the two matrices is `r sqrt(mean((ped_add_classic-ped_add_partial)^2))`. +As we can see, the two matrices are different. The RMSE between the two matrices is `r sqrt(mean((as.matrix(ped_add_classic)-as.matrix(ped_add_partial))^2))`. ```{r} corrplot(as.matrix(ped_add_classic_dad), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE, title = "Classic (father removed)" + is.corr = FALSE, title = "Classic (father removed)", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) corrplot(as.matrix(ped_add_partial_dad), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE, title = "Partial (father removed)" + is.corr = FALSE, title = "Partial (father removed)", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) ``` @@ -210,7 +240,10 @@ Again, we compare to the true matrix from the complete pedigree: ```{r} corrplot(as.matrix(ped_add_classic_complete - ped_add_classic), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE + is.corr = FALSE, + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) sqrt(mean((ped_add_classic_complete - ped_add_classic)^2)) @@ -220,7 +253,10 @@ sqrt(mean((ped_add_classic_complete - ped_add_classic)^2)) ```{r} corrplot(as.matrix(ped_add_classic_complete - ped_add_partial), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE + is.corr = FALSE, + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) sqrt(mean((ped_add_classic_complete - ped_add_partial)^2)) @@ -362,27 +398,42 @@ fam1 <- inbreeding_list[[1]] corrplot(as.matrix(fam1$ped_add_classic_complete), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE, title = "Classic - Complete" + is.corr = FALSE, title = "Classic - Complete", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) corrplot(as.matrix(fam1$ped_add_classic_mom), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE, title = "Classic - Mom Missing" + is.corr = FALSE, title = "Classic - Mom Missing", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) corrplot(as.matrix(fam1$ped_add_partial_mom), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE, title = "Partial - Mom Missing" + is.corr = FALSE, title = "Partial - Mom Missing", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) corrplot(as.matrix(fam1$ped_add_classic_dad), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE, title = "Classic - Dad Missing" + is.corr = FALSE, title = "Classic - Dad Missing", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) corrplot(as.matrix(fam1$ped_add_partial_dad), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE, title = "Partial - Dad Missing" + is.corr = FALSE, title = "Partial - Dad Missing", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) ``` @@ -392,22 +443,34 @@ To visualize the differences from the true matrix: ```{r} corrplot(as.matrix(fam1$ped_add_classic_complete - fam1$ped_add_classic_mom), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE, title = "Classic Mom Diff from Complete" + is.corr = FALSE, title = "Classic Mom Diff from Complete", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) corrplot(as.matrix(fam1$ped_add_classic_complete - fam1$ped_add_partial_mom), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE, title = "Partial Mom Diff from Complete" + is.corr = FALSE, title = "Partial Mom Diff from Complete", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) corrplot(as.matrix(fam1$ped_add_classic_complete - fam1$ped_add_classic_dad), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE, title = "Classic Dad Diff from Complete" + is.corr = FALSE, title = "Classic Dad Diff from Complete", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) corrplot(as.matrix(fam1$ped_add_classic_complete - fam1$ped_add_partial_dad), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE, title = "Partial Dad Diff from Complete" + is.corr = FALSE, title = "Partial Dad Diff from Complete", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) ``` diff --git a/vignettes/partial.html b/vignettes/partial.html index aaa1fbec..51f5d3f5 100644 --- a/vignettes/partial.html +++ b/vignettes/partial.html @@ -377,12 +377,14 @@

    Hazard Data Example

    ped_add_partial_complete <- ped2com(df,
       isChild_method = "partialparent",
       component = "additive",
    -  adjacency_method = "direct"
    -)
    -ped_add_classic_complete <- ped2com(df,
    -  isChild_method = "classic",
    -  component = "additive", adjacency_method = "direct"
    -)
    + adjacency_method = "direct", + sparse = FALSE +) +ped_add_classic_complete <- ped2com(df, + isChild_method = "classic", + component = "additive", adjacency_method = "direct", + sparse = FALSE +)

    The following plots display the full additive matrices. These matrices should be identical.

    This can be confirmed visually and numerically.

    @@ -392,25 +394,31 @@

    Hazard Data Example

    corrplot(as.matrix(ped_add_classic_complete), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE, title = "Additive component - Classic method" -) -

    + is.corr = FALSE, title = "Additive component - Classic method", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) +) +

    
    -corrplot(as.matrix(ped_add_partial_complete),
    -  method = "color", type = "lower", col.lim = c(0, 1),
    -  is.corr = FALSE, title = "Additive component - Partial parent method"
    -)
    -

    + +corrplot(as.matrix(ped_add_partial_complete), + method = "color", type = "lower", col.lim = c(0, 1), + is.corr = FALSE, title = "Additive component - Partial parent method", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) +) +

    To verify this, we subtract one matrix from the other and calculate RMSE. The difference should be numerically zero. Indeed, it is 0.

    -
    corrplot(as.matrix(ped_add_classic_complete - ped_add_partial_complete),
    +
    corrplot((as.matrix(ped_add_classic_complete) - as.matrix(ped_add_partial_complete)),
       method = "color", type = "lower", col.lim = c(0, 1),
    -  is.corr = FALSE
    -)
    -#> Warning in corrplot(as.matrix(ped_add_classic_complete -
    -#> ped_add_partial_complete), : col.lim interval too wide, please set a suitable
    -#> value
    -

    + is.corr = FALSE, order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) +)
    +

    Introducing Missingness: Remove a Parent

    @@ -420,13 +428,15 @@

    Introducing Missingness: Remove a Parent

    ped_add_partial_mom <- ped_add_partial <- ped2com(df,
       isChild_method = "partialparent",
       component = "additive",
    -  adjacency_method = "direct"
    -)
    -
    -ped_add_classic_mom <- ped_add_classic <- ped2com(df,
    -  isChild_method = "classic",
    -  component = "additive", adjacency_method = "direct"
    -)
    + adjacency_method = "direct", + sparse = FALSE +) + +ped_add_classic_mom <- ped_add_classic <- ped2com(df, + isChild_method = "classic", + component = "additive", adjacency_method = "direct", + sparse = FALSE +)

    The two methods now treat individual 4 differently in the parent adjacency matrix. The classic method applies a fixed contribution because one parent remains. The partial parent method inflates the @@ -436,26 +446,34 @@

    Introducing Missingness: Remove a Parent

    between the two matrices is 0.009811.

    corrplot(as.matrix(ped_add_classic),
       method = "color", type = "lower", col.lim = c(0, 1),
    -  is.corr = FALSE, title = "Classic (mother removed)"
    -)
    -

    -
    
    -corrplot(as.matrix(ped_add_partial),
    -  method = "color", type = "lower", col.lim = c(0, 1),
    -  is.corr = FALSE, title = "Partial (mother removed)"
    -)
    -

    + is.corr = FALSE, title = "Classic (mother removed)", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) +) +

    +
    corrplot(as.matrix(ped_add_partial),
    +  method = "color", type = "lower", col.lim = c(0, 1),
    +  is.corr = FALSE, title = "Partial (mother removed)",
    +  order = "hclust",
    +  tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2,
    +  col = COL1("Reds", 100), mar = c(0, 0, 2, 0)
    +)
    +

    We quantify the overall matrix difference:

    -
    sqrt(mean((ped_add_classic - ped_add_partial)^2))
    +
    sqrt(mean((as.matrix(ped_add_classic) - as.matrix(ped_add_partial))^2))
     #> [1] 0.009811047

    Next, we compare each method to the matrix from the complete pedigree. This evaluates how much each method deviates from the correct additive structure.

    -
    corrplot(as.matrix(ped_add_classic_complete - ped_add_classic),
    +
    corrplot(as.matrix(ped_add_classic_complete) - as.matrix(ped_add_classic),
       method = "color", type = "lower", col.lim = c(0, 1),
    -  is.corr = FALSE
    -)
    -

    + is.corr = FALSE, + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) +)
    +

    
     sqrt(mean((ped_add_classic_complete - ped_add_classic)^2))
     #> [1] 0.02991371
    @@ -463,9 +481,12 @@

    Introducing Missingness: Remove a Parent

    is 0.0299137.

    corrplot(as.matrix(ped_add_classic_complete - ped_add_partial),
       method = "color", type = "lower", col.lim = c(0, 1),
    -  is.corr = FALSE
    -)
    -

    + is.corr = FALSE, + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) +)
    +

    
     sqrt(mean((ped_add_classic_complete - ped_add_partial)^2))
     #> [1] 0.02825904
    @@ -499,32 +520,48 @@

    Removing the Father Instead

    two matrices is 0.009811.

    corrplot(as.matrix(ped_add_classic_dad),
       method = "color", type = "lower", col.lim = c(0, 1),
    -  is.corr = FALSE, title = "Classic (father removed)"
    -)
    -

    + is.corr = FALSE, title = "Classic (father removed)", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) +) +

    
     corrplot(as.matrix(ped_add_partial_dad),
       method = "color", type = "lower", col.lim = c(0, 1),
    -  is.corr = FALSE, title = "Partial (father removed)"
    -)
    -

    + is.corr = FALSE, title = "Partial (father removed)", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) +) +

    Again, we compare to the true matrix from the complete pedigree:

    corrplot(as.matrix(ped_add_classic_complete - ped_add_classic),
       method = "color", type = "lower", col.lim = c(0, 1),
    -  is.corr = FALSE
    -)
    -

    + is.corr = FALSE, + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) +) +

    
     sqrt(mean((ped_add_classic_complete - ped_add_classic)^2))
    -#> [1] 0.02991371
    +#> Warning in mean.default((ped_add_classic_complete - ped_add_classic)^2): +#> argument is not numeric or logical: returning NA +#> [1] NA
    corrplot(as.matrix(ped_add_classic_complete - ped_add_partial),
       method = "color", type = "lower", col.lim = c(0, 1),
    -  is.corr = FALSE
    -)
    -

    + is.corr = FALSE, + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) +) +

    
     sqrt(mean((ped_add_classic_complete - ped_add_partial)^2))
    -#> [1] 0.02825904
    +#> Warning in mean.default((ped_add_classic_complete - ped_add_partial)^2): +#> argument is not numeric or logical: returning NA +#> [1] NA

    The partial parent method again yields a matrix closer to the full-data version.

    @@ -638,7 +675,85 @@

    Inbreeding Dataset: Family-Level Evaluation

    ped_add_partial_mom = ped_add_partial_mom, ped_add_classic_mom = ped_add_classic_mom ) -} +} +#> Warning in mean.default((ped_add_classic_complete - ped_add_partial_dad)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_classic_dad)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_partial_mom)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_classic_mom)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_partial_mom)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_partial_dad)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_classic_dad)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_partial_mom)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_classic_mom)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_partial_mom)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_partial_dad)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_classic_dad)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_partial_mom)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_classic_mom)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_partial_mom)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_partial_dad)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_classic_dad)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_partial_mom)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_classic_mom)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_partial_mom)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_partial_dad)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_classic_dad)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_partial_mom)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_classic_mom)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_partial_mom)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_partial_dad)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_classic_dad)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_partial_mom)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_classic_mom)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_partial_mom)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_partial_dad)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_classic_dad)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_partial_mom)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_classic_mom)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_partial_mom)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_partial_dad)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_classic_dad)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_partial_mom)^2): +#> argument is not numeric or logical: returning NA +#> Warning in mean.default((ped_add_classic_complete - ped_add_classic_mom)^2): +#> argument is not numeric or logical: returning NA

    Example: Family 1

    To understand what these matrices look like, we visualize them for @@ -651,57 +766,84 @@

    Example: Family 1

    corrplot(as.matrix(fam1$ped_add_classic_complete), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE, title = "Classic - Complete" -)
    -

    + is.corr = FALSE, title = "Classic - Complete", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) +) +

    
     corrplot(as.matrix(fam1$ped_add_classic_mom),
       method = "color", type = "lower", col.lim = c(0, 1),
    -  is.corr = FALSE, title = "Classic - Mom Missing"
    -)
    -

    + is.corr = FALSE, title = "Classic - Mom Missing", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) +) +

    
     corrplot(as.matrix(fam1$ped_add_partial_mom),
       method = "color", type = "lower", col.lim = c(0, 1),
    -  is.corr = FALSE, title = "Partial - Mom Missing"
    -)
    -

    + is.corr = FALSE, title = "Partial - Mom Missing", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) +) +

    
     corrplot(as.matrix(fam1$ped_add_classic_dad),
       method = "color", type = "lower", col.lim = c(0, 1),
    -  is.corr = FALSE, title = "Classic - Dad Missing"
    -)
    -

    + is.corr = FALSE, title = "Classic - Dad Missing", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) +) +

    
     corrplot(as.matrix(fam1$ped_add_partial_dad),
       method = "color", type = "lower", col.lim = c(0, 1),
    -  is.corr = FALSE, title = "Partial - Dad Missing"
    -)
    -

    + is.corr = FALSE, title = "Partial - Dad Missing", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) +) +

    To visualize the differences from the true matrix:

    corrplot(as.matrix(fam1$ped_add_classic_complete - fam1$ped_add_classic_mom),
       method = "color", type = "lower", col.lim = c(0, 1),
    -  is.corr = FALSE, title = "Classic Mom Diff from Complete"
    -)
    -

    + is.corr = FALSE, title = "Classic Mom Diff from Complete", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) +) +

    
     corrplot(as.matrix(fam1$ped_add_classic_complete - fam1$ped_add_partial_mom),
       method = "color", type = "lower", col.lim = c(0, 1),
    -  is.corr = FALSE, title = "Partial Mom Diff from Complete"
    -)
    -

    + is.corr = FALSE, title = "Partial Mom Diff from Complete", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) +) +

    
     corrplot(as.matrix(fam1$ped_add_classic_complete - fam1$ped_add_classic_dad),
       method = "color", type = "lower", col.lim = c(0, 1),
    -  is.corr = FALSE, title = "Classic Dad Diff from Complete"
    -)
    -

    + is.corr = FALSE, title = "Classic Dad Diff from Complete", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) +) +

    
     corrplot(as.matrix(fam1$ped_add_classic_complete - fam1$ped_add_partial_dad),
       method = "color", type = "lower", col.lim = c(0, 1),
    -  is.corr = FALSE, title = "Partial Dad Diff from Complete"
    -)
    -

    + is.corr = FALSE, title = "Partial Dad Diff from Complete", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) +) +

    These plots show how each method responds to missing data, and whether it maintains consistency with the complete pedigree. We observe that the partial parent method typically introduces smaller deviations. @@ -728,13 +870,14 @@

    Summary

    results %>%
       select(RMSE_diff_mom, RMSE_diff_dad) %>%
       summary()
    -#>  RMSE_diff_mom      RMSE_diff_dad     
    -#>  Min.   :0.001222   Min.   :0.001222  
    -#>  1st Qu.:0.001869   1st Qu.:0.002036  
    -#>  Median :0.002538   Median :0.002520  
    -#>  Mean   :0.005763   Mean   :0.005786  
    -#>  3rd Qu.:0.005625   3rd Qu.:0.005625  
    -#>  Max.   :0.024221   Max.   :0.024221
    +#> RMSE_diff_mom RMSE_diff_dad +#> Min. : NA Min. : NA +#> 1st Qu.: NA 1st Qu.: NA +#> Median : NA Median : NA +#> Mean :NaN Mean :NaN +#> 3rd Qu.: NA 3rd Qu.: NA +#> Max. : NA Max. : NA +#> NA's :8 NA's :8

    In all families, both RMSE_diff_mom and RMSE_diff_dad are positive—indicating that the classic method produces larger the errors relative to the partial method. This @@ -742,9 +885,9 @@

    Summary

    father.

    To verify this directly:

    mean(results$RMSE_diff_mom > 0, na.rm = TRUE)
    -#> [1] 1
    +#> [1] NaN
     mean(results$RMSE_diff_dad > 0, na.rm = TRUE)
    -#> [1] 1
    +#> [1] NaN

    These proportions show how often the partial method produces a lower RMSE across the dataset. This confirms the earlier findings: when pedigree data are incomplete, the partial parent method more faithfully @@ -756,13 +899,14 @@

    Summary

    -max_R_partial_dad, -max_R_classic_mom, -max_R_partial_mom, -max_R_classic ) %>% summary() -#> RMSE_partial_dad RMSE_partial_mom RMSE_classic_dad RMSE_classic_mom -#> Min. :0.04773 Min. :0.04773 Min. :0.04895 Min. :0.04895 -#> 1st Qu.:0.05570 1st Qu.:0.05349 1st Qu.:0.05774 1st Qu.:0.05555 -#> Median :0.06206 Median :0.06899 Median :0.06457 Median :0.07158 -#> Mean :0.07545 Mean :0.07686 Mean :0.08124 Mean :0.08262 -#> 3rd Qu.:0.08237 3rd Qu.:0.08323 3rd Qu.:0.08866 3rd Qu.:0.08866 -#> Max. :0.15547 Max. :0.15547 Max. :0.17969 Max. :0.17969 +#> RMSE_partial_dad RMSE_partial_mom RMSE_classic_dad RMSE_classic_mom +#> Min. : NA Min. : NA Min. : NA Min. : NA +#> 1st Qu.: NA 1st Qu.: NA 1st Qu.: NA 1st Qu.: NA +#> Median : NA Median : NA Median : NA Median : NA +#> Mean :NaN Mean :NaN Mean :NaN Mean :NaN +#> 3rd Qu.: NA 3rd Qu.: NA 3rd Qu.: NA 3rd Qu.: NA +#> Max. : NA Max. : NA Max. : NA Max. : NA +#> NA's :8 NA's :8 NA's :8 NA's :8