diff --git a/.gitignore b/.gitignore index 8faabc57..90e6d24a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,18 +1,23 @@ -.Rproj.user +*.ASOIAF.ged +*.Rproj +*.knit.md .DS_Store -data-raw/ASOIAF.ged -R/.Rhistory +.Rdata .Rhistory -paper/paper.html +.Rproj.user +.httr-oauth +.quarto +.vscode/launch.json /Meta/ -*.knit.md -vignettes/articles/paper.html BGmisc.code-workspace -tests/testthat/Rplots.pdf -*.ASOIAF.ged -ASOIAF.ged -*.Rproj - -.vscode/launch.json -dataRelatedPairs_new2.csv +R/.Rhistory +benchmark_results.csv +data-raw/ASOIAF.ged data-raw/ASOIAF_040725.ged +dataRelatedPairs.csv +dataRelatedPairs_new2.csv +paper/paper.html +tests/testthat/Rplots.pdf +vignettes/articles/paper.html +.vscode/settings.json +data-raw/logo_orange.png diff --git a/DESCRIPTION b/DESCRIPTION index b35bfadd..a81d4686 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: BGmisc Title: An R Package for Extended Behavior Genetics Analysis -Version: 1.3.5.1 +Version: 1.4.0 Authors@R: c( person("S. Mason", "Garrison", , "garrissm@wfu.edu", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-4804-6003")), diff --git a/NAMESPACE b/NAMESPACE index 47496f84..d739dd51 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,7 +1,10 @@ # Generated by roxygen2: do not edit by hand export(SimPed) -export(allGens) +export(assignCoupleIDs) +export(buildTreeGrid) +export(calcAllGens) +export(calcFamilySize) export(calculateRelatedness) export(checkIDs) export(checkParentIDs) @@ -9,17 +12,17 @@ export(checkPedigreeNetwork) export(checkSex) export(com2links) export(comp2vech) +export(computeParentAdjacency) export(createGenDataFrame) export(dropLink) export(evenInsert) -export(extractSummaryText) -export(famSizeCal) export(fitComponentModel) +export(getWikiTreeSummary) export(identifyComponentModel) export(inferRelatedness) +export(insertEven) export(makeInbreeding) export(makeTwins) -export(parseTree) export(ped2add) export(ped2ce) export(ped2cn) @@ -30,18 +33,24 @@ export(ped2maternal) export(ped2mit) export(ped2paternal) export(plotPedigree) +export(readGed) export(readGedcom) export(readWikifamilytree) +export(readgedcom) export(recodeSex) export(related_coef) -export(relatedness) export(repairSex) export(resample) export(simulatePedigree) +export(summariseFamilies) +export(summariseMatrilines) +export(summarisePatrilines) +export(summarisePedigrees) export(summarizeFamilies) export(summarizeMatrilines) export(summarizePatrilines) export(summarizePedigrees) +export(traceTreePaths) export(vech) import(data.table) import(kinship2) diff --git a/NEWS.md b/NEWS.md index f0fd33f4..7d508905 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,7 +1,14 @@ -# BGmisc 1.3.6 +# BGmisc 1.4.0 * revived checkParents function to check for handling phantom parents and missing parents * added tests for checkParents function * added GoT analysis +* reduced complexity of com2links, summarizePedigree, and checkIDs with the use of subfunctions +* allow verbose argument to be passed to standardizeColnames +* list SimPed and related_coef as aliases for functions +* harmonizing function names like calcFamilySize from famSizeCal +* implemented adjBeta function to evaluation alternative build method +* reorganize file names to be more consistent +* harmonized famID # BGmisc 1.3.5.1 * Setting the default for the `sparse` argument in `ped2com()` to TRUE diff --git a/R/buildComponent.R b/R/buildComponent.R new file mode 100644 index 00000000..0372e5fc --- /dev/null +++ b/R/buildComponent.R @@ -0,0 +1,661 @@ +#' Take a pedigree and turn it into a relatedness matrix +#' @param ped a pedigree dataset. Needs ID, momID, and dadID columns +#' @param component character. Which component of the pedigree to return. See Details. +#' @param max.gen the maximum number of generations to compute +#' (e.g., only up to 4th degree relatives). The default is 25. However it can be set to infinity. +#' `Inf` uses as many generations as there are in the data. +#' @param sparse logical. If TRUE, use and return sparse matrices from Matrix package +#' @param verbose logical. If TRUE, print progress through stages of algorithm +#' @param update_rate numeric. The rate at which to print progress +#' @param gc logical. If TRUE, do frequent garbage collection via \code{\link{gc}} to save memory +#' @param saveable logical. If TRUE, save the intermediate results to disk +#' @param save_rate numeric. The rate at which to save the intermediate results +#' @param save_rate_gen numeric. The rate at which to save the intermediate results by generation. If NULL, defaults to save_rate +#' @param save_rate_parlist numeric. The rate at which to save the intermediate results by parent list. If NULL, defaults to save_rate*1000 +#' @param resume logical. If TRUE, resume from a checkpoint +#' @param save_path character. The path to save the checkpoint files +#' @param flatten.diag logical. If TRUE, overwrite the diagonal of the final relatedness matrix with ones +#' @param standardize.colnames logical. If TRUE, standardize the column names of the pedigree dataset +#' @param transpose_method character. The method to use for computing the transpose. Options are "tcrossprod", "crossprod", or "star" +#' @param adjacency_method character. The method to use for computing the adjacency matrix. Options are "loop", "indexed", direct or beta +#' @param isChild_method character. The method to use for computing the isChild matrix. Options are "classic" or "partialparent" +#' @param adjBeta_method numeric The method to use for computing the building the adjacency_method matrix when using the "beta" build +#' @param ... additional arguments to be passed to \code{\link{ped2com}} +#' @details The algorithms and methodologies used in this function are further discussed and exemplified in the vignette titled "examplePedigreeFunctions". For more advanced scenarios and detailed explanations, consult this vignette. +#' @export +#' +ped2com <- function(ped, component, + max.gen = 25, + sparse = TRUE, + verbose = FALSE, + gc = FALSE, + flatten.diag = FALSE, + standardize.colnames = TRUE, + transpose_method = "tcrossprod", + adjacency_method = "direct", + isChild_method = "classic", + saveable = FALSE, + resume = FALSE, + save_rate = 5, + save_rate_gen = save_rate, + save_rate_parlist = 100000 * save_rate, + update_rate = 100, + save_path = "checkpoint/", + adjBeta_method = NULL, + ...) { + #------ + # Check inputs + #------ + + config <- list( + verbose = verbose, + saveable = saveable, + resume = resume, + save_path = save_path, + max.gen = max.gen, + sparse = sparse, + flatten.diag = flatten.diag, + standardize.colnames = standardize.colnames, + transpose_method = transpose_method, + adjacency_method = adjacency_method, + isChild_method = isChild_method, + save_rate = save_rate, + save_rate_gen = save_rate_gen, + save_rate_parlist = save_rate_parlist, + update_rate = update_rate, + gc = gc, + component = component, + adjBeta_method = adjBeta_method, + nr = nrow(ped) + ) + + + #------ + # Checkpointing + #------ + if (config$saveable || config$resume) { # prepare checkpointing + if (config$verbose) cat("Preparing checkpointing...\n") + + checkpoint_files <- initializeCheckpoint(config) # initialize checkpoint files + } + #------ + # Validation/Preparation + #------ + + # Validate the 'component' argument and match it against predefined choices + config$component <- match.arg(tolower(config$component), + choices = c( + "generation", + "additive", + "common nuclear", + "mitochondrial" + ) + ) + + transpose_method_options <- c( + "tcrossprod", "crossprod", "star", + "tcross.alt.crossprod", "tcross.alt.star" + ) + if (!config$transpose_method %in% transpose_method_options) { + stop(paste0( + "Invalid method specified. Choose from ", + paste(transpose_method_options, collapse = ", "), "." + )) + } + + + if (!config$adjacency_method %in% c("indexed", "loop", "direct", "beta")) { + stop("Invalid method specified. Choose from 'indexed', 'loop', 'direct', or 'beta'.") + } + + # standardize colnames + if (config$standardize.colnames) { + ped <- standardizeColnames(ped, verbose = config$verbose) + } + + # Load final result if computation was completed + if (config$resume && file.exists(checkpoint_files$final_matrix)) { + if (config$verbose) cat("Loading final computed matrix...\n") + return(readRDS(checkpoint_files$final_matrix)) + } + + + #------ + # Algorithm + #------ + + # Get the number of rows in the pedigree dataset, representing the size of the family + # nr <- nrow(ped) + + # Print the family size if verbose is TRUE + if (config$verbose) { + cat(paste0("Family Size = ", config$nr, "\n")) + } + + # Step 1: Construct parent-child adjacency matrix + + ## A. Resume from Checkpoint if Needed + ## Initialize variables + list_of_adjacencies <- .loadOrComputeParList( + checkpoint_files = checkpoint_files, + ped = ped, + config = config + ) + + + ## B. Resume loop from the next uncomputed index + + + # Construct sparse matrix + # Garbage collection if gc is TRUE + if (config$gc) { + gc() + } + + # Assign parent values based on the component type + parVal <- .assignParentValue(component = config$component) + + # Construct sparse matrix + # Initialize adjacency matrix for parent-child relationships + isPar <- .loadOrComputeIsPar( + iss = list_of_adjacencies$iss, + jss = list_of_adjacencies$jss, + parVal = parVal, + ped = ped, + checkpoint_files = checkpoint_files, + config = config + ) + if (config$verbose) { + cat("Completed first degree relatives (adjacency)\n") + } + + # isPar is the adjacency matrix. 'A' matrix from RAM + + if (config$component %in% c("common nuclear")) { + Matrix::diag(isPar) <- 1 + if (config$sparse == FALSE) { + isPar <- as.matrix(isPar) + } + return(isPar) + } + + # isChild is the 'S' matrix from RAM + isChild <- .loadOrComputeIsChild( + ped = ped, + checkpoint_files = checkpoint_files, + config = config + ) + # --- Step 2: Compute Relatedness Matrix --- + + + if (config$resume && file.exists(checkpoint_files$r_checkpoint) && file.exists(checkpoint_files$gen_checkpoint) && file.exists(checkpoint_files$mtSum_checkpoint) && file.exists(checkpoint_files$newIsPar_checkpoint) && + file.exists(checkpoint_files$count_checkpoint) + ) { + if (config$verbose) cat("Resuming: Loading previous computation...\n") + r <- readRDS(checkpoint_files$r_checkpoint) + gen <- readRDS(checkpoint_files$gen_checkpoint) + mtSum <- readRDS(checkpoint_files$mtSum_checkpoint) + newIsPar <- readRDS(checkpoint_files$newIsPar_checkpoint) + count <- readRDS(checkpoint_files$count_checkpoint) + } else { + r <- Matrix::Diagonal(x = 1, n = config$nr) + gen <- rep(1, config$nr) + mtSum <- sum(r, na.rm = TRUE) + newIsPar <- isPar + count <- 0 + } + maxCount <- config$max.gen + 1 + if (config$verbose) { + cat("About to do RAM path tracing\n") + } + + # r is I + A + A^2 + ... = (I-A)^-1 from RAM + # could trim, here + while (mtSum != 0 && count < maxCount) { + r <- r + newIsPar + gen <- gen + (Matrix::rowSums(newIsPar) > 0) + newIsPar <- newIsPar %*% isPar + mtSum <- sum(newIsPar) + count <- count + 1 + if (config$verbose) { + cat(paste0("Completed ", count - 1, " degree relatives\n")) + } + # Save progress every save_rate iterations + if (config$saveable && (count %% save_rate_gen == 0)) { + saveRDS(r, file = checkpoint_files$r_checkpoint) + saveRDS(gen, file = checkpoint_files$gen_checkpoint) + saveRDS(newIsPar, file = checkpoint_files$newIsPar_checkpoint) + saveRDS(mtSum, file = checkpoint_files$mtSum_checkpoint) + saveRDS(count, file = checkpoint_files$count_checkpoint) + } + if (config$gc && config$nr > 1000000) { + gc() + } # extra gc if large + } + # compute rsq <- r %*% sqrt(diag(isChild)) + # compute rel <- tcrossprod(rsq) + if (config$gc) { + rm(isPar, newIsPar) + gc() + } + if (config$saveable) { + saveRDS(r, file = checkpoint_files$ram_checkpoint) + } + + if (config$component == "generation") { # no need to do the rest + return(gen) + } else { + if (config$verbose) { + cat("Completed RAM path tracing\n") + } + } + + # --- Step 3: I-A inverse times diagonal multiplication --- + r2 <- .loadOrComputeInverseDiagonal( + r = r, + isChild = isChild, + checkpoint_files = checkpoint_files, + config = config + ) + + # --- Step 4: T crossproduct --- + + if (config$resume && file.exists(checkpoint_files$tcrossprod_checkpoint) && config$component != "generation") { + if (config$verbose) cat("Resuming: Loading tcrossprod...\n") + r <- readRDS(checkpoint_files$tcrossprod_checkpoint) + } else { + r <- .computeTranspose(r2 = r2, transpose_method = transpose_method, verbose = config$verbose) + if (config$saveable) { + saveRDS(r, file = checkpoint_files$tcrossprod_checkpoint) + } + } + + if (config$component == "mitochondrial") { + r@x <- rep(1, length(r@x)) + # Assign 1 to all nonzero elements for mitochondrial component + } + + if (config$sparse == FALSE) { + r <- as.matrix(r) + } + if (config$flatten.diag) { # flattens diagonal if you don't want to deal with inbreeding + diag(r) <- 1 + } + if (config$saveable) { + saveRDS(r, file = checkpoint_files$final_matrix) + } + return(r) +} + +#' Take a pedigree and turn it into an additive genetics relatedness matrix +#' @inheritParams ped2com +#' @inherit ped2com details +#' @export +#' +ped2add <- function(ped, max.gen = 25, sparse = TRUE, verbose = FALSE, + gc = FALSE, + flatten.diag = FALSE, standardize.colnames = TRUE, + transpose_method = "tcrossprod", + adjacency_method = "direct", + saveable = FALSE, + resume = FALSE, + save_rate = 5, + save_rate_gen = save_rate, + save_rate_parlist = 100000 * save_rate, + save_path = "checkpoint/", + ...) { + ped2com( + ped = ped, + max.gen = max.gen, + sparse = sparse, + verbose = verbose, + gc = gc, + component = "additive", + flatten.diag = flatten.diag, + standardize.colnames = standardize.colnames, + transpose_method = transpose_method, + adjacency_method = adjacency_method, + saveable = saveable, + resume = resume, + save_rate_gen = save_rate_gen, + save_rate_parlist = save_rate_parlist, + save_path = save_path + ) +} + +#' Take a pedigree and turn it into a mitochondrial relatedness matrix +#' @inheritParams ped2com +#' @inherit ped2com details +#' @export +#' @aliases ped2mt +#' +ped2mit <- ped2mt <- function(ped, max.gen = 25, + sparse = TRUE, + verbose = FALSE, gc = FALSE, + flatten.diag = FALSE, + standardize.colnames = TRUE, + transpose_method = "tcrossprod", + adjacency_method = "direct", + saveable = FALSE, + resume = FALSE, + save_rate = 5, + save_rate_gen = save_rate, + save_rate_parlist = 100000 * save_rate, + save_path = "checkpoint/", + ...) { + ped2com( + ped = ped, + max.gen = max.gen, + sparse = sparse, + verbose = verbose, + gc = gc, + component = "mitochondrial", + flatten.diag = flatten.diag, + standardize.colnames = standardize.colnames, + transpose_method = transpose_method, + adjacency_method = adjacency_method, + saveable = saveable, + resume = resume, + save_rate_gen = save_rate_gen, + save_rate_parlist = save_rate_parlist, + save_path = save_path, + ... + ) +} + +#' Take a pedigree and turn it into a common nuclear environmental relatedness matrix +#' @inheritParams ped2com +#' @inherit ped2com details +#' @export +#' +ped2cn <- function(ped, max.gen = 25, sparse = TRUE, verbose = FALSE, + gc = FALSE, flatten.diag = FALSE, + standardize.colnames = TRUE, + transpose_method = "tcrossprod", + saveable = FALSE, + resume = FALSE, + save_rate = 5, + adjacency_method = "direct", + save_rate_gen = save_rate, + save_rate_parlist = 1000 * save_rate, + save_path = "checkpoint/", + ...) { + ped2com( + ped = ped, + max.gen = max.gen, + sparse = sparse, + verbose = verbose, + gc = gc, + component = "common nuclear", + adjacency_method = adjacency_method, + flatten.diag = flatten.diag, + standardize.colnames = standardize.colnames, + transpose_method = transpose_method, + saveable = saveable, + resume = resume, + save_rate_gen = save_rate_gen, + save_rate_parlist = save_rate_parlist, + save_path = save_path, + ... + ) +} +#' Take a pedigree and turn it into an extended environmental relatedness matrix +#' @inheritParams ped2com +#' @inherit ped2com details +#' @export +#' +ped2ce <- function(ped, ...) { + matrix(1, nrow = nrow(ped), ncol = nrow(ped), dimnames = list(ped$ID, ped$ID)) +} + + +#' Compute the transpose multiplication for the relatedness matrix +#' @inheritParams ped2com +#' @inherit ped2com details +#' @param r2 a relatedness matrix +#' +.computeTranspose <- function(r2, transpose_method = "tcrossprod", verbose = FALSE) { + valid_methods <- c( + "tcrossprod", "crossprod", "star", + "tcross.alt.crossprod", "tcross.alt.star" + ) + if (!transpose_method %in% valid_methods) { + stop("Invalid method specified. Choose from 'tcrossprod', 'crossprod', 'star', 'tcross.alt.crossprod', or 'tcross.alt.star'.") + } + + # Map aliases to core methods + alias_map <- c( + "tcross.alt.crossprod" = "crossprod", + "tcross.alt.star" = "star" + ) + + if (transpose_method %in% names(alias_map)) { + method_normalized <- alias_map[[transpose_method]] + } else { + method_normalized <- transpose_method + } + + result <- switch(method_normalized, + "tcrossprod" = { + if (verbose) cat("Doing tcrossprod\n") + Matrix::tcrossprod(r2) + }, + "crossprod" = { + if (verbose) cat("Doing tcrossprod using crossprod(t(.))\n") + crossprod(t(as.matrix(r2))) + }, + "star" = { + if (verbose) cat("Doing tcrossprod using %*% t(.)\n") + r2 %*% t(as.matrix(r2)) + } + ) + + return(result) +} + +#' Initialize checkpoint files +#' @inheritParams ped2com +#' @keywords internal + +initializeCheckpoint <- function(config = list( + verbose = FALSE, + saveable = FALSE, + resume = FALSE, + save_path = "checkpoint/" + )) { + # Define checkpoint files + # Ensure save path exists + if (config$saveable && !dir.exists(config$save_path)) { + if (config$verbose) cat("Creating save path...\n") + dir.create(config$save_path, recursive = TRUE) + } else if (config$resume && !dir.exists(config$save_path)) { + stop("Cannot resume from checkpoint. Save path does not exist.") + } + + checkpoint_files <- list( + parList = file.path(config$save_path, "parList.rds"), + lens = file.path(config$save_path, "lens.rds"), + isPar = file.path(config$save_path, "isPar.rds"), + iss = file.path(config$save_path, "iss.rds"), + jss = file.path(config$save_path, "jss.rds"), + isChild = file.path(config$save_path, "isChild.rds"), + r_checkpoint = file.path(config$save_path, "r_checkpoint.rds"), + gen_checkpoint = file.path(config$save_path, "gen_checkpoint.rds"), + newIsPar_checkpoint = file.path(config$save_path, "newIsPar_checkpoint.rds"), + mtSum_checkpoint = file.path(config$save_path, "mtSum_checkpoint.rds"), + ram_checkpoint = file.path(config$save_path, "ram_checkpoint.rds"), + r2_checkpoint = file.path(config$save_path, "r2_checkpoint.rds"), + tcrossprod_checkpoint = file.path(config$save_path, "tcrossprod_checkpoint.rds"), + count_checkpoint = file.path(config$save_path, "count_checkpoint.rds"), + final_matrix = file.path(config$save_path, "final_matrix.rds") + ) + + return(checkpoint_files) +} + +#' Assign parent values based on component type +#' @inheritParams ped2com +.assignParentValue <- function(component) { + # Set parent values depending on the component type + if (component %in% c("generation", "additive")) { + parVal <- .5 + } else if (component %in% c("common nuclear", "mitochondrial")) { + parVal <- 1 + } else { + stop("Don't know how to set parental value") + } + return(parVal) +} + +#' Load or compute a checkpoint +#' @param file The file path to load the checkpoint from. +#' @param compute_fn The function to compute the checkpoint if it doesn't exist. +#' @param config A list containing configuration parameters such as `resume`, `verbose`, and `saveable`. +#' @param message_resume Optional message to display when resuming from a checkpoint. +#' @param message_compute Optional message to display when computing the checkpoint. +#' @return The loaded or computed checkpoint. +#' @keywords internal +loadOrComputeCheckpoint <- function(file, compute_fn, config, message_resume = NULL, message_compute = NULL) { + if (config$resume && file.exists(file)) { + if (config$verbose && !is.null(message_resume)) cat(message_resume) + return(readRDS(file)) + } else { + if (config$verbose && !is.null(message_compute)) cat(message_compute) + result <- compute_fn() + if (config$saveable) saveRDS(result, file = file) + return(result) + } +} + +#' Load or compute the isPar matrix +#' @inheritParams loadOrComputeCheckpoint +#' @inheritParams ped2com +#' @param iss The row indices of the sparse matrix. +#' @param jss The column indices of the sparse matrix. +#' @param parVal The value to assign to the non-zero elements of the sparse matrix. +#' @param ped The pedigree dataset. +#' @param checkpoint_files A list of checkpoint file paths. +#' +#' @keywords internal +.loadOrComputeIsPar <- function(iss, jss, parVal, ped, checkpoint_files, config) { + isPar <- loadOrComputeCheckpoint( + file = checkpoint_files$isPar, + compute_fn = function() { + Matrix::sparseMatrix( + i = iss, j = jss, x = parVal, + dims = c(config$nr, config$nr), + dimnames = list(ped$ID, ped$ID) + ) + }, + config = config, + message_resume = "Resuming: Loading adjacency matrix...\n", + message_compute = "Initializing adjacency matrix...\n" + ) + + return(isPar) +} + +#' Load or compute the isChild matrix +#' @inheritParams loadOrComputeCheckpoint +#' @inheritParams ped2com +#' @param checkpoint_files A list of checkpoint file paths. +#' +#' @keywords internal + +.loadOrComputeIsChild <- function(ped, checkpoint_files, config) { + isChild <- loadOrComputeCheckpoint( + file = checkpoint_files$isChild, + compute_fn = function() isChild(isChild_method = config$isChild_method, ped = ped), + config = config, + message_resume = "Resuming: Loading isChild matrix...\n", + message_compute = "Computing isChild matrix...\n" + ) + + return(isChild) +} + +.loadOrComputeInverseDiagonal <- function(r, isChild, checkpoint_files, config) { + r2 <- loadOrComputeCheckpoint( + file = checkpoint_files$r2_checkpoint, + compute_fn = function() { + r %*% Matrix::Diagonal(x = sqrt(isChild), n = config$nr) + }, + config = config, + message_resume = "Resuming: Loading I-A inverse...\n", + message_compute = "Doing I-A inverse times diagonal multiplication\n" + ) + if (config$gc) { + rm(r, isChild) + gc() + } + return(r2) +} + + + +#' parent-child adjacency data +#' @inheritParams loadOrComputeCheckpoint +#' @inheritParams ped2com +#' @param checkpoint_files A list of checkpoint file paths. +#' @param config A list containing configuration parameters such as `resume`, `verbose`, and `saveable`. +#' @param parList A list of parent-child adjacency data. +#' @param lens A vector of lengths for each parent-child relationship. +#' @keywords internal + +#' @return A list containing the parent-child adjacency data either loaded from a checkpoint or initialized. +#' + +.loadOrComputeParList <- function(checkpoint_files, config, ped = NULL, parList = NULL, lens = NULL) { + if (config$resume && file.exists(checkpoint_files$parList) && file.exists(checkpoint_files$lens)) { + if (config$verbose) cat("Resuming: Loading parent-child adjacency data...\n") + parList <- readRDS(checkpoint_files$parList) + lens <- readRDS(checkpoint_files$lens) + computed_indices <- which(!sapply(parList, is.null)) + lastComputed <- if (length(computed_indices) > 0) max(computed_indices) else 0 + if (config$verbose) cat("Resuming from iteration", lastComputed + 1, "\n") + } else { + ## Initialize variables + parList <- vector("list", config$nr) + lens <- integer(config$nr) + lastComputed <- 0 + if (config$verbose) cat("Building parent adjacency matrix...\n") + } + + if (config$resume && file.exists(checkpoint_files$iss) && file.exists(checkpoint_files$jss)) { # fix to check actual + if (config$verbose) cat("Resuming: Constructed matrix...\n") + jss <- readRDS(checkpoint_files$jss) + iss <- readRDS(checkpoint_files$iss) + list_of_adjacencies <- list(iss = iss, jss = jss) + } else { + if (config$verbose) cat("Computing parent-child adjacency matrix...\n") + list_of_adjacencies <- computeParentAdjacency( + ped = ped, + save_rate_parlist = config$save_rate_parlist, + checkpoint_files = checkpoint_files, + component = config$component, + adjacency_method = config$adjacency_method, # adjacency_method, + saveable = config$saveable, + resume = config$resume, + save_path = config$save_path, + update_rate = config$update_rate, + verbose = config$verbose, + lastComputed = lastComputed, + config = config, + parList = parList, + lens = lens, + adjBeta_method = config$adjBeta_method + ) + + # Construct sparse matrix + + + if (config$verbose) { + cat("Constructed sparse matrix\n") + } + if (config$saveable) { + saveRDS(list_of_adjacencies$jss, file = checkpoint_files$jss) + saveRDS(list_of_adjacencies$iss, file = checkpoint_files$iss) + } + } + return(list_of_adjacencies) +} diff --git a/R/calculateFamilySize.R b/R/calculateFamilySize.R index 51365d85..c346cb66 100644 --- a/R/calculateFamilySize.R +++ b/R/calculateFamilySize.R @@ -1,11 +1,11 @@ -#' allGens +#' calcAllGens #' A function to calculate the number of individuals in each generation. This is a supporting function for \code{simulatePedigree}. #' @param kpc Number of kids per couple (integer >= 2). #' @param Ngen Number of generations (integer >= 1). #' @param marR Mating rate (numeric value ranging from 0 to 1). #' @return Returns a vector containing the number of individuals in every generation. #' @export -allGens <- function(kpc, Ngen, marR) { +calcAllGens <- function(kpc, Ngen, marR) { # Check if the number of generations is valid if (Ngen < 1) { stop("The number of generations should be an integer greater or equal than 1") @@ -23,14 +23,15 @@ allGens <- function(kpc, Ngen, marR) { } return(allGens) } +#' @rdname calcAllGens +allGens <- calcAllGens - -#' sizeAllGens +#' calcFamilySizeByGen #' An internal supporting function for \code{simulatePedigree}. -#' @inheritParams allGens +#' @inheritParams calcAllGens #' @return Returns a vector including the number of individuals in every generation. -sizeAllGens <- function(kpc, Ngen, marR) { +calcFamilySizeByGen <- function(kpc, Ngen, marR) { Nmid <- Ngen - 2 midGens <- numeric(length = Nmid) @@ -46,14 +47,15 @@ sizeAllGens <- function(kpc, Ngen, marR) { # print(allGens) return(allGens) } +#' @rdname calcFamilySizeByGen +sizeAllGens <- calcFamilySizeByGen - -#' famSizeCal +#' calcFamilySize #' A function to calculate the total number of individuals in a pedigree given parameters. This is a supporting function for function \code{simulatePedigree} -#' @inheritParams allGens +#' @inheritParams calcAllGens #' @return Returns a numeric value indicating the total pedigree size. #' @export -famSizeCal <- function(kpc, Ngen, marR) { +calcFamilySize <- function(kpc, Ngen, marR) { if (Ngen < 1) { stop("The number of generations should be an integer greater than or equal to 1") } else if (Ngen == 1) { @@ -71,3 +73,7 @@ famSizeCal <- function(kpc, Ngen, marR) { } return(size) } + +#' @rdname calcFamilySize +#' +famSizeCal <- calcFamilySize diff --git a/R/computeRelatedness.R b/R/calculateRelatedness.R similarity index 93% rename from R/computeRelatedness.R rename to R/calculateRelatedness.R index 2533c16f..da7336e3 100644 --- a/R/computeRelatedness.R +++ b/R/calculateRelatedness.R @@ -64,6 +64,14 @@ calculateRelatedness <- function( return(coef) } + +#' @rdname calculateRelatedness +#' @export +related_coef <- function(...) { + warning("The 'related_coef' function is deprecated. Please use 'calculateRelatedness' instead.") + calculateRelatedness(...) +} + #' Infer Relatedness Coefficient #' #' @description @@ -79,6 +87,7 @@ calculateRelatedness <- function( #' @param aceA Numeric. Proportion of variance attributable to additive genetic variance. Must be between 0 and 1. Default is 0.9. #' @param aceC Numeric. Proportion of variance attributable to shared environmental variance. Must be between 0 and 1. Default is 0. #' @param sharedC Numeric. Proportion of shared environment shared between the two individuals. Must be between 0 (no shared environment) and 1 (completely shared environment). Default is 0. +#' @param ... Further named arguments that may be passed to another function. #' #' @return #' Numeric. The calculated relatedness coefficient (`est_r`). @@ -97,6 +106,13 @@ inferRelatedness <- function(obsR, aceA = .9, aceC = 0, sharedC = 0) { return(calc_r) } +#' @rdname inferRelatedness +#' @keywords internal +relatedness <- function(...) { + warning("The 'relatedness' function is deprecated. Please use 'inferRelatedness' instead.") + inferRelatedness(...) +} + #' Falconer's Formula #' #' @description @@ -133,7 +149,6 @@ calculateH <- function(r1, r2, obsR1, obsR2) { message("Your scale might be reverse coded because you have negative correlations. Please check your data. ") } - # Calculate heritability estimates (H^2) for all pairs heritability_estimates <- (obsR1 - obsR2) / (r1 - r2) @@ -145,7 +160,5 @@ calculateH <- function(r1, r2, obsR1, obsR2) { if (any(heritability_estimates > 1)) { warning("Some calculated heritability values are greater than 1, which may suggest overestimation or errors in the observed correlations or relatedness coefficients.") } - - return(heritability_estimates) } diff --git a/R/checkIDs.R b/R/checkIDs.R index b9da9a6d..b687f4c7 100644 --- a/R/checkIDs.R +++ b/R/checkIDs.R @@ -17,7 +17,7 @@ #' @export checkIDs <- function(ped, verbose = FALSE, repair = FALSE) { # Standardize column names in the input dataframe - ped <- standardizeColnames(ped) + ped <- standardizeColnames(ped, verbose = verbose) # Initialize a list to store validation results validation_results <- list() @@ -28,91 +28,15 @@ checkIDs <- function(ped, verbose = FALSE, repair = FALSE) { } # Identify non-unique IDs - duplicated_ids <- ped$ID[duplicated(ped$ID) | duplicated(ped$ID, fromLast = TRUE)] - + id_check <- checkIDuniqueness(ped = ped, verbose = verbose) - # Update the validation_results list - if (length(duplicated_ids) > 0) { - if (verbose) { - cat(paste0(length(duplicated_ids), " non-unique IDs found.\n")) - } - validation_results$all_unique_ids <- FALSE - validation_results$total_non_unique_ids <- length(duplicated_ids) - validation_results$non_unique_ids <- unique(duplicated_ids) - } else { - if (verbose) { - cat("All IDs are unique.\n") - } - validation_results$all_unique_ids <- TRUE - validation_results$total_non_unique_ids <- 0 - validation_results$non_unique_ids <- NULL - } if (verbose) { cat("Step 2: Checking for within row duplicats...\n") - cat("Is own father?\n") - } - is_own_father <- ped$ID[ped$ID == ped$dadID & !is.na(ped$dadID)] - if (verbose) { - cat("Is own mother?\n") - } - is_own_mother <- ped$ID[ped$ID == ped$momID & !is.na(ped$momID)] - if (verbose) { - cat("Is mother father?\n") } - duplicated_parents <- ped$ID[ped$dadID == ped$momID & !is.na(ped$dadID) & !is.na(ped$momID)] + row_check <- checkWithinRowDuplicates(ped = ped, verbose = verbose) - # get the total number of within row duplicates - validation_results$total_own_father <- length(is_own_father) - validation_results$total_own_mother <- length(is_own_mother) - validation_results$total_duplicated_parents <- length(duplicated_parents) - validation_results$total_within_row_duplicates <- sum(length(is_own_father), length(is_own_mother), length(duplicated_parents)) - # Update the validation_results list + validation_results <- c(id_check, row_check) - if (validation_results$total_within_row_duplicates > 0) { - if (verbose) { - cat(paste0( - validation_results$total_within_row_duplicates, - " within row duplicates found.\n" - )) - } - validation_results$within_row_duplicates <- TRUE - if (validation_results$total_own_father > 0) { - validation_results$is_own_father_ids <- unique(is_own_father) - if (verbose) { - cat(paste0( - validation_results$total_own_father, - " individuals are their own fathers.\n" - )) - } - } - if (validation_results$total_own_mother > 0) { - validation_results$is_own_mother_ids <- unique(is_own_mother) - if (verbose) { - cat(paste0( - validation_results$total_own_mother, - " individuals are their own mothers.\n" - )) - } - } - if (validation_results$total_duplicated_parents > 0) { - validation_results$duplicated_parents_ids <- unique(duplicated_parents) - if (verbose) { - cat(paste0( - validation_results$total_duplicated_parents, - " individuals have the same mother and father.\n" - )) - } - } - } else { - if (verbose) { - cat("No within row duplicates found.\n") - } - validation_results$within_row_duplicates <- FALSE - validation_results$total_within_row_duplicates <- 0 - validation_results$is_own_father_ids <- NULL - validation_results$is_own_mother_ids <- NULL - validation_results$duplicated_parents_ids <- NULL - } if (verbose) { cat("Validation Results:\n") print(validation_results) @@ -163,9 +87,82 @@ checkIDs <- function(ped, verbose = FALSE, repair = FALSE) { #' Repair Missing IDs #' #' This function repairs missing IDs in a pedigree. -#' @param ped A pedigree object -#' @param verbose A logical indicating whether to print progress messages +#' @inheritParams checkIDs #' @return A corrected pedigree repairIDs <- function(ped, verbose = FALSE) { checkIDs(ped = ped, verbose = verbose, repair = TRUE) } + +#' Check for duplicated individual IDs +#' +#' This function checks for duplicated individual IDs in a pedigree. +#' +#' @inheritParams checkIDs +#' @return A list containing the results of the check +#' +checkIDuniqueness <- function(ped, verbose = FALSE) { + # Identify non-unique IDs + + duplicated_ids <- ped$ID[duplicated(ped$ID) | duplicated(ped$ID, fromLast = TRUE)] + + if (verbose) { + if (length(duplicated_ids) > 0) { + cat(length(duplicated_ids), " non-unique IDs found.\n") + } else { + cat("All IDs are unique.\n") + } + } + + # Update the validation_results list + list( + all_unique_ids = length(duplicated_ids) == 0, + total_non_unique_ids = length(duplicated_ids), + non_unique_ids = if (length(duplicated_ids) > 0) unique(duplicated_ids) else NULL + ) +} + + + +#' Check for within-row duplicates (self-parents, same mom/dad) +#' +#' This function checks for within-row duplicates in a pedigree. +#' +#' @inheritParams checkIDs +#' @return A list containing the results of the check +#' +checkWithinRowDuplicates <- function(ped, verbose = FALSE) { + # is the individual their own father or mother? + is_own_father <- ped$ID[ped$ID == ped$dadID & !is.na(ped$dadID)] + is_own_mother <- ped$ID[ped$ID == ped$momID & !is.na(ped$momID)] + + # is mother and father the same? + duplicated_parents <- ped$ID[ + ped$dadID == ped$momID & + !is.na(ped$dadID) & !is.na(ped$momID) + ] + + # get the total number of within row duplicates + total <- length(is_own_father) + length(is_own_mother) + length(duplicated_parents) + + if (verbose) { + if (total > 0) { + cat(total, " within row duplicates found.\n") + if (length(is_own_father) > 0) cat(length(is_own_father), " individuals are their own fathers.\n") + if (length(is_own_mother) > 0) cat(length(is_own_mother), " individuals are their own mothers.\n") + if (length(duplicated_parents) > 0) cat(length(duplicated_parents), " individuals have the same mother and father.\n") + } else { + cat("No within row duplicates found.\n") + } + } + # Update the validation_results list + list( + total_own_father = length(is_own_father), + total_own_mother = length(is_own_mother), + total_duplicated_parents = length(duplicated_parents), + total_within_row_duplicates = total, + within_row_duplicates = total > 0, + is_own_father_ids = if (length(is_own_father) > 0) unique(is_own_father) else NULL, + is_own_mother_ids = if (length(is_own_mother) > 0) unique(is_own_mother) else NULL, + duplicated_parents_ids = if (length(duplicated_parents) > 0) unique(duplicated_parents) else NULL + ) +} diff --git a/R/checkParents.R b/R/checkParents.R index b03a5926..a5635bdc 100644 --- a/R/checkParents.R +++ b/R/checkParents.R @@ -13,6 +13,7 @@ #' @param addphantoms A logical flag indicating whether to add phantom parents for missing parent IDs. #' @param parentswithoutrow A logical flag indicating whether to add parents without a row in the pedigree. #' +#' #' @return Depending on the value of `repair`, either a list containing validation results or a repaired dataframe is returned. #' @examples #' \dontrun{ @@ -23,12 +24,9 @@ checkParentIDs <- function(ped, verbose = FALSE, repair = FALSE, repairsex = repair, addphantoms = repair, - parentswithoutrow = repair - ) { + parentswithoutrow = repair) { # Standardize column names in the input dataframe - ped_og <- ped - ped <- standardizeColnames(ped) - + ped <- standardizeColnames(ped, verbose = verbose) # Initialize a list to store validation results validation_results <- list() @@ -37,6 +35,7 @@ checkParentIDs <- function(ped, verbose = FALSE, repair = FALSE, cat("Step 1: Checking for single parents...\n") } + # Identify missing fathers and mothers missing_fathers <- ped$ID[which(is.na(ped$dadID) & !is.na(ped$momID))] missing_mothers <- ped$ID[which(!is.na(ped$dadID) & is.na(ped$momID))] @@ -74,83 +73,33 @@ checkParentIDs <- function(ped, verbose = FALSE, repair = FALSE, cat("All parents are listed in the pedigree.\n") } } - validation_results$missing_parents <- validation_results$single_parents&length(rowless_parents) > 0 + validation_results$missing_parents <- validation_results$single_parents & length(rowless_parents) > 0 if (verbose) { cat("Step 2: Determining the if moms are the same sex and dads are same sex\n") } # Determine modal sex values for moms and dads - mom_sexes <- ped$sex[ped$ID %in% ped$momID] - dad_sexes <- ped$sex[ped$ID %in% ped$dadID] - - # Determine the most frequent sex for moms and dads - most_frequent_sex_mom <- names(sort(table(mom_sexes), decreasing = TRUE))[1] - most_frequent_sex_dad <- names(sort(table(dad_sexes), decreasing = TRUE))[1] - - # are all moms/dads the same sex? - validation_results$mom_sex <- unique(mom_sexes) - validation_results$dad_sex <- unique(dad_sexes) - - # Store the most frequent sex for moms and dads - if (is.numeric(ped$sex)) { - validation_results$female_var <- as.numeric(most_frequent_sex_mom) - validation_results$male_var <- as.numeric(most_frequent_sex_dad) - } else if (is.character(ped$sex) | is.factor(ped$sex)) { - validation_results$female_var <- most_frequent_sex_mom - validation_results$male_var <- most_frequent_sex_dad - } else { - print("You should never see this. If you do, then you have a problem with the data type of the sex variable") - } - - # verbose - if (length(validation_results$mom_sex) == 1) { - if (verbose) { - cat(paste0( - "All moms are '", - validation_results$female_var, - "'.\n" - )) - } - validation_results$female_moms <- TRUE - } else { - validation_results$female_moms <- FALSE - } - - if (length(validation_results$dad_sex) == 1) { - if (verbose) { - cat(paste0( - "All dads are '", - validation_results$male_var, - "'.\n" - )) - } - validation_results$male_dads <- TRUE - } else { - validation_results$male_dads <- FALSE - } - # Check for inconsistent sex - wrong_sex_moms <- ped$ID[which(ped$sex[ped$ID %in% ped$momID] != validation_results$female_var)] - wrong_sex_dads <- ped$ID[which(ped$sex[ped$ID %in% ped$dadID] != validation_results$male_var)] - - validation_results$wrong_sex_moms <- wrong_sex_moms - validation_results$wrong_sex_dads <- wrong_sex_dads - - if (verbose) { - if (length(wrong_sex_moms) > 0) { - cat("Some individuals listed as moms are not coded as", validation_results$female_var, "\n") - } - if (length(wrong_sex_dads) > 0) { - cat("Some individuals listed as dads are not coded as", validation_results$male_var, "\n") - } - } + mom_results <- checkParentSex(ped, parent_col = "momID", verbose = verbose) + dad_results <- checkParentSex(ped, parent_col = "dadID", verbose = verbose) + + validation_results$mom_sex <- mom_results$unique_sexes + validation_results$dad_sex <- dad_results$unique_sexes + validation_results$female_var <- mom_results$modal_sex + validation_results$male_var <- dad_results$modal_sex + validation_results$wrong_sex_moms <- mom_results$inconsistent_parents + validation_results$wrong_sex_dads <- dad_results$inconsistent_parents + validation_results$female_moms <- mom_results$all_same_sex + validation_results$male_dads <- dad_results$all_same_sex # Are any parents in both momID and dadID? momdad <- intersect(ped$dadID, ped$momID) - if (!is.na(momdad)&&length(momdad) > 0) { + if (length(momdad) > 0 && !is.na(momdad)) { validation_results$parents_in_both <- momdad if (verbose) { - cat(paste("Some individuals appear in both momID and dadID roles.\n", - "These individuals are:\n")) + cat(paste( + "Some individuals appear in both momID and dadID roles.\n", + "These individuals are:\n" + )) print(momdad) } } @@ -162,12 +111,12 @@ checkParentIDs <- function(ped, verbose = FALSE, repair = FALSE, print(validation_results) } return(validation_results) - } else{ - if (verbose) { - cat("Validation Results:\n") - print(validation_results) - cat("Step 3: Attempting to repair missing parents...\n") - } + } else { + if (verbose) { + cat("Validation Results:\n") + print(validation_results) + cat("Step 3: Attempting to repair missing parents...\n") + } cat("REPAIR IN EARLY ALPHA\n") # Initialize a list to track changes made during repair @@ -177,32 +126,47 @@ checkParentIDs <- function(ped, verbose = FALSE, repair = FALSE, phantom_dads_added = c(), phantom_moms_added = c() ) - if(repairsex){ - # Fix sex of existing parents if wrong - mom_indices <- match(ped$momID, ped$ID) - dad_indices <- match(ped$dadID, ped$ID) - - - - if (!is.na(validation_results$female_var)) { - corrected_moms <- ped$ID[mom_indices[!is.na(mom_indices)]] - ped$sex[mom_indices[!is.na(mom_indices)]] <- validation_results$female_var - changes$corrected_mom_sex <- corrected_moms - if (verbose && length(corrected_moms) > 0) { - cat("Corrected sex of moms for:", paste(corrected_moms, collapse = ", "), "\n") + if (repairsex) { + # Fix sex of existing parents if wrong + mom_indices <- match(ped$momID, ped$ID) + dad_indices <- match(ped$dadID, ped$ID) + + + + if (length(validation_results$female_var) > 0 && !is.na(validation_results$female_var)) { + corrected_moms <- ped$ID[mom_indices[!is.na(mom_indices)]] + ped$sex[mom_indices[!is.na(mom_indices)]] <- validation_results$female_var + changes$corrected_mom_sex <- corrected_moms + if (verbose && length(corrected_moms) > 0) { + cat("Corrected sex of moms for:", paste(corrected_moms, collapse = ", "), "\n") + } + } else { + corrected_moms <- ped$ID[mom_indices[!is.na(mom_indices)]] + ped$sex[mom_indices[!is.na(mom_indices)]] <- 0 + + changes$corrected_mom_sex <- corrected_moms + if (verbose && length(corrected_moms) > 0) { + cat("Corrected sex of moms for:", paste(corrected_moms, collapse = ", "), "\n") + } } - } - if (!is.na(validation_results$male_var)) { - corrected_dads <- ped$ID[dad_indices[!is.na(dad_indices)]] - ped$sex[dad_indices[!is.na(dad_indices)]] <- validation_results$male_var - changes$corrected_dad_sex <- corrected_dads - if (verbose && length(corrected_dads) > 0) { - cat("Corrected sex of dads for:", paste(corrected_dads, collapse = ", "), "\n") + if (length(validation_results$male_var) > 0 && !is.na(validation_results$male_var)) { + corrected_dads <- ped$ID[dad_indices[!is.na(dad_indices)]] + ped$sex[dad_indices[!is.na(dad_indices)]] <- validation_results$male_var + changes$corrected_dad_sex <- corrected_dads + if (verbose && length(corrected_dads) > 0) { + cat("Corrected sex of dads for:", paste(corrected_dads, collapse = ", "), "\n") + } + } else { + corrected_dads <- ped$ID[dad_indices[!is.na(dad_indices)]] + ped$sex[dad_indices[!is.na(dad_indices)]] <- 1 + changes$corrected_dad_sex <- corrected_dads + if (verbose && length(corrected_dads) > 0) { + cat("Corrected sex of dads for:", paste(corrected_dads, collapse = ", "), "\n") + } } } - } -} -if (addphantoms){ + } + if (addphantoms) { # Generate new IDs newIDbase <- if (is.numeric(ped$ID)) max(ped$ID, na.rm = TRUE) + 1 else paste0("phantom-", seq_len(nrow(ped))) new_entries <- data.frame() @@ -217,7 +181,7 @@ if (addphantoms){ new_entry$ID <- new_id new_entry$dadID <- NA new_entry$momID <- NA - new_entry$sex <- validation_results$male_var + new_entry$sex <- if (length(validation_results$male_var) > 0 && !is.na(validation_results$male_var)) validation_results$male_var else 1 new_entries <- rbind(new_entries, new_entry) } @@ -230,7 +194,7 @@ if (addphantoms){ new_entry$ID <- new_id new_entry$dadID <- NA new_entry$momID <- NA - new_entry$sex <- validation_results$female_var + new_entry$sex <- if (length(validation_results$female_var) > 0 && !is.na(validation_results$female_var)) validation_results$female_var else 0 new_entries <- rbind(new_entries, new_entry) } @@ -248,10 +212,39 @@ if (addphantoms){ if (verbose && length(changes$phantom_moms_added) > 0) { cat("Added phantom moms for:", paste(changes$phantom_moms_added, collapse = ", "), "\n") } + } + # add parents who appear in momID or dadID but are missing from ID + if (parentswithoutrow) { + # Add parents who appear in momID or dadID but are missing from ID + ped <- addRowlessParents(ped = ped, verbose = verbose, validation_results = validation_results) + } + + if (verbose) { + cat("Changes Made:\n") + print(changes) + } + return(ped) +} +#' Repair Parent IDs +#' +#' This function repairs parent IDs in a pedigree. +#' @inheritParams checkParentIDs +#' @inherit checkParentIDs details +#' @return A corrected pedigree +repairParentIDs <- function(ped, verbose = FALSE) { + checkParentIDs(ped = ped, verbose = verbose, repair = TRUE) } - # add phantom parents - if(parentswithoutrow){ + +#' Add addRowlessParents +#' +#' This function adds parents who appear in momID or dadID but are missing from ID +#' @inheritParams checkParentIDs +#' @param validation_results validation results + +addRowlessParents <- function(ped, verbose, validation_results) { # Add parents who appear in momID or dadID but are missing from ID + new_entries <- data.frame() + listed_parents <- unique(c(ped$momID, ped$dadID)) listed_parents <- listed_parents[!is.na(listed_parents)] @@ -280,22 +273,11 @@ if (addphantoms){ new_row$sex <- inferred_sex new_entries <- rbind(new_entries, new_row) } - } - ped <- merge(ped, new_entries, all = TRUE) - } + ped <- merge(ped, new_entries, all = TRUE) if (verbose) { - cat("Changes Made:\n") - print(changes) + cat("Added phantom parents for:", paste(new_entries$ID, collapse = ", "), "\n") } - return(ped) -} -#' Repair Parent IDs -#' -#' This function repairs parent IDs in a pedigree. -#' @inheritParams checkParentIDs -#' @inherit checkParentIDs details -#' @return A corrected pedigree -repairParentIDs <- function(ped, verbose = FALSE) { - checkParentIDs(ped = ped, verbose = verbose, repair = TRUE) + } + return(ped) } diff --git a/R/checkPedigree.R b/R/checkPedigree.R index 1ebe6afa..3c0af179 100644 --- a/R/checkPedigree.R +++ b/R/checkPedigree.R @@ -13,8 +13,10 @@ #' @return List containing detailed validation results. #' @examples #' \dontrun{ -#' results <- checkPedigreeNetwork(ped, personID = "ID", -#' momID = "momID", dadID = "dadID", verbose = TRUE) +#' results <- checkPedigreeNetwork(ped, +#' personID = "ID", +#' momID = "momID", dadID = "dadID", verbose = TRUE +#' ) #' } #' @export checkPedigreeNetwork <- function(ped, personID = "ID", momID = "momID", dadID = "dadID", verbose = FALSE) { diff --git a/R/checkSex.R b/R/checkSex.R index d220dec6..d2fc9fd5 100644 --- a/R/checkSex.R +++ b/R/checkSex.R @@ -26,6 +26,8 @@ #' @param code_female The current code used to represent females in the 'sex' column. If both are NULL, no recoding is performed. #' @param verbose A logical flag indicating whether to print progress and validation messages to the console. #' @param repair A logical flag indicating whether to attempt repairs on the sex coding. +#' @param momID The column name for maternal IDs. Default is "momID". +#' @param dadID The column name for paternal IDs. Default is "dadID". #' #' @return Depending on the value of `repair`, either a list containing validation results or a repaired dataframe is returned. #' @examples @@ -35,9 +37,11 @@ #' } #' @export #' -checkSex <- function(ped, code_male = NULL, code_female = NULL, verbose = FALSE, repair = FALSE) { +checkSex <- function(ped, code_male = NULL, code_female = NULL, verbose = FALSE, repair = FALSE, + momID = "momID", + dadID = "dadID") { # Standardize column names in the input dataframe - ped <- standardizeColnames(ped) + ped <- standardizeColnames(ped, verbose = verbose) # TO DO: bypass the rest of the function if recode_only is TRUE @@ -53,36 +57,33 @@ checkSex <- function(ped, code_male = NULL, code_female = NULL, verbose = FALSE, validation_results$sex_unique <- unique(ped$sex) validation_results$sex_length <- length(unique(ped$sex)) if (verbose) { - cat(paste0(validation_results$sex_length, " unique values found.\n")) - cat(paste0("Unique values: ", paste0(validation_results$sex_unique, collapse = ", "), "\n")) + cat(validation_results$sex_length, " unique sex codes found: ", paste(validation_results$sex_unique, collapse = ", "), "\n") } - # Are there multiple sexes/genders in the list of dads and moms? - table_sex_dad <- sort(table(ped$sex[ped$ID %in% ped$dadID]), decreasing = TRUE) - table_sex_mom <- sort(table(ped$sex[ped$ID %in% ped$momID]), decreasing = TRUE) - validation_results$all_sex_dad <- names(table_sex_dad) - validation_results$all_sex_mom <- names(table_sex_mom) - validation_results$most_frequent_sex_dad <- validation_results$all_sex_dad[1] - validation_results$most_frequent_sex_mom <- validation_results$all_sex_mom[1] + # Are there multiple sexes/genders in the list of dads and moms? - # List ids for dads that are female, moms that are male - if (length(validation_results$all_sex_dad) > 1) { - df_dads <- ped[ped$ID %in% ped$dadID, ] - validation_results$ID_female_dads <- df_dads$ID[df_dads$sex != validation_results$most_frequent_sex_dad] - validation_results$ID_child_female_dads <- ped$ID[ped$dadID %in% validation_results$ID_female_dads] - remove(df_dads) - } - if (length(validation_results$all_sex_mom) > 1) { - df_moms <- ped[ped$ID %in% ped$momID, ] - validation_results$ID_male_moms <- df_moms$ID[df_moms$sex != validation_results$most_frequent_sex_mom] - validation_results$ID_child_male_moms <- ped$ID[ped$momID %in% validation_results$ID_male_moms] - remove(df_moms) - } + dad_results <- checkParentSex(ped, parent_col = dadID, verbose = verbose) + mom_results <- checkParentSex(ped, parent_col = momID, verbose = verbose) + + validation_results$all_sex_dad <- dad_results$unique_sexes + validation_results$all_sex_mom <- mom_results$unique_sexes + validation_results$most_frequent_sex_dad <- dad_results$modal_sex + validation_results$most_frequent_sex_mom <- mom_results$modal_sex + validation_results$ID_female_dads <- dad_results$inconsistent_parents + validation_results$ID_child_female_dads <- dad_results$inconsistent_children + validation_results$ID_male_moms <- mom_results$inconsistent_parents + validation_results$ID_child_male_moms <- mom_results$inconsistent_children - if (repair) { + if (repair == FALSE) { if (verbose) { + cat("Checks Made:\n") + print(validation_results) + } + return(validation_results) + } else { + if (verbose == TRUE) { cat("Step 2: Attempting to repair sex coding...\n") } # Initialize a list to track changes made during repair @@ -107,13 +108,6 @@ checkSex <- function(ped, code_male = NULL, code_female = NULL, verbose = FALSE, print(changes) } return(ped) - } else { - if (verbose) { - cat("Checks Made:\n") - print(validation_results) - } - - return(validation_results) } } @@ -193,3 +187,95 @@ recodeSex <- function( } return(ped) } + + + +#' Check Parental Role Sex Consistency +#' +#' Validates sex coding consistency for a given parental role (momID or dadID). +#' +#' @param ped Pedigree dataframe. +#' @param parent_col The column name for parent IDs ("momID" or "dadID"). +#' @param sex_col The column name for sex coding. Default is "sex". +#' @param verbose Logical, whether to print messages. +#' +#' +#' @return A list containing role, unique sex codes, modal sex, inconsistent parents, and linked children. +checkParentSex <- function(ped, parent_col, sex_col = "sex", verbose = FALSE) { + parent_ids <- ped[[parent_col]] + parent_rows <- ped[ped$ID %in% parent_ids, ] + + if (nrow(parent_rows) == 0) { + if (verbose) cat(paste0("No individuals found in role: ", parent_col, "\n")) + return(list( + role = parent_col, + unique_sexes = NULL, + modal_sex = NA, + all_same_sex = NA, + inconsistent_parents = NULL, + inconsistent_children = NULL + )) + } + + + # Are there multiple sexes/genders in the list of dads and moms? + parent_sexes <- parent_rows[[sex_col]] + unique_sexes <- unique(parent_sexes) + + # are all moms/dads the same sex? + all_same_sex <- length(unique_sexes) == 1 + + # Store the most frequent sex for moms and dads + modal_sex <- names(sort(table(parent_sexes), decreasing = TRUE))[1] + + # Type coercion based on ped$sex type + if (is.numeric(ped[[sex_col]])) { + modal_sex <- as.numeric(modal_sex) + } + + # List ids for dads that are female, moms that are male + inconsistent_parents <- parent_rows$ID[parent_rows[[sex_col]] != modal_sex] + + child_col <- parent_col + inconsistent_children <- ped$ID[ped[[child_col]] %in% inconsistent_parents] + + + if (verbose) { + cat(paste0("Role: ", parent_col, "\n")) + cat(length(unique_sexes), " unique sex codes found: ", paste(unique_sexes, collapse = ", "), "\n") + cat("Modal sex code: ", modal_sex, "\n") + + if (all_same_sex) { + cat("All parents consistently coded.\n") + } else { + cat(length(inconsistent_parents), " parents have inconsistent sex coding.\n") + } + } + + return(list( + role = parent_col, + unique_sexes = unique_sexes, + modal_sex = modal_sex, + all_same_sex = all_same_sex, + inconsistent_parents = inconsistent_parents, + inconsistent_children = inconsistent_children + )) +} + +#' Get the Modal Value of a Vector +#' +#' This function calculates the modal value of a vector, which is the most frequently occurring value. +#' If the vector is empty or contains only NA values, it returns NA. +#' +#' @param x A vector of values. +#' +#' @return The modal value of the vector. If the vector is empty or contains only NA values, returns NA. +#' @keywords internal +.getModalValue <- function(x) { + if (length(stats::na.omit(x)) == 0) { + return(NA) + } + freq_table <- sort(table(x), decreasing = TRUE) + modal <- names(freq_table)[1] + return(modal) +} diff --git a/R/cleanPedigree.R b/R/cleanPedigree.R index ffd8054e..46a3a64a 100644 --- a/R/cleanPedigree.R +++ b/R/cleanPedigree.R @@ -13,12 +13,12 @@ standardizeColnames <- function(df, verbose = FALSE) { # Internal mapping of standardized names to possible variants mapping <- list( - "fam" = "^(?:fam(?:ily)?[\\.\\-_]?(?:id)?)", + "famID" = "^(?:fam(?:ily)?[\\.\\-_]?(?:id)?)", "ID" = "^(?:i(?:d$|ndiv(?:idual)?)|p(?:erson)?[\\.\\-_]?id)", "gen" = "^(?:gen(?:s|eration)?)", - "dadID" = "^(?:d(?:ad)?id|paid|fatherid|pid[\\.\\-_]?fath[er]*)", + "dadID" = "^(?:d(?:ad)?id|paid|fatherid|pid[\\.\\-_]?fath[er]*|sire)", "patID" = "^(?:dat[\\.\\-_]?id|pat[\\.\\-_]?id|paternal[\\.\\-_]?(?:id)?)", - "momID" = "^(?:m(?:om|a|other)?[\\.\\-_]?id|pid[\\.\\-_]?moth[er]*)", + "momID" = "^(?:m(?:om|a|other)?[\\.\\-_]?id|pid[\\.\\-_]?moth[er]*|dame)", "matID" = "^(?:mat[\\.\\-_]?id|maternal[\\.\\-_]?(?:id)?)", "spID" = "^(?:s(?:pt)?id|spouse[\\.\\-_]?(?:id)?|partner[\\.\\-_]?(?:id)?|husb(?:and)?[\\.\\-_]?id|wife[\\.\\-_]?(?:id)?|pid[\\.\\-_]?spouse1?)", "twinID" = "^(?:twin[\\.\\-_]?(?:id)?)", @@ -60,7 +60,7 @@ standardizeColnames <- function(df, verbose = FALSE) { # check_sex = TRUE, # check_parents = TRUE, # verbose = FALSE) { -# corrected_ped <- ped <- standardizeColnames(ped) +# corrected_ped <- ped <- standardizeColnames(ped, verbose = verbose) # if (verbose) { # print("Repairing pedigree...") # } diff --git a/R/constructAdjacency.R b/R/constructAdjacency.R new file mode 100644 index 00000000..74145337 --- /dev/null +++ b/R/constructAdjacency.R @@ -0,0 +1,532 @@ +.adjLoop <- function(ped, component, saveable, resume, + save_path, verbose, lastComputed, + checkpoint_files, update_rate, + parList, lens, save_rate_parlist, config, + ...) { + # Loop through each individual in the pedigree + # Build the adjacency matrix for parent-child relationships + # Is person in column j the parent of the person in row i? .5 for yes, 0 for no. + ped$momID <- as.numeric(ped$momID) + ped$dadID <- as.numeric(ped$dadID) + ped$ID <- as.numeric(ped$ID) + + for (i in (lastComputed + 1):config$nr) { + x <- ped[i, , drop = FALSE] + # Handle parentage according to the 'component' specified + if (component %in% c("generation", "additive")) { + # Code for 'generation' and 'additive' components + # Checks if is mom of ID or is dad of ID + xID <- as.numeric(x["ID"]) + sMom <- (xID == ped$momID) + sDad <- (xID == ped$dadID) + val <- sMom | sDad + val[is.na(val)] <- FALSE + } else if (component %in% c("common nuclear")) { + # Code for 'common nuclear' component + # IDs have the Same mom and Same dad + sMom <- (as.numeric(x["momID"]) == ped$momID) + sMom[is.na(sMom)] <- FALSE + sDad <- (as.numeric(x["dadID"]) == ped$dadID) + sDad[is.na(sDad)] <- FALSE + val <- sMom & sDad + } else if (component %in% c("mitochondrial")) { + # Code for 'mitochondrial' component + val <- (as.numeric(x["ID"]) == ped$momID) + val[is.na(val)] <- FALSE + } else { + stop("Unknown relatedness component requested") + } + # Storing the indices of the parent-child relationships + # Keep track of indices only, and then initialize a single sparse matrix + wv <- which(val) + parList[[i]] <- wv + lens[i] <- length(wv) + # Print progress if verbose is TRUE + if (verbose && (i %% update_rate == 0)) { + cat(paste0("Done with ", i, " of ", config$nr, "\n")) + } + # Checkpointing every save_rate iterations + if (saveable && (i %% save_rate_parlist == 0)) { + saveRDS(parList, file = checkpoint_files$parList) + saveRDS(lens, file = checkpoint_files$lens) + if (verbose) cat("Checkpointed parlist saved at iteration", i, "\n") + } + } + jss <- rep(1L:config$nr, times = lens) + iss <- unlist(parList) + list_of_adjacency <- list(iss = iss, jss = jss) + return(list_of_adjacency) +} + +.adjIndexed <- function(ped, component, saveable, resume, + save_path, verbose, lastComputed, + checkpoint_files, update_rate, + parList, lens, save_rate_parlist, config) { + # Loop through each individual in the pedigree + # Build the adjacency matrix for parent-child relationships + # Is person in column j the parent of the person in row i? .5 for yes, 0 for no. + + # Convert IDs + ped$ID <- as.numeric(ped$ID) + ped$momID <- as.numeric(ped$momID) + ped$dadID <- as.numeric(ped$dadID) + + # parent-child lookup + mom_index <- match(ped$momID, ped$ID, nomatch = 0) + dad_index <- match(ped$dadID, ped$ID, nomatch = 0) + + for (i in (lastComputed + 1):config$nr) { + if (component %in% c("generation", "additive")) { + sMom <- (mom_index == i) + sDad <- (dad_index == i) + val <- sMom | sDad + } else if (component %in% c("common nuclear")) { + # Code for 'common nuclear' component + # IDs have the Same mom and Same dad + sMom <- (ped$momID[i] == ped$momID) + sMom[is.na(sMom)] <- FALSE + sDad <- (ped$dadID[i] == ped$dadID) + sDad[is.na(sDad)] <- FALSE + val <- sMom & sDad + } else if (component %in% c("mitochondrial")) { + val <- (mom_index == i) + } else { + stop("Unknown relatedness component requested") + } + + val[is.na(val)] <- FALSE + parList[[i]] <- which(val) + lens[i] <- length(parList[[i]]) + + # Print progress if verbose is TRUE + if (verbose && (i %% update_rate == 0)) { + cat(paste0("Done with ", i, " of ", config$nr, "\n")) + } + + # Checkpointing every save_rate iterations + if (saveable && (i %% save_rate_parlist == 0)) { + saveRDS(parList, file = checkpoint_files$parList) + saveRDS(lens, file = checkpoint_files$lens) + if (verbose) cat("Checkpointed parlist saved at iteration", i, "\n") + } + } + jss <- rep(1L:config$nr, times = lens) + iss <- unlist(parList) + list_of_adjacency <- list(iss = iss, jss = jss) + return(list_of_adjacency) +} + +.adjDirect <- function(ped, component, saveable, resume, + save_path, verbose, lastComputed, + checkpoint_files, update_rate, + parList, lens, save_rate_parlist, config, + ...) { + # Loop through each individual in the pedigree + # Build the adjacency matrix for parent-child relationships + # Is person in column j the parent of the person in row i? .5 for yes, 0 for no. + uniID <- ped$ID # live dangerously without sort(unique(ped$ID)) + ped$ID <- as.numeric(factor(ped$ID, levels = uniID)) + ped$momID <- as.numeric(factor(ped$momID, levels = uniID)) + ped$dadID <- as.numeric(factor(ped$dadID, levels = uniID)) + + if (component %in% c("generation", "additive")) { + mIDs <- stats::na.omit(data.frame(rID = ped$ID, cID = ped$momID)) + dIDs <- stats::na.omit(data.frame(rID = ped$ID, cID = ped$dadID)) + iss <- c(mIDs$rID, dIDs$rID) + jss <- c(mIDs$cID, dIDs$cID) + } else if (component %in% c("common nuclear")) { + # message("Common Nuclear component is not yet implemented for direct method. Using index method.\n") + + # 1) Create a logical mask for only known parents + mask <- !is.na(ped$momID) & !is.na(ped$dadID) + + # 2) Create a single hash label for each known (momID, dadID) pair + base <- max(ped$ID, na.rm = TRUE) + 1L + pairCode <- ped$momID[mask] + base * ped$dadID[mask] + + # 3) Factor that label => each row with the same (mom,dad) gets the same integer code + childVec <- which(mask) + + # 4) Group children by pairCode, so each group is "all children with that (mom,dad)" + groupList <- split(childVec, pairCode) + + # 5) For each group with >1 children, form pairwise adjacency (i->j) for i != j + iss_list <- vector("list", length(groupList)) + jss_list <- vector("list", length(groupList)) + counter <- 1 + + for (g in groupList) { + k <- length(g) + if (k > 1) { + # We'll form all k^2 combos, then remove the diagonal i=j + # rep() calls faster than expand.grid + + # v = each child repeated k times + # w = entire group repeated once for each child + v <- rep(g, each = k) # row index + w <- rep(g, times = k) # col index + + keep <- (v != w) # remove diagonal where v == w + iss_list[[counter]] <- v[keep] + jss_list[[counter]] <- w[keep] + counter <- counter + 1 + } + } + + + iss <- unlist(iss_list, use.names = FALSE) + jss <- unlist(jss_list, use.names = FALSE) + } else if (component %in% c("mitochondrial")) { + mIDs <- stats::na.omit(data.frame(rID = ped$ID, cID = ped$momID)) + iss <- c(mIDs$rID) + jss <- c(mIDs$cID) + } else { + stop("Unknown relatedness component requested") + } + list_of_adjacency <- list( + iss = iss, + jss = jss + ) + return(list_of_adjacency) +} + + + +.adjBeta <- function(ped, component, + adjBeta_method = 5, + parList = NULL, + lastComputed = 0, + lens = NULL, + saveable = FALSE, + resume = FALSE, + save_path = NULL, + verbose = FALSE, + save_rate_parlist = NULL, + update_rate = NULL, + checkpoint_files = NULL, + config, + ...) { # 1) Pairwise compare mother IDs + if (adjBeta_method == 1) { + # gets slow when data are bigger. much slower than indexed + momMatch <- outer(ped$momID, ped$momID, FUN = "==") + momMatch[is.na(momMatch)] <- FALSE + + # 2) Pairwise compare father IDs + dadMatch <- outer(ped$dadID, ped$dadID, FUN = "==") + dadMatch[is.na(dadMatch)] <- FALSE + + # 3) Sibling adjacency if both mom & dad match + adj <- momMatch & dadMatch + + # 4) Extract indices where adj[i,j] is TRUE + w <- which(adj, arr.ind = TRUE) + # iss <- w[, 1] + # jss <- w[, 2] + # + list_of_adjacency <- list( + iss = w[, 1], + jss = w[, 2] + ) + } else if (adjBeta_method == 2) { + # 1) Create a logical mask for known parents + mask <- !is.na(ped$momID) & !is.na(ped$dadID) + + # 2) Create a single string label for each known (momID, dadID) pair + pairLabel <- paste0(ped$momID[mask], "_", ped$dadID[mask]) + + # 3) Factor that label => each row with the same (mom,dad) gets the same integer code + # This is "creating a new ID" for each unique parent pair + pairCode <- match(pairLabel, unique(pairLabel)) + + # childVec are the row indices in 'ped' that have known parents + childVec <- which(mask) # length(childVec) = sum(mask) + + # 4) Group children by pairCode, so each group is "all children with that (mom,dad)" + groupList <- split(childVec, pairCode) + + # 5) For each group with >1 children, form pairwise adjacency i->j + iss_list <- list() + jss_list <- list() + counter <- 1 + + for (g in groupList) { + if (length(g) > 1) { + combos <- expand.grid(g, g, KEEP.OUT.ATTRS = FALSE) + combos <- combos[combos[, 1] != combos[, 2], , drop = FALSE] + iss_list[[counter]] <- combos[, 1] + jss_list[[counter]] <- combos[, 2] + counter <- counter + 1 + } + } + # iss <- unlist(iss_list, use.names = FALSE) + # jss <- unlist(jss_list, use.names = FALSE) + + list_of_adjacency <- list( + iss = unlist(iss_list, use.names = FALSE), + jss = unlist(jss_list, use.names = FALSE) + ) + } else if (adjBeta_method == 3) { + # nr <- nrow(ped) + # terrible + # Define a scalar-checking function: + f_check <- function(i, j) { + # i, j are each single integers + # Return one boolean: do they share both parents? + !is.na(ped$momID[i]) && !is.na(ped$dadID[i]) && + !is.na(ped$momID[j]) && !is.na(ped$dadID[j]) && + (ped$momID[i] == ped$momID[j]) && + (ped$dadID[i] == ped$dadID[j]) + } + + # Vectorize it so outer() will produce an nr x nr matrix + vf_check <- Vectorize(f_check) + + # Now outer() calls vf_check(...) in a way that yields scalar results + adj <- outer(seq_len(config$nr), seq_len(config$nr), FUN = vf_check) + + # Extract which cells of adj are TRUE + w <- which(adj, arr.ind = TRUE) + # iss <- w[, 1] + # jss <- w[, 2] + + list_of_adjacency <- list( + iss = iss <- w[, 1], + jss = jss <- w[, 2] + ) + } else if (adjBeta_method == 4) { + # 1) Create a logical mask for known parents + mask <- !is.na(ped$momID) & !is.na(ped$dadID) + + # 2) Create a single string label for each known (momID, dadID) pair + pairLabel <- paste0(ped$momID[mask], "_", ped$dadID[mask]) + + # 3) Factor that label => each row with the same (mom,dad) gets the same integer code + pairCode <- match(pairLabel, unique(pairLabel)) + + # childVec are the row indices in 'ped' that have known parents + childVec <- which(mask) + + # 4) Group children by pairCode, so each group is "all children with that (mom,dad)" + groupList <- split(childVec, pairCode) + + # 5) For each group with >1 children, form pairwise adjacency (i->j) for i != j + iss_list <- vector("list", length(groupList)) + jss_list <- vector("list", length(groupList)) + counter <- 1 + + for (g in groupList) { + k <- length(g) + if (k > 1) { + # We'll form all k^2 combos, then remove the diagonal i=j + # Instead of expand.grid, do rep() calls: + + # v = each child repeated k times + # w = entire group repeated once for each child + v <- rep(g, each = k) # row index + w <- rep(g, times = k) # col index + + keep <- (v != w) # remove diagonal where v == w + iss_list[[counter]] <- v[keep] + jss_list[[counter]] <- w[keep] + counter <- counter + 1 + } + } + + list_of_adjacency <- list( + iss = unlist(iss_list, use.names = FALSE), + jss = unlist(jss_list, use.names = FALSE) + ) + } else if (adjBeta_method == 5) { + # 1) Create a logical mask for known parents + mask <- !is.na(ped$momID) & !is.na(ped$dadID) + + # 2) Create a single hash label for each known (momID, dadID) pair + # pairLabel <- paste0(ped$momID[mask], "_", ped$dadID[mask]) + base <- max(ped$ID, na.rm = TRUE) + 1L + pairCode <- ped$momID[mask] + base * ped$dadID[mask] + + # 3) Factor that label => each row with the same (mom,dad) gets the same integer code + childVec <- which(mask) + + # 4) Group children by pairCode, so each group is "all children with that (mom,dad)" + groupList <- split(childVec, pairCode) + + # 5) For each group with >1 children, form pairwise adjacency (i->j) for i != j + iss_list <- vector("list", length(groupList)) + jss_list <- vector("list", length(groupList)) + counter <- 1 + + for (g in groupList) { + k <- length(g) + if (k > 1) { + # We'll form all k^2 combos, then remove the diagonal i=j + # Instead of expand.grid, do rep() calls: + + # v = each child repeated k times + # w = entire group repeated once for each child + v <- rep(g, each = k) # row index + w <- rep(g, times = k) # col index + + keep <- (v != w) # remove diagonal where v == w + iss_list[[counter]] <- v[keep] + jss_list[[counter]] <- w[keep] + counter <- counter + 1 + } + } + + list_of_adjacency <- list( + iss = unlist(iss_list, use.names = FALSE), + jss = unlist(jss_list, use.names = FALSE) + ) + } else { + list_of_adjacency <- .adjIndexed( + ped = ped, component = component, + saveable = saveable, resume = resume, + save_path = save_path, verbose = verbose, + lastComputed = lastComputed, config = config, + checkpoint_files = checkpoint_files, + update_rate = update_rate, parList = parList, + lens = lens, save_rate_parlist = save_rate_parlist + ) + } + return(list_of_adjacency) +} + + +#' Compute Parent Adjacency Matrix with Multiple Approaches +#' @inheritParams ped2com +#' @inherit ped2com details +#' @param config a configuration list that passes parameters to the function +#' @param lastComputed the last computed index +#' @param parList a list of parent-child relationships +#' @param lens a vector of the lengths of the parent-child relationships +#' @param checkpoint_files a list of checkpoint files +#' @param update_rate the rate at which to update the progress +#' +#' @export +computeParentAdjacency <- function(ped, component, + adjacency_method = "direct", + saveable, resume, + save_path, + verbose = FALSE, + lastComputed = 0, + checkpoint_files, + update_rate, + parList, lens, save_rate_parlist, + adjBeta_method = NULL, + config, + ...) { + if (!adjacency_method %in% c("loop", "indexed", "direct", "beta")) { + stop("Invalid method specified. Choose from 'loop', 'direct', 'indexed', or 'beta'.") + } + # For loop/indexed/direct: skip if already complete + if (adjacency_method != "beta" && lastComputed >= config$nr) { + list_of_adjacency <- NULL + } else { + list_of_adjacency <- switch(adjacency_method, + "loop" = { + # Original version + .adjLoop( + ped = ped, + component = component, + saveable = saveable, + resume = resume, + save_path = save_path, + verbose = verbose, + lastComputed = lastComputed, + checkpoint_files = checkpoint_files, + update_rate = update_rate, + parList = parList, + lens = lens, + save_rate_parlist = save_rate_parlist, + config = config, + ... + ) + }, + "indexed" = { + # Garrison version + .adjIndexed( + ped = ped, + component = component, + saveable = saveable, + resume = resume, + save_path = save_path, + verbose = verbose, + lastComputed = lastComputed, + checkpoint_files = checkpoint_files, + update_rate = update_rate, + parList = parList, + lens = lens, + save_rate_parlist = save_rate_parlist, + config = config, + ... + ) + }, + "direct" = { + # Hunter version + .adjDirect( + ped = ped, + component = component, + saveable = saveable, + resume = resume, + save_path = save_path, + verbose = verbose, + lastComputed = lastComputed, + checkpoint_files = checkpoint_files, + update_rate = update_rate, + parList = parList, + lens = lens, + save_rate_parlist = save_rate_parlist, + config = config, + ... + ) + }, + "beta" = { + .adjBeta( + ped = ped, + adjBeta_method = adjBeta_method, + component = component, + saveable = saveable, + resume = resume, + save_path = save_path, + verbose = verbose, + lastComputed = lastComputed, + checkpoint_files = checkpoint_files, + update_rate = update_rate, + parList = parList, + lens = lens, + save_rate_parlist = save_rate_parlist, + config = config, + ... + ) + } + ) + } + if (saveable) { + saveRDS(parList, file = checkpoint_files$parList) + saveRDS(lens, file = checkpoint_files$lens) + if (verbose) { + cat("Final checkpoint saved for adjacency matrix.\n") + } + } + return(list_of_adjacency) +} + + +#' Determine isChild Status, isChild is the 'S' matrix from RAM +#' @param isChild_method method to determine isChild status +#' @param ped pedigree data frame +#' @return isChild 'S' matrix +#' + +isChild <- function(isChild_method, ped) { + if (isChild_method == "partialparent") { + isChild <- apply(ped[, c("momID", "dadID")], 1, function(x) { + .5 + .25 * sum(is.na(x)) # 2 parents -> .5, 1 parent -> .75, 0 parents -> 1 + }) + } else { + isChild <- apply(ped[, c("momID", "dadID")], 1, function(x) { + 2^(-!all(is.na(x))) + }) + } +} diff --git a/R/convertPedigree.R b/R/convertPedigree.R deleted file mode 100644 index 257bc02f..00000000 --- a/R/convertPedigree.R +++ /dev/null @@ -1,725 +0,0 @@ -#' Take a pedigree and turn it into a relatedness matrix -#' @param ped a pedigree dataset. Needs ID, momID, and dadID columns -#' @param component character. Which component of the pedigree to return. See Details. -#' @param max.gen the maximum number of generations to compute -#' (e.g., only up to 4th degree relatives). The default is 25. However it can be set to infinity. -#' `Inf` uses as many generations as there are in the data. -#' @param sparse logical. If TRUE, use and return sparse matrices from Matrix package -#' @param verbose logical. If TRUE, print progress through stages of algorithm -#' @param update_rate numeric. The rate at which to print progress -#' @param gc logical. If TRUE, do frequent garbage collection via \code{\link{gc}} to save memory -#' @param saveable logical. If TRUE, save the intermediate results to disk -#' @param save_rate numeric. The rate at which to save the intermediate results -#' @param save_rate_gen numeric. The rate at which to save the intermediate results by generation. If NULL, defaults to save_rate -#' @param save_rate_parlist numeric. The rate at which to save the intermediate results by parent list. If NULL, defaults to save_rate*1000 -#' @param resume logical. If TRUE, resume from a checkpoint -#' @param save_path character. The path to save the checkpoint files -#' @param flatten.diag logical. If TRUE, overwrite the diagonal of the final relatedness matrix with ones -#' @param standardize.colnames logical. If TRUE, standardize the column names of the pedigree dataset -#' @param transpose_method character. The method to use for computing the transpose. Options are "tcrossprod", "crossprod", or "star" -#' @param adjacency_method character. The method to use for computing the adjacency matrix. Options are "loop" or "indexed" -#' @param isChild_method character. The method to use for computing the isChild matrix. Options are "classic" or "partialparent" -#' @param ... additional arguments to be passed to \code{\link{ped2com}} -#' @details The algorithms and methodologies used in this function are further discussed and exemplified in the vignette titled "examplePedigreeFunctions". For more advanced scenarios and detailed explanations, consult this vignette. -#' @export -#' -ped2com <- function(ped, component, - max.gen = 25, - sparse = TRUE, - verbose = FALSE, - gc = FALSE, - flatten.diag = FALSE, - standardize.colnames = TRUE, - transpose_method = "tcrossprod", - adjacency_method = "indexed", - isChild_method = "classic", - saveable = FALSE, - resume = FALSE, - save_rate = 5, - save_rate_gen = save_rate, - save_rate_parlist = 100000 * save_rate, - update_rate = 100, - save_path = "checkpoint/", - ...) { - #------ - # Checkpointing - #------ - if (saveable || resume) { # prepare checkpointing - if (verbose) cat("Preparing checkpointing...\n") - # Ensure save path exists - if (saveable && !dir.exists(save_path)) { - if (verbose) cat("Creating save path...\n") - dir.create(save_path, recursive = TRUE) - } else if (resume && !dir.exists(save_path)) { - stop("Cannot resume from checkpoint. Save path does not exist.") - } - - # Define checkpoint files - checkpoint_files <- list( - parList = file.path(save_path, "parList.rds"), - lens = file.path(save_path, "lens.rds"), - isPar = file.path(save_path, "isPar.rds"), - iss = file.path(save_path, "iss.rds"), - jss = file.path(save_path, "jss.rds"), - isChild = file.path(save_path, "isChild.rds"), - r_checkpoint = file.path(save_path, "r_checkpoint.rds"), - gen_checkpoint = file.path(save_path, "gen_checkpoint.rds"), - newIsPar_checkpoint = file.path(save_path, "newIsPar_checkpoint.rds"), - mtSum_checkpoint = file.path(save_path, "mtSum_checkpoint.rds"), - r2_checkpoint = file.path(save_path, "r2_checkpoint.rds"), - tcrossprod_checkpoint = file.path(save_path, "tcrossprod_checkpoint.rds"), - count_checkpoint = file.path(save_path, "count_checkpoint.rds"), - final_matrix = file.path(save_path, "final_matrix.rds") - ) - } - #------ - # Validation/Preparation - #------ - - # Validate the 'component' argument and match it against predefined choices - component <- match.arg(tolower(component), - choices = c( - "generation", - "additive", - "common nuclear", - "mitochondrial" - ) - ) - - if (!transpose_method %in% c("tcrossprod", "crossprod", "star", "tcross.alt.crossprod", "tcross.alt.star")) { - stop("Invalid method specified. Choose from 'tcrossprod', 'crossprod', or 'star' or 'tcross.alt.crossprod' or 'tcross.alt.star'.") - } - if (!adjacency_method %in% c("indexed", "loop", "direct")) { - stop("Invalid method specified. Choose from 'indexed', 'loop', or 'direct'.") - } - - # standardize colnames - if (standardize.colnames) { - ped <- standardizeColnames(ped) - } - - # Load final result if computation was completed - if (resume && file.exists(checkpoint_files$final_matrix)) { - if (verbose) cat("Loading final computed matrix...\n") - return(readRDS(checkpoint_files$final_matrix)) - } - - - #------ - # Algorithm - #------ - - # Get the number of rows in the pedigree dataset, representing the size of the family - nr <- nrow(ped) - - # Print the family size if verbose is TRUE - if (verbose) { - cat(paste0("Family Size = ", nr, "\n")) - } - - # Step 1: Construct parent-child adjacency matrix - ## A. Resume from Checkpoint if Needed - if (resume && file.exists(checkpoint_files$parList) && file.exists(checkpoint_files$lens)) { - if (verbose) cat("Resuming: Loading parent-child adjacency data...\n") - parList <- readRDS(checkpoint_files$parList) - lens <- readRDS(checkpoint_files$lens) - computed_indices <- which(!sapply(parList, is.null)) - lastComputed <- if (length(computed_indices) > 0) max(computed_indices) else 0 - if (verbose) cat("Resuming from iteration", lastComputed + 1, "\n") - } else { - ## Initialize variables - parList <- vector("list", nr) - lens <- integer(nr) - lastComputed <- 0 - - if (verbose) cat("Building parent adjacency matrix...\n") - } - - - ## B. Resume loop from the next uncomputed index - - if (verbose) cat("Computing parent-child adjacency matrix...\n") - # Construct sparse matrix - if (resume && file.exists(checkpoint_files$iss) && file.exists(checkpoint_files$jss)) { # fix to check actual - if (verbose) cat("Resuming: Constructed matrix...\n") - jss <- readRDS(checkpoint_files$jss) - iss <- readRDS(checkpoint_files$iss) - list_of_adjacencies <- list(iss = iss, jss = jss) - } else { - list_of_adjacencies <- compute_parent_adjacency( - ped = ped, - save_rate_parlist = save_rate_parlist, - checkpoint_files = checkpoint_files, - component = component, - adjacency_method = adjacency_method, # adjacency_method, - saveable = saveable, - resume = resume, - save_path = save_path, - update_rate = update_rate, - verbose = verbose, - lastComputed = lastComputed, - nr = nr, - parList = parList, - lens = lens - ) - - # Construct sparse matrix - iss <- list_of_adjacencies$iss - jss <- list_of_adjacencies$jss - - if (verbose) { - cat("Constructed sparse matrix\n") - } - if (saveable) { - saveRDS(jss, file = checkpoint_files$jss) - saveRDS(iss, file = checkpoint_files$iss) - } - # Garbage collection if gc is TRUE - if (gc) { - rm(parList, lens, list_of_adjacencies) - gc() - } - } - # Set parent values depending on the component type - if (component %in% c("generation", "additive")) { - parVal <- .5 - } else if (component %in% c("common nuclear", "mitochondrial")) { - parVal <- 1 - } else { - stop("Don't know how to set parental value") - } - # Construct sparse matrix - if (resume && file.exists(checkpoint_files$isPar)) { - if (verbose) cat("Resuming: Loading adjacency matrix...\n") - isPar <- readRDS(checkpoint_files$isPar) - } else { - # Initialize adjacency matrix for parent-child relationships - isPar <- Matrix::sparseMatrix( - i = iss, - j = jss, - x = parVal, - dims = c(nr, nr), - dimnames = list(ped$ID, ped$ID) - ) - if (verbose) { - cat("Completed first degree relatives (adjacency)\n") - } - if (saveable) { - saveRDS(isPar, file = checkpoint_files$isPar) - } - } - - # isPar is the adjacency matrix. 'A' matrix from RAM - if (component %in% c("common nuclear")) { - Matrix::diag(isPar) <- 1 - if (sparse==FALSE) { - isPar <- as.matrix(isPar) - } - return(isPar) - } - - if (resume && file.exists(checkpoint_files$isChild)) { - if (verbose) cat("Resuming: Loading isChild matrix...\n") - isChild <- readRDS(checkpoint_files$isChild) - } else { - # isChild is the 'S' matrix from RAM - - if (isChild_method == "partialparent") { - isChild <- apply(ped[, c("momID", "dadID")], 1, function(x) { - .5 + .25 * sum(is.na(x)) # 2 parents -> .5, 1 parent -> .75, 0 parents -> 1 - }) - } else { - isChild <- apply(ped[, c("momID", "dadID")], 1, function(x) { - 2^(-!all(is.na(x))) - }) - } - if (saveable) { - saveRDS(isChild, file = checkpoint_files$isChild) - } - } - # --- Step 2: Compute Relatedness Matrix --- - if (resume && file.exists(checkpoint_files$r_checkpoint) && file.exists(checkpoint_files$gen_checkpoint) && file.exists(checkpoint_files$mtSum_checkpoint) && file.exists(checkpoint_files$newIsPar_checkpoint) && - file.exists(checkpoint_files$count_checkpoint) - ) { - if (verbose) cat("Resuming: Loading previous computation...\n") - r <- readRDS(checkpoint_files$r_checkpoint) - gen <- readRDS(checkpoint_files$gen_checkpoint) - mtSum <- readRDS(checkpoint_files$mtSum_checkpoint) - newIsPar <- readRDS(checkpoint_files$newIsPar_checkpoint) - count <- readRDS(checkpoint_files$count_checkpoint) - } else { - r <- Matrix::Diagonal(x = 1, n = nr) - gen <- rep(1, nr) - mtSum <- sum(r, na.rm = TRUE) - newIsPar <- isPar - count <- 0 - } - maxCount <- max.gen + 1 - if (verbose) { - cat("About to do RAM path tracing\n") - } - - # r is I + A + A^2 + ... = (I-A)^-1 from RAM - # could trim, here - while (mtSum != 0 && count < maxCount) { - r <- r + newIsPar - gen <- gen + (Matrix::rowSums(newIsPar) > 0) - newIsPar <- newIsPar %*% isPar - mtSum <- sum(newIsPar) - count <- count + 1 - if (verbose) { - cat(paste0("Completed ", count - 1, " degree relatives\n")) - } - # Save progress every save_rate iterations - if (saveable && (count %% save_rate_gen == 0)) { - saveRDS(r, file = checkpoint_files$r_checkpoint) - saveRDS(gen, file = checkpoint_files$gen_checkpoint) - saveRDS(newIsPar, file = checkpoint_files$newIsPar_checkpoint) - saveRDS(mtSum, file = checkpoint_files$mtSum_checkpoint) - saveRDS(count, file = checkpoint_files$count_checkpoint) - } - } - # compute rsq <- r %*% sqrt(diag(isChild)) - # compute rel <- tcrossprod(rsq) - if (gc) { - rm(isPar, newIsPar) - gc() - } - - if (component == "generation") { # no need to do the rest - return(gen) - } else { - if (verbose) { - cat("Completed RAM path tracing\n") - } - } - - # --- Step 3: I-A inverse times diagonal multiplication --- - if (resume && file.exists(checkpoint_files$r2_checkpoint)) { - if (verbose) cat("Resuming: Loading I-A inverse...\n") - r2 <- readRDS(checkpoint_files$r2_checkpoint) - } else { - if (verbose) { - cat("Doing I-A inverse times diagonal multiplication\n") - } - r2 <- r %*% Matrix::Diagonal(x = sqrt(isChild), n = nr) - if (gc) { - rm(r, isChild) - gc() - } - if (saveable) { - saveRDS(r2, file = checkpoint_files$r2_checkpoint) - } - } - - # --- Step 4: T crossproduct --- - - if (resume && file.exists(checkpoint_files$tcrossprod_checkpoint) && component != "generation") { - if (verbose) cat("Resuming: Loading tcrossprod...\n") - r <- readRDS(checkpoint_files$tcrossprod_checkpoint) - } else { - r <- .computeTranspose(r2 = r2, transpose_method = transpose_method, verbose = verbose) - if (saveable) { - saveRDS(r, file = checkpoint_files$tcrossprod_checkpoint) - } - } - - - if (component == "mitochondrial") { - r@x <- rep(1, length(r@x)) - # Assign 1 to all nonzero elements for mitochondrial component - } - - if (sparse==FALSE) { - r <- as.matrix(r) - } - if (flatten.diag) { # flattens diagonal if you don't want to deal with inbreeding - diag(r) <- 1 - } - if (saveable) { - saveRDS(r, file = checkpoint_files$final_matrix) - } - return(r) -} - -#' Take a pedigree and turn it into an additive genetics relatedness matrix -#' @inheritParams ped2com -#' @inherit ped2com details -#' @export -#' -ped2add <- function(ped, max.gen = 25, sparse = TRUE, verbose = FALSE, - gc = FALSE, - flatten.diag = FALSE, standardize.colnames = TRUE, - transpose_method = "tcrossprod", - adjacency_method = "direct", - saveable = FALSE, - resume = FALSE, - save_rate = 5, - save_rate_gen = save_rate, - save_rate_parlist = 1000 * save_rate, - save_path = "checkpoint/", - ...) { - ped2com( - ped = ped, - max.gen = max.gen, - sparse = sparse, - verbose = verbose, - gc = gc, - component = "additive", - flatten.diag = flatten.diag, - standardize.colnames = standardize.colnames, - transpose_method = transpose_method, - adjacency_method = adjacency_method, - saveable = saveable, - resume = resume, - save_rate_gen = save_rate_gen, - save_rate_parlist = save_rate_parlist, - save_path = save_path - ) -} - -#' Take a pedigree and turn it into a mitochondrial relatedness matrix -#' @inheritParams ped2com -#' @inherit ped2com details -#' @export -#' @aliases ped2mt -#' -ped2mit <- ped2mt <- function(ped, max.gen = 25, - sparse = TRUE, - verbose = FALSE, gc = FALSE, - flatten.diag = FALSE, - standardize.colnames = TRUE, - transpose_method = "tcrossprod", - adjacency_method = "direct", - saveable = FALSE, - resume = FALSE, - save_rate = 5, - save_rate_gen = save_rate, - save_rate_parlist = 1000 * save_rate, - save_path = "checkpoint/", - ...) { - ped2com( - ped = ped, - max.gen = max.gen, - sparse = sparse, - verbose = verbose, - gc = gc, - component = "mitochondrial", - flatten.diag = flatten.diag, - standardize.colnames = standardize.colnames, - transpose_method = transpose_method, - adjacency_method = adjacency_method, - saveable = saveable, - resume = resume, - save_rate_gen = save_rate_gen, - save_rate_parlist = save_rate_parlist, - save_path = save_path - ) -} - -#' Take a pedigree and turn it into a common nuclear environmental relatedness matrix -#' @inheritParams ped2com -#' @inherit ped2com details -#' @export -#' -ped2cn <- function(ped, max.gen = 25, sparse = TRUE, verbose = FALSE, - gc = FALSE, flatten.diag = FALSE, - standardize.colnames = TRUE, - transpose_method = "tcrossprod", - saveable = FALSE, - resume = FALSE, - save_rate = 5, - adjacency_method = "indexed", - save_rate_gen = save_rate, - save_rate_parlist = 1000 * save_rate, - save_path = "checkpoint/", - ...) { - ped2com( - ped = ped, - max.gen = max.gen, - sparse = sparse, - verbose = verbose, - gc = gc, - component = "common nuclear", - adjacency_method = adjacency_method, - flatten.diag = flatten.diag, - standardize.colnames = standardize.colnames, - transpose_method = transpose_method, - saveable = saveable, - resume = resume, - save_rate_gen = save_rate_gen, - save_rate_parlist = save_rate_parlist, - save_path = save_path - ) -} -#' Take a pedigree and turn it into an extended environmental relatedness matrix -#' @inheritParams ped2com -#' @inherit ped2com details -#' @export -#' -ped2ce <- function(ped, - ...) { - matrix(1, nrow = nrow(ped), ncol = nrow(ped), dimnames = list(ped$ID, ped$ID)) -} - - -#' Compute the transpose multiplication for the relatedness matrix -#' @inheritParams ped2com -#' @inherit ped2com details -#' @param r2 a relatedness matrix -#' -.computeTranspose <- function(r2, transpose_method = "tcrossprod", verbose = FALSE) { - if (!transpose_method %in% c("tcrossprod", "crossprod", "star", "tcross.alt.crossprod", "tcross.alt.star")) { - stop("Invalid method specified. Choose from 'tcrossprod', 'crossprod', or 'star'.") - } - if (transpose_method %in% c("crossprod", "tcross.alt.crossprod")) { - if (verbose) cat("Doing alt tcrossprod crossprod t \n") - return(crossprod(t(as.matrix(r2)))) - } else if (transpose_method %in% c("star", "tcross.alt.star")) { - if (verbose) cat("Doing alt tcrossprod %*% t \n") - return(r2 %*% t(as.matrix(r2))) - } else { - if (verbose) cat("Doing tcrossprod\n") - return(Matrix::tcrossprod(r2)) - } -} - -.adjLoop <- function(ped, component, saveable, resume, - save_path, verbose, lastComputed, - nr, checkpoint_files, update_rate, - parList, lens, save_rate_parlist, - ...) { - # Loop through each individual in the pedigree - # Build the adjacency matrix for parent-child relationships - # Is person in column j the parent of the person in row i? .5 for yes, 0 for no. - ped$momID <- as.numeric(ped$momID) - ped$dadID <- as.numeric(ped$dadID) - ped$ID <- as.numeric(ped$ID) - - for (i in (lastComputed + 1):nr) { - x <- ped[i, , drop = FALSE] - # Handle parentage according to the 'component' specified - if (component %in% c("generation", "additive")) { - # Code for 'generation' and 'additive' components - # Checks if is mom of ID or is dad of ID - xID <- as.numeric(x["ID"]) - sMom <- (xID == ped$momID) - sDad <- (xID == ped$dadID) - val <- sMom | sDad - val[is.na(val)] <- FALSE - } else if (component %in% c("common nuclear")) { - # Code for 'common nuclear' component - # IDs have the Same mom and Same dad - sMom <- (as.numeric(x["momID"]) == ped$momID) - sMom[is.na(sMom)] <- FALSE - sDad <- (as.numeric(x["dadID"]) == ped$dadID) - sDad[is.na(sDad)] <- FALSE - val <- sMom & sDad - } else if (component %in% c("mitochondrial")) { - # Code for 'mitochondrial' component - val <- (as.numeric(x["ID"]) == ped$momID) - val[is.na(val)] <- FALSE - } else { - stop("Unknown relatedness component requested") - } - # Storing the indices of the parent-child relationships - # Keep track of indices only, and then initialize a single sparse matrix - wv <- which(val) - parList[[i]] <- wv - lens[i] <- length(wv) - # Print progress if verbose is TRUE - if (verbose && (i %% update_rate == 0)) { - cat(paste0("Done with ", i, " of ", nr, "\n")) - } - # Checkpointing every save_rate iterations - if (saveable && (i %% save_rate_parlist == 0)) { - saveRDS(parList, file = checkpoint_files$parList) - saveRDS(lens, file = checkpoint_files$lens) - if (verbose) cat("Checkpointed parlist saved at iteration", i, "\n") - } - } - jss <- rep(1L:nr, times = lens) - iss <- unlist(parList) - list_of_adjacency <- list(iss = iss, jss = jss) - return(list_of_adjacency) -} - -.adjIndexed <- function(ped, component, saveable, resume, - save_path, verbose, lastComputed, - nr, checkpoint_files, update_rate, - parList, lens, save_rate_parlist, - ...) { - # Loop through each individual in the pedigree - # Build the adjacency matrix for parent-child relationships - # Is person in column j the parent of the person in row i? .5 for yes, 0 for no. - - # Convert IDs - ped$ID <- as.numeric(ped$ID) - ped$momID <- as.numeric(ped$momID) - ped$dadID <- as.numeric(ped$dadID) - - # parent-child lookup - mom_index <- match(ped$momID, ped$ID, nomatch = 0) - dad_index <- match(ped$dadID, ped$ID, nomatch = 0) - - for (i in (lastComputed + 1):nr) { - if (component %in% c("generation", "additive")) { - sMom <- (mom_index == i) - sDad <- (dad_index == i) - val <- sMom | sDad - } else if (component %in% c("common nuclear")) { - # Code for 'common nuclear' component - # IDs have the Same mom and Same dad - sMom <- (ped$momID[i] == ped$momID) - sMom[is.na(sMom)] <- FALSE - sDad <- (ped$dadID[i] == ped$dadID) - sDad[is.na(sDad)] <- FALSE - val <- sMom & sDad - } else if (component %in% c("mitochondrial")) { - val <- (mom_index == i) - } else { - stop("Unknown relatedness component requested") - } - - val[is.na(val)] <- FALSE - parList[[i]] <- which(val) - lens[i] <- length(parList[[i]]) - - # Print progress if verbose is TRUE - if (verbose && (i %% update_rate == 0)) { - cat(paste0("Done with ", i, " of ", nr, "\n")) - } - - # Checkpointing every save_rate iterations - if (saveable && (i %% save_rate_parlist == 0)) { - saveRDS(parList, file = checkpoint_files$parList) - saveRDS(lens, file = checkpoint_files$lens) - if (verbose) cat("Checkpointed parlist saved at iteration", i, "\n") - } - } - jss <- rep(1L:nr, times = lens) - iss <- unlist(parList) - list_of_adjacency <- list(iss = iss, jss = jss) - return(list_of_adjacency) -} - -.adjDirect <- function(ped, component, saveable, resume, - save_path, verbose, lastComputed, - nr, checkpoint_files, update_rate, - parList, lens, save_rate_parlist, - ...) { - # Loop through each individual in the pedigree - # Build the adjacency matrix for parent-child relationships - # Is person in column j the parent of the person in row i? .5 for yes, 0 for no. - uniID <- ped$ID # live dangerously without sort(unique(ped$ID)) - ped$ID <- as.numeric(factor(ped$ID, levels = uniID)) - ped$momID <- as.numeric(factor(ped$momID, levels = uniID)) - ped$dadID <- as.numeric(factor(ped$dadID, levels = uniID)) - - if (component %in% c("generation", "additive")) { - mIDs <- stats::na.omit(data.frame(rID = ped$ID, cID = ped$momID)) - dIDs <- stats::na.omit(data.frame(rID = ped$ID, cID = ped$dadID)) - iss <- c(mIDs$rID, dIDs$rID) - jss <- c(mIDs$cID, dIDs$cID) - } else if (component %in% c("common nuclear")) { - stop("Common Nuclear component is not yet implemented for direct method. Use index method.\n") - # change to warning and call indexed version - } else if (component %in% c("mitochondrial")) { - mIDs <- stats::na.omit(data.frame(rID = ped$ID, cID = ped$momID)) - iss <- c(mIDs$rID) - jss <- c(mIDs$cID) - } else { - stop("Unknown relatedness component requested") - } - list_of_adjacency <- list( - iss = iss, - jss = jss - ) - return(list_of_adjacency) -} - -#' Compute Parent Adjacency Matrix with Multiple Approaches -#' @inheritParams ped2com -#' @inherit ped2com details -#' @param nr the number of rows in the pedigree dataset -#' @param lastComputed the last computed index -#' @param parList a list of parent-child relationships -#' @param lens a vector of the lengths of the parent-child relationships -#' @param checkpoint_files a list of checkpoint files - -compute_parent_adjacency <- function(ped, component, - adjacency_method = "indexed", - saveable, resume, - save_path, verbose, - lastComputed, nr, checkpoint_files, update_rate, - parList, lens, save_rate_parlist, - ...) { - if (adjacency_method == "loop") { - if (lastComputed < nr) { # Original version - list_of_adjacency <- .adjLoop( - ped = ped, - component = component, - saveable = saveable, - resume = resume, - save_path = save_path, - verbose = verbose, - lastComputed = lastComputed, - nr = nr, - checkpoint_files = checkpoint_files, - update_rate = update_rate, - parList = parList, - lens = lens, - save_rate_parlist = save_rate_parlist, - ... - ) - } - } else if (adjacency_method == "indexed") { # Garrison version - if (lastComputed < nr) { - list_of_adjacency <- .adjIndexed( - ped = ped, - component = component, - saveable = saveable, - resume = resume, - save_path = save_path, - verbose = verbose, - lastComputed = lastComputed, - nr = nr, - checkpoint_files = checkpoint_files, - update_rate = update_rate, - parList = parList, - lens = lens, - save_rate_parlist = save_rate_parlist, - ... - ) - } - } else if (adjacency_method == "direct") { # Hunter version - if (lastComputed < nr) { - list_of_adjacency <- .adjDirect( - ped = ped, - component = component, - saveable = saveable, - resume = resume, - save_path = save_path, - verbose = verbose, - lastComputed = lastComputed, - nr = nr, - checkpoint_files = checkpoint_files, - update_rate = update_rate, - parList = parList, - lens = lens, - save_rate_parlist = save_rate_parlist, - ... - ) - } - } else { - stop("Invalid method specified. Choose from 'loop', 'direct', or 'indexed'.") - } - if (saveable) { - saveRDS(parList, file = checkpoint_files$parList) - saveRDS(lens, file = checkpoint_files$lens) - if (verbose) { - cat("Final checkpoint saved for adjacency matrix.\n") - } - } - return(list_of_adjacency) -} diff --git a/R/helpGeneric.R b/R/helpGeneric.R index e8ef381c..4e1d8ca9 100644 --- a/R/helpGeneric.R +++ b/R/helpGeneric.R @@ -24,10 +24,9 @@ rmvn <- function(n, sigma) { #' @param x vector of any length #' @return replaces null values in a vector to NA #' -nullToNA <- function(x) { +null2NA <- function(x) { if (length(x) == 0) { x <- NA - # Handle case when x is a list } else if (is.list(x)) { for (i in seq_along(x)) { @@ -39,6 +38,9 @@ nullToNA <- function(x) { return(x) } +#' @rdname null2NA +#' +nullToNA <- null2NA #' modified tryCatch function #' @@ -46,10 +48,14 @@ nullToNA <- function(x) { #' @keywords internal #' @return Fuses the nullToNA function with efunc #' -try_na <- function(x) { - nullToNA(tryCatch(x, error = efunc)) +tryNA <- function(x) { + null2NA(tryCatch(x, error = efunc)) } +#' @rdname tryNA +#' @keywords internal +#' +try_na <- tryNA #' Compute the null space of a matrix #' #' @param M a matrix of which the null space is desired @@ -97,73 +103,3 @@ resample <- function(x, ...) { } x[sample.int(length(x), ...)] } - - -#' SimPed (Deprecated) -#' -#' This function is a wrapper around the new `simulatePedigree` function. -#' `SimPed` has been deprecated, and it's advised to use `simulatePedigree` directly. -#' -#' @param ... Arguments to be passed to `simulatePedigree`. -#' @return The same result as calling `simulatePedigree`. -#' @seealso \code{\link{simulatePedigree}} for the updated function. -#' @description When calling this function, a warning will be issued about its deprecation. -#' @keywords deprecated -#' @examples -#' \dontrun{ -#' # This is an example of the deprecated function: -#' SimPed(...) -#' # It is recommended to use: -#' simulatePedigree(...) -#' } -#' @export -SimPed <- function(...) { # nolint: object_name_linter. - warning("The 'SimPed' function is deprecated. Please use 'simulatePedigree' instead.") - simulatePedigree(...) -} - -#' related_coef (Deprecated) -#' -#' This function is a wrapper around the new `calculateRelatedness` function. -#' `related_coef` has been deprecated, and it's advised to use `calculateRelatedness` directly. -#' -#' @param ... Arguments to be passed to `calculateRelatedness`. -#' @return The same result as calling `calculateRelatedness`. -#' @seealso \code{\link{calculateRelatedness}} for the updated function. -#' @description When calling this function, a warning will be issued about its deprecation. -#' @keywords deprecated -#' @examples -#' \dontrun{ -#' # This is an example of the deprecated function: -#' related_coef(...) -#' # It is recommended to use: -#' calculateRelatedness(...) -#' } -#' @export -related_coef <- function(...) { - warning("The 'related_coef' function is deprecated. Please use 'calculateRelatedness' instead.") - calculateRelatedness(...) -} - -#' relatedness (Deprecated) -#' -#' This function is a wrapper around the new `inferRelatedness` function. -#' `relatedness` has been deprecated, and it's advised to use `inferRelatedness` directly. -#' -#' @param ... Arguments to be passed to `inferRelatedness`. -#' @return The same result as calling `inferRelatedness`. -#' @seealso \code{\link{inferRelatedness}} for the updated function. -#' @description When calling this function, a warning will be issued about its deprecation. -#' @keywords deprecated -#' @examples -#' \dontrun{ -#' # This is an example of the deprecated function: -#' relatedness(...) -#' # It is recommended to use: -#' inferRelatedness(...) -#' } -#' @export -relatedness <- function(...) { - warning("The 'relatedness' function is deprecated. Please use 'inferRelatedness' instead.") - inferRelatedness(...) -} diff --git a/R/helpPedigree.R b/R/helpPedigree.R index 7b6bb4be..87263fb9 100644 --- a/R/helpPedigree.R +++ b/R/helpPedigree.R @@ -39,15 +39,17 @@ createGenDataFrame <- function(sizeGens, genIndex, idGen) { #' #' @param idGen Vector of IDs for the generation. #' @param sexR Numeric value indicating the sex ratio (proportion of males). +#' @param code_male The value to use for males. Default is "M" +#' @param code_female The value to use for females. Default is "F" #' @return Vector of sexes ("M" for male, "F" for female) for the offspring. #' @importFrom stats runif -determineSex <- function(idGen, sexR) { +determineSex <- function(idGen, sexR, code_male = "M", code_female = "F") { if (runif(1) > .5) { - sexVec1 <- rep("M", floor(length(idGen) * sexR)) - sexVec2 <- rep("F", length(idGen) - length(sexVec1)) + sexVec1 <- rep(code_male, floor(length(idGen) * sexR)) + sexVec2 <- rep(code_female, length(idGen) - length(sexVec1)) } else { - sexVec1 <- rep("F", floor(length(idGen) * (1 - sexR))) - sexVec2 <- rep("M", length(idGen) - length(sexVec1)) + sexVec1 <- rep(code_female, floor(length(idGen) * (1 - sexR))) + sexVec2 <- rep(code_male, length(idGen) - length(sexVec1)) } sexVec <- sample(c(sexVec1, sexVec2)) return(sexVec) @@ -60,7 +62,8 @@ determineSex <- function(idGen, sexR) { #' #' @param df_Ngen The dataframe for the current generation, including columns for individual IDs and spouse IDs. #' @return The input dataframe augmented with a 'coupleId' column, where each mated pair has a unique identifier. -assignCoupleIds <- function(df_Ngen) { +#' @export +assignCoupleIDs <- function(df_Ngen) { df_Ngen$coupleId <- NA_character_ # Initialize the coupleId column with NAs usedCoupleIds <- character() # Initialize an empty character vector to track used IDs @@ -86,6 +89,10 @@ assignCoupleIds <- function(df_Ngen) { return(df_Ngen) } + +#' @rdname assignCoupleIDs +assignCoupleIds <- assignCoupleIDs + #' Generate or Adjust Number of Kids per Couple Based on Mating Rate #' #' This function generates or adjusts the number of kids per couple in a generation diff --git a/R/insertEven.R b/R/insertEven.R index ba2d55e9..573bd8c9 100644 --- a/R/insertEven.R +++ b/R/insertEven.R @@ -12,7 +12,7 @@ #' @export #' @seealso \code{\link{SimPed}} for the main function that uses this supporting function. -evenInsert <- function(m, n, verbose = FALSE) { +insertEven <- function(m, n, verbose = FALSE) { if (length(m) > length(n)) { temp <- m m <- n @@ -36,3 +36,7 @@ evenInsert <- function(m, n, verbose = FALSE) { return(vec) } + +#' @rdname insertEven +#' @export +evenInsert <- insertEven diff --git a/R/makeLinks.R b/R/makeLinks.R index 9517b533..37c44274 100644 --- a/R/makeLinks.R +++ b/R/makeLinks.R @@ -17,10 +17,12 @@ #' @param legacy Logical. If TRUE, uses the legacy branch of the function. #' @param outcome_name Character string representing the outcome name (used in file naming). #' @param drop_upper_triangular Logical. If TRUE, drops the upper triangular portion of the matrix. +#' @param include_all_links_1ped Logical. If TRUE, includes all links in the output. (Default is true when only one ped is provided) #' @param ... Additional arguments to be passed to \code{\link{com2links}} #' #' @return A data frame of related pairs if \code{writetodisk} is FALSE; otherwise, writes the results to disk. -#' @export +#' @export com2links + com2links <- function( rel_pairs_file = "dataRelatedPairs.csv", ad_ped_matrix = NULL, @@ -38,462 +40,268 @@ com2links <- function( legacy = FALSE, outcome_name = "data", drop_upper_triangular = TRUE, + include_all_links_1ped = FALSE, ...) { - # Non-legacy mode processing - - if (!legacy) { - # --- Input Validations and Preprocessing --- - - # Ensure that at least one relationship matrix is provided. - if (is.null(ad_ped_matrix) && is.null(mit_ped_matrix) && is.null(cn_ped_matrix)) { - stop("At least one of 'ped_matrix', 'mit_ped_matrix', or 'cn_ped_matrix' must be provided.") - } - # Validate and convert ad_ped_matrix to a sparse dgCMatrix if provided. - if (!is.null(ad_ped_matrix)) { - if (!inherits(ad_ped_matrix, c("matrix", "dgCMatrix", "dsCMatrix"))) { - stop("The 'ad_ped_matrix' must be a matrix or dgCMatrix.") - } - # convert to sparse - if (!inherits(ad_ped_matrix, "dgCMatrix")) { - ad_ped_matrix <- methods::as(ad_ped_matrix, "dgCMatrix") - } - } - - # Validate and convert cn_ped_matrix to a sparse dgCMatrix if provided. - if (!is.null(cn_ped_matrix)) { - if (!inherits(cn_ped_matrix, c("matrix", "dgCMatrix", "dsCMatrix"))) { - stop("The 'cn_ped_matrix' must be a matrix or dgCMatrix.") - } - # convert to sparse - if (!inherits(cn_ped_matrix, "dgCMatrix")) { - cn_ped_matrix <- methods::as(cn_ped_matrix, "dgCMatrix") - } - # Ensure CN matrix is symmetric. - cn_ped_matrix <- methods::as(cn_ped_matrix, "symmetricMatrix") - } - - # Validate and process mit_ped_matrix: convert and ensure binary values. - if (!is.null(mit_ped_matrix)) { - if (!inherits(mit_ped_matrix, c("matrix", "dgCMatrix", "dsCMatrix"))) { - stop("The 'mit_ped_matrix' must be a matrix or dgCMatrix.") - } - if (!inherits(mit_ped_matrix, "dgCMatrix")) { - mit_ped_matrix <- methods::as(mit_ped_matrix, "symmetricMatrix") - } - # Ensure mitochondrial matrix values are binary (0/1) - mit_ped_matrix@x[mit_ped_matrix@x > 0] <- 1 - } + # --- Input Validations and Preprocessing --- - # --- Build IDs and Prepare Matrix Pointers --- - - # Extract individual IDs from the first available matrix. - ids <- NULL - if (!is.null(cn_ped_matrix)) { - ids <- as.numeric(dimnames(cn_ped_matrix)[[1]]) - nc <- ncol(cn_ped_matrix) - } else if (!is.null(ad_ped_matrix)) { - ids <- as.numeric(dimnames(ad_ped_matrix)[[1]]) - nc <- ncol(ad_ped_matrix) - } else if (!is.null(mit_ped_matrix)) { - ids <- as.numeric(dimnames(mit_ped_matrix)[[1]]) - nc <- ncol(mit_ped_matrix) - } - - if (is.null(ids)) { - stop("Could not extract IDs from the provided matrices.") - } + # Ensure that at least one relationship matrix is provided. + if (is.null(ad_ped_matrix) && is.null(mit_ped_matrix) && is.null(cn_ped_matrix)) { + stop("At least one of 'ad_ped_matrix', 'mit_ped_matrix', or 'cn_ped_matrix' must be provided.") + } + # Validate and convert ad_ped_matrix to a sparse dgCMatrix if provided. + if (!is.null(ad_ped_matrix)) { + ad_ped_matrix <- validate_and_convert_matrix( + mat = ad_ped_matrix, + name = "ad_ped_matrix" + ) + } - # Count how many matrices are provided. - sum_nulls <- sum(!is.null(ad_ped_matrix), - !is.null(mit_ped_matrix), - !is.null(cn_ped_matrix), - na.rm = TRUE + # Validate and convert cn_ped_matrix to a sparse dgCMatrix if provided. + if (!is.null(cn_ped_matrix)) { + cn_ped_matrix <- validate_and_convert_matrix( + mat = cn_ped_matrix, + name = "cn_ped_matrix", + ensure_symmetric = TRUE ) - if (verbose) { - print(sum_nulls) - } + } - # Extract the internal pointers (p, i, and x slots) for each provided matrix. - if (!is.null(ad_ped_matrix)) { - ad_ped_p <- ad_ped_matrix@p + 1L - ad_ped_i <- ad_ped_matrix@i + 1L - ad_ped_x <- ad_ped_matrix@x - } - if (!is.null(mit_ped_matrix)) { - mt_p <- mit_ped_matrix@p + 1L - mt_i <- mit_ped_matrix@i + 1L - mt_x <- mit_ped_matrix@x - } - if (!is.null(cn_ped_matrix)) { - cn_p <- cn_ped_matrix@p + 1L - cn_i <- cn_ped_matrix@i + 1L - cn_x <- cn_ped_matrix@x - } + # Validate and process mit_ped_matrix: convert and ensure binary values. + if (!is.null(mit_ped_matrix)) { + mit_ped_matrix <- validate_and_convert_matrix( + mat = mit_ped_matrix, + name = "mit_ped_matrix", force_binary = TRUE, + ensure_symmetric = TRUE + ) + } - # --- Process Based on the Number of Provided Matrices --- - # --- Case: All Three Matrices Provided --- - if (sum_nulls == 3) { - # Set pointers for all three matrices. - newColPos1 <- ad_ped_p - iss1 <- ad_ped_i - x1 <- ad_ped_x + # --- Build IDs and Prepare Matrix Pointers --- - newColPos2 <- mt_p - iss2 <- mt_i - x2 <- mt_x + # Extract individual IDs from the first available matrix. + ids <- NULL - newColPos3 <- cn_p - iss3 <- cn_i - x3 <- cn_x - # Define relationship column names. - relNames <- c("addRel", "mitRel", "cnuRel") - # Optionally remove the original pointers to free memory. - if (gc == TRUE) { - remove(ad_ped_p, ad_ped_i, ad_ped_x, mt_p, mt_i, mt_x, cn_p, cn_i, cn_x) - } - if (verbose) { - message("All 3 matrix is present") - } + if (!is.null(cn_ped_matrix)) { + ids <- as.numeric(dimnames(cn_ped_matrix)[[1]]) + nc <- ncol(cn_ped_matrix) + } else if (!is.null(ad_ped_matrix)) { + ids <- as.numeric(dimnames(ad_ped_matrix)[[1]]) + nc <- ncol(ad_ped_matrix) + } else if (!is.null(mit_ped_matrix)) { + ids <- as.numeric(dimnames(mit_ped_matrix)[[1]]) + nc <- ncol(mit_ped_matrix) + } - # File names - # rel_pairs_file <- paste0(outcome_name, "_dataRelatedPairs.csv") - # mapa_id_file <- paste0(outcome_name, "_data_mapaID.csv") + if (is.null(ids)) { + stop("Could not extract IDs from the provided matrices.") + } - # Initialize the related pairs file with headers. - df_relpairs <- data.frame( - ID1 = numeric(0), ID2 = numeric(0) - ) - df_relpairs[[relNames[1]]] <- numeric(0) - df_relpairs[[relNames[2]]] <- numeric(0) - df_relpairs[[relNames[3]]] <- numeric(0) - - # Write the headers to the related pairs file. - if (writetodisk == TRUE) { - utils::write.table( - df_relpairs, - file = rel_pairs_file, sep = ",", append = FALSE, row.names = FALSE - ) + # --- matrix_case construction and switch dispatch --- + matrix_case <- paste(sort(c( + if (!is.null(ad_ped_matrix)) "ad" else NULL, + if (!is.null(mit_ped_matrix)) "mt" else NULL, + if (!is.null(cn_ped_matrix)) "cn" else NULL + )), collapse = "-") - # Prepare an empty buffer for batching writes. - write_buffer <- list() - remove(df_relpairs) - } + if (verbose) { + print(matrix_case) + } - # Loop over each column (individual) in the matrix. - for (j in 1L:nc) { - ID2 <- ids[j] - - # Extract column indices for the 1st component - ncp1 <- newColPos1[j] - ncp1p <- newColPos1[j + 1L] - cond1 <- ncp1 < ncp1p - if (cond1) { - vv1 <- ncp1:(ncp1p - 1L) - iss1vv <- iss1[vv1] - } - # Extract indices for the 2nd component - ncp2 <- newColPos2[j] - ncp2p <- newColPos2[j + 1L] - cond2 <- ncp2 < ncp2p - if (cond2) { - vv2 <- ncp2:(ncp2p - 1L) - iss2vv <- iss2[vv2] - } + switch(matrix_case, + "ad" = process_one( + matrix = ad_ped_matrix, + rel_name = "addRel", + ids = ids, + nc = nc, + rel_pairs_file = rel_pairs_file, + writetodisk = writetodisk, + write_buffer_size = write_buffer_size, + drop_upper_triangular = drop_upper_triangular, + update_rate = update_rate, + verbose = verbose, + gc = gc, + include_all_links = include_all_links_1ped, + ... + ), + "mt" = process_one( + matrix = mit_ped_matrix, + rel_name = "mitRel", + ids = ids, + nc = nc, + rel_pairs_file = rel_pairs_file, + writetodisk = writetodisk, + write_buffer_size = write_buffer_size, + drop_upper_triangular = drop_upper_triangular, + update_rate = update_rate, + verbose = verbose, + gc = gc, + include_all_links = include_all_links_1ped, + ... + ), + "cn" = process_one( + matrix = cn_ped_matrix, + rel_name = "cnuRel", + ids = ids, + nc = nc, + rel_pairs_file = rel_pairs_file, + writetodisk = writetodisk, + write_buffer_size = write_buffer_size, + drop_upper_triangular = drop_upper_triangular, + update_rate = update_rate, + verbose = verbose, + gc = gc, + include_all_links = include_all_links_1ped, + ... + ), + "ad-mt" = process_two( + matrix1 = ad_ped_matrix, + name1 = "addRel", + matrix2 = mit_ped_matrix, + name2 = "mitRel", + ids = ids, + nc = nc, + rel_pairs_file = rel_pairs_file, + writetodisk = writetodisk, + write_buffer_size = write_buffer_size, + drop_upper_triangular = drop_upper_triangular, + update_rate = update_rate, + verbose = verbose, + gc = gc, + ... + ), + "ad-cn" = process_two( + matrix1 = ad_ped_matrix, + name1 = "addRel", + matrix2 = cn_ped_matrix, + name2 = "cnuRel", + ids = ids, + nc = nc, + rel_pairs_file = rel_pairs_file, + writetodisk = writetodisk, + write_buffer_size = write_buffer_size, + drop_upper_triangular = drop_upper_triangular, + update_rate = update_rate, + verbose = verbose, + gc = gc, + ... + ), + "cn-mt" = process_two( + matrix1 = cn_ped_matrix, + name1 = "cnuRel", + matrix2 = mit_ped_matrix, + name2 = "mitRel", + ids = ids, + nc = nc, + rel_pairs_file = rel_pairs_file, + writetodisk = writetodisk, + write_buffer_size = write_buffer_size, + drop_upper_triangular = drop_upper_triangular, + update_rate = update_rate, + verbose = verbose, + gc = gc, + ... + ), + "ad-cn-mt" = process_all_three( + mat1 = ad_ped_matrix, + name1 = "addRel", + mat2 = mit_ped_matrix, + name2 = "mitRel", + mat3 = cn_ped_matrix, + name3 = "cnuRel", + ids = ids, + nc = nc, + rel_pairs_file = rel_pairs_file, + writetodisk = writetodisk, + write_buffer_size = write_buffer_size, + drop_upper_triangular = drop_upper_triangular, + update_rate = update_rate, + verbose = verbose, + gc = gc, + ... + ), + stop("Unsupported matrix combination") + ) +} +#' Convert Sparse Relationship Matrices to Kinship Links for one Matrix +#' @inheritParams com2links +#' @param include_all_links Logical. If TRUE, all links are included in the output. +#' @keywords internal - # Extract indices for the 3rd component - ncp3 <- newColPos3[j] - ncp3p <- newColPos3[j + 1L] - cond3 <- ncp3 < ncp3p - if (cond3) { - vv3 <- ncp3:(ncp3p - 1L) - iss3vv <- iss3[vv3] - } - # Create a unique, sorted set of row indices from all provided matrices. - u <- sort(igraph::union(igraph::union(if (cond1) { - iss1vv - }, if (cond2) { - iss2vv - }), if (cond3) { - iss3vv - })) - - # If any relationships exist for this individual, build the related pairs. - if (cond1 || cond2 || cond3) { - ID1 <- ids[u] - tds <- data.frame(ID1 = ID1, ID2 = ID2) - tds[[relNames[1]]] <- 0 - tds[[relNames[2]]] <- 0 - tds[[relNames[3]]] <- 0 - - # Assign the relationship coefficients from each matrix. - if (cond1) { - tds[u %in% iss1vv, relNames[1]] <- x1[vv1] - } - if (cond2) { - tds[u %in% iss2vv, relNames[2]] <- x2[vv2] - } - if (cond3) { - tds[u %in% iss3vv, relNames[3]] <- x3[vv3] - } - # Optionally drop upper-triangular entries. - if (drop_upper_triangular == TRUE) { - tds <- tds[tds$ID1 <= tds$ID2, ] # or < if you want strictly lower triangle - } +process_one <- function(matrix, rel_name, ids, nc, rel_pairs_file, writetodisk, + write_buffer_size, drop_upper_triangular, update_rate, verbose, gc, + include_all_links = TRUE, ...) { + if (include_all_links == FALSE) { + # Extract pointers and indices from the matrix. + newColPos <- matrix@p + 1L + iss <- matrix@i + 1L + x <- matrix@x - # Write the batch to disk or accumulate in the data frame. - if (nrow(tds) > 0) { - if (writetodisk == TRUE) { - write_buffer[[length(write_buffer) + 1]] <- tds - - if (length(write_buffer) >= write_buffer_size) { # Write in batches - utils::write.table(do.call(rbind, write_buffer), - file = rel_pairs_file, - row.names = FALSE, col.names = FALSE, append = TRUE, sep = "," - ) - write_buffer <- list() - } - } else { - df_relpairs <- rbind(df_relpairs, tds) - } - } - } - if (verbose) { - if (!(j %% update_rate)) { - cat(paste0("Done with ", j, " of ", nc, "\n")) - } - } - } - } else if (sum_nulls == 2) { - # --- Case: Two Matrices Provided --- - # Set pointers and relationship names based on which matrix is missing. - - if (is.null(ad_ped_matrix)) { - newColPos1 <- mt_p - iss1 <- mt_i - x1 <- mt_x - newColPos2 <- cn_p - iss2 <- cn_i - x2 <- cn_x - relNames <- c("mitRel", "cnuRel") - if (gc == TRUE) { - remove(mt_p, mt_i, mt_x, cn_p, cn_i, cn_x) - } - } - if (is.null(mit_ped_matrix)) { - newColPos1 <- ad_ped_p - iss1 <- ad_ped_i - x1 <- ad_ped_x - newColPos2 <- cn_p - iss2 <- cn_i - x2 <- cn_x - relNames <- c("addRel", "cnuRel") - if (gc == TRUE) { - remove(ad_ped_p, ad_ped_i, ad_ped_x, cn_p, cn_i, cn_x) - } - } - if (is.null(cn_ped_matrix)) { - newColPos1 <- ad_ped_p - iss1 <- ad_ped_i - x1 <- ad_ped_x - newColPos2 <- mt_p - iss2 <- mt_i - x2 <- mt_x - relNames <- c("addRel", "mitRel") - if (gc == TRUE) { - remove(ad_ped_p, ad_ped_i, ad_ped_x, mt_p, mt_i, mt_x) - } - } + # Initialize the related pairs file with headers. + df_relpairs <- initialize_empty_df(relNames = rel_name) - # Initialize the related pairs file with the appropriate headers. - df_relpairs <- data.frame( - ID1 = numeric(0), ID2 = numeric(0) + if (writetodisk == TRUE) { + utils::write.table( + df_relpairs, + file = rel_pairs_file, sep = ",", append = FALSE, row.names = FALSE ) - df_relpairs[[relNames[1]]] <- numeric(0) - df_relpairs[[relNames[2]]] <- numeric(0) - if (writetodisk == TRUE) { - utils::write.table( - df_relpairs, - file = rel_pairs_file, sep = ",", append = FALSE, row.names = FALSE - ) - # initial buffer - write_buffer <- list() - remove(df_relpairs) - } - # Process each column to extract relationships. - for (j in 1L:nc) { - ID2 <- ids[j] - - # Extract indices from the first matrix. - ncp1 <- newColPos1[j] - ncp1p <- newColPos1[j + 1L] - cond1 <- ncp1 < ncp1p - if (cond1) { - vv1 <- ncp1:(ncp1p - 1L) - iss1vv <- iss1[vv1] - } - # Extract indices from the second matrix. - ncp2 <- newColPos2[j] - ncp2p <- newColPos2[j + 1L] - cond2 <- ncp2 < ncp2p - if (cond2) { - vv2 <- ncp2:(ncp2p - 1L) - iss2vv <- iss2[vv2] - } + # Prepare an empty buffer for batching writes. + write_buffer <- list() + remove(df_relpairs) + } - # Merge the indices from both matrices. - u <- sort(igraph::union(if (cond1) { - iss1vv - }, if (cond2) { - iss2vv - })) - - # Create related pairs if relationships are found. - if (cond1 || cond2) { - ID1 <- ids[u] - tds <- data.frame(ID1 = ID1, ID2 = ID2) - tds[[relNames[1]]] <- 0 - tds[[relNames[2]]] <- 0 - - if (cond1) { - tds[u %in% iss1vv, relNames[1]] <- x1[vv1] - } - if (cond2) { - tds[u %in% iss2vv, relNames[2]] <- x2[vv2] - } - if (drop_upper_triangular == TRUE) { - tds <- tds[tds$ID1 <= tds$ID2, ] # or < if you want strictly lower triangle - } + # Process each column in the matrix. + for (j in 1L:nc) { + ID2 <- ids[j] - # Write the batch to disk or accumulate in the data frame. - if (nrow(tds) > 0) { - if (writetodisk == TRUE) { - write_buffer[[length(write_buffer) + 1]] <- tds - - if (length(write_buffer) >= write_buffer_size) { # Write in batches - utils::write.table(do.call(rbind, write_buffer), - file = rel_pairs_file, - row.names = FALSE, col.names = FALSE, append = TRUE, sep = "," - ) - write_buffer <- list() - } - } else { - df_relpairs <- rbind(df_relpairs, tds) - } - } - } - if (verbose) { - if (!(j %% update_rate)) { - cat(paste0("Done with ", j, " of ", nc, "\n")) - } - } - } - } else if (sum_nulls == 1) { - # --- Case: Only One Matrix Provided --- - if (verbose) { - message("Only one matrix is present") - } - if (!is.null(ad_ped_matrix)) { - newColPos1 <- ad_ped_p - iss1 <- ad_ped_i - x1 <- ad_ped_x - relNames <- c("addRel") - if (gc == TRUE) { - remove(ad_ped_p, ad_ped_i, ad_ped_x) - } - } - if (!is.null(mit_ped_matrix)) { - newColPos1 <- mt_p - iss1 <- mt_i - x1 <- mt_x - relNames <- c("mitRel") - if (gc == TRUE) { - remove(mt_p, mt_i, mt_x) - } - } - if (!is.null(cn_ped_matrix)) { - newColPos1 <- cn_p - iss1 <- cn_i - x1 <- cn_x - relNames <- c("cnuRel") - if (gc == TRUE) { - remove(cn_p, cn_i, cn_x) - } + # Extract column indices + ncp <- newColPos[j] + ncpp <- newColPos[j + 1L] + cond <- ncp < ncpp + if (cond) { + vv <- ncp:(ncpp - 1L) + issvv <- iss[vv] } - # Initialize the related pairs file. - df_relpairs <- data.frame( - ID1 = numeric(0), ID2 = numeric(0) - ) - df_relpairs[[relNames[1]]] <- numeric(0) - if (writetodisk == TRUE) { - utils::write.table( - df_relpairs, - file = rel_pairs_file, sep = ",", append = FALSE, row.names = FALSE - ) - - # initial buffer - write_buffer <- list() - - remove(df_relpairs) - } + # Create a unique set of row indices. + u <- sort(issvv) - # Process each column. - for (j in 1L:nc) { - ID2 <- ids[j] - # Extract column indices - ncp1 <- newColPos1[j] - ncp1p <- newColPos1[j + 1L] - cond1 <- ncp1 < ncp1p - if (cond1) { - vv1 <- ncp1:(ncp1p - 1L) - iss1vv <- iss1[vv1] + # If any relationships exist for this individual, build the related pairs. + if (cond) { + # Create a data frame with unique pairs. + ID1 <- ids[u] + tds <- data.frame(ID1 = ID1, ID2 = ID2) + tds[[rel_name]] <- 0 + if (cond) { + tds[u %in% issvv, rel_name] <- x[vv] + } + if (drop_upper_triangular == TRUE) { + tds <- tds[tds$ID1 <= tds$ID2, ] # or < if you want strictly lower triangle } - # Use the indices from the single matrix. - u <- sort(iss1vv) - - if (cond1) { - ID1 <- ids[u] - tds <- data.frame(ID1 = ID1, ID2 = ID2) - tds[[relNames[1]]] <- 0 - - if (cond1) { - tds[u %in% iss1vv, relNames[1]] <- x1[vv1] - } - if (drop_upper_triangular == TRUE) { - tds <- tds[tds$ID1 <= tds$ID2, ] # or < if you want strictly lower triangle - } - - # Write the batch to disk or accumulate in the data frame. - if (nrow(tds) > 0) { - if (writetodisk == TRUE) { - write_buffer[[length(write_buffer) + 1]] <- tds - - if (length(write_buffer) >= write_buffer_size) { # Write in batches - utils::write.table(do.call(rbind, write_buffer), - file = rel_pairs_file, - row.names = FALSE, col.names = FALSE, append = TRUE, sep = "," - ) - write_buffer <- list() - } - } else { - df_relpairs <- rbind(df_relpairs, tds) + # Write the batch to disk or accumulate in the data frame. + if (nrow(tds) > 0) { + if (writetodisk == TRUE) { + write_buffer[[length(write_buffer) + 1]] <- tds + + if (length(write_buffer) >= write_buffer_size) { # Write in batches + utils::write.table(do.call(rbind, write_buffer), + file = rel_pairs_file, + row.names = FALSE, col.names = FALSE, append = TRUE, sep = "," + ) + write_buffer <- list() } + } else { + df_relpairs <- rbind(df_relpairs, tds) } } - if (verbose && !(j %% update_rate)) { - cat(paste0("Done with ", j, " of ", nc, "\n")) - } } - } else { - stop("No matrices provided") + if (verbose && (j %% update_rate == 0L)) { + cat("Done with", j, "of", nc, "\n") + } } - # If not writing to disk, return the accumulated data frame. if (writetodisk == FALSE) { return(df_relpairs) @@ -505,110 +313,273 @@ com2links <- function( row.names = FALSE, col.names = FALSE, append = TRUE, sep = "," ) } - # return(NULL) } - } else if (legacy) { - # --- Legacy Mode --- - if (verbose) { - message("Using legacy mode") + if (gc == TRUE) { + remove(newColPos, iss, x) } - # In legacy mode, convert matrices to the expected symmetric formats. - - # load(paste0(outcome_name,'_dataBiggestCnPedigree.Rdata')) - # biggestCnPed <- methods::as(biggestCnPed, "symmetricMatrix") - # load(paste0(outcome_name,'_dataBiggestPedigree.Rdata')) - # load(paste0(outcome_name,'_dataBiggestMtPedigree.Rdata')) - - # rel_pairs_file <- paste0(outcome_name,'_datacnmitBiggestRelatedPairsTake3.csv') - - biggestMtPed <- mit_ped_matrix - remove(mit_ped_matrix) - biggestCnPed <- methods::as(cn_ped_matrix, "symmetricMatrix") - remove(cn_ped_matrix) - biggestPed <- ad_ped_matrix - remove(ad_ped_matrix) - biggestMtPed@x[biggestMtPed@x > 0] <- 1 - - # Set the output file name. - if (exists("rel_pairs_file")) { - fname <- rel_pairs_file + } else { + matrix2 <- matrix(rep(1, length(ids)^2), + nrow = length(ids), + dimnames = list(ids, ids) + ) + process_two( + matrix2 = matrix, name2 = rel_name, + matrix1 = methods::as(matrix2, "CsparseMatrix"), + name1 = "phantom", + ids = ids, + nc = nc, + rel_pairs_file = rel_pairs_file, + writetodisk = writetodisk, + write_buffer_size = write_buffer_size, + drop_upper_triangular = drop_upper_triangular, + update_rate = update_rate, + verbose = verbose, + gc = gc + ) + } +} + +process_all_three <- function( + mat1, name1, + mat2, name2, + mat3, name3, + ids, nc, + rel_pairs_file, + writetodisk, + write_buffer_size, + drop_upper_triangular, + update_rate, + verbose, + gc, + ...) { + # Extract matrix slots + p1 <- mat1@p + 1L + i1 <- mat1@i + 1L + x1 <- mat1@x + p2 <- mat2@p + 1L + i2 <- mat2@i + 1L + x2 <- mat2@x + p3 <- mat3@p + 1L + i3 <- mat3@i + 1L + x3 <- mat3@x + + relNames <- c(name1, name2, name3) + df_relpairs <- initialize_empty_df(relNames) + + if (writetodisk) { + utils::write.table(df_relpairs, file = rel_pairs_file, sep = ",", append = FALSE, row.names = FALSE) + write_buffer <- list() + rm(df_relpairs) + } + + for (j in seq_len(nc)) { + ID2 <- ids[j] + + # Get index spans + v1 <- if (p1[j] < p1[j + 1L]) { + idx <- p1[j]:(p1[j + 1L] - 1L) + list(i = i1[idx], x = x1[idx]) } else { - fname <- paste0(outcome_name, "_dataBiggestRelatedPairsTake2.csv") + NULL } - # Initialize the output file with headers. - ds <- data.frame(ID1 = numeric(0), ID2 = numeric(0), addRel = numeric(0), mitRel = numeric(0), cnuRel = numeric(0)) - utils::write.table(ds, file = fname, sep = ",", append = FALSE, row.names = FALSE) - - # Extract IDs from the common nuclear matrix. - ids <- as.numeric(dimnames(biggestCnPed)[[1]]) - - # Extract pointers from the legacy matrices. - newColPos1 <- biggestPed@p + 1L - iss1 <- biggestPed@i + 1L - newColPos2 <- biggestMtPed@p + 1L - iss2 <- biggestMtPed@i + 1L - newColPos3 <- biggestCnPed@p + 1L - iss3 <- biggestCnPed@i + 1L - nc <- ncol(biggestPed) - - # Process each individual. - for (j in 1L:nc) { - ID2 <- ids[j] - ncp1 <- newColPos1[j] - ncp1p <- newColPos1[j + 1L] - cond1 <- ncp1 < ncp1p - if (cond1) { - vv1 <- ncp1:(ncp1p - 1L) - iss1vv <- iss1[vv1] + v2 <- if (p2[j] < p2[j + 1L]) { + idx <- p2[j]:(p2[j + 1L] - 1L) + list(i = i2[idx], x = x2[idx]) + } else { + NULL + } + v3 <- if (p3[j] < p3[j + 1L]) { + idx <- p3[j]:(p3[j + 1L] - 1L) + list(i = i3[idx], x = x3[idx]) + } else { + NULL + } + + # Union of index positions + u <- sort(unique(c( + if (!is.null(v1)) v1$i else NULL, + if (!is.null(v2)) v2$i else NULL, + if (!is.null(v3)) v3$i else NULL + ))) + if (length(u) > 0) { + ID1 <- ids[u] + tds <- data.frame(ID1 = ID1, ID2 = ID2) + tds[[name1]] <- if (!is.null(v1)) ifelse(u %in% v1$i, v1$x[match(u, v1$i)], 0) else 0 + tds[[name2]] <- if (!is.null(v2)) ifelse(u %in% v2$i, v2$x[match(u, v2$i)], 0) else 0 + tds[[name3]] <- if (!is.null(v3)) ifelse(u %in% v3$i, v3$x[match(u, v3$i)], 0) else 0 + + if (drop_upper_triangular) { + tds <- tds[tds$ID1 <= tds$ID2, ] } - ncp2 <- newColPos2[j] - ncp2p <- newColPos2[j + 1L] - cond2 <- ncp2 < ncp2p - if (cond2) { - vv2 <- ncp2:(ncp2p - 1L) - iss2vv <- iss2[vv2] + + if (nrow(tds) > 0) { + if (writetodisk) { + write_buffer[[length(write_buffer) + 1L]] <- tds + if (length(write_buffer) >= write_buffer_size) { + utils::write.table(do.call(rbind, write_buffer), + file = rel_pairs_file, row.names = FALSE, + col.names = FALSE, append = TRUE, sep = "," + ) + write_buffer <- list() + } + } else { + df_relpairs <- rbind(df_relpairs, tds) + } } - ncp3 <- newColPos3[j] - ncp3p <- newColPos3[j + 1L] - cond3 <- ncp3 < ncp3p - if (cond3) { - vv3 <- ncp3:(ncp3p - 1L) - iss3vv <- iss3[vv3] + } + + if (verbose && (j %% update_rate == 0L)) { + cat("Done with", j, "of", nc, "\n") + } + } + + if (!writetodisk) { + return(df_relpairs) + } else if (length(write_buffer) > 0) { + utils::write.table(do.call(rbind, write_buffer), + file = rel_pairs_file, row.names = FALSE, + col.names = FALSE, append = TRUE, sep = "," + ) + } + + invisible(NULL) +} + +process_two <- function( + matrix1, name1, + matrix2, name2, + ids, nc, + rel_pairs_file, + writetodisk, + write_buffer_size, + drop_upper_triangular, + update_rate, + verbose, + gc, + ...) { + # Extract internal slots + p1 <- matrix1@p + 1L + i1 <- matrix1@i + 1L + x1 <- matrix1@x + p2 <- matrix2@p + 1L + i2 <- matrix2@i + 1L + x2 <- matrix2@x + + relNames <- c(name1, name2) + df_relpairs <- initialize_empty_df(relNames) + + if (writetodisk) { + utils::write.table(df_relpairs, file = rel_pairs_file, sep = ",", append = FALSE, row.names = FALSE) + write_buffer <- list() + rm(df_relpairs) + } + + for (j in seq_len(nc)) { + ID2 <- ids[j] + + # Get index/value slices + v1 <- if (p1[j] < p1[j + 1L]) { + idx <- p1[j]:(p1[j + 1L] - 1L) + list(i = i1[idx], x = x1[idx]) + } else { + NULL + } + v2 <- if (p2[j] < p2[j + 1L]) { + idx <- p2[j]:(p2[j + 1L] - 1L) + list(i = i2[idx], x = x2[idx]) + } else { + NULL + } + + # Union of indices from both matrices + u <- sort(unique(c( + if (!is.null(v1)) v1$i else NULL, + if (!is.null(v2)) v2$i else NULL + ))) + + if (length(u) > 0) { + ID1 <- ids[u] + tds <- data.frame(ID1 = ID1, ID2 = ID2) + tds[[name1]] <- if (!is.null(v1)) ifelse(u %in% v1$i, v1$x[match(u, v1$i)], 0) else 0 + tds[[name2]] <- if (!is.null(v2)) ifelse(u %in% v2$i, v2$x[match(u, v2$i)], 0) else 0 + + if (drop_upper_triangular) { + tds <- tds[tds$ID1 <= tds$ID2, ] } - # Merge indices from all three matrices. - u <- sort(igraph::union(igraph::union(if (cond1) { - iss1vv - }, if (cond2) { - iss2vv - }), if (cond3) { - iss3vv - })) - # browser() - if (cond1 || cond2 || cond3) { - ID1 <- ids[u] - tds <- data.frame(ID1 = ID1, ID2 = ID2, addRel = 0, mitRel = 0, cnuRel = 0) - if (cond1) { - tds$addRel[u %in% iss1vv] <- biggestPed@x[vv1] - } - if (cond2) { - tds$mitRel[u %in% iss2vv] <- biggestMtPed@x[vv2] - } - if (cond3) { - tds$cnuRel[u %in% iss3vv] <- biggestCnPed@x[vv3] + if (nrow(tds) > 0) { + if (writetodisk) { + write_buffer[[length(write_buffer) + 1L]] <- tds + if (length(write_buffer) >= write_buffer_size) { + utils::write.table(do.call(rbind, write_buffer), + file = rel_pairs_file, row.names = FALSE, + col.names = FALSE, append = TRUE, sep = "," + ) + write_buffer <- list() + } + } else { + df_relpairs <- rbind(df_relpairs, tds) } - utils::write.table(tds, file = fname, row.names = FALSE, col.names = FALSE, append = TRUE, sep = ",") - } - if (!(j %% 500)) { - cat(paste0("Done with ", j, " of ", nc, "\n")) } } + + if (verbose && (j %% update_rate == 0L)) { + cat("Done with", j, "of", nc, "\n") + } } + if (!writetodisk) { + return(df_relpairs) + } else if (length(write_buffer) > 0) { + utils::write.table(do.call(rbind, write_buffer), + file = rel_pairs_file, row.names = FALSE, + col.names = FALSE, append = TRUE, sep = "," + ) + } + invisible(NULL) +} - # Merge and write the parentage matrices - # df <- full_join(mat_ped_matrix %>% arrange(ID), pat_ped_matrix %>% arrange(ID)) - # write.table(df, file = mapa_id_file, sep = ",", append = FALSE, row.names = FALSE) +#' @title validate_and_convert_matrix +#' @description +#' This function validates and converts a matrix to a specific format. +#' +#' @param mat The matrix to be validated and converted. +#' @param name The name of the matrix for error messages. +#' @param ensure_symmetric Logical indicating whether to ensure the matrix is symmetric. +#' @param force_binary Logical indicating whether to force the matrix to be binary. +#' +#' @return The validated and converted matrix. +validate_and_convert_matrix <- function(mat, name, ensure_symmetric = FALSE, force_binary = FALSE) { + if (!inherits(mat, c( + "matrix", "dgCMatrix", "dsCMatrix", "generalMatrix", + "symmetricMatrix", "triangularMatrix", "dsyMatrix", "dspMatrix", "dsyMatrix", "CsparseMatrix" + ))) { + stop(paste0("The '", name, "' must be a matrix or generalMatrix")) + } + if (!inherits(mat, "generalMatrix")) { + mat <- methods::as(mat, if (ensure_symmetric) "symmetricMatrix" else "generalMatrix") + } + if (force_binary) { + mat@x[mat@x > 0] <- 1 + } + return(mat) +} + +#' @title initialize_empty_df +#' @description +#' This function initializes an empty data frame with specified column names. +#' +#' @param relNames A vector of column names to be included in the data frame. +#' +#' @return An empty data frame with specified column names. +#' @keywords internal + +initialize_empty_df <- function(relNames) { + df <- data.frame(ID1 = numeric(0), ID2 = numeric(0)) + for (r in relNames) { + df[[r]] <- numeric(0) + } + return(df) } diff --git a/R/makeLinkslegacy.R b/R/makeLinkslegacy.R new file mode 100644 index 00000000..686c9f5a --- /dev/null +++ b/R/makeLinkslegacy.R @@ -0,0 +1,630 @@ +#' Convert Sparse Relationship Matrices to Kinship Links +#' @inheritParams com2links +#' @keywords internal + + +.com2links.legacy <- function( + rel_pairs_file = "dataRelatedPairs.csv", + ad_ped_matrix = NULL, + mit_ped_matrix = mt_ped_matrix, + mt_ped_matrix = NULL, + cn_ped_matrix = NULL, + # pat_ped_matrix = NULL, + # mat_ped_matrix = NULL, + # mapa_id_file = "data_mapaID.csv", + write_buffer_size = 1000, + update_rate = 1000, + gc = TRUE, + writetodisk = TRUE, + verbose = FALSE, + legacy = FALSE, + outcome_name = "data", + drop_upper_triangular = TRUE, + ...) { + # Non-legacy mode processing + + if (!legacy) { + # --- Input Validations and Preprocessing --- + + # Ensure that at least one relationship matrix is provided. + if (is.null(ad_ped_matrix) && is.null(mit_ped_matrix) && is.null(cn_ped_matrix)) { + stop("At least one of 'ad_ped_matrix', 'mit_ped_matrix', or 'cn_ped_matrix' must be provided.") + } + # Validate and convert ad_ped_matrix to a sparse dgCMatrix if provided. + if (!is.null(ad_ped_matrix)) { + ad_ped_matrix <- validate_and_convert_matrix( + mat = ad_ped_matrix, + name = "ad_ped_matrix" + ) + } + + # Validate and convert cn_ped_matrix to a sparse dgCMatrix if provided. + if (!is.null(cn_ped_matrix)) { + cn_ped_matrix <- validate_and_convert_matrix( + mat = cn_ped_matrix, + name = "cn_ped_matrix", + ensure_symmetric = TRUE + ) + } + + # Validate and process mit_ped_matrix: convert and ensure binary values. + if (!is.null(mit_ped_matrix)) { + mit_ped_matrix <- validate_and_convert_matrix( + mat = mit_ped_matrix, + name = "mit_ped_matrix", force_binary = TRUE, + ensure_symmetric = TRUE + ) + } + + # --- Build IDs and Prepare Matrix Pointers --- + + # Extract individual IDs from the first available matrix. + ids <- NULL + + + + if (!is.null(cn_ped_matrix)) { + ids <- as.numeric(dimnames(cn_ped_matrix)[[1]]) + nc <- ncol(cn_ped_matrix) + } else if (!is.null(ad_ped_matrix)) { + ids <- as.numeric(dimnames(ad_ped_matrix)[[1]]) + nc <- ncol(ad_ped_matrix) + } else if (!is.null(mit_ped_matrix)) { + ids <- as.numeric(dimnames(mit_ped_matrix)[[1]]) + nc <- ncol(mit_ped_matrix) + } + + if (is.null(ids)) { + stop("Could not extract IDs from the provided matrices.") + } + + + # Construct case identifier + matrix_case <- paste(sort(c( + if (!is.null(ad_ped_matrix)) "ad" else NULL, + if (!is.null(mit_ped_matrix)) "mt" else NULL, + if (!is.null(cn_ped_matrix)) "cn" else NULL + )), collapse = "-") + + + # Count how many matrices are provided. + sum_nulls <- sum(!is.null(ad_ped_matrix), + !is.null(mit_ped_matrix), + !is.null(cn_ped_matrix), + na.rm = TRUE + ) + if (verbose) { + print(matrix_case) + } + + # Extract the internal pointers (p, i, and x slots) for each provided matrix. + if (!is.null(ad_ped_matrix)) { + ad_ped_p <- ad_ped_matrix@p + 1L + ad_ped_i <- ad_ped_matrix@i + 1L + ad_ped_x <- ad_ped_matrix@x + } + if (!is.null(mit_ped_matrix)) { + mt_p <- mit_ped_matrix@p + 1L + mt_i <- mit_ped_matrix@i + 1L + mt_x <- mit_ped_matrix@x + } + if (!is.null(cn_ped_matrix)) { + cn_p <- cn_ped_matrix@p + 1L + cn_i <- cn_ped_matrix@i + 1L + cn_x <- cn_ped_matrix@x + } + + # --- Process Based on the Number of Provided Matrices --- + # --- Case: All Three Matrices Provided --- + if (sum_nulls == 3) { + # Set pointers for all three matrices. + newColPos1 <- ad_ped_p + iss1 <- ad_ped_i + x1 <- ad_ped_x + + newColPos2 <- mt_p + iss2 <- mt_i + x2 <- mt_x + + newColPos3 <- cn_p + iss3 <- cn_i + x3 <- cn_x + + # Define relationship column names. + relNames <- c("addRel", "mitRel", "cnuRel") + + # Optionally remove the original pointers to free memory. + if (gc == TRUE) { + remove(ad_ped_p, ad_ped_i, ad_ped_x, mt_p, mt_i, mt_x, cn_p, cn_i, cn_x) + } + if (verbose) { + message("All 3 matrix is present") + } + + # File names + # rel_pairs_file <- paste0(outcome_name, "_dataRelatedPairs.csv") + # mapa_id_file <- paste0(outcome_name, "_data_mapaID.csv") + + # Initialize the related pairs file with headers. + df_relpairs <- initialize_empty_df(relNames = relNames) + + # Write the headers to the related pairs file. + if (writetodisk == TRUE) { + utils::write.table( + df_relpairs, + file = rel_pairs_file, sep = ",", append = FALSE, row.names = FALSE + ) + + # Prepare an empty buffer for batching writes. + write_buffer <- list() + remove(df_relpairs) + } + + # Loop over each column (individual) in the matrix. + for (j in 1L:nc) { + ID2 <- ids[j] + + # Extract column indices for the 1st component + ncp1 <- newColPos1[j] + ncp1p <- newColPos1[j + 1L] + cond1 <- ncp1 < ncp1p + if (cond1) { + vv1 <- ncp1:(ncp1p - 1L) + iss1vv <- iss1[vv1] + } + # Extract indices for the 2nd component + ncp2 <- newColPos2[j] + ncp2p <- newColPos2[j + 1L] + cond2 <- ncp2 < ncp2p + if (cond2) { + vv2 <- ncp2:(ncp2p - 1L) + iss2vv <- iss2[vv2] + } + + # Extract indices for the 3rd component + ncp3 <- newColPos3[j] + ncp3p <- newColPos3[j + 1L] + cond3 <- ncp3 < ncp3p + if (cond3) { + vv3 <- ncp3:(ncp3p - 1L) + iss3vv <- iss3[vv3] + } + + # Create a unique, sorted set of row indices from all provided matrices. + u <- sort(igraph::union(igraph::union(if (cond1) { + iss1vv + }, if (cond2) { + iss2vv + }), if (cond3) { + iss3vv + })) + + # If any relationships exist for this individual, build the related pairs. + if (cond1 || cond2 || cond3) { + ID1 <- ids[u] + tds <- data.frame(ID1 = ID1, ID2 = ID2) + tds[[relNames[1]]] <- 0 + tds[[relNames[2]]] <- 0 + tds[[relNames[3]]] <- 0 + + # Assign the relationship coefficients from each matrix. + if (cond1) { + tds[u %in% iss1vv, relNames[1]] <- x1[vv1] + } + if (cond2) { + tds[u %in% iss2vv, relNames[2]] <- x2[vv2] + } + if (cond3) { + tds[u %in% iss3vv, relNames[3]] <- x3[vv3] + } + + # Optionally drop upper-triangular entries. + if (drop_upper_triangular == TRUE) { + tds <- tds[tds$ID1 <= tds$ID2, ] # or < if you want strictly lower triangle + } + + # Write the batch to disk or accumulate in the data frame. + if (nrow(tds) > 0) { + if (writetodisk == TRUE) { + write_buffer[[length(write_buffer) + 1]] <- tds + + if (length(write_buffer) >= write_buffer_size) { # Write in batches + utils::write.table(do.call(rbind, write_buffer), + file = rel_pairs_file, + row.names = FALSE, col.names = FALSE, append = TRUE, sep = "," + ) + write_buffer <- list() + } + } else { + df_relpairs <- rbind(df_relpairs, tds) + } + } + } + if (verbose && (j %% update_rate == 0L)) { + cat("Done with", j, "of", nc, "\n") + } + } + } else if (sum_nulls == 2) { + # --- Case: Two Matrices Provided --- + # Set pointers and relationship names based on which matrix is missing. + + if (is.null(ad_ped_matrix)) { + newColPos1 <- mt_p + iss1 <- mt_i + x1 <- mt_x + newColPos2 <- cn_p + iss2 <- cn_i + x2 <- cn_x + relNames <- c("mitRel", "cnuRel") + if (gc == TRUE) { + remove(mt_p, mt_i, mt_x, cn_p, cn_i, cn_x) + } + } + if (is.null(mit_ped_matrix)) { + newColPos1 <- ad_ped_p + iss1 <- ad_ped_i + x1 <- ad_ped_x + newColPos2 <- cn_p + iss2 <- cn_i + x2 <- cn_x + relNames <- c("addRel", "cnuRel") + if (gc == TRUE) { + remove(ad_ped_p, ad_ped_i, ad_ped_x, cn_p, cn_i, cn_x) + } + } + if (is.null(cn_ped_matrix)) { + newColPos1 <- ad_ped_p + iss1 <- ad_ped_i + x1 <- ad_ped_x + newColPos2 <- mt_p + iss2 <- mt_i + x2 <- mt_x + relNames <- c("addRel", "mitRel") + if (gc == TRUE) { + remove(ad_ped_p, ad_ped_i, ad_ped_x, mt_p, mt_i, mt_x) + } + } + + # Initialize the related pairs file with the appropriate headers. + df_relpairs <- initialize_empty_df(relNames = relNames) + + if (writetodisk == TRUE) { + utils::write.table( + df_relpairs, + file = rel_pairs_file, sep = ",", append = FALSE, row.names = FALSE + ) + # initial buffer + write_buffer <- list() + remove(df_relpairs) + } + + # Process each column to extract relationships. + for (j in 1L:nc) { + ID2 <- ids[j] + + # Extract indices from the first matrix. + ncp1 <- newColPos1[j] + ncp1p <- newColPos1[j + 1L] + cond1 <- ncp1 < ncp1p + if (cond1) { + vv1 <- ncp1:(ncp1p - 1L) + iss1vv <- iss1[vv1] + } + # Extract indices from the second matrix. + ncp2 <- newColPos2[j] + ncp2p <- newColPos2[j + 1L] + cond2 <- ncp2 < ncp2p + if (cond2) { + vv2 <- ncp2:(ncp2p - 1L) + iss2vv <- iss2[vv2] + } + + # Merge the indices from both matrices. + u <- sort(igraph::union(if (cond1) { + iss1vv + }, if (cond2) { + iss2vv + })) + + # Create related pairs if relationships are found. + if (cond1 || cond2) { + ID1 <- ids[u] + tds <- data.frame(ID1 = ID1, ID2 = ID2) + tds[[relNames[1]]] <- 0 + tds[[relNames[2]]] <- 0 + + if (cond1) { + tds[u %in% iss1vv, relNames[1]] <- x1[vv1] + } + if (cond2) { + tds[u %in% iss2vv, relNames[2]] <- x2[vv2] + } + if (drop_upper_triangular == TRUE) { + tds <- tds[tds$ID1 <= tds$ID2, ] # or < if you want strictly lower triangle + } + + # Write the batch to disk or accumulate in the data frame. + if (nrow(tds) > 0) { + if (writetodisk == TRUE) { + write_buffer[[length(write_buffer) + 1]] <- tds + + if (length(write_buffer) >= write_buffer_size) { # Write in batches + utils::write.table(do.call(rbind, write_buffer), + file = rel_pairs_file, + row.names = FALSE, col.names = FALSE, append = TRUE, sep = "," + ) + write_buffer <- list() + } + } else { + df_relpairs <- rbind(df_relpairs, tds) + } + } + } + if (verbose && (j %% update_rate == 0L)) { + cat("Done with", j, "of", nc, "\n") + } + } + } else if (sum_nulls == 1) { + # --- Case: Only One Matrix Provided --- + if (verbose) { + message("Only one matrix is present") + } + if (!is.null(ad_ped_matrix)) { + newColPos1 <- ad_ped_p + iss1 <- ad_ped_i + x1 <- ad_ped_x + relNames <- c("addRel") + if (gc == TRUE) { + remove(ad_ped_p, ad_ped_i, ad_ped_x) + } + } + if (!is.null(mit_ped_matrix)) { + newColPos1 <- mt_p + iss1 <- mt_i + x1 <- mt_x + relNames <- c("mitRel") + if (gc == TRUE) { + remove(mt_p, mt_i, mt_x) + } + } + if (!is.null(cn_ped_matrix)) { + newColPos1 <- cn_p + iss1 <- cn_i + x1 <- cn_x + relNames <- c("cnuRel") + if (gc == TRUE) { + remove(cn_p, cn_i, cn_x) + } + } + + # Initialize the related pairs file. + df_relpairs <- initialize_empty_df(relNames = relNames) + + if (writetodisk == TRUE) { + utils::write.table( + df_relpairs, + file = rel_pairs_file, sep = ",", append = FALSE, row.names = FALSE + ) + + # initial buffer + write_buffer <- list() + + remove(df_relpairs) + } + + # Process each column. + for (j in 1L:nc) { + ID2 <- ids[j] + # Extract column indices + ncp1 <- newColPos1[j] + ncp1p <- newColPos1[j + 1L] + cond1 <- ncp1 < ncp1p + if (cond1) { + vv1 <- ncp1:(ncp1p - 1L) + iss1vv <- iss1[vv1] + } + + # Use the indices from the single matrix. + u <- sort(iss1vv) + + if (cond1) { + ID1 <- ids[u] + tds <- data.frame(ID1 = ID1, ID2 = ID2) + tds[[relNames[1]]] <- 0 + + if (cond1) { + tds[u %in% iss1vv, relNames[1]] <- x1[vv1] + } + if (drop_upper_triangular == TRUE) { + tds <- tds[tds$ID1 <= tds$ID2, ] # or < if you want strictly lower triangle + } + + # Write the batch to disk or accumulate in the data frame. + if (nrow(tds) > 0) { + if (writetodisk == TRUE) { + write_buffer[[length(write_buffer) + 1]] <- tds + + if (length(write_buffer) >= write_buffer_size) { # Write in batches + utils::write.table(do.call(rbind, write_buffer), + file = rel_pairs_file, + row.names = FALSE, col.names = FALSE, append = TRUE, sep = "," + ) + write_buffer <- list() + } + } else { + df_relpairs <- rbind(df_relpairs, tds) + } + } + } + if (verbose && (j %% update_rate == 0L)) { + cat("Done with", j, "of", nc, "\n") + } + } + } else { + stop("No matrices provided") + } + + # If not writing to disk, return the accumulated data frame. + if (writetodisk == FALSE) { + return(df_relpairs) + } else { + # Write any remaining buffered rows. + if (length(write_buffer) > 0) { + utils::write.table(do.call(rbind, write_buffer), + file = rel_pairs_file, + row.names = FALSE, col.names = FALSE, append = TRUE, sep = "," + ) + } + # return(NULL) + } + } else if (legacy) { + # --- Legacy Mode --- + # In legacy mode, convert matrices to the expected symmetric formats. + .com2links.og( + rel_pairs_file = rel_pairs_file, + ad_ped_matrix = ad_ped_matrix, + mit_ped_matrix = mit_ped_matrix, + cn_ped_matrix = cn_ped_matrix, + update_rate = update_rate, + verbose = verbose, + outcome_name = outcome_name + ) + return(NULL) + } + + # --- End of Legacy Mode --- + + # Merge and write the parentage matrices + # df <- full_join(mat_ped_matrix %>% arrange(ID), pat_ped_matrix %>% arrange(ID)) + + # write.table(df, file = mapa_id_file, sep = ",", append = FALSE, row.names = FALSE) +} + +#' Convert Pedigree Matrices to Related Pairs File (Legacy) +#' @description +#' This legacy function converts pedigree matrices into a related pairs file. +#' @inheritParams com2links +#' @keywords internal + + +.com2links.og <- function( + rel_pairs_file = "dataRelatedPairs.csv", + ad_ped_matrix = NULL, + mit_ped_matrix = mt_ped_matrix, + mt_ped_matrix = NULL, + cn_ped_matrix = NULL, + update_rate = 500, + verbose = FALSE, + outcome_name = "data", + ...) { + # --- Legacy Mode --- + if (verbose) { + message("Using legacy mode") + } + # In legacy mode, convert matrices to the expected symmetric formats. + + # load(paste0(outcome_name,'_dataBiggestCnPedigree.Rdata')) + # biggestCnPed <- methods::as(biggestCnPed, "symmetricMatrix") + # load(paste0(outcome_name,'_dataBiggestPedigree.Rdata')) + # load(paste0(outcome_name,'_dataBiggestMtPedigree.Rdata')) + + # rel_pairs_file <- paste0(outcome_name,'_datacnmitBiggestRelatedPairsTake3.csv') + + biggestMtPed <- mit_ped_matrix + remove(mit_ped_matrix) + biggestCnPed <- methods::as(cn_ped_matrix, "symmetricMatrix") + remove(cn_ped_matrix) + biggestPed <- ad_ped_matrix + remove(ad_ped_matrix) + biggestMtPed@x[biggestMtPed@x > 0] <- 1 + + # Set the output file name. + if (exists("rel_pairs_file")) { + fname <- rel_pairs_file + } else { + fname <- paste0(outcome_name, "_dataBiggestRelatedPairsTake2.csv") + } + # Initialize the output file with headers. + ds <- data.frame( + ID1 = numeric(0), ID2 = numeric(0), + addRel = numeric(0), + mitRel = numeric(0), cnuRel = numeric(0) + ) + + utils::write.table(ds, + file = fname, sep = ",", + append = FALSE, row.names = FALSE + ) + + # Extract IDs from the common nuclear matrix. + ids <- as.numeric(dimnames(biggestCnPed)[[1]]) + + # Extract pointers from the legacy matrices. + newColPos1 <- biggestPed@p + 1L + iss1 <- biggestPed@i + 1L + newColPos2 <- biggestMtPed@p + 1L + iss2 <- biggestMtPed@i + 1L + newColPos3 <- biggestCnPed@p + 1L + iss3 <- biggestCnPed@i + 1L + nc <- ncol(biggestPed) + + # Process each individual. + for (j in 1L:nc) { + ID2 <- ids[j] + ncp1 <- newColPos1[j] + ncp1p <- newColPos1[j + 1L] + cond1 <- ncp1 < ncp1p + if (cond1) { + vv1 <- ncp1:(ncp1p - 1L) + iss1vv <- iss1[vv1] + } + ncp2 <- newColPos2[j] + ncp2p <- newColPos2[j + 1L] + cond2 <- ncp2 < ncp2p + if (cond2) { + vv2 <- ncp2:(ncp2p - 1L) + iss2vv <- iss2[vv2] + } + ncp3 <- newColPos3[j] + ncp3p <- newColPos3[j + 1L] + cond3 <- ncp3 < ncp3p + if (cond3) { + vv3 <- ncp3:(ncp3p - 1L) + iss3vv <- iss3[vv3] + } + + # Merge indices from all three matrices. + u <- sort(igraph::union(igraph::union(if (cond1) { + iss1vv + }, if (cond2) { + iss2vv + }), if (cond3) { + iss3vv + })) + # browser() + if (cond1 || cond2 || cond3) { + ID1 <- ids[u] + tds <- data.frame( + ID1 = ID1, ID2 = ID2, + addRel = 0, mitRel = 0, cnuRel = 0 + ) + if (cond1) { + tds$addRel[u %in% iss1vv] <- biggestPed@x[vv1] + } + if (cond2) { + tds$mitRel[u %in% iss2vv] <- biggestMtPed@x[vv2] + } + if (cond3) { + tds$cnuRel[u %in% iss3vv] <- biggestCnPed@x[vv3] + } + utils::write.table(tds, + file = fname, row.names = FALSE, + col.names = FALSE, append = TRUE, sep = "," + ) + } + if (!(j %% update_rate)) { + cat(paste0("Done with ", j, " of ", nc, "\n")) + } + } + return(NULL) +} diff --git a/R/plotPedigree.R b/R/plotPedigree.R index 051610e6..f82be4bc 100644 --- a/R/plotPedigree.R +++ b/R/plotPedigree.R @@ -25,14 +25,14 @@ plotPedigree <- function(ped, pconnect = .5, ...) { # Standardize column names in the input dataframe - ped <- standardizeColnames(ped) + ped <- standardizeColnames(ped, verbose = verbose) # Define required columns - simulated_vars <- c("fam", "ID", "dadID", "momID", "sex") + simulated_vars <- c("famID", "ID", "dadID", "momID", "sex") # Check if dataframe contains the required columns if (all(simulated_vars %in% names(ped))) { - p <- ped[, c("fam", "ID", "dadID", "momID", "sex")] + p <- ped[, c("famID", "ID", "dadID", "momID", "sex")] colnames(p) <- c("ped", "id", "father", "mother", "sex") # data conversation diff --git a/R/readGedcom.R b/R/readGedcom.R new file mode 100644 index 00000000..7181fac4 --- /dev/null +++ b/R/readGedcom.R @@ -0,0 +1,636 @@ +#' Read a GEDCOM File +#' +#' This function reads a GEDCOM file and parses it into a structured data frame of individuals. +#' +#' @param file_path The path to the GEDCOM file. +#' @param add_parents A logical value indicating whether to add parents to the data frame. +#' @param remove_empty_cols A logical value indicating whether to remove columns with all missing values. +#' @param combine_cols A logical value indicating whether to combine columns with duplicate values. +#' @param verbose A logical value indicating whether to print messages. +#' @param skinny A logical value indicating whether to return a skinny data frame. +#' @param update_rate numeric. The rate at which to print progress +#' @param post_process A logical value indicating whether to post-process the data frame. +#' @param ... Additional arguments to be passed to the function. +#' @return A data frame containing information about individuals, with the following potential columns: +#' - `id`: ID of the individual +#' - `momID`: ID of the individual's mother +#' - `dadID`: ID of the individual's father +#' - `sex`: Sex of the individual +#' - `name`: Full name of the individual +#' - `name_given`: First name of the individual +#' - `name_surn`: Last name of the individual +#' - `name_marriedsurn`: Married name of the individual +#' - `name_nick`: Nickname of the individual +#' - `name_npfx`: Name prefix +#' - `name_nsfx`: Name suffix +#' - `birth_date`: Birth date of the individual +#' - `birth_lat`: Latitude of the birthplace +#' - `birth_long`: Longitude of the birthplace +#' - `birth_place`: Birthplace of the individual +#' - `death_caus`: Cause of death +#' - `death_date`: Death date of the individual +#' - `death_lat`: Latitude of the place of death +#' - `death_long`: Longitude of the place of death +#' - `death_place`: Place of death of the individual +#' - `attribute_caste`: Caste of the individual +#' - `attribute_children`: Number of children of the individual +#' - `attribute_description`: Description of the individual +#' - `attribute_education`: Education of the individual +#' - `attribute_idnumber`: Identification number of the individual +#' - `attribute_marriages`: Number of marriages of the individual +#' - `attribute_nationality`: Nationality of the individual +#' - `attribute_occupation`: Occupation of the individual +#' - `attribute_property`: Property owned by the individual +#' - `attribute_religion`: Religion of the individual +#' - `attribute_residence`: Residence of the individual +#' - `attribute_ssn`: Social security number of the individual +#' - `attribute_title`: Title of the individual +#' - `FAMC`: ID(s) of the family where the individual is a child +#' - `FAMS`: ID(s) of the family where the individual is a spouse +#' @export +readGedcom <- function(file_path, + verbose = FALSE, + add_parents = TRUE, + remove_empty_cols = TRUE, + combine_cols = TRUE, + skinny = FALSE, + update_rate = 1000, + post_process = TRUE, + ...) { + # Ensure the file exists and read all lines. + if (!file.exists(file_path)) { + stop("File does not exist: ", file_path) + } + if (verbose) message("Reading file: ", file_path) + lines <- readLines(file_path) + total_lines <- length(lines) + if (verbose) message("File is ", total_lines, " lines long") + + # Count pattern occurrences (pattern_rows remains used in subfunctions) + pattern_rows <- countPatternRows(data.frame(X1 = lines)) + + # List of variables to initialize + all_var_names <- unlist(list( + identifiers = c("id", "momID", "dadID"), + names = c( + "name", "name_given", "name_given_pieces", "name_surn", "name_surn_pieces", "name_marriedsurn", + "name_nick", "name_npfx", "name_nsfx" + ), + sex = c("sex"), + birth = c("birth_date", "birth_lat", "birth_long", "birth_place"), + death = c("death_caus", "death_date", "death_lat", "death_long", "death_place"), + attributes = c( + "attribute_caste", "attribute_children", "attribute_description", "attribute_education", + "attribute_idnumber", "attribute_marriages", "attribute_nationality", "attribute_occupation", + "attribute_property", "attribute_religion", "attribute_residence", "attribute_ssn", + "attribute_title" + ), + relationships = c("FAMC", "FAMS") + ), use.names = FALSE) + + # Split the file into blocks; each block corresponds to one individual. + blocks <- splitIndividuals(lines, verbose) + + # Parse each individual block into a record (a named list) + records <- lapply(blocks, parseIndividualBlock, + pattern_rows = pattern_rows, + all_var_names = all_var_names, verbose = verbose + ) + + # Remove any NULLs (if a block did not contain an individual id) + records <- Filter(Negate(is.null), records) + + if (length(records) == 0) { + warning("No people found in file") + return(NULL) + } + + # Convert the list of records to a data frame. + df_temp <- do.call(rbind, lapply(records, function(rec) { + as.data.frame(rec, stringsAsFactors = FALSE) + })) + + if (verbose) message("File has ", nrow(df_temp), " people") + + # Run post-processing if requested. + if (post_process) { + if (verbose) message("Post-processing data frame") + df_temp <- postProcessGedcom( + df_temp = df_temp, + remove_empty_cols = remove_empty_cols, + combine_cols = combine_cols, + add_parents = add_parents, + skinny = skinny, + verbose = verbose + ) + } + + return(df_temp) +} + +# --- SUBFUNCTIONS --- +#' Split GEDCOM Lines into Individual Blocks +#' +#' This function partitions the GEDCOM file (as a vector of lines) into a list of blocks, +#' where each block corresponds to a single individual starting with an "@ INDI" line. +#' +#' @param lines A character vector of lines from the GEDCOM file. +#' @param verbose Logical indicating whether to output progress messages. +#' @return A list of character vectors, each representing one individual. +splitIndividuals <- function(lines, verbose = FALSE) { + indi_idx <- grep("@ INDI", lines) + if (length(indi_idx) == 0) { + return(list()) + } + + blocks <- list() + for (i in seq_along(indi_idx)) { + start <- indi_idx[i] + end <- if (i < length(indi_idx)) indi_idx[i + 1] - 1 else length(lines) + block <- lines[start:end] + blocks[[length(blocks) + 1]] <- block + } + if (verbose) message("Found ", length(blocks), " individual blocks") + return(blocks) +} + +#' Initialize an Empty Individual Record +#' +#' Creates a named list with all GEDCOM fields set to NA. +#' +#' @param all_var_names A character vector of variable names. +#' @return A named list representing an empty individual record. +initializeRecord <- function(all_var_names) { + stats::setNames(as.list(rep(NA_character_, length(all_var_names))), all_var_names) +} + +#' Parse a GEDCOM Individual Block +#' +#' Processes a block of GEDCOM lines corresponding to a single individual. +#' +#' @param block A character vector containing the GEDCOM lines for one individual. +#' @param pattern_rows A list with counts of lines matching specific GEDCOM tags. +#' @param all_var_names A character vector of variable names. +#' @param verbose Logical indicating whether to print progress messages. +#' @return A named list representing the parsed record for the individual, or NULL if no ID is found. +#' @keywords internal +parseIndividualBlock <- function(block, pattern_rows, all_var_names, verbose = FALSE) { + record <- initializeRecord(all_var_names) + n_lines <- length(block) + + # Loop through the block by index so that we can look ahead for event details. + i <- 1 + while (i <= n_lines) { + line <- block[i] + + # Process individual identifier (e.g., "@ INDI ...") + if (grepl("@ INDI", line)) { + record$id <- stringr::str_extract(line, "(?<=@.)\\d*(?=@)") + i <- i + 1 + next + } + + # Special processing for full name using " NAME" tag. + if (grepl(" NAME", line) && pattern_rows$num_name_rows > 0) { + record <- parseNameLine(line, record) + i <- i + 1 + next + } + + # Process birth and death events by consuming multiple lines. + if (grepl(" BIRT", line) && pattern_rows$num_birt_rows > 0) { + record <- processEventLine("birth", block, i, record, pattern_rows) + i <- i + 1 # Skip further processing of this line. + next + } + if (grepl(" DEAT", line) && pattern_rows$num_deat_rows > 0) { + record <- processEventLine("death", block, i, record, pattern_rows) + i <- i + 1 + next + } + + # Process other tags using common mappings. + # Define mappings for name pieces (if not handled by NAME tag). + name_piece_mappings <- list( + list(tag = "GIVN", field = "name_given_pieces", mode = "replace"), + list(tag = "NPFX", field = "name_npfx", mode = "replace"), + list(tag = "NICK", field = "name_nick", mode = "replace"), + list(tag = "SURN", field = "name_surn_pieces", mode = "replace"), + list(tag = "NSFX", field = "name_nsfx", mode = "replace"), + list(tag = "_MARNM", field = "name_marriedsurn", mode = "replace") + ) + out <- applyTagMappings(line, record, pattern_rows, name_piece_mappings) + if (out$matched) { + record <- out$record + i <- i + 1 + next + } + + # Process attribute tags. + attribute_mappings <- list( + list(tag = "SEX", field = "sex", mode = "replace"), + list(tag = "CAST", field = "attribute_caste", mode = "replace"), + list(tag = "DSCR", field = "attribute_description", mode = "replace"), + list(tag = "EDUC", field = "attribute_education", mode = "replace"), + list(tag = "IDNO", field = "attribute_idnumber", mode = "replace"), + list(tag = "NATI", field = "attribute_nationality", mode = "replace"), + list(tag = "NCHI", field = "attribute_children", mode = "replace"), + list(tag = "NMR", field = "attribute_marriages", mode = "replace"), + list(tag = "OCCU", field = "attribute_occupation", mode = "replace"), + list(tag = "PROP", field = "attribute_property", mode = "replace"), + list(tag = "RELI", field = "attribute_religion", mode = "replace"), + list(tag = "RESI", field = "attribute_residence", mode = "replace"), + list(tag = "SSN", field = "attribute_ssn", mode = "replace"), + list(tag = "TITL", field = "attribute_title", mode = "replace") + ) + out <- applyTagMappings(line, record, pattern_rows, attribute_mappings) + if (out$matched) { + record <- out$record + i <- i + 1 + next + } + + # Process relationship tags, using a custom extractor. + relationship_mappings <- list( + list( + tag = "FAMC", field = "FAMC", mode = "append", + extractor = function(x) stringr::str_extract(x, "(?<=@.)\\d*(?=@)") + ), + list( + tag = "FAMS", field = "FAMS", mode = "append", + extractor = function(x) stringr::str_extract(x, "(?<=@.)\\d*(?=@)") + ) + ) + out <- applyTagMappings(line, record, pattern_rows, relationship_mappings) + if (out$matched) { + record <- out$record + i <- i + 1 + next + } + + # Optionally print progress for long records. + i <- i + 1 + } + + # If the record has no ID, return NULL. + if (is.na(record$id)) { + return(NULL) + } + return(record) +} + +#' Parse a Full Name Line +#' +#' Extracts full name information from a GEDCOM "NAME" line and updates the record accordingly. +#' +#' @param line A character string containing the name line. +#' @param record A named list representing the individual's record. +#' @return The updated record with parsed name information. +parseNameLine <- function(line, record) { + record$name <- extract_info(line, "NAME") + record$name_given <- stringr::str_extract(record$name, ".*(?= /)") + record$name_surn <- stringr::str_extract(record$name, "(?<=/).*(?=/)") + record$name <- stringr::str_squish(stringr::str_replace(record$name, "/", " ")) + return(record) +} + +#' Process Event Lines (Birth or Death) +#' +#' Extracts event details (e.g., date, place, cause, latitude, longitude) from a block of GEDCOM lines. +#' For "birth": expect DATE on line i+1, PLAC on i+2, LATI on i+4, LONG on i+5. +#' For "death": expect DATE on line i+1, PLAC on i+2, CAUS on i+3, LATI on i+4, LONG on i+5. +#' @param event A character string indicating the event type ("birth" or "death"). +#' @param block A character vector of GEDCOM lines. +#' @param i The current line index where the event tag is found. +#' @param record A named list representing the individual's record. +#' @param pattern_rows A list with counts of GEDCOM tag occurrences. +#' @return The updated record with parsed event information.# +# For "death": expect DATE on line i+1, PLAC on i+2, CAUS on i+3, LATI on i+4, LONG on i+5. +processEventLine <- function(event, block, i, record, pattern_rows) { + n_lines <- length(block) + if (event == "birth") { + if (i + 1 <= n_lines) record$birth_date <- extract_info(block[i + 1], "DATE") + if (i + 2 <= n_lines) record$birth_place <- extract_info(block[i + 2], "PLAC") + if (i + 4 <= n_lines) record$birth_lat <- extract_info(block[i + 4], "LATI") + if (i + 5 <= n_lines) record$birth_long <- extract_info(block[i + 5], "LONG") + } else if (event == "death") { + if (i + 1 <= n_lines) record$death_date <- extract_info(block[i + 1], "DATE") + if (i + 2 <= n_lines) record$death_place <- extract_info(block[i + 2], "PLAC") + if (i + 3 <= n_lines) record$death_caus <- extract_info(block[i + 3], "CAUS") + if (i + 4 <= n_lines) record$death_lat <- extract_info(block[i + 4], "LATI") + if (i + 5 <= n_lines) record$death_long <- extract_info(block[i + 5], "LONG") + } + return(record) +} + +#' Apply Tag Mappings to a Line +#' +#' Iterates over a list of tag mappings and, if a tag matches the line, updates the record. +#' +#' @param line A character string from the GEDCOM file. +#' @param record A named list representing the individual's record. +#' @param pattern_rows A list with GEDCOM tag counts. +#' @param tag_mappings A list of lists. Each sublist should define: +#' - \code{tag}: the GEDCOM tag, +#' - \code{field}: the record field to update, +#' - \code{mode}: either "replace" or "append", +#' - \code{extractor}: (optional) a custom extraction function. +#' @return A list with the updated record (\code{record}) and a logical flag (\code{matched}). +applyTagMappings <- function(line, record, pattern_rows, tag_mappings) { + for (mapping in tag_mappings) { + extractor <- if (is.null(mapping$extractor)) NULL else mapping$extractor + result <- process_tag(mapping$tag, mapping$field, pattern_rows, line, record, + extractor = extractor, mode = mapping$mode + ) + record <- result$vars + if (result$matched) { + return(list(record = record, matched = TRUE)) + } + } + return(list(record = record, matched = FALSE)) +} + + +#' Extract Information from Line +#' +#' This function extracts information from a line based on a specified type. +#' @param line A character string representing a line from a GEDCOM file. +#' @param type A character string representing the type of information to extract. +#' @return A character string with the extracted information. +#' @keywords internal +extract_info <- function(line, type) { + stringr::str_squish(stringr::str_extract(line, paste0("(?<=", type, " ).+"))) +} + +#' Count GEDCOM Pattern Rows +#' +#' Counts the number of lines in a file (passed as a data frame with column "X1") +#' that match various GEDCOM patterns. +#' +#' @param file A data frame with a column \code{X1} containing GEDCOM lines. +#' @return A list with counts of specific GEDCOM tag occurrences. +countPatternRows <- function(file) { + pattern_counts <- sapply( + c( + "@ INDI", " NAME", " GIVN", " NPFX", " NICK", " SURN", " NSFX", " _MARNM", + " BIRT", " DEAT", " SEX", " CAST", " DSCR", " EDUC", " IDNO", " NATI", + " NCHI", " NMR", " OCCU", " PROP", " RELI", " RESI", " SSN", " TITL", + " FAMC", " FAMS", " PLAC", " LATI", " LONG", " DATE", " CAUS" + ), + function(pat) sum(grepl(pat, file$X1)) + ) + num_rows <- list( + num_indi_rows = pattern_counts["@ INDI"], + num_name_rows = pattern_counts[" NAME"], + num_givn_rows = pattern_counts[" GIVN"], + num_npfx_rows = pattern_counts[" NPFX"], + num_nick_rows = pattern_counts[" NICK"], + num_surn_rows = pattern_counts[" SURN"], + num_nsfx_rows = pattern_counts[" NSFX"], + num_marnm_rows = pattern_counts[" _MARNM"], + num_birt_rows = pattern_counts[" BIRT"], + num_deat_rows = pattern_counts[" DEAT"], + num_sex_rows = pattern_counts[" SEX"], + num_cast_rows = pattern_counts[" CAST"], + num_dscr_rows = pattern_counts[" DSCR"], + num_educ_rows = pattern_counts[" EDUC"], + num_idno_rows = pattern_counts[" IDNO"], + num_nati_rows = pattern_counts[" NATI"], + num_nchi_rows = pattern_counts[" NCHI"], + num_nmr_rows = pattern_counts[" NMR"], + num_occu_rows = pattern_counts[" OCCU"], + num_prop_rows = pattern_counts[" PROP"], + num_reli_rows = pattern_counts[" RELI"], + num_resi_rows = pattern_counts[" RESI"], + num_ssn_rows = pattern_counts[" SSN"], + num_titl_rows = pattern_counts[" TITL"], + num_famc_rows = pattern_counts[" FAMC"], + num_fams_rows = pattern_counts[" FAMS"], + num_plac_rows = pattern_counts[" PLAC"], + num_lati_rows = pattern_counts[" LATI"], + num_long_rows = pattern_counts[" LONG"], + num_date_rows = pattern_counts[" DATE"], + num_caus_rows = pattern_counts[" CAUS"] + ) + return(num_rows) +} + +#' Process a GEDCOM Tag +#' +#' Extracts and assigns a value to a specified field in `vars` if the pattern is present. +#' Returns both the updated variable list and a flag indicating whether the tag was matched. +#' +#' @param tag The GEDCOM tag (e.g., "SEX", "CAST", etc.). +#' @param field_name The name of the variable to assign to in `vars`. +#' @param pattern_rows Output from `countPatternRows()`. +#' @param line The GEDCOM line to parse. +#' @param vars The current list of variables to update. +#' @return A list with updated `vars` and a `matched` flag. +#' @keywords internal +process_tag <- function(tag, field_name, pattern_rows, line, vars, + extractor = NULL, mode = "replace") { + count_name <- paste0("num_", tolower(tag), "_rows") + matched <- FALSE + if (!is.null(pattern_rows[[count_name]]) && + pattern_rows[[count_name]] > 0 && + grepl(paste0(" ", tag), line)) { + value <- if (is.null(extractor)) { + extract_info(line, tag) + } else { + extractor(line) + } + if (mode == "append" && !is.na(vars[[field_name]])) { + vars[[field_name]] <- paste0(vars[[field_name]], ", ", value) + } else { + vars[[field_name]] <- value + } + matched <- TRUE + } + return(list(vars = vars, matched = matched)) +} + +#' Post-process GEDCOM Data Frame +#' +#' This function optionally adds parent information, combines duplicate columns, +#' and removes empty columns from the GEDCOM data frame. +#' +#' @param df_temp A data frame produced by \code{readGedcom()}. +#' @param remove_empty_cols Logical indicating whether to remove columns that are entirely missing. +#' @param combine_cols Logical indicating whether to combine columns with duplicate values. +#' @param add_parents Logical indicating whether to add parent information. +#' @param skinny Logical indicating whether to slim down the data frame. +#' @param verbose Logical indicating whether to print progress messages. +#' @return The post-processed data frame. +postProcessGedcom <- function(df_temp, + remove_empty_cols = TRUE, + combine_cols = TRUE, + add_parents = TRUE, + skinny = TRUE, + verbose = FALSE) { + if (add_parents) { + if (verbose) message("Processing parents") + df_temp <- processParents(df_temp, datasource = "gedcom") + } + if (combine_cols) { + df_temp <- collapseNames(verbose = verbose, df_temp = df_temp) + } + if (remove_empty_cols) { + if (verbose) message("Removing empty columns") + df_temp <- df_temp[, colSums(is.na(df_temp)) < nrow(df_temp)] + } + if (skinny) { + if (verbose) message("Slimming down the data frame") + df_temp <- df_temp[, colSums(is.na(df_temp)) < nrow(df_temp)] + df_temp$FAMC <- NULL + df_temp$FAMS <- NULL + } + return(df_temp) +} + +#' Process Parents Information from GEDCOM Data +#' +#' Adds parent IDs to the individuals based on family relationship data. +#' +#' @param df_temp A data frame produced by \code{readGedcom()}. +#' @param datasource Character string indicating the data source ("gedcom" or "wiki"). +#' @return The updated data frame with parent IDs added. +processParents <- function(df_temp, datasource) { + if (datasource == "gedcom") { + required_cols <- c("FAMC", "sex", "FAMS") + } else if (datasource == "wiki") { + required_cols <- c("id") + } else { + stop("Invalid datasource") + } + if (!all(required_cols %in% colnames(df_temp))) { + missing_cols <- setdiff(required_cols, colnames(df_temp)) + warning("Missing necessary columns: ", paste(missing_cols, collapse = ", ")) + return(df_temp) + } + family_to_parents <- mapFAMS2parents(df_temp) + if (is.null(family_to_parents) || length(family_to_parents) == 0) { + return(df_temp) + } + df_temp <- mapFAMC2parents(df_temp, family_to_parents) + return(df_temp) +} + +#' Create a Mapping from Family IDs to Parent IDs +#' +#' This function scans the data frame and creates a mapping of family IDs +#' to the corresponding parent IDs. +#' +#' @param df_temp A data frame produced by \code{readGedcom()}. +#' @return A list mapping family IDs to parent information. +mapFAMS2parents <- function(df_temp) { + if (!all(c("FAMS", "sex") %in% colnames(df_temp))) { + warning("The data frame does not contain the necessary columns (FAMS, sex)") + return(NULL) + } + family_to_parents <- list() + for (i in 1:nrow(df_temp)) { + if (!is.na(df_temp$FAMS[i])) { + fams_ids <- unlist(strsplit(df_temp$FAMS[i], ", ")) + for (fams_id in fams_ids) { + if (!is.null(family_to_parents[[fams_id]])) { + if (df_temp$sex[i] == "M") { + family_to_parents[[fams_id]]$father <- df_temp$id[i] + } else if (df_temp$sex[i] == "F") { + family_to_parents[[fams_id]]$mother <- df_temp$id[i] + } + } else { + family_to_parents[[fams_id]] <- list() + if (df_temp$sex[i] == "M") { + family_to_parents[[fams_id]]$father <- df_temp$id[i] + } else if (df_temp$sex[i] == "F") { + family_to_parents[[fams_id]]$mother <- df_temp$id[i] + } + } + } + } + } + return(family_to_parents) +} + +#' Assign momID and dadID based on family mapping +#' +#' This function assigns mother and father IDs to individuals in the data frame +#' based on the mapping of family IDs to parent IDs. +#' +#' @param df_temp A data frame containing individual information. +#' @param family_to_parents A list mapping family IDs to parent IDs. +#' @return A data frame with added momID and dad_ID columns. +#' @keywords internal +mapFAMC2parents <- function(df_temp, family_to_parents) { + df_temp$momID <- NA_character_ + df_temp$dadID <- NA_character_ + for (i in 1:nrow(df_temp)) { + if (!is.na(df_temp$FAMC[i])) { + famc_ids <- unlist(strsplit(df_temp$FAMC[i], ", ")) + for (famc_id in famc_ids) { + if (!is.null(family_to_parents[[famc_id]])) { + if (!is.null(family_to_parents[[famc_id]]$father)) { + df_temp$dadID[i] <- family_to_parents[[famc_id]]$father + } + if (!is.null(family_to_parents[[famc_id]]$mother)) { + df_temp$momID[i] <- family_to_parents[[famc_id]]$mother + } + } + } + } + } + return(df_temp) +} + +#' collapse Names +#' +#' This function combines the `name_given` and `name_given_pieces` columns in a data frame. +#' +#' @inheritParams readGedcom +#' @param df_temp A data frame containing the columns to be combined. +#' @return A data frame with the combined columns. +collapseNames <- function(verbose, df_temp) { + if (verbose) message("Combining Duplicate Columns") + + if (!all(is.na(df_temp$name_given_pieces)) | !all(is.na(df_temp$name_given))) { + result <- combine_columns(df_temp$name_given, df_temp$name_given_pieces) + df_temp$name_given <- result$combined + if (!result$retain_col2) df_temp$name_given_pieces <- NULL + } + + if (!all(is.na(df_temp$name_surn_pieces)) | !all(is.na(df_temp$name_surn))) { + result <- combine_columns(df_temp$name_surn, df_temp$name_surn_pieces) + df_temp$name_surn <- result$combined + if (!result$retain_col2) df_temp$name_surn_pieces <- NULL + } + return(df_temp) +} + +#' Combine Columns +#' +#' This function combines two columns, handling conflicts and merging non-conflicting data. +#' @param col1 The first column to combine. +#' @param col2 The second column to combine. +#' @return A list with the combined column and a flag indicating if the second column should be retained. +#' @keywords internal +# Helper function to check for conflicts and merge columns +combine_columns <- function(col1, col2) { + col1_lower <- stringr::str_to_lower(col1) + col2_lower <- stringr::str_to_lower(col2) + conflicts <- !is.na(col1_lower) & !is.na(col2_lower) & col1_lower != col2_lower + if (any(conflicts)) { + warning("Columns have conflicting values. They were not merged.") + return(list(combined = col1, retain_col2 = TRUE)) + } else { + combined <- ifelse(is.na(col1), col2, col1) + return(list(combined = combined, retain_col2 = FALSE)) + } +} + +# --- Exported Aliases --- +#' @rdname readGedcom +#' @export +readGed <- readGedcom +#' @rdname readGedcom +#' @export +readgedcom <- readGedcom diff --git a/R/readGedcomlegacy.R b/R/readGedcomlegacy.R new file mode 100644 index 00000000..7a420a9c --- /dev/null +++ b/R/readGedcomlegacy.R @@ -0,0 +1,621 @@ +#' Read a GEDCOM File +#' +#' This function reads a GEDCOM file and parses it into a structured data frame of individuals. +#' Inspired by https://raw.githubusercontent.com/jjfitz/readgedcom/master/R/read_gedcom.R +#' +#' @param file_path The path to the GEDCOM file. +#' @param add_parents A logical value indicating whether to add parents to the data frame. +#' @param remove_empty_cols A logical value indicating whether to remove columns with all missing values. +#' @param combine_cols A logical value indicating whether to combine columns with duplicate values. +#' @param verbose A logical value indicating whether to print messages. +#' @param skinny A logical value indicating whether to return a skinny data frame. +#' @param update_rate numeric. The rate at which to print progress +#' @param ... Additional arguments to be passed to the function. +#' @return A data frame containing information about individuals, with the following potential columns: +#' - `id`: ID of the individual +#' - `momID`: ID of the individual's mother +#' - `dadID`: ID of the individual's father +#' - `sex`: Sex of the individual +#' - `name`: Full name of the individual +#' - `name_given`: First name of the individual +#' - `name_surn`: Last name of the individual +#' - `name_marriedsurn`: Married name of the individual +#' - `name_nick`: Nickname of the individual +#' - `name_npfx`: Name prefix +#' - `name_nsfx`: Name suffix +#' - `birth_date`: Birth date of the individual +#' - `birth_lat`: Latitude of the birthplace +#' - `birth_long`: Longitude of the birthplace +#' - `birth_place`: Birthplace of the individual +#' - `death_caus`: Cause of death +#' - `death_date`: Death date of the individual +#' - `death_lat`: Latitude of the place of death +#' - `death_long`: Longitude of the place of death +#' - `death_place`: Place of death of the individual +#' - `attribute_caste`: Caste of the individual +#' - `attribute_children`: Number of children of the individual +#' - `attribute_description`: Description of the individual +#' - `attribute_education`: Education of the individual +#' - `attribute_idnumber`: Identification number of the individual +#' - `attribute_marriages`: Number of marriages of the individual +#' - `attribute_nationality`: Nationality of the individual +#' - `attribute_occupation`: Occupation of the individual +#' - `attribute_property`: Property owned by the individual +#' - `attribute_religion`: Religion of the individual +#' - `attribute_residence`: Residence of the individual +#' - `attribute_ssn`: Social security number of the individual +#' - `attribute_title`: Title of the individual +#' - `FAMC`: ID(s) of the family where the individual is a child +#' - `FAMS`: ID(s) of the family where the individual is a spouse +#' @keywords internal +.readGedcom.legacy <- function(file_path, + verbose = FALSE, + add_parents = TRUE, + remove_empty_cols = TRUE, + combine_cols = TRUE, + skinny = FALSE, + update_rate = 1000, + post_process = TRUE, + ...) { + # Checks + if (!file.exists(file_path)) stop("File does not exist: ", file_path) + if (verbose) { + print(paste("Reading file:", file_path)) + } + file <- data.frame(X1 = readLines(file_path)) + file_length <- nrow(file) + if (verbose) { + print(paste0("File is ", file_length, " lines long")) + } + + # Count the number of rows containing specific patterns + num_rows <- .countPatternRows.legacy(file) + + # List of variables to initialize + var_names <- list( + identifiers = c("id", "momID", "dadID"), + names = c( + "name", "name_given", "name_given_pieces", + "name_surn", "name_surn_pieces", "name_marriedsurn", + "name_nick", "name_npfx", "name_nsfx" + ), + sex = c("sex"), + birth = c("birth_date", "birth_lat", "birth_long", "birth_place"), + death = c( + "death_caus", "death_date", + "death_lat", "death_long", "death_place" + ), + attributes = c( + "attribute_caste", "attribute_children", "attribute_description", + "attribute_education", "attribute_idnumber", "attribute_marriages", + "attribute_nationality", "attribute_occupation", + "attribute_property", "attribute_religion", "attribute_residence", + "attribute_ssn", "attribute_title" + ), + relationships = c("FAMC", "FAMS") + ) + + all_var_names <- unlist(var_names, use.names = FALSE) + + # Initialize all variables to NA + vars <- stats::setNames( + as.list(rep(NA_character_, length(all_var_names))), + all_var_names + ) + + df_temp <- as.data.frame(matrix(nrow = 1, ncol = length(all_var_names))) + names(df_temp) <- all_var_names + + if (verbose) { + print("Parsing GEDCOM file") + } + for (i in 1:length(file[1][[1]])) { + tmpv <- file[1][[1]][[i]] + + if (grepl("@ INDI", tmpv)) { + line_to_write <- as.data.frame(vars) + df_temp <- rbind(df_temp, line_to_write) + + # Reset all variables to NA + vars <- stats::setNames(as.list(rep( + NA_character_, + length(all_var_names) + )), all_var_names) + + vars$id <- stringr::str_extract(tmpv, "(?<=@.)\\d*(?=@)") + next + } + + # names + if (num_rows$num_name_rows > 0 && grepl(" NAME", tmpv)) { + vars$name <- .extract_info.legacy(tmpv, "NAME") + vars$name_given <- stringr::str_extract(vars$name, ".*(?= /)") + vars$name_surn <- stringr::str_extract(vars$name, "(?<=/).*(?=/)") + vars$name <- stringr::str_squish(stringr::str_replace(vars$name, "/", " ")) + next + } + # PERSONAL_NAME_PIECES := NAME | NPFX | GIVN | NICK | SPFX | SURN | NSFX + result <- .process_tag.legacy("GIVN", "name_given_pieces", num_rows, tmpv, vars) + vars <- result$vars + if (result$matched) next + + # npfx := Name Prefix + result <- .process_tag.legacy("NPFX", "name_npfx", num_rows, tmpv, vars) + vars <- result$vars + if (result$matched) next + + # NICK := Nickname + result <- .process_tag.legacy("NICK", "name_nick", num_rows, tmpv, vars) + vars <- result$vars + if (result$matched) next + + # surn := Surname + result <- .process_tag.legacy("SURN", "name_surn_pieces", num_rows, tmpv, vars) + vars <- result$vars + if (result$matched) next + + # nsfx := Name suffix + result <- .process_tag.legacy("NSFX", "name_nsfx", num_rows, tmpv, vars) + vars <- result$vars + if (result$matched) next + + result <- .process_tag.legacy("_MARNM", "name_marriedsurn", num_rows, tmpv, vars) + vars <- result$vars + if (result$matched) next + + # Birth event related information + if (num_rows$num_birt_rows > 0 && grepl(" BIRT", tmpv)) { + if (num_rows$num_date_rows > 0 && i + 1 <= file_length) { + vars$birth_date <- .extract_info.legacy(file[1][[1]][[i + 1]], "DATE") + if (num_rows$num_plac_rows > 0 && i + 2 <= file_length) { + vars$birth_place <- .extract_info.legacy(file[1][[1]][[i + 2]], "PLAC") + if (num_rows$num_lati_rows > 0 && i + 4 <= file_length) { + vars$birth_lat <- .extract_info.legacy(file[1][[1]][[i + 4]], "LATI") + if (num_rows$num_long_rows > 0 && i + 5 <= file_length) { + vars$birth_long <- .extract_info.legacy(file[1][[1]][[i + 5]], "LONG") + } + } + } + } + next + } + + # Death event related information + # the ifs are nested so that there is no need to check if you've already run out of + if (num_rows$num_deat_rows > 0 && grepl(" DEAT", tmpv)) { + if (num_rows$num_date_rows > 0 && i + 1 <= file_length) { + vars$death_date <- .extract_info.legacy(file[1][[1]][[i + 1]], "DATE") + if (num_rows$num_plac_rows > 0 && i + 2 <= file_length) { + vars$death_place <- .extract_info.legacy(file[1][[1]][[i + 2]], "PLAC") + if (num_rows$num_caus_rows > 0 && i + 3 <= file_length) { + vars$death_caus <- .extract_info.legacy(file[1][[1]][[i + 3]], "CAUS") + if (num_rows$num_lati_rows > 0 && i + 4 <= file_length) { + vars$death_lat <- .extract_info.legacy(file[1][[1]][[i + 4]], "LATI") + if (num_rows$num_long_rows > 0 && i + 5 <= file_length) { + vars$death_long <- .extract_info.legacy(file[1][[1]][[i + 5]], "LONG") + } + } + } + } + } + next + } + + + # Attribute tags using process_tag() + for (tag_field in list( + c("SEX", "sex"), + + # CAST caste + # g7:CAST The name of an individual’s rank or status in society which is sometimes based on racial or religious differences, or differences in wealth, inherited rank, profession, or occupation. + c("CAST", "attribute_caste"), + + # DSCR physical description + # g7:DSCR The physical characteristics of a person. + c("DSCR", "attribute_description"), + + # EDUC education + # g7:EDUC Indicator of a level of education attained. + c("EDUC", "attribute_education"), + + # IDNO identifying number + # g7:IDNO A number or other string assigned to identify a person within some significant external system. It must have a TYPE substructure to define what kind of identification number is being provided. + c("IDNO", "attribute_idnumber"), + + # NATI nationality + # g7:NATI An individual’s national heritage or origin, or other folk, house, kindred, lineage, or tribal interest. + c("NATI", "attribute_nationality"), + + # NCHI number of children + # g7:INDI-NCHI The number of children that this person is known to be the parent of (all marriages). + c("NCHI", "attribute_children"), + + # NMR number of marriages + # g7:NMR The number of times this person has participated in a family as a spouse or parent. + c("NMR", "attribute_marriages"), + + # OCCU occupation + # g7:OCCU The type of work or profession of an individual. + c("OCCU", "attribute_occupation"), + + # PROP property + # g7:PROP Pertaining to possessions such as real estate or other property of interest. + c("PROP", "attribute_property"), + + # RELI religion + # g7:INDI-RELI A religious denomination to which a person is affiliated or for which a record applies. + c("RELI", "attribute_religion"), + + # RESI residence + # g7:INDI-RESI An address or place of residence where an individual resided. + c("RESI", "attribute_residence"), + + # SSN social security number + # g7:SSN A number assigned by the United States Social Security Administration, used for tax identification purposes. It is a type of IDNO. + c("SSN", "attribute_ssn"), + + # TITL title + # g7:INDI-TITL A formal designation used by an individual in connection with positions of royalty or other social status, such as Grand Duke. + c("TITL", "attribute_title") + )) { + result <- .process_tag.legacy(tag_field[1], tag_field[2], num_rows, tmpv, vars) + vars <- result$vars + if (result$matched) next + } + + # relationship data + # g7:INDI-FAMC + ## The family in which an individual appears as a child. It is also used with a g7:FAMC-STAT substructure to show individuals who are not children of the family. See FAMILY_RECORD for more details. + result <- .process_tag.legacy("FAMC", "FAMC", num_rows, tmpv, vars, + extractor = function(x) stringr::str_extract(x, "(?<=@.)\\d*(?=@)"), + mode = "append" + ) + vars <- result$vars + if (result$matched) next + + # FAMS (Family spouse) g7:FAMS + # The family in which an individual appears as a partner. See FAMILY_RECORD for more details. + result <- .process_tag.legacy("FAMS", "FAMS", num_rows, tmpv, vars, + extractor = function(x) stringr::str_extract(x, "(?<=@.)\\d*(?=@)"), + mode = "append" + ) + vars <- result$vars + if (result$matched) next + + if (verbose && i %% update_rate == 0) { + cat("Processed", i, "lines\n") + } + } + # write final file line + line_to_write <- as.data.frame(vars) + df_temp <- rbind(df_temp, line_to_write) + df_temp <- df_temp[!is.na(df_temp$id), ] + + if (verbose) { + print(paste0("File has ", nrow(df_temp), " people")) + } + if (nrow(df_temp) == 0) { + warning("No people found in file") + return(NULL) + } + if (nrow(df_temp) != num_rows$num_indi_rows) { + warning("The number of people found in the processed file does not match the number of individuals raw data") + } + + if (post_process) { + if (verbose) { + print("Post-processing data frame") + } + # Remove the first row (empty) + df_temp <- .postProcessGedcom.legacy( + df_temp = df_temp, + remove_empty_cols = remove_empty_cols, + combine_cols = combine_cols, + add_parents = add_parents, + skinny = skinny, + verbose = verbose + ) + } + + return(df_temp) +} + +#' Post-process GEDCOM Data Frame +#' +#' @inheritParams .readGedcom.legacy +#' @inheritParams .mapFAMS2parents.legacy +#' @return A data frame with processed information. + +.postProcessGedcom.legacy <- function(df_temp, + remove_empty_cols = TRUE, + combine_cols = TRUE, + add_parents = TRUE, + skinny = TRUE, + verbose = FALSE) { + # Add mom and dad ids + if (add_parents) { + if (verbose) { + print("Processing parents") + } + df_temp <- .processParents.legacy(df_temp, datasource = "gedcom") + } + + if (combine_cols) { + df_temp <- .collapseNames.legacy(verbose = verbose, df_temp = df_temp) + } + + if (remove_empty_cols) { + # Remove empty columns + if (verbose) { + print("Removing empty columns") + } + df_temp <- df_temp[, colSums(is.na(df_temp)) < nrow(df_temp)] + } + if (skinny) { + if (verbose) { + print("Slimming down the data frame") + } + df_temp <- df_temp[, colSums(is.na(df_temp)) < nrow(df_temp)] + df_temp$FAMC <- NULL + df_temp$FAMS <- NULL + } + + return(df_temp) +} + +#' Create a mapping of family IDs to parent IDs +#' +#' This function creates a mapping from family IDs to the IDs of the parents. +#' +#' @param df_temp A data frame containing information about individuals. +#' @return A list mapping family IDs to parent IDs. +#' @keywords internal +#' +.mapFAMS2parents.legacy <- function(df_temp) { + if (!all(c("FAMS", "sex") %in% colnames(df_temp))) { + warning("The data frame does not contain the necessary columns (FAMS, sex)") + return(NULL) + } + family_to_parents <- list() + for (i in 1:nrow(df_temp)) { + if (!is.na(df_temp$FAMS[i])) { + fams_ids <- unlist(strsplit(df_temp$FAMS[i], ", ")) + for (fams_id in fams_ids) { + if (!is.null(family_to_parents[[fams_id]])) { + if (df_temp$sex[i] == "M") { + family_to_parents[[fams_id]]$father <- df_temp$id[i] + } else if (df_temp$sex[i] == "F") { + family_to_parents[[fams_id]]$mother <- df_temp$id[i] + } + } else { + family_to_parents[[fams_id]] <- list() + if (df_temp$sex[i] == "M") { + family_to_parents[[fams_id]]$father <- df_temp$id[i] + } else if (df_temp$sex[i] == "F") { + family_to_parents[[fams_id]]$mother <- df_temp$id[i] + } + } + } + } + } + return(family_to_parents) +} + +#' Assign momID and dadID based on family mapping +#' +#' This function assigns mother and father IDs to individuals in the data frame +#' based on the mapping of family IDs to parent IDs. +#' +#' @param df_temp A data frame containing individual information. +#' @param family_to_parents A list mapping family IDs to parent IDs. +#' @return A data frame with added momID and dad_ID columns. +#' @keywords internal +.mapFAMC2parents.legacy <- function(df_temp, family_to_parents) { + df_temp$momID <- NA_character_ + df_temp$dadID <- NA_character_ + for (i in 1:nrow(df_temp)) { + if (!is.na(df_temp$FAMC[i])) { + famc_ids <- unlist(strsplit(df_temp$FAMC[i], ", ")) + for (famc_id in famc_ids) { + if (!is.null(family_to_parents[[famc_id]])) { + if (!is.null(family_to_parents[[famc_id]]$father)) { + df_temp$dadID[i] <- family_to_parents[[famc_id]]$father + } + if (!is.null(family_to_parents[[famc_id]]$mother)) { + df_temp$momID[i] <- family_to_parents[[famc_id]]$mother + } + } + } + } + } + return(df_temp) +} + +#' Process parents information +#' +#' This function processes the dataframe to add momID and dadID columns. +#' +#' @param df_temp A data frame containing information about individuals. +#' @return A data frame with added momID and dadID columns. +#' @keywords internal +.processParents.legacy <- function(df_temp, datasource) { + # Ensure required columns are present + if (datasource == "gedcom") { + required_cols <- c("FAMC", "sex", "FAMS") + } else if (datasource == "wiki") { + required_cols <- c("id") + } else { + stop("Invalid datasource") + } + + if (!all(required_cols %in% colnames(df_temp))) { + missing_cols <- setdiff(required_cols, colnames(df_temp)) + warning("Missing necessary columns: ", paste(missing_cols, collapse = ", ")) + return(df_temp) + } + + family_to_parents <- .mapFAMS2parents.legacy(df_temp) + if (is.null(family_to_parents) || length(family_to_parents) == 0) { + return(df_temp) + } + df_temp <- .mapFAMC2parents.legacy(df_temp, family_to_parents) + return(df_temp) +} + + + +#' Extract Information from Line +#' +#' This function extracts information from a line based on a specified type. +#' @param line A character string representing a line from a GEDCOM file. +#' @param type A character string representing the type of information to extract. +#' @return A character string with the extracted information. +#' @keywords internal +.extract_info.legacy <- function(line, type) { + stringr::str_squish(stringr::str_extract(line, paste0("(?<=", type, " ).+"))) +} + +#' Combine Columns +#' +#' This function combines two columns, handling conflicts and merging non-conflicting data. +#' @param col1 The first column to combine. +#' @param col2 The second column to combine. +#' @return A list with the combined column and a flag indicating if the second column should be retained. +#' @keywords internal +# Helper function to check for conflicts and merge columns +.combine_columns.legacy <- function(col1, col2) { + col1_lower <- stringr::str_to_lower(col1) + col2_lower <- stringr::str_to_lower(col2) + + # Check if there are any conflicting non-NA values + conflicts <- !is.na(col1_lower) & !is.na(col2_lower) & col1_lower != col2_lower + + if (any(conflicts)) { + warning("Columns have conflicting values. They were not merged.") + return(list(combined = col1, retain_col2 = TRUE)) # Indicate to retain col2 + } else { + combined <- ifelse(is.na(col1), col2, col1) + return(list(combined = combined, retain_col2 = FALSE)) + } +} + +#' Check for Pattern Rows +#' +#' This function counts the number of rows containing specific patterns. +#' @param file A data frame containing the GEDCOM file. +#' @return A list with the number of rows containing each pattern. +#' @keywords internal +#' +.countPatternRows.legacy <- function(file) { + # Count the number of rows containing specific patterns + pattern_counts <- sapply( + c( + "@ INDI", " NAME", " GIVN", " NPFX", " NICK", " SURN", " NSFX", " _MARNM", + " BIRT", " DEAT", " SEX", " CAST", " DSCR", " EDUC", " IDNO", " NATI", + " NCHI", " NMR", " OCCU", " PROP", " RELI", " RESI", " SSN", " TITL", + " FAMC", " FAMS", " PLAC", " LATI", " LONG", " DATE", " CAUS" + ), + function(pat) sum(grepl(pat, file$X1)) + ) + num_rows <- list( + num_indi_rows = pattern_counts["@ INDI"], + num_name_rows = pattern_counts[" NAME"], + num_givn_rows = pattern_counts[" GIVN"], + num_npfx_rows = pattern_counts[" NPFX"], + num_nick_rows = pattern_counts[" NICK"], + num_surn_rows = pattern_counts[" SURN"], + num_nsfx_rows = pattern_counts[" NSFX"], + num_marnm_rows = pattern_counts[" _MARNM"], + num_birt_rows = pattern_counts[" BIRT"], + num_deat_rows = pattern_counts[" DEAT"], + num_sex_rows = pattern_counts[" SEX"], + num_cast_rows = pattern_counts[" CAST"], + num_dscr_rows = pattern_counts[" DSCR"], + num_educ_rows = pattern_counts[" EDUC"], + num_idno_rows = pattern_counts[" IDNO"], + num_nati_rows = pattern_counts[" NATI"], + num_nchi_rows = pattern_counts[" NCHI"], + num_nmr_rows = pattern_counts[" NMR"], + num_occu_rows = pattern_counts[" OCCU"], + num_prop_rows = pattern_counts[" PROP"], + num_reli_rows = pattern_counts[" RELI"], + num_resi_rows = pattern_counts[" RESI"], + num_ssn_rows = pattern_counts[" SSN"], + num_titl_rows = pattern_counts[" TITL"], + num_famc_rows = pattern_counts[" FAMC"], + num_fams_rows = pattern_counts[" FAMS"], + num_plac_rows = pattern_counts[" PLAC"], + num_lati_rows = pattern_counts[" LATI"], + num_long_rows = pattern_counts[" LONG"], + num_date_rows = pattern_counts[" DATE"], + num_caus_rows = pattern_counts[" CAUS"] + ) + return(num_rows) +} + +#' Process a GEDCOM Tag +#' +#' Extracts and assigns a value to a specified field in `vars` if the pattern is present. +#' Returns both the updated variable list and a flag indicating whether the tag was matched. +#' +#' @param tag The GEDCOM tag (e.g., "SEX", "CAST", etc.). +#' @param field_name The name of the variable to assign to in `vars`. +#' @param pattern_rows Output from `countPatternRows()`. +#' @param line The GEDCOM line to parse. +#' @param vars The current list of variables to update. +#' @return A list with updated `vars` and a `matched` flag. +#' @keywords internal +#' +.process_tag.legacy <- function(tag, field_name, pattern_rows, line, vars, + extractor = NULL, mode = "replace") { + count_name <- paste0("num_", tolower(tag), "_rows") + matched <- FALSE + if (!is.null(pattern_rows[[count_name]]) && + pattern_rows[[count_name]] > 0 && + grepl(paste0(" ", tag), line)) { + value <- if (is.null(extractor)) .extract_info.legacy(line, tag) else extractor(line) + + if (mode == "append" && !is.na(vars[[field_name]])) { + vars[[field_name]] <- paste0(vars[[field_name]], ", ", value) + } else { + vars[[field_name]] <- value + } + + matched <- TRUE + } + return(list(vars = vars, matched = matched)) +} + +#' collapse Names +#' +#' This function combines the `name_given` and `name_given_pieces` columns in a data frame. +#' +#' @inheritParams .readGedcom.legacy +#' @param df_temp A data frame containing the columns to be combined. +#' @keywords internal +.collapseNames.legacy <- function(verbose, df_temp) { + if (verbose) { + print("Combining Duplicate Columns") + } + # need to check if any values aren't NA in name_given_pieces and name_surn_pieces + # Combine `name_given` and `name_given_pieces` + + # Combine `name_given` and `name_given_pieces` + if (!all(is.na(df_temp$name_given_pieces)) | !all(is.na(df_temp$name_given))) { + result <- .combine_columns.legacy(df_temp$name_given, df_temp$name_given_pieces) + df_temp$name_given <- result$combined + if (!result$retain_col2) { + df_temp$name_given_pieces <- NULL + } + } + + # Combine `name_surn` and `name_surn_pieces` + if (!all(is.na(df_temp$name_surn_pieces)) | !all(is.na(df_temp$name_surn))) { + result <- .combine_columns.legacy(df_temp$name_surn, df_temp$name_surn_pieces) + df_temp$name_surn <- result$combined + if (!result$retain_col2) { + df_temp$name_surn_pieces <- NULL + } + } + return(df_temp) +} diff --git a/R/readPedigree.R b/R/readPedigree.R deleted file mode 100644 index d79bb230..00000000 --- a/R/readPedigree.R +++ /dev/null @@ -1,752 +0,0 @@ -#' Read a GEDCOM File -#' -#' This function reads a GEDCOM file and parses it into a structured data frame of individuals. -#' Inspired by https://raw.githubusercontent.com/jjfitz/readgedcom/master/R/read_gedcom.R -#' -#' @param file_path The path to the GEDCOM file. -#' @param add_parents A logical value indicating whether to add parents to the data frame. -#' @param remove_empty_cols A logical value indicating whether to remove columns with all missing values. -#' @param combine_cols A logical value indicating whether to combine columns with duplicate values. -#' @param verbose A logical value indicating whether to print messages. -#' @param skinny A logical value indicating whether to return a skinny data frame. -#' @return A data frame containing information about individuals, with the following potential columns: -#' - `id`: ID of the individual -#' - `momID`: ID of the individual's mother -#' - `dadID`: ID of the individual's father -#' - `sex`: Sex of the individual -#' - `name`: Full name of the individual -#' - `name_given`: First name of the individual -#' - `name_surn`: Last name of the individual -#' - `name_marriedsurn`: Married name of the individual -#' - `name_nick`: Nickname of the individual -#' - `name_npfx`: Name prefix -#' - `name_nsfx`: Name suffix -#' - `birth_date`: Birth date of the individual -#' - `birth_lat`: Latitude of the birthplace -#' - `birth_long`: Longitude of the birthplace -#' - `birth_place`: Birthplace of the individual -#' - `death_caus`: Cause of death -#' - `death_date`: Death date of the individual -#' - `death_lat`: Latitude of the place of death -#' - `death_long`: Longitude of the place of death -#' - `death_place`: Place of death of the individual -#' - `attribute_caste`: Caste of the individual -#' - `attribute_children`: Number of children of the individual -#' - `attribute_description`: Description of the individual -#' - `attribute_education`: Education of the individual -#' - `attribute_idnumber`: Identification number of the individual -#' - `attribute_marriages`: Number of marriages of the individual -#' - `attribute_nationality`: Nationality of the individual -#' - `attribute_occupation`: Occupation of the individual -#' - `attribute_property`: Property owned by the individual -#' - `attribute_religion`: Religion of the individual -#' - `attribute_residence`: Residence of the individual -#' - `attribute_ssn`: Social security number of the individual -#' - `attribute_title`: Title of the individual -#' - `FAMC`: ID(s) of the family where the individual is a child -#' - `FAMS`: ID(s) of the family where the individual is a spouse -#' @export -readGedcom <- function(file_path, - verbose = FALSE, - add_parents = TRUE, - remove_empty_cols = TRUE, - combine_cols = TRUE, - skinny = FALSE) { - # Checks - if (!file.exists(file_path)) stop("File does not exist: ", file_path) - if (verbose) { - print(paste("Reading file:", file_path)) - } - file <- data.frame(X1 = readLines(file_path)) - file_length <- nrow(file) - if (verbose) { - print(paste0("File is ", file_length, " lines long")) - } - - # Count the number of rows containing specific patterns - num_rows <- countPatternRows(file) - - # List of variables to initialize - var_names <- list( - identifiers = c("id", "momID", "dadID"), - names = c( - "name", "name_given", "name_given_pieces", - "name_surn", "name_surn_pieces", "name_marriedsurn", "name_nick", "name_npfx", "name_nsfx" - ), - sex = c("sex"), - birth = c("birth_date", "birth_lat", "birth_long", "birth_place"), - death = c("death_caus", "death_date", "death_lat", "death_long", "death_place"), - attributes = c( - "attribute_caste", "attribute_children", "attribute_description", - "attribute_education", "attribute_idnumber", "attribute_marriages", - "attribute_nationality", "attribute_occupation", - "attribute_property", "attribute_religion", "attribute_residence", - "attribute_ssn", "attribute_title" - ), - relationships = c("FAMC", "FAMS") - ) - - all_var_names <- unlist(var_names, use.names = FALSE) - - # Initialize all variables to NA - vars <- stats::setNames(as.list(rep(NA_character_, length(all_var_names))), all_var_names) - - df_temp <- as.data.frame(matrix(nrow = 1, ncol = length(all_var_names))) - names(df_temp) <- all_var_names - - if (verbose) { - print("Parsing GEDCOM file") - } - for (i in 1:length(file[1][[1]])) { - tmpv <- file[1][[1]][[i]] - - if (grepl("@ INDI", tmpv)) { - line_to_write <- as.data.frame(vars) - df_temp <- rbind(df_temp, line_to_write) - - # Reset all variables to NA - vars <- stats::setNames(as.list(rep(NA_character_, length(all_var_names))), all_var_names) - - vars$id <- stringr::str_extract(tmpv, "(?<=@.)\\d*(?=@)") - next - } - - # names - if (num_rows$num_name_rows > 0 && grepl(" NAME", tmpv)) { - vars$name <- extract_info(tmpv, "NAME") - vars$name_given <- stringr::str_extract(vars$name, ".*(?= /)") - vars$name_surn <- stringr::str_extract(vars$name, "(?<=/).*(?=/)") - vars$name <- stringr::str_squish(stringr::str_replace(vars$name, "/", " ")) - next - } - # PERSONAL_NAME_PIECES := NAME | NPFX | GIVN | NICK | SPFX | SURN | NSFX - if (num_rows$num_givn_rows > 0 && grepl(" GIVN", tmpv)) { - vars$name_given_pieces <- extract_info(tmpv, "GIVN") - next - } - - # npfx := Name Prefix - if (num_rows$num_npfx_rows > 0 && grepl(" NPFX", tmpv)) { - vars$name_npfx <- extract_info(tmpv, "NPFX") - next - } - - # NICK := Nickname - if (num_rows$num_nick_rows > 0 && grepl(" NICK", tmpv)) { - vars$name_nick <- extract_info(tmpv, "NICK") - next - } - - # surn := Surname - if (num_rows$num_surn_rows > 0 && grepl(" SURN", tmpv)) { - vars$name_surn_pieces <- extract_info(tmpv, "SURN") - next - } - - # nsfx := Name suffix - if (num_rows$num_nsfx_rows > 0 && grepl(" NSFX", tmpv)) { - vars$name_nsfx <- extract_info(tmpv, "NSFX") - next - } - if (num_rows$num_marnm_rows > 0 && grepl(" _MARNM", tmpv)) { - vars$name_marriedsurn <- extract_info(tmpv, "_MARNM") - next - } - # Birth event related information - if (num_rows$num_birt_rows > 0 && grepl(" BIRT", tmpv)) { - if (num_rows$num_date_rows > 0 && i + 1 <= file_length) { - vars$birth_date <- extract_info(file[1][[1]][[i + 1]], "DATE") - if (num_rows$num_plac_rows > 0 && i + 2 <= file_length) { - vars$birth_place <- extract_info(file[1][[1]][[i + 2]], "PLAC") - if (num_rows$num_lati_rows > 0 && i + 4 <= file_length) { - vars$birth_lat <- extract_info(file[1][[1]][[i + 4]], "LATI") - if (num_rows$num_long_rows > 0 && i + 5 <= file_length) { - vars$birth_long <- extract_info(file[1][[1]][[i + 5]], "LONG") - } - } - } - } - next - } - - # Death event related information - # the ifs are nested so that there is no need to check if you've already run out of - if (num_rows$num_deat_rows > 0 && grepl(" DEAT", tmpv)) { - if (num_rows$num_date_rows > 0 && i + 1 <= file_length) { - vars$death_date <- extract_info(file[1][[1]][[i + 1]], "DATE") - if (num_rows$num_plac_rows > 0 && i + 2 <= file_length) { - vars$death_place <- extract_info(file[1][[1]][[i + 2]], "PLAC") - if (num_rows$num_caus_rows > 0 && i + 3 <= file_length) { - vars$death_caus <- extract_info(file[1][[1]][[i + 3]], "CAUS") - if (num_rows$num_lati_rows > 0 && i + 4 <= file_length) { - vars$death_lat <- extract_info(file[1][[1]][[i + 4]], "LATI") - if (num_rows$num_long_rows > 0 && i + 5 <= file_length) { - vars$death_long <- extract_info(file[1][[1]][[i + 5]], "LONG") - } - } - } - } - } - next - } - - if (grepl(" SEX", tmpv)) { - vars$sex <- extract_info(tmpv, "SEX") - next - } - - # Individual Attributes - - # CAST caste - # g7:CAST The name of an individual’s rank or status in society which is sometimes based on racial or religious differences, or differences in wealth, inherited rank, profession, or occupation. - if (num_rows$num_cast_rows > 0 && grepl(" CAST", tmpv)) { - vars$attribute_caste <- extract_info(tmpv, "CAST") - next - } - # DSCR physical description - # g7:DSCR The physical characteristics of a person. - if (num_rows$num_dscr_rows > 0 && grepl(" DSCR", tmpv)) { - vars$attribute_description <- extract_info(tmpv, "DSCR") - next - } - # EDUC education - # g7:EDUC Indicator of a level of education attained. - if (num_rows$num_educ_rows > 0 && grepl(" EDUC", tmpv)) { - vars$attribute_education <- extract_info(tmpv, "EDUC") - next - } - # IDNO identifying number - # g7:IDNO A number or other string assigned to identify a person within some significant external system. It must have a TYPE substructure to define what kind of identification number is being provided. - if (num_rows$num_idno_rows > 0 && grepl(" IDNO", tmpv)) { - vars$attribute_idnumber <- extract_info(tmpv, "IDNO") - next - } - # NATI nationality - # g7:NATI An individual’s national heritage or origin, or other folk, house, kindred, lineage, or tribal interest. - if (num_rows$num_nati_rows > 0 && grepl(" NATI", tmpv)) { - vars$attribute_nationality <- extract_info(tmpv, "NATI") - next - } - # NCHI number of children - # g7:INDI-NCHI The number of children that this person is known to be the parent of (all marriages). - if (num_rows$num_nchi_rows > 0 && grepl(" NCHI", tmpv)) { - vars$attribute_children <- extract_info(tmpv, "NCHI") - next - } - - # NMR number of marriages - # g7:NMR The number of times this person has participated in a family as a spouse or parent. - if (num_rows$num_nmr_rows > 0 && grepl(" NMR", tmpv)) { - vars$attribute_marriages <- extract_info(tmpv, "NMR") - next - } - - # OCCU occupation - # g7:OCCU The type of work or profession of an individual. - if (num_rows$num_occu_rows > 0 && grepl(" OCCU", tmpv)) { - vars$attribute_occupation <- extract_info(tmpv, "OCCU") - next - } - # PROP property - # g7:PROP Pertaining to possessions such as real estate or other property of interest. - - if (num_rows$num_prop_rows > 0 && grepl(" PROP", tmpv)) { - vars$attribute_property <- extract_info(tmpv, "PROP") - next - } - - # RELI religion - # g7:INDI-RELI A religious denomination to which a person is affiliated or for which a record applies. - if (num_rows$num_reli_rows > 0 && grepl(" RELI", tmpv)) { - vars$attribute_religion <- extract_info(tmpv, "RELI") - next - } - # RESI residence - # g7:INDI-RESI An address or place of residence where an individual resided. - if (num_rows$num_resi_rows > 0 && grepl(" RESI", tmpv)) { - vars$attribute_residence <- extract_info(tmpv, "RESI") - next - } - - # SSN social security number - # g7:SSN A number assigned by the United States Social Security Administration, used for tax identification purposes. It is a type of IDNO. - if (num_rows$num_ssn_rows > 0 && grepl(" SSN", tmpv)) { - vars$attribute_ssn <- extract_info(tmpv, "SSN") - next - } - # TITL title - # g7:INDI-TITL A formal designation used by an individual in connection with positions of royalty or other social status, such as Grand Duke. - if (num_rows$num_titl_rows > 0 && grepl(" TITL", tmpv)) { - vars$attribute_title <- extract_info(tmpv, "TITL") - next - } - - # relationship data - # g7:INDI-FAMC - ## The family in which an individual appears as a child. It is also used with a g7:FAMC-STAT substructure to show individuals who are not children of the family. See FAMILY_RECORD for more details. - if (num_rows$num_famc_rows > 0 && grepl(" FAMC", tmpv)) { - if (is.na(vars$FAMC)) { - vars$FAMC <- stringr::str_extract(tmpv, "(?<=@.)\\d*(?=@)") - } else { - vars$FAMC <- paste0(vars$FAMC, ", ", stringr::str_extract(tmpv, "(?<=@.)\\d*(?=@)")) - } - next - } - # FAMS (Family spouse) g7:FAMS - # The family in which an individual appears as a partner. See FAMILY_RECORD for more details. - if (num_rows$num_fams_rows > 0 && grepl(" FAMS", tmpv)) { - if (is.na(vars$FAMS)) { - vars$FAMS <- stringr::str_extract(tmpv, "(?<=@.)\\d*(?=@)") - } else { - vars$FAMS <- paste0(vars$FAMS, ", ", stringr::str_extract(tmpv, "(?<=@.)\\d*(?=@)")) - } - next - } - if (verbose && i %% 1000 == 0) { - cat("Processed", i, "lines\n") - } - } - # write final file line - line_to_write <- as.data.frame(vars) - df_temp <- rbind(df_temp, line_to_write) - df_temp <- df_temp[!is.na(df_temp$id), ] - - if (verbose) { - print(paste0("File has ", nrow(df_temp), " people")) - } - if (nrow(df_temp) == 0) { - warning("No people found in file") - return(NULL) - } - if (nrow(df_temp) != num_rows$num_indi_rows) { - warning("The number of people found in the processed file does not match the number of individuals raw data") - } - # Add mom and dad ids - if (add_parents) { - if (verbose) { - print("Processing parents") - } - df_temp <- processParents(df_temp, datasource = "gedcom") - } - - - - if (combine_cols) { - if (verbose) { - print("Combining Duplicate Columns") - } - # need to check if any values aren't NA in name_given_pieces and name_surn_pieces - # Combine `name_given` and `name_given_pieces` - - # Combine `name_given` and `name_given_pieces` - if (!all(is.na(df_temp$name_given_pieces)) | !all(is.na(df_temp$name_given))) { - result <- combine_columns(df_temp$name_given, df_temp$name_given_pieces) - df_temp$name_given <- result$combined - if (!result$retain_col2) { - df_temp$name_given_pieces <- NULL - } - } - - # Combine `name_surn` and `name_surn_pieces` - if (!all(is.na(df_temp$name_surn_pieces)) | !all(is.na(df_temp$name_surn))) { - result <- combine_columns(df_temp$name_surn, df_temp$name_surn_pieces) - df_temp$name_surn <- result$combined - if (!result$retain_col2) { - df_temp$name_surn_pieces <- NULL - } - } - } - - if (remove_empty_cols) { - # Remove empty columns - if (verbose) { - print("Removing empty columns") - } - df_temp <- df_temp[, colSums(is.na(df_temp)) < nrow(df_temp)] - } - if (skinny) { - if (verbose) { - print("Slimming down the data frame") - } - df_temp <- df_temp[, colSums(is.na(df_temp)) < nrow(df_temp)] - df_temp$FAMC <- NULL - df_temp$FAMS <- NULL - } - return(df_temp) -} - - -#' Create a mapping of family IDs to parent IDs -#' -#' This function creates a mapping from family IDs to the IDs of the parents. -#' -#' @param df_temp A data frame containing information about individuals. -#' @return A list mapping family IDs to parent IDs. -#' @keywords internal -createFamilyToParentsMapping <- function(df_temp, datasource) { - if (datasource == "gedcom") { - if (!all(c("FAMS", "sex") %in% colnames(df_temp))) { - warning("The data frame does not contain the necessary columns (FAMS, sex)") - return(NULL) - } - family_to_parents <- list() - for (i in 1:nrow(df_temp)) { - if (!is.na(df_temp$FAMS[i])) { - fams_ids <- unlist(strsplit(df_temp$FAMS[i], ", ")) - for (fams_id in fams_ids) { - if (!is.null(family_to_parents[[fams_id]])) { - if (df_temp$sex[i] == "M") { - family_to_parents[[fams_id]]$father <- df_temp$id[i] - } else if (df_temp$sex[i] == "F") { - family_to_parents[[fams_id]]$mother <- df_temp$id[i] - } - } else { - family_to_parents[[fams_id]] <- list() - if (df_temp$sex[i] == "M") { - family_to_parents[[fams_id]]$father <- df_temp$id[i] - } else if (df_temp$sex[i] == "F") { - family_to_parents[[fams_id]]$mother <- df_temp$id[i] - } - } - } - } - } - } else if (datasource == "wiki") { - message("The data source is not supported") - return(df_temp) - } - return(family_to_parents) -} - -#' Assign momID and dadID based on family mapping -#' -#' This function assigns mother and father IDs to individuals in the data frame -#' based on the mapping of family IDs to parent IDs. -#' -#' @param df_temp A data frame containing individual information. -#' @param family_to_parents A list mapping family IDs to parent IDs. -#' @param datasource A string indicating the data source. Options are "gedcom" and "wiki". -#' @return A data frame with added momID and dad_ID columns. -#' @keywords internal -assignParentIDs <- function(df_temp, family_to_parents, datasource) { - df_temp$momID <- NA_character_ - df_temp$dadID <- NA_character_ - if (datasource == "gedcom") { - for (i in 1:nrow(df_temp)) { - if (!is.na(df_temp$FAMC[i])) { - famc_ids <- unlist(strsplit(df_temp$FAMC[i], ", ")) - for (famc_id in famc_ids) { - if (!is.null(family_to_parents[[famc_id]])) { - if (!is.null(family_to_parents[[famc_id]]$father)) { - df_temp$dadID[i] <- family_to_parents[[famc_id]]$father - } - if (!is.null(family_to_parents[[famc_id]]$mother)) { - df_temp$momID[i] <- family_to_parents[[famc_id]]$mother - } - } - } - } - } - return(df_temp) - } else if (datasource == "wiki") { - message("No parents information available for wiki data") - return(df_temp) - } -} - -#' Process parents information -#' -#' This function processes the dataframe to add momID and dadID columns. -#' -#' @param df_temp A data frame containing information about individuals. -#' @return A data frame with added momID and dadID columns. -#' @keywords internal -processParents <- function(df_temp, datasource) { - # Ensure required columns are present - if (datasource == "gedcom") { - required_cols <- c("FAMC", "sex", "FAMS") - } else if (datasource == "wiki") { - required_cols <- c("id") - } else { - stop("Invalid datasource") - } - - if (!all(required_cols %in% colnames(df_temp))) { - missing_cols <- setdiff(required_cols, colnames(df_temp)) - warning("Missing necessary columns: ", paste(missing_cols, collapse = ", ")) - return(df_temp) - } - - family_to_parents <- createFamilyToParentsMapping(df_temp, datasource = datasource) - if (is.null(family_to_parents) || length(family_to_parents) == 0) { - return(df_temp) - } - df_temp <- assignParentIDs(df_temp, family_to_parents, datasource = datasource) - return(df_temp) -} - - - -#' Extract Information from Line -#' -#' This function extracts information from a line based on a specified type. -#' @param line A character string representing a line from a GEDCOM file. -#' @param type A character string representing the type of information to extract. -#' @return A character string with the extracted information. -#' @keywords internal -extract_info <- function(line, type) { - stringr::str_squish(stringr::str_extract(line, paste0("(?<=", type, " ).+"))) -} - -#' Combine Columns -#' -#' This function combines two columns, handling conflicts and merging non-conflicting data. -#' @param col1 The first column to combine. -#' @param col2 The second column to combine. -#' @return A list with the combined column and a flag indicating if the second column should be retained. -#' @keywords internal -# Helper function to check for conflicts and merge columns -combine_columns <- function(col1, col2) { - col1_lower <- stringr::str_to_lower(col1) - col2_lower <- stringr::str_to_lower(col2) - - # Check if there are any conflicting non-NA values - conflicts <- !is.na(col1_lower) & !is.na(col2_lower) & col1_lower != col2_lower - - if (any(conflicts)) { - warning("Columns have conflicting values. They were not merged.") - return(list(combined = col1, retain_col2 = TRUE)) # Indicate to retain col2 - } else { - combined <- ifelse(is.na(col1), col2, col1) - return(list(combined = combined, retain_col2 = FALSE)) - } -} - -#' Check for Pattern Rows -#' -#' This function counts the number of rows containing specific patterns. -#' @param file A data frame containing the GEDCOM file. -#' @return A list with the number of rows containing each pattern. -#' @keywords internal -#' -countPatternRows <- function(file) { - # Count the number of rows containing specific patterns - pattern_counts <- sapply( - c( - "@ INDI", " NAME", " GIVN", " NPFX", " NICK", " SURN", " NSFX", " _MARNM", - " BIRT", " DEAT", " SEX", " CAST", " DSCR", " EDUC", " IDNO", " NATI", - " NCHI", " NMR", " OCCU", " PROP", " RELI", " RESI", " SSN", " TITL", - " FAMC", " FAMS", " PLAC", " LATI", " LONG", " DATE", " CAUS" - ), - function(pat) sum(grepl(pat, file$X1)) - ) - num_rows <- list( - num_indi_rows = pattern_counts["@ INDI"], - num_name_rows = pattern_counts[" NAME"], - num_givn_rows = pattern_counts[" GIVN"], - num_npfx_rows = pattern_counts[" NPFX"], - num_nick_rows = pattern_counts[" NICK"], - num_surn_rows = pattern_counts[" SURN"], - num_nsfx_rows = pattern_counts[" NSFX"], - num_marnm_rows = pattern_counts[" _MARNM"], - num_birt_rows = pattern_counts[" BIRT"], - num_deat_rows = pattern_counts[" DEAT"], - num_sex_rows = pattern_counts[" SEX"], - num_cast_rows = pattern_counts[" CAST"], - num_dscr_rows = pattern_counts[" DSCR"], - num_educ_rows = pattern_counts[" EDUC"], - num_idno_rows = pattern_counts[" IDNO"], - num_nati_rows = pattern_counts[" NATI"], - num_nchi_rows = pattern_counts[" NCHI"], - num_nmr_rows = pattern_counts[" NMR"], - num_occu_rows = pattern_counts[" OCCU"], - num_prop_rows = pattern_counts[" PROP"], - num_reli_rows = pattern_counts[" RELI"], - num_resi_rows = pattern_counts[" RESI"], - num_ssn_rows = pattern_counts[" SSN"], - num_titl_rows = pattern_counts[" TITL"], - num_famc_rows = pattern_counts[" FAMC"], - num_fams_rows = pattern_counts[" FAMS"], - num_plac_rows = pattern_counts[" PLAC"], - num_lati_rows = pattern_counts[" LATI"], - num_long_rows = pattern_counts[" LONG"], - num_date_rows = pattern_counts[" DATE"], - num_caus_rows = pattern_counts[" CAUS"] - ) - return(num_rows) -} - -#' Read Wiki Family Tree -#' -#' @param text A character string containing the text of a family tree in wiki format. -#' @export -readWikifamilytree <- function(text) { - # Extract summary text - - summary_text <- extractSummaryText(text) - # Extract all lines defining the family tree - tree_lines <- unlist(stringr::str_extract_all(text, "\\{\\{familytree.*?\\}\\}")) - tree_lines <- tree_lines[!stringr::str_detect(tree_lines, "start|end")] # Remove start/end markers - tree_lines <- gsub("\\{\\{familytree(.*?)\\}\\}", "\\1", tree_lines) # Remove wrapping markup - - # Convert tree structure into a coordinate grid (preserves symbols!) - tree_df <- parseTree(tree_lines) - - # Identify columns that start with "Y" - cols_to_pivot <- grep("^Y", names(tree_df), value = TRUE) - - # Reshape from wide to long format - tree_long <- makeLongTree(tree_df, cols_to_pivot) - - # Extract member definitions - members_df <- matchMembers(text) - members_df$id <- paste0("P", seq_len(nrow(members_df))) # Assign unique person IDs - - # Merge names into the tree structure (keeping all symbols!) - tree_long <- merge(tree_long, members_df, by.x = "Value", by.y = "identifier", all.x = TRUE) - - tree_long$DisplayName <- ifelse(!is.na(tree_long$name), tree_long$name, tree_long$Value) # Use name if available - - # parse relationships and infer them - - relationships_df <- parseRelationships(tree_long) - - # relationships_df <- processParents(tree_long, datasource = "wiki") - - - - # Return structured table of the family tree (symbols included) - list( - summary = summary_text, - members = members_df, - structure = tree_long, - relationships = relationships_df - ) -} - -#' Make Long Tree -#' @param tree_df A data frame containing the tree structure. -#' @param cols_to_pivot A character vector of column names to pivot. -#' @return A long data frame containing the tree structure. -#' @keywords internal -makeLongTree <- function(tree_df, cols_to_pivot) { - tree_long <- stats::reshape(tree_df, - varying = cols_to_pivot, - v.names = "Value", - timevar = "Column", - times = cols_to_pivot, - idvar = setdiff(names(tree_df), cols_to_pivot), - direction = "long" - ) - - tree_long <- tree_long[!is.na(tree_long$Value), ] - tree_long$Value <- stringr::str_trim(tree_long$Value) - tree_long$Column <- as.numeric(gsub("^Y", "", tree_long$Column)) - return(tree_long) -} - -#' Match Members -#' @inheritParams readWikifamilytree -#' @return A data frame containing information about the members of the family tree. -#' @keywords internal - -matchMembers <- function(text) { - member_matches <- stringr::str_extract_all(text, "\\|\\s*([A-Za-z0-9]+)\\s*=\\s*([^|}]*)")[[1]] - member_matches <- gsub("\\[|\\]|'''", "", member_matches) # Remove formatting - - members_df <- data.frame( - identifier = stringr::str_trim(stringr::str_extract(member_matches, "^[^=]+")), - name = stringr::str_trim(stringr::str_extract(member_matches, "(?<=\\=).*")), - stringsAsFactors = FALSE - ) - - # Remove leading pipes (`|`) from identifiers for consistency - members_df$identifier <- gsub("^\\|\\s*", "", members_df$identifier) - - # remove summary row - members_df <- members_df[members_df$identifier != "summary", ] - - return(members_df) -} - -#' Extract Summary Text -#' @inheritParams readWikifamilytree -#' @return A character string containing the summary text. -#' @keywords internal -#' @export - -extractSummaryText <- function(text) { - summary_match <- stringr::str_match(text, "\\{\\{familytree/start \\|summary=(.*?)\\}\\}") - summary_text <- ifelse(!is.na(summary_match[, 2]), summary_match[, 2], NA) - return(summary_text) -} - -#' Parse Tree -#' @param tree_lines A character vector containing the lines of the tree structure. -#' @return A data frame containing the tree structure. -#' @keywords internal -#' @export - -parseTree <- function(tree_lines) { - tree_matrix <- base::strsplit(tree_lines, "\\|") # Split each row into columns - max_cols <- max(sapply(tree_matrix, length)) # Find the max column count - - # Convert to a data frame (ensures correct structure) - tree_df <- do.call(rbind, lapply(tree_matrix, function(row) { - length(row) <- max_cols # Ensure uniform column length - return(row) - })) - - tree_df <- as.data.frame(tree_df, stringsAsFactors = FALSE) - colnames(tree_df) <- paste0("Y", seq_len(ncol(tree_df))) # Assign column names - tree_df$Row <- seq_len(nrow(tree_df)) # Assign row numbers - return(tree_df) -} - -#' infer relationship from tree template -#' -#' @param tree_long A data frame containing the tree structure in long format. -#' @return A data frame containing the relationships between family members. -#' @keywords internal -#' -parseRelationships <- function(tree_long) { - relationships <- data.frame( - id = tree_long$id, - momID = NA_character_, - dadID = NA_character_, - spouseID = NA_character_, - stringsAsFactors = FALSE - ) - - # Loop through rows to find connections - for (i in seq_len(nrow(tree_long))) { - row <- tree_long[i, ] - - # **Parent-Child Detection** - if (row$Value == "y") { - parent <- tree_long$Value[tree_long$Row == row$Row - 1 & tree_long$Column == row$Column] - child <- tree_long$Value[tree_long$Row == row$Row + 1 & tree_long$Column == row$Column] - - if (length(parent) == 0) parent <- NA - if (length(child) == 0) child <- NA - # Assign mom/dad IDs based on tree structure - if (!is.na(parent) && !is.na(child)) { - relationships$momID[relationships$id == child] <- parent - relationships$dadID[relationships$id == child] <- parent # Assuming one parent detected for now - } - } - - # **Spouse Detection** - if (row$Value == "+") { - spouse1 <- tree_long$Value[tree_long$Row == row$Row & tree_long$Column == row$Column - 1] - spouse2 <- tree_long$Value[tree_long$Row == row$Row & tree_long$Column == row$Column + 1] - - if (!is.na(spouse1) && !is.na(spouse2)) { - relationships$spouseID[relationships$id == spouse1] <- spouse2 - relationships$spouseID[relationships$id == spouse2] <- spouse1 - } - } - } - - return(relationships) -} diff --git a/R/readWikifamilytree.R b/R/readWikifamilytree.R new file mode 100644 index 00000000..39f609ea --- /dev/null +++ b/R/readWikifamilytree.R @@ -0,0 +1,383 @@ +#' Read Wiki Family Tree +#' +#' @param text A character string containing the text of a family tree in wiki format. +#' @param verbose A logical value indicating whether to print messages. +#' @param file_path The path to the file containing the family tree. +#' @param ... Additional arguments (not used). +#' +#' @return A list containing the summary, members, structure, and relationships of the family tree. +#' @export +readWikifamilytree <- function(text = NULL, verbose = FALSE, file_path = NULL, ...) { + # Checks + if (is.null(text) && is.null(file_path)) { + stop("Either 'text' or 'file_path' must be provided.") + } + # read from file if provided + if (!is.null(file_path)) { + if (!file.exists(file_path)) stop("File does not exist: ", file_path) + + if (verbose) { + print(paste("Reading file:", file_path)) + } + file <- data.frame(X1 = readLines(file_path)) + file_length <- nrow(file) + if (verbose) { + print(paste0("File is ", file_length, " lines long")) + } + text <- paste0(file$X1, collapse = "\n") + } + # Extract summary text + + summary_text <- getWikiTreeSummary(text) + # Extract all lines defining the family tree + tree_lines <- unlist(stringr::str_extract_all(text, "\\{\\{familytree.*?\\}\\}")) + tree_lines <- tree_lines[!stringr::str_detect(tree_lines, "start|end")] # Remove start/end markers + tree_lines <- gsub("\\{\\{familytree(.*?)\\}\\}", "\\1", tree_lines) # Remove wrapping markup + + # Convert tree structure into a coordinate grid (preserves symbols!) + tree_df <- buildTreeGrid(tree_lines) + + # Identify columns that start with "Y" + cols_to_pivot <- grep("^Y", names(tree_df), value = TRUE) + + # Reshape from wide to long format + tree_long <- convertGrid2LongTree(tree_df, cols_to_pivot) + + # Extract member definitions + members_df <- extractMemberTable(text) + members_df$id <- paste0("P", seq_len(nrow(members_df))) # Assign unique person IDs + + # Merge names into the tree structure (keeping all symbols!) + tree_long <- merge(tree_long, members_df, by.x = "Value", by.y = "identifier", all.x = TRUE) + + tree_long$DisplayName <- ifelse(!is.na(tree_long$name), tree_long$name, tree_long$Value) # Use name if available + + # parse relationships and infer them + tree_paths <- traceTreePaths(tree_long, deduplicate = FALSE) + + parsedRelationships <- parseTreeRelationships(tree_long, tree_paths) + # Return structured table of the family tree (symbols included) + list( + summary = summary_text, + members = members_df, + structure = tree_long, + tree_paths = parsedRelationships$tree_paths, + relationships = parsedRelationships$relationships + ) +} + +#' Make Long Tree +#' @param tree_df A data frame containing the tree structure. +#' @param cols_to_pivot A character vector of column names to pivot. +#' @return A long data frame containing the tree structure. +#' @keywords internal +convertGrid2LongTree <- function(tree_df, cols_to_pivot) { + tree_long <- stats::reshape(tree_df, + varying = cols_to_pivot, + v.names = "Value", + timevar = "Column", + times = cols_to_pivot, + idvar = setdiff(names(tree_df), cols_to_pivot), + direction = "long" + ) + + tree_long <- tree_long[!is.na(tree_long$Value), ] + tree_long$Value <- stringr::str_trim(tree_long$Value) + tree_long$Column <- as.numeric(gsub("^Y", "", tree_long$Column)) + return(tree_long) +} + +#' Match Members +#' @inheritParams readWikifamilytree +#' @return A data frame containing information about the members of the family tree. +#' @keywords internal + +extractMemberTable <- function(text) { + member_matches <- stringr::str_extract_all(text, "\\|\\s*([A-Za-z0-9]+)\\s*=\\s*([^|}]*)")[[1]] + member_matches <- gsub("\\[|\\]|'''", "", member_matches) # Remove formatting + + members_df <- data.frame( + identifier = stringr::str_trim(stringr::str_extract(member_matches, "^[^=]+")), + name = stringr::str_trim(stringr::str_extract(member_matches, "(?<=\\=).*")), + stringsAsFactors = FALSE + ) + + # Remove leading pipes (`|`) from identifiers for consistency + members_df$identifier <- gsub("^\\|\\s*", "", members_df$identifier) + + # remove summary row + members_df <- members_df[members_df$identifier != "summary", ] + + return(members_df) +} + +#' Extract Summary Text +#' @inheritParams readWikifamilytree +#' @return A character string containing the summary text. +#' @keywords internal +#' @export + +getWikiTreeSummary <- function(text) { + summary_match <- stringr::str_match(text, "\\{\\{familytree/start \\|summary=(.*?)\\}\\}") + summary_text <- ifelse(!is.na(summary_match[, 2]), summary_match[, 2], NA) + return(summary_text) +} + +#' Parse Tree +#' @param tree_lines A character vector containing the lines of the tree structure. +#' @return A data frame containing the tree structure. +#' @keywords internal +#' @export + +buildTreeGrid <- function(tree_lines) { + tree_matrix <- base::strsplit(tree_lines, "\\|") # Split each row into columns + max_cols <- max(sapply(tree_matrix, length)) # Find the max column count + + # Convert to a data frame (ensures correct structure) + tree_df <- do.call(rbind, lapply(tree_matrix, function(row) { + length(row) <- max_cols # Ensure uniform column length + return(row) + })) + + tree_df <- as.data.frame(tree_df, stringsAsFactors = FALSE) + colnames(tree_df) <- paste0("Y", seq_len(ncol(tree_df))) # Assign column names + tree_df$Row <- seq_len(nrow(tree_df)) # Assign row numbers + return(tree_df) +} + +#' infer relationship from tree template +#' +#' @param tree_long A data frame containing the tree structure in long format. +#' @param tree_paths Optional. traceTreePaths output. If NULL, it will be calculated. +#' @return A data frame containing the relationships between family members. +#' @keywords internal +#' +parseTreeRelationships <- function(tree_long, tree_paths = NULL) { + # Check if tree_paths is NULL and call traceTreePaths if necessary + if (is.null(tree_paths)) { + tree_paths <- traceTreePaths(tree_long, deduplicate = FALSE) + } + # Initialize relationships dataframe: one row per unique person + person_ids <- unique(tree_long$id[!is.na(tree_long$id)]) + + # Initialize relationships data frame + relationships <- data.frame( + id = person_ids, + momID = NA_character_, + dadID = NA_character_, + parent_1 = NA_character_, + parent_2 = NA_character_, + spouseID = NA_character_, + stringsAsFactors = FALSE + ) + + tree_paths <- tree_paths[!is.na(tree_paths$from_id) & !is.na(tree_paths$to_id), ] + + # Fill in relationships based on the tree structure + tree_paths$relationship <- NA_character_ + + # map relationships based on the intermediate values + + tree_paths$relationship[ + grepl("\\+", tree_paths$intermediate_values) & + !grepl("y", tree_paths$intermediate_values) + ] <- "spouse" + + # Parent-child: + and y both present + tree_paths$relationship[ + grepl("\\+", tree_paths$intermediate_values) & + grepl("y", tree_paths$intermediate_values) + ] <- "offspring" + + tree_paths$relationship[ + is.na(tree_paths$relationship) & grepl("y", tree_paths$intermediate_values) + ] <- "offspring" + + # determine direction + tree_paths$relationship[grepl("^\\+", tree_paths$intermediate_values) & tree_paths$relationship == "offspring"] <- "parent-child" + tree_paths$relationship[grepl("[y\\|]$", tree_paths$intermediate_values) & tree_paths$relationship == "offspring"] <- "parent-child" + tree_paths$relationship[grepl("\\+$", tree_paths$intermediate_values) & tree_paths$relationship == "offspring"] <- "child-parent" + tree_paths$relationship[grepl("^[y\\|]", tree_paths$intermediate_values) & tree_paths$relationship == "offspring"] <- "child-parent" + + # Fill spouse links + spouse_links <- tree_paths[tree_paths$relationship == "spouse", ] + + for (i in seq_len(nrow(spouse_links))) { + a <- spouse_links$from_id[i] + b <- spouse_links$to_id[i] + relationships$spouseID[relationships$id == a] <- b + relationships$spouseID[relationships$id == b] <- a + } + + # Fill parent-child links from directional tags + + pc_links <- tree_paths[tree_paths$relationship == "parent-child", ] + for (i in seq_len(nrow(pc_links))) { + relationships <- populateParents( + df = relationships, + child = pc_links$to_id[i], + parent = pc_links$from_id[i] + ) + } + + # --- Child-parent (to_id = parent) --- + cp_links <- tree_paths[tree_paths$relationship == "child-parent", ] + for (i in seq_len(nrow(cp_links))) { + relationships <- populateParents( + df = relationships, + child = cp_links$from_id[i], + parent = cp_links$to_id[i] + ) + } + + out <- list( + tree_paths = tree_paths, + relationships = relationships + ) + return(out) +} + +#' Assign Parent +#' @param df A data frame containing the relationships. +#' @param child The ID of the child. +#' @param parent The ID of the parent. +#' @return A data frame with updated parent information. +#' @keywords internal +populateParents <- function(df, child, parent) { + idx <- which(df$id == child) + if (length(idx) != 1) { + return(df) + } + + if (is.na(df$parent_1[idx])) { + df$parent_1[idx] <- parent + } else if (is.na(df$parent_2[idx]) && df$parent_1[idx] != parent) { + df$parent_2[idx] <- parent + } + return(df) +} + + +#' Trace paths between individuals in a family tree grid +#' +#' @param tree_long A data.frame with columns: Row, Column, Value, id +#' @param deduplicate Logical, if TRUE, will remove duplicate paths +#' @return A data.frame with columns: from_id, to_id, direction, path_length, intermediates +#' @export +#' +traceTreePaths <- function(tree_long, deduplicate = TRUE) { + # Keep only relevant cells (people and path symbols) + path_symbols <- c("|", "-", "+", "v", "^", "y", ",", ".", "`", "!", "~", "x", ")", "(") + tree_long$Value <- gsub("\\s+", "", tree_long$Value) # Remove whitespace + active_cells <- tree_long[!is.na(tree_long$Value) & + (tree_long$Value %in% path_symbols | !is.na(tree_long$id)), ] + + active_cells$key <- paste(active_cells$Row, active_cells$Column, sep = "_") + + edges <- do.call(rbind, lapply(seq_len(nrow(active_cells)), function(i) { + from_key <- active_cells$key[i] + to_keys <- getGridNeighbors(active_cells[i, ], + active_keys = active_cells$key + ) + if (length(to_keys) > 0) { + data.frame(from = from_key, to = to_keys, stringsAsFactors = FALSE) + } + })) + + # Create graph + g <- igraph::graph_from_data_frame(edges, directed = FALSE) + + # Map keys to IDs + person_nodes <- active_cells[!is.na(active_cells$id), c("key", "id")] + id_map <- stats::setNames(person_nodes$id, person_nodes$key) + + # Find all pairs of people and trace paths + person_keys <- names(id_map) + result <- data.frame() + + for (i in seq_along(person_keys)) { + for (j in seq_along(person_keys)) { + if (i == j) next + from_key <- person_keys[i] + to_key <- person_keys[j] + + # skip if either endpoint is not in graph + if (!(from_key %in% igraph::V(g)$name) || !(to_key %in% igraph::V(g)$name)) { + next + } + # Find the shortest path between the two keys + sp <- suppressWarnings(igraph::shortest_paths(g, from_key, to_key, output = "vpath")$vpath[[1]]) + if (length(sp) > 1) { + intermediate <- setdiff(names(sp), c(from_key, to_key)) + # Extract values at those intermediate keys + intermediate_values <- sapply(intermediate, function(k) { + cell <- active_cells[active_cells$key == k, ] + if (nrow(cell) > 0) cell$Value else NA + }) + result <- rbind(result, data.frame( + from_id = id_map[[from_key]], + to_id = id_map[[to_key]], + path_length = length(sp) - 1, + intermediates = paste(intermediate, collapse = ";"), + intermediate_values = paste(intermediate_values, collapse = ""), + stringsAsFactors = FALSE + )) + } else { + # If no path found, add a row with NA values + result <- rbind(result, data.frame( + from_id = id_map[[from_key]], + to_id = id_map[[to_key]], + path_length = NA, + intermediates = NA, + intermediate_values = NA, + stringsAsFactors = FALSE + )) + } + } + } + + if (deduplicate == TRUE) { + # Deduplicate pairs + result <- deduplicatePairs(result) + } + + return(result) +} + +#' Build adjacency list (4-way neighbors) +#' +#' @param cell A data frame with columns Row and Column +#' @return A character vector of neighboring cell keys +#' @keywords internal + +getGridNeighbors <- function(cell, active_keys) { + offsets <- list(c(1, 0), c(-1, 0), c(0, 1), c(0, -1)) # down, up, right, left + out <- character() + for (offset in offsets) { + r2 <- cell$Row + offset[1] + c2 <- cell$Column + offset[2] + key2 <- paste(r2, c2, sep = "_") + if (key2 %in% active_keys) { + out <- c(out, key2) + } + } + return(out) +} + +#' Deduplicate pairs of IDs in a data frame +#' +#' @param df A data frame with columns from_id and to_id +#' @return A data frame with unique pairs of IDs +#' @keywords internal +deduplicatePairs <- function(df) { + # Create a new column with sorted pairs + df$pair <- apply(df[, c("from_id", "to_id")], 1, function(x) paste(sort(x), collapse = "_")) + + # Remove duplicates based on the pair column + df_dedup <- df[!duplicated(df$pair), ] + + # Drop the pair column + df_dedup$pair <- NULL + + return(df_dedup) +} diff --git a/R/buildPedigree.R b/R/segmentPedigree.R similarity index 92% rename from R/buildPedigree.R rename to R/segmentPedigree.R index 6a89f94e..98dfbaa3 100644 --- a/R/buildPedigree.R +++ b/R/segmentPedigree.R @@ -1,6 +1,7 @@ #' Segment Pedigree into Extended Families #' -#' This function adds an extended family ID variable to a pedigree by segmenting that dataset into independent extended families +#' This function adds an extended family ID variable to a pedigree by segmenting +#' that dataset into independent extended families #' using the weakly connected components algorithm. #' #' @@ -86,7 +87,9 @@ ped2graph <- function(ped, adjacent = c("parents", "mothers", "fathers"), ...) { # Check ped/data.fram - if (!inherits(ped, "data.frame")) stop("ped should be a data.frame or inherit to a data.frame") + if (!inherits(ped, "data.frame")) { + stop("ped should be a data.frame or inherit to a data.frame") + } # Handle adjacent argument adjacent <- match.arg(tolower(adjacent)[1], choices = c( @@ -165,7 +168,7 @@ ped2graph <- function(ped, #' Add a maternal line ID variable to a pedigree #' @inheritParams ped2fam -#' @param matID Character. Maternal line ID variable to be created and added to the pedigree +#' @param matID Character. Maternal line ID variable to be created and added to the pedigree #' @details #' Under various scenarios it is useful to know which people in a pedigree #' belong to the same maternal lines. This function first turns a pedigree @@ -177,9 +180,13 @@ ped2graph <- function(ped, #' @export #' ped2maternal <- function(ped, personID = "ID", - momID = "momID", dadID = "dadID", matID = "matID", ...) { + momID = "momID", dadID = "dadID", + matID = "matID", ...) { # Call to wrapper function - .ped2id(ped = ped, personID = personID, momID = momID, dadID = dadID, famID = matID, type = "mothers") + .ped2id( + ped = ped, personID = personID, momID = momID, + dadID = dadID, famID = matID, type = "mothers" + ) } #' Add a paternal line ID variable to a pedigree @@ -199,5 +206,8 @@ ped2paternal <- function(ped, personID = "ID", momID = "momID", dadID = "dadID", patID = "patID", ...) { # Call to wrapper function - .ped2id(ped = ped, personID = personID, momID = momID, dadID = dadID, famID = patID, type = "fathers") + .ped2id( + ped = ped, personID = personID, momID = momID, + dadID = dadID, famID = patID, type = "fathers" + ) } diff --git a/R/simulatePedigree.R b/R/simulatePedigree.R index d50b7732..65f35ff8 100644 --- a/R/simulatePedigree.R +++ b/R/simulatePedigree.R @@ -24,7 +24,6 @@ buildWithinGenerations <- function(sizeGens, marR, sexR, Ngen) { df_Ngen$sex <- determineSex(idGen = idGen, sexR = sexR) - # print(paste("tiger",i)) # The first generation if (i == 1) { @@ -181,11 +180,9 @@ buildBetweenGenerations <- function(df_Fam, Ngen, sizeGens, verbose, marR, sexR, # count the number of couples in the i th gen countCouple <- (nrow(df_Ngen) - sum(is.na(df_Ngen$spID))) * .5 - # Now, assign couple IDs for the current generation df_Ngen <- assignCoupleIds(df_Ngen) - # get the number of linked female and male children after excluding the single children # get a vector of single person id in the ith generation IdSingle <- df_Ngen$id[is.na(df_Ngen$spID)] @@ -194,9 +191,13 @@ buildBetweenGenerations <- function(df_Fam, Ngen, sizeGens, verbose, marR, sexR, SingleM <- sum(df_Ngen$sex == "M" & is.na(df_Ngen$spID)) CoupleM <- N_LinkedMale - SingleM - df_Fam[df_Fam$gen == i, ] <- markPotentialChildren(df_Ngen = df_Ngen, i = i, Ngen = Ngen, sizeGens = sizeGens, CoupleF = CoupleF) - - + df_Fam[df_Fam$gen == i, ] <- markPotentialChildren( + df_Ngen = df_Ngen, + i = i, + Ngen = Ngen, + sizeGens = sizeGens, + CoupleF = CoupleF + ) if (verbose) { print( "Step 2.2: mark a group of potential parents in the i-1 th generation" @@ -251,7 +252,6 @@ buildBetweenGenerations <- function(df_Fam, Ngen, sizeGens, verbose, marR, sexR, # generate link kids to the couples random_numbers <- adjustKidsPerCouple(nMates = sum(df_Ngen$ifparent) / 2, kpc = kpc, rd_kpc = rd_kpc) - # cat("final random numbers",random_numbers, "\n") # cat("mean",sum(random_numbers)/length(random_numbers), "\n") # create two vectors for maId and paId; replicate the ids to match the same length as IdOfp @@ -343,6 +343,7 @@ buildBetweenGenerations <- function(df_Fam, Ngen, sizeGens, verbose, marR, sexR, #' @param balancedSex Not fully developed yet. Always \code{TRUE} in the current version. #' @param balancedMar Not fully developed yet. Always \code{TRUE} in the current version. #' @param verbose logical If TRUE, print progress through stages of algorithm +#' @param ... Additional arguments to be passed to other functions. #' @return A \code{data.frame} with each row representing a simulated individual. The columns are as follows: #' \itemize{ @@ -390,8 +391,6 @@ simulatePedigree <- function(kpc = 3, sizeGens = sizeGens, verbose = verbose, marR = marR, sexR = sexR, kpc = kpc, rd_kpc = rd_kpc ) - - df_Fam <- df_Fam[, 1:7] df_Fam <- df_Fam[!(is.na(df_Fam$pat) & is.na(df_Fam$mat) & is.na(df_Fam$spID)), ] colnames(df_Fam)[c(2, 4, 5)] <- c("ID", "dadID", "momID") @@ -408,3 +407,10 @@ simulatePedigree <- function(kpc = 3, # print(df_Fam) return(df_Fam) } + +#' @rdname simulatePedigree +#' @export +SimPed <- function(...) { # nolint: object_name_linter. + warning("The 'SimPed' function is deprecated. Please use 'simulatePedigree' instead.") + simulatePedigree(...) +} diff --git a/R/summarizePedigree.R b/R/summarizePedigree.R index 1fbd06ac..6cee4e5e 100644 --- a/R/summarizePedigree.R +++ b/R/summarizePedigree.R @@ -39,7 +39,7 @@ summarizePedigrees <- function(ped, famID = "famID", personID = "ID", matID = "matID", patID = "patID", type = c("fathers", "mothers", "families"), byr = NULL, include_founder = FALSE, founder_sort_var = NULL, - nbiggest = 5, noldest = 5, skip_var = NULL, + nbiggest = 5, noldest = nbiggest, skip_var = NULL, five_num_summary = FALSE, network_checks = FALSE, verbose = FALSE) { # Fast Fails @@ -104,14 +104,13 @@ summarizePedigrees <- function(ped, famID = "famID", personID = "ID", if (network_checks) { if (verbose) message("Performing network validation checks...") - network_validation_results <- checkPedigreeNetwork( + output$network_validation <- checkPedigreeNetwork( ped, personID = personID, momID = momID, dadID = dadID, verbose = verbose ) - output$network_validation <- network_validation_results } # Calculate summary statistics for families, maternal lines, and paternal lines @@ -124,15 +123,11 @@ summarizePedigrees <- function(ped, famID = "famID", personID = "ID", ) # Find the originating member for each line if (include_founder) { - if (verbose) message("Finding originating members for families...") - originating_member_family <- findFounder(ped_dt, + family_summary_dt <- summarizeFounder( + verbose = verbose, ped_dt = ped_dt, group_var = famID, - sort_var = founder_sort_var - ) - # Merge summary statistics with originating members for additional information - family_summary_dt <- merge(family_summary_dt, - originating_member_family, - by = famID, suffixes = c("", "_founder") + sort_var = founder_sort_var, + foo_summary_dt = family_summary_dt ) } output$family_summary <- family_summary_dt @@ -147,14 +142,11 @@ summarizePedigrees <- function(ped, famID = "famID", personID = "ID", five_num_summary = five_num_summary ) if (include_founder) { - if (verbose) message("Finding originating members for matrilineal lines...") - originating_member_maternal <- findFounder(ped_dt, + maternal_summary_dt <- summarizeFounder( + verbose = verbose, ped_dt = ped_dt, group_var = matID, - sort_var = founder_sort_var - ) - maternal_summary_dt <- merge(maternal_summary_dt, - originating_member_maternal, - by = matID, suffixes = c("", "_founder") + sort_var = founder_sort_var, + foo_summary_dt = maternal_summary_dt ) } output$maternal_summary <- maternal_summary_dt @@ -168,14 +160,11 @@ summarizePedigrees <- function(ped, famID = "famID", personID = "ID", five_num_summary = five_num_summary ) if (include_founder) { - if (verbose) message("Finding originating members for patrilineal lines...") - originating_member_paternal <- findFounder(ped_dt, + paternal_summary_dt <- summarizeFounder( + verbose = verbose, ped_dt = ped_dt, group_var = patID, - sort_var = founder_sort_var - ) - paternal_summary_dt <- merge(paternal_summary_dt, - originating_member_paternal, - by = patID, suffixes = c("", "_founder") + sort_var = founder_sort_var, + foo_summary_dt = paternal_summary_dt ) } @@ -191,58 +180,66 @@ summarizePedigrees <- function(ped, famID = "famID", personID = "ID", # repair = FALSE, verbose = verbose) # } - - # Optionally find the superlative lines - # & noldest <= unique(ped_dt[[famID]]) - # determin number of lines - - ## oldest if (!is.null(byr) && noldest > 0) { if (!is.null(n_families) && "families" %in% type) { if (verbose) message("Finding oldest families...") - output$oldest_families <- try_na(family_summary_dt[order(get(byr))][1:min(c(noldest, n_families), - na.rm = TRUE - )]) + output$oldest_families <- findOldest( + foo_summary_dt = family_summary_dt, + byr = byr, + noldest = noldest, + n_foo = n_families + ) } if (!is.null(n_mothers) && "mothers" %in% type) { if (verbose) message("Finding oldest maternal lines...") - output$oldest_maternal <- try_na(maternal_summary_dt[order(get(byr))][1:min(c(noldest, n_mothers), - na.rm = TRUE - )]) + output$oldest_maternal <- findOldest( + foo_summary_dt = maternal_summary_dt, + byr = byr, + noldest = noldest, + n_foo = n_mothers + ) } if (!is.null(n_fathers) && "fathers" %in% type) { if (verbose) message("Finding oldest paternal lines...") - output$oldest_paternal <- try_na(paternal_summary_dt[order(get(byr))][1:min(c(noldest, n_fathers), - na.rm = TRUE - )]) + output$oldest_paternal <- findOldest( + foo_summary_dt = paternal_summary_dt, + byr = byr, + noldest = noldest, + n_foo = n_fathers + ) } } # biggest lines if (!is.null(nbiggest) && nbiggest > 0) { if (!is.null(n_families) && "families" %in% type) { - output$biggest_families <- try_na(family_summary_dt[order(-get("count"))][1:min(c(nbiggest, n_families), - na.rm = TRUE - )]) + output$biggest_families <- findBiggest( + foo_summary_dt = family_summary_dt, + nbiggest = nbiggest, + n_foo = n_families + ) } if (!is.null(n_mothers) && "mothers" %in% type) { - output$biggest_maternal <- try_na(maternal_summary_dt[order(-get("count"))][1:min(c(nbiggest, n_mothers), - na.rm = TRUE - )]) + output$biggest_maternal <- findBiggest( + foo_summary_dt = maternal_summary_dt, + nbiggest = nbiggest, + n_foo = n_mothers + ) } if (!is.null(n_fathers) && "fathers" %in% type) { - output$biggest_paternal <- try_na(paternal_summary_dt[order(-get("count"))][1:min(c(nbiggest, n_fathers), - na.rm = TRUE - )]) + output$biggest_paternal <- findBiggest( + foo_summary_dt = paternal_summary_dt, + nbiggest = nbiggest, + n_foo = n_fathers + ) } } - return(output) } -# Function to calculate summary statistics for all numeric variables +#' Function to calculate summary statistics for all numeric variables #' This function calculates summary statistics for all numeric variables in a data.table. It is supposed to be used internally by the \code{summarize_pedigree} function. #' @inheritParams summarizePedigrees #' @param data A data.table containing the pedigree data. @@ -290,8 +287,8 @@ calculateSummaryDT <- function(data, group_var, skip_var, return(summary_stats) } -# Function to find the originating member for each line - +#' Function to find the originating member for each line +#' #' This function finds the originating member for each line in a pedigree. It is supposed to be used internally by the \code{summarize_pedigree} function. #' @inheritParams summarizePedigrees #' @param sort_var A character string specifying the column name to sort by. @@ -304,7 +301,29 @@ findFounder <- function(data, group_var, sort_var) { data[order(get(sort_var)), .SD[1], by = group_var] } +#' Function to summarize the originating members for each line +#' +#' This function summarizes the originating members for each line in a pedigree. It is supposed to be used internally by the \code{summarize_pedigree} function. +#' +#' @inheritParams summarizePedigrees +#' @inheritParams findFounder +#' +#' @keywords internal +summarizeFounder <- function(ped_dt, group_var, sort_var, foo_summary_dt, verbose) { + if (verbose) message(paste0("Finding originating members for ", "group_var")) + originating_member_foo <- findFounder( + data = ped_dt, + group_var = group_var, + sort_var = sort_var + ) + # Merge summary statistics with originating members for additional information + foo_summary_dt <- merge(foo_summary_dt, + originating_member_foo, + by = group_var, suffixes = c("", "_founder") + ) + return(foo_summary_dt) +} #' Summarize the maternal lines in a pedigree #' @inheritParams summarizePedigrees #' @seealso [summarizePedigrees ()] @@ -332,6 +351,9 @@ summarizeMatrilines <- function(ped, famID = "famID", personID = "ID", ) } + + + #' Summarize the paternal lines in a pedigree #' @inheritParams summarizePedigrees #' @seealso [summarizePedigrees ()] @@ -385,3 +407,46 @@ summarizeFamilies <- function(ped, famID = "famID", personID = "ID", founder_sort_var = founder_sort_var ) } +# Function to find the oldest individuals in a pedigree +#' This function finds the oldest families in a pedigree. It is supposed to be used internally by the \code{summarize_pedigree} function. +#' @inheritParams summarizePedigrees +#' @param foo_summary_dt A data.table containing the summary statistics. +#' @param n_foo An integer specifying the number of individuals in the summary. +#' @returns a data.table containing the oldest families in the pedigree. + +findOldest <- function(foo_summary_dt, byr, noldest, n_foo) { + oldest_foo <- try_na(foo_summary_dt[order(get(byr))][1:min(c(noldest, n_foo), + na.rm = TRUE + )]) + return(oldest_foo) +} + +# Function to find the biggest families in a pedigree +#' This function finds the biggest families in a pedigree. It is supposed to be used internally by the \code{summarize_pedigree} function. +#' @inheritParams findOldest +#' @inheritParams summarizePedigrees +#' @returns a data.table containing the biggest families in the pedigree. + + +findBiggest <- function(foo_summary_dt, nbiggest, n_foo) { + biggest_foo <- try_na(foo_summary_dt[order(-get("count"))][1:min(c(nbiggest, n_foo), + na.rm = TRUE + )]) + return(biggest_foo) +} + +#' @rdname summarizePedigrees +#' @export +summarisePedigrees <- summarizePedigrees + +#' @rdname summarizeFamilies +#' @export +summariseFamilies <- summarizeFamilies + +#' @rdname summarizeMatrilines +#' @export +summariseMatrilines <- summarizeMatrilines + +#' @rdname summarizePatrilines +#' @export +summarisePatrilines <- summarizePatrilines diff --git a/R/tweakPedigree.R b/R/tweakPedigree.R index 2367fe21..52b67a30 100644 --- a/R/tweakPedigree.R +++ b/R/tweakPedigree.R @@ -15,10 +15,10 @@ makeTwins <- function(ped, ID_twin1 = NA_integer_, ID_twin2 = NA_integer_, gen_twin = 2, verbose = FALSE) { # Check if the ped is the same format as the output of simulatePedigree if (paste0(colnames(ped), collapse = "") != paste0(c( - "fam", "ID", "gen", + "famID", "ID", "gen", "dadID", "momID", "spID", "sex" ), collapse = "")) { - ped <- standardizeColnames(ped) + ped <- standardizeColnames(ped, verbose = verbose) if (verbose) { cat("The input pedigree is not in the same format as the output of simulatePedigree\n") } @@ -128,10 +128,10 @@ makeInbreeding <- function(ped, if (paste0(colnames(ped), collapse = "" ) != paste0( - c("fam", "ID", "gen", "dadID", "momID", "spID", "sex"), + c("famID", "ID", "gen", "dadID", "momID", "spID", "sex"), collapse = "" )) { - ped <- standardizeColnames(ped) + ped <- standardizeColnames(ped, verbose = verbose) if (verbose) { cat("The input pedigree is not in the same format as the output of simulatePedigree\n") } @@ -255,7 +255,7 @@ dropLink <- function(ped, if (!is.na(ID_drop)) { ped[ped$ID %in% ID_drop, c("dadID", "momID")] <- NA_integer_ } else { - warning("No individual is dropped from his/her parents.") + warning("No individual is dropped from their parents.") } } else { ped[ped$ID == ID_drop, c("dadID", "momID")] <- NA_integer_ diff --git a/README.Rmd b/README.Rmd index ee83aced..9780c791 100644 --- a/README.Rmd +++ b/README.Rmd @@ -17,6 +17,7 @@ options(citation.bibtex.max = 0) # BGmisc +discord website [![status](https://joss.theoj.org/papers/ee3a025be4f61584f977a7657d936187/status.svg)](https://joss.theoj.org/papers/10.21105/joss.06203)
[![Project Status: Active – The project has reached a stable, usable state and is being actively developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active) [![R package version](https://www.r-pkg.org/badges/version/BGmisc)](https://cran.r-project.org/package=BGmisc) diff --git a/README.md b/README.md index 6b8357fe..eede5556 100644 --- a/README.md +++ b/README.md @@ -5,16 +5,20 @@ +discord website [![status](https://joss.theoj.org/papers/ee3a025be4f61584f977a7657d936187/status.svg)](https://joss.theoj.org/papers/10.21105/joss.06203)
[![Project Status: Active – The project has reached a stable, usable state and is being actively developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active) -[![R package version](https://www.r-pkg.org/badges/version/BGmisc)](https://cran.r-project.org/package=BGmisc) +[![R package +version](https://www.r-pkg.org/badges/version/BGmisc)](https://cran.r-project.org/package=BGmisc) [![Package downloads](https://cranlogs.r-pkg.org/badges/grand-total/BGmisc)](https://cran.r-project.org/package=BGmisc)
[![R-CMD-check](https://github.com/R-Computing-Lab/BGmisc/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/R-Computing-Lab/BGmisc/actions/workflows/R-CMD-check.yaml) -[![Dev Main branch](https://github.com/R-Computing-Lab/BGmisc/actions/workflows/R-CMD-dev_maincheck.yaml/badge.svg)](https://github.com/R-Computing-Lab/BGmisc/actions/workflows/R-CMD-dev_maincheck.yaml) -[![Codecov test coverage](https://codecov.io/gh/R-Computing-Lab/BGmisc/graph/badge.svg?token=2IARK2XSA6)](https://app.codecov.io/gh/R-Computing-Lab/BGmisc) +[![Dev Main +branch](https://github.com/R-Computing-Lab/BGmisc/actions/workflows/R-CMD-dev_maincheck.yaml/badge.svg)](https://github.com/R-Computing-Lab/BGmisc/actions/workflows/R-CMD-dev_maincheck.yaml) +[![Codecov test +coverage](https://codecov.io/gh/R-Computing-Lab/BGmisc/graph/badge.svg?token=2IARK2XSA6)](https://app.codecov.io/gh/R-Computing-Lab/BGmisc) ![License](https://img.shields.io/badge/License-GPL_v3-blue.svg) diff --git a/data-raw/HouseofHabsburg.ged b/data-raw/HouseofHabsburg.ged index 98b7100c..fd358bf8 100644 --- a/data-raw/HouseofHabsburg.ged +++ b/data-raw/HouseofHabsburg.ged @@ -1,4 +1,4 @@ -0 HEAD +0 HEAD 1 SOUR PAF 2 NAME Personal Ancestral File 2 VERS 5.2.18.0 @@ -260,7 +260,7 @@ 0 @I17@ INDI 1 NAME /Frederick I/ 2 SURN Frederick I -2 _AKA Frederick the Handsome or the Fair +2 _AKA Frederick the Handsome or the Fair 1 SEX M 1 BIRT 2 DATE 1289 diff --git a/data-raw/Targaryen tree Dance.txt b/data-raw/Targaryen tree Dance.txt new file mode 100644 index 00000000..082c263c --- /dev/null +++ b/data-raw/Targaryen tree Dance.txt @@ -0,0 +1,17 @@ +{{familytree/start|style=font: 90% sans-serif;|state={{{state|}}}|title={{{title|}}}}} +{{familytree | | | | | | | | | | | | | JAE |v| ALY | |JAE=[[Jaehaerys I Targaryen|Jaehaerys I]]|ALY=[[Alysanne Targaryen|Alysanne]]|}} +{{familytree | | | | | | | |,|-|-|-|v|-|-|-|^|-|-|-|v|-|-|-|.| | | | | | | | | | | | | | | | | | | | | }} +{{familytree | | |JOC|v|AEM| |DAE|v|ROD| |BAE|v|ALY| |JOC=[[Jocelyn Baratheon|Jocelyn
Baratheon]]|AEM=[[Aemon Targaryen (son of Jaehaerys I)|Aemon]]|DAE=[[Daella Targaryen (daughter of Jaehaerys I)|Daella]]|ROD=[[Rodrik Arryn|Rodrik
Arryn]]|BAE=[[Baelon Targaryen (son of Jaehaerys I)|Baelon]]|ALY=[[Alyssa Targaryen|Alyssa]]}} +{{familytree | | | | | |!| | | | | | | |!| | | |,|-|-|-|^|-|-|-|-|-|-|-|-|-|-|-|-|-|v|-|-|-|-|-|-|-|.| }} +{{familytree |COV|v|RHA| | | | | |AEM|v|VIS|-|-|-|-|-|-|-|-|-|-|v|ALI| |DAE|-|RHR| |AEG| |COV=[[Corlys Velaryon|Corlys
Velaryon]]|RHA=[[Rhaenys Targaryen (daughter of Aemon)|Rhaenys]]|AEM=[[Aemma Arryn|Aemma
Arryn]]|VIS=[[Viserys I Targaryen|Viserys I]]|ALI=[[Alicent Hightower|Alicent
Hightower]]|DAE=[[Daemon Targaryen|Daemon]]
Prince Daemon Targaryen was married to Lady Rhea Royce from {{Date|97}} to {{Date|115}}, to Lady Laena Velaryon from {{Date|115}} to {{Date|120}}, and to Princess Rhaenyra Targaryen from {{Date|120}} to {{Date|130}}.|RHR=[[Rhea Royce|Rhea
Royce]]|AEG=[[Aegon Targaryen (son of Baelon)|Aegon]]}} +{{familytree | |,|-|^|-|-|-|-|-|.| | | |,|-|^|-|-|-|-|-|v|-|-|-|.| | | |)|-|-|-|v|-|-|-|v|-|-|-|.| | | }} +{{familytree |LAE|v|DAE| |LEO|v|RHA|v|DAE| |SON| |BAE| |AEG|v|HEL| |AEM| |DAR| |LAE=[[Laena Velaryon|Laena
Velaryon]]|DAE=[[Daemon Targaryen|Daemon]]
|LEO=[[Laenor Velaryon|Laenor
Velaryon]]|RHA=[[Rhaenyra Targaryen|Rhaenyra]]|SON=Son|BAE=[[Baelon Targaryen (son of Viserys I)|Baelon]]|AEG=[[Aegon II Targaryen|Aegon II]]|HEL=[[Helaena Targaryen|Helaena]]|AEM=[[Aemond Targaryen|Aemond]]|DAR=[[Daeron Targaryen (son of Viserys I)|Daeron]]}} +{{familytree | | | |!| | | | | | | |!| | | |`|-|-|-|-|-|-|-|-|-|-|-|.| | | |`|-|-|-|-|-|-|-|.| | | | | | | | | }} +{{familytree | |,|-|^|-|v|-|-|-|.| |`|-|v|-|-|-|v|-|-|-|.| | | |,|-|^|-|v|-|-|-|.| | | |,|-|^|-|v|-|-|-|.| | | }} +{{familytree |BAE| |RHA| |STI| |JAC| |LUC| |JOF| |VIS| |VSN| |AEG|-|JAE| |JAY| |MAE| |BAE=[[Baela Targaryen|Baela]]|RHA=[[Rhaena Targaryen (daughter of Daemon)|Rhaena]]|STI=Stillborn
son|JAC=[[Jacaerys Velaryon|Jacaerys
Velaryon]]|LUC=[[Lucerys Velaryon|Lucerys
Velaryon]]|JOF=[[Joffrey Velaryon|Joffrey
Velaryon]]|VIS=[[Viserys II Targaryen|Viserys II]]|VSN=[[Visenya Targaryen (daughter of Daemon)|Visenya]]|AEG=[[Aegon III Targaryen|Aegon III]]|JAE=[[Jaehaera Targaryen|Jaehaera]]|JAY=[[Jaehaerys Targaryen (son of Aegon II)|Jaehaerys]]|MAE=[[Maelor Targaryen|Maelor]]}} +{{familytree | | | | | | | | |}} +{| style="border-spacing: 2px; - +|style="text-align: left; vertical-align: top"|'''Notes:''' +|{{References|group="Note"}} +|} +{{familytree/end|state={{{state|}}}}} diff --git a/data-raw/benchged.R b/data-raw/benchged.R new file mode 100644 index 00000000..18a9fc47 --- /dev/null +++ b/data-raw/benchged.R @@ -0,0 +1,31 @@ +library(microbenchmark) +library(Matrix) +# library(BGmisc) +# data("hazard") +library(tidyverse) + + + +# Run benchmarking for "loop" and "indexed" methods in ped2com() +benchmark_results <- microbenchmark( + reg = { + readGedcom("data-raw/royal92.ged") + }, + alpha = { + readGedcom.alpha("data-raw/royal92.ged") + }, + times = 5 # Run each method 100 times +) + +summary(benchmark_results) + +lm(benchmark_results$time ~ benchmark_results$expr) %>% + summary() +# Print benchmark results +print(benchmark_results) + +# Optional: Save results to CSV for later analysis +write.csv(summary(benchmark_results), + "benchmark_results.csv", + row.names = FALSE +) diff --git a/data-raw/benchmark.R b/data-raw/benchmark.R index 80d6db40..2add0e4f 100644 --- a/data-raw/benchmark.R +++ b/data-raw/benchmark.R @@ -2,7 +2,7 @@ library(microbenchmark) library(Matrix) # library(BGmisc) # data("hazard") - +library(tidyverse) # make big data set.seed(15) @@ -10,53 +10,307 @@ Ngen <- 5 kpc <- 5 sexR <- .50 marR <- .7 -ped <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR) - -# Define parameters -component <- "additive" # Change this to test different components -saveable <- FALSE # Disable saving to avoid disk I/O slowing down benchmarking -resume <- FALSE # Disable resume to ensure full fresh runs -save_path <- "checkpoint/" -verbose <- FALSE # Turn off verbose for cleaner output -update_rate <- 100 -save_rate_parlist <- 1000 - -# Run benchmarking for "loop" and "indexed" methods in ped2com() -benchmark_results <- microbenchmark( - loop = { - ped2com( - ped = ped, - component = component, - adjacency_method = "loop", # Test "loop" method - saveable = saveable, - resume = resume, - save_path = save_path, - verbose = verbose, - update_rate = update_rate, - save_rate_parlist = save_rate_parlist - ) - }, - indexed = { - ped2com( - ped = ped, - component = component, - adjacency_method = "indexed", # Test "indexed" method - saveable = saveable, - resume = resume, - save_path = save_path, - verbose = verbose, - update_rate = update_rate, - save_rate_parlist = save_rate_parlist +ped <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR) %>% + mutate( + fam = "fam 1" + ) +set.seed(151) +Ngen <- 5 +marR <- .8 +ped2 <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR) %>% + mutate( + fam = "fam 2", + ID = ID + max(ped2$ID, na.rm = TRUE), + momID = momID + max(ped$ID, na.rm = TRUE), + dadID = dadID + max(ped$ID, na.rm = TRUE), + spID = spID + max(ped$ID, na.rm = TRUE) + ) +set.seed(1151) +kpc <- 8 +Ngen <- 6 +ped3 <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR) %>% + mutate( + fam = "fam 3", + ID = ID + max(ped2$ID, na.rm = TRUE), + momID = momID + max(ped2$ID, na.rm = TRUE), + dadID = dadID + max(ped2$ID, na.rm = TRUE), + spID = spID + max(ped2$ID, na.rm = TRUE) + ) +ped3 <- ped3 %>% + mutate( + fam = "fam 4", + ID = ID + max(ped3$ID, na.rm = TRUE), + momID = momID + max(ped3$ID, na.rm = TRUE), + dadID = dadID + max(ped3$ID, na.rm = TRUE), + spID = spID + max(ped3$ID, na.rm = TRUE) + ) %>% + rbind(ped3) + +set.seed(1151) +kpc <- 2 +Ngen <- 10 +ped4 <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR) %>% + mutate( + fam = "fam 5", + ID = ID + max(ped3$ID, na.rm = TRUE), + momID = momID + max(ped3$ID, na.rm = TRUE), + dadID = dadID + max(ped3$ID, na.rm = TRUE), + spID = spID + max(ped3$ID, na.rm = TRUE) + ) + + +ped <- rbind(ped, ped2) +ped <- rbind(ped, ped3) +ped <- rbind(ped, ped4) + +if (TRUE) { + # Define parameters + component <- "common nuclear" # "additive" # Change this to test different components + saveable <- FALSE # Disable saving to avoid disk I/O slowing down benchmarking + resume <- FALSE # Disable resume to ensure full fresh runs + save_path <- "checkpoint/" + verbose <- FALSE # Turn off verbose for cleaner output + update_rate <- 100 + save_rate_parlist <- 1000 + # method_approach <- 1 + # Run benchmarking for "loop" and "indexed" methods in ped2com() + benchmark_results <- microbenchmark( + # loop_big = { + # ped2com( + # ped = ped, + # component = component, + # adjacency_method = "loop", # Test "loop" method + # saveable = saveable, + # resume = resume, + # save_path = save_path, + # verbose = verbose, + # update_rate = update_rate, + # save_rate_parlist = save_rate_parlist + # ) + # }, + indexed_big = { + ped2com( + ped = ped, + component = component, + adjacency_method = "indexed", # Test "indexed" method + saveable = saveable, + resume = resume, + save_path = save_path, + verbose = verbose, + update_rate = update_rate, + save_rate_parlist = save_rate_parlist + ) + }, + direct4_big = { + ped2com( + ped = ped, + component = component, + adjacency_method = "direct", # Test "indexed" method + saveable = saveable, + resume = resume, + save_path = save_path, + verbose = verbose, + update_rate = update_rate, + method_approach = 4, + save_rate_parlist = save_rate_parlist + ) + }, + direct2_big = { + ped2com( + ped = ped, + component = component, + adjacency_method = "direct", # Test "indexed" method + saveable = saveable, + resume = resume, + save_path = save_path, + verbose = verbose, + update_rate = update_rate, + method_approach = 2, + save_rate_parlist = save_rate_parlist + ) + }, + direct5_big = { + ped2com( + ped = ped, + component = component, + adjacency_method = "direct", # Test "indexed" method + saveable = saveable, + resume = resume, + save_path = save_path, + verbose = verbose, + update_rate = update_rate, + method_approach = 5, + save_rate_parlist = save_rate_parlist + ) + }, + # loop = { + # ped2com( + # ped = ped2, + # component = component, + # adjacency_method = "loop", # Test "loop" method + # saveable = saveable, + # resume = resume, + # save_path = save_path, + # verbose = verbose, + # update_rate = update_rate, + # save_rate_parlist = save_rate_parlist + # + # ) + # }, + indexed = { + ped2com( + ped = ped2, + component = component, + adjacency_method = "indexed", # Test "indexed" method + saveable = saveable, + resume = resume, + save_path = save_path, + verbose = verbose, + update_rate = update_rate, + save_rate_parlist = save_rate_parlist + ) + }, + direct4 = { + ped2com( + ped = ped2, + component = component, + adjacency_method = "direct", # Test "indexed" method + saveable = saveable, + resume = resume, + save_path = save_path, + verbose = verbose, + update_rate = update_rate, + method_approach = 4, + save_rate_parlist = save_rate_parlist + ) + }, + direct2 = { + ped2com( + ped = ped2, + component = component, + adjacency_method = "direct", # Test "indexed" method + saveable = saveable, + resume = resume, + save_path = save_path, + verbose = verbose, + update_rate = update_rate, + method_approach = 2, + save_rate_parlist = save_rate_parlist + ) + }, + direct5 = { + ped2com( + ped = ped2, + component = component, + adjacency_method = "direct", # Test "indexed" method + saveable = saveable, + resume = resume, + save_path = save_path, + verbose = verbose, + update_rate = update_rate, + method_approach = 5, + save_rate_parlist = save_rate_parlist + ) + }, + times = 10 # Run each method 100 times + ) + + + summary(benchmark_results) + + df_plot <- benchmark_results %>% mutate( + size = case_when( + expr %in% c("loop", "indexed", "direct4", "direct2", "direct5") ~ "small", + expr %in% c("loop_big", "indexed_big", "direct4_big", "direct2_big", "direct5_big") ~ "big" + ), + method = case_when( + expr %in% c("loop", "loop_big") ~ "loop", + expr %in% c("indexed", "indexed_big") ~ "indexed", + expr %in% c("direct4", "direct4_big") ~ "direct4", + expr %in% c("direct2", "direct2_big") ~ "direct2", + expr %in% c("direct5", "direct5_big") ~ "direct5" ) - }, - times = 100 # Run each method 100 times -) + ) # %>% + + # set indexed as reference level + df_plot$method <- factor(df_plot$method, levels = c("indexed", "loop", "direct2", "direct4", "direct5")) + df_plot$size <- factor(df_plot$size, levels = c("small", "big")) + + + lm(time ~ method * size, data = df_plot) %>% + summary() %>% + print() + + + p <- ggplot(df_plot, aes(x = method, y = time)) + + geom_boxplot(aes(fill = size), alpha = 0.5) + + labs( + title = "Benchmarking Results", + x = "Method", + y = "Time (seconds)" + ) + + theme_minimal() + + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + # Print benchmark results + + p + print(benchmark_results) + + # Optional: Save results to CSV for later analysis + write.csv(summary(benchmark_results), + "benchmark_results.csv", + row.names = FALSE + ) + # Print benchmark +} -# Print benchmark +if (FALSE) { + verbose <- FALSE + ad_ped_matrix <- ped2com(ped, component = "additive", adjacency_method = "direct", sparse = TRUE) + mit_ped_matrix <- ped2com(ped, component = "mitochondrial", adjacency_method = "direct", sparse = TRUE) + cn_ped_matrix <- ped2com(ped, component = "common nuclear", adjacency_method = "indexed", sparse = TRUE) + benchmark_results <- microbenchmark( + beta = { + com2links.beta( + ad_ped_matrix = ad_ped_matrix, + mit_ped_matrix = mit_ped_matrix, + cn_ped_matrix = cn_ped_matrix, + writetodisk = TRUE, + verbose = verbose + ) + file.remove("dataRelatedPairs.csv") + }, regular = { + com2links( + ad_ped_matrix = ad_ped_matrix, + mit_ped_matrix = mit_ped_matrix, + cn_ped_matrix = cn_ped_matrix, + writetodisk = TRUE, + verbose = verbose + ) + file.remove("dataRelatedPairs.csv") + }, legacy = { + com2links( + ad_ped_matrix = ad_ped_matrix, + mit_ped_matrix = mit_ped_matrix, + cn_ped_matrix = cn_ped_matrix, + verbose = verbose, + legacy = TRUE + ) + file.remove("dataRelatedPairs.csv") + }, + times = 100 # Run each method 100 times + ) + summary(benchmark_results) -# Print benchmark results -print(benchmark_results) + lm(benchmark_results$time ~ benchmark_results$expr) %>% + summary() + # Print benchmark results + print(benchmark_results) -# Optional: Save results to CSV for later analysis -write.csv(summary(benchmark_results), "benchmark_results.csv", row.names = FALSE) + # Optional: Save results to CSV for later analysis + write.csv(summary(benchmark_results), + "benchmark_results.csv", + row.names = FALSE + ) +} diff --git a/data-raw/df_ASOIAF.R b/data-raw/df_ASOIAF.R index 4a840d57..9741d04f 100644 --- a/data-raw/df_ASOIAF.R +++ b/data-raw/df_ASOIAF.R @@ -10,7 +10,7 @@ library(BGmisc) ## Create dataframe ASOIAF <- readGedcom("data-raw/ASOIAF.ged") - +# ASOIAF <- readGedcom("data-raw/ASOIAF_040725.ged") df <- ped2fam(ASOIAF, personID = "id") %>% select( @@ -29,7 +29,8 @@ df <- ped2fam(ASOIAF, personID = "id") %>% name = str_remove(name, "/") ) -# pedADD <- ped2com(df , personID = "id", momID = "momID", dadID = "dadID", component = "additive", isChild_method = "partial_parent") +# pedADD <- ped2com(df , personID = "id", momID = "momID", +# dadID = "dadID", component = "additive", isChild_method = "partial_parent") # com2links(ad_ped_matrix=pedADD) # if missing momID or dadID, assign the next available ID diff --git a/data-raw/df_inbreeding.R b/data-raw/df_inbreeding.R index 4419baef..430830ff 100644 --- a/data-raw/df_inbreeding.R +++ b/data-raw/df_inbreeding.R @@ -8,5 +8,5 @@ inbreeding <- raw ## # data processing -write.csv(inbreeding, "data-raw/inbreeding.csv", row.names = FALSE) +# write.csv(inbreeding, "data-raw/inbreeding.csv", row.names = FALSE) usethis::use_data(inbreeding, overwrite = TRUE, compress = "xz") diff --git a/data-raw/df_potter.R b/data-raw/df_potter.R index 77c147aa..d1f5faf3 100644 --- a/data-raw/df_potter.R +++ b/data-raw/df_potter.R @@ -45,7 +45,11 @@ potter <- data.frame( "Molly Weasley", "Lucy Weasley" ), - gen = c(1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3), + gen = c( + 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3 + ), momID = c( 101, 101, 103, 103, NA, 3, 4, 10, NA, NA, 10, 10, 10, 10, 10, 10, NA, 105, 105, NA, @@ -137,7 +141,8 @@ potter[nrow(potter) + 1, ] <- list( 0 ) -# potter[nrow(potter) + 1,] <- list(personID,fam,name,gen,momID,dadID,spouseID,sex) +# potter[nrow(potter) + 1,] <- list(personID,fam,name,gen, +# momID,dadID,spouseID,sex) write_csv(potter, here("data-raw", "potter.csv")) usethis::use_data(potter, overwrite = TRUE, compress = "xz") @@ -156,18 +161,24 @@ df <- ped2fam(potter_big, personID = "id") %>% -birth_date, -FAMC, -FAMS - ) %>% rename(personID=id) %>% + ) %>% + rename(personID = id) %>% mutate( personID = as.numeric(personID), momID = as.numeric(momID), dadID = as.numeric(dadID), name = str_remove(name, "/") - ) %>% arrange(name) + ) %>% + arrange(name) -potter_clean <- potter %>% arrange(name) %>% mutate(personID = as.numeric(personID)*1000, - momID = as.numeric(momID)*1000, - dadID = as.numeric(dadID)*1000) +potter_clean <- potter %>% + arrange(name) %>% + mutate( + personID = as.numeric(personID) * 1000, + momID = as.numeric(momID) * 1000, + dadID = as.numeric(dadID) * 1000 + ) # merge so that I have all of A and all the Bs that match A @@ -177,166 +188,168 @@ df_clean <- df %>% potter_clean <- potter_clean %>% - mutate(personID = case_match( - personID, - 22000 ~ 15, - 9000 ~ 18, - 6000 ~ 10, - 18000 ~ 29, - 7000 ~ 1, - 17000 ~ 25, - 25000 ~ 27, - 13000 ~ 20, - 19000 ~ 34, - 23000 ~ 16, - 10000 ~ 17, - 14000 ~ 22, - 3000 ~ 8, - 24000 ~ 26, - 1000 ~ 9, - 26000 ~ 30, - 21000 ~ 14, - 11000 ~ 19, - 8000 ~ 13, - 5000 ~ 2, - 15000 ~ 23, - 16000 ~ 24, - 4000 ~ 3, - 2000 ~ -2000, - 101000 ~12, - 102000 ~ 11, - 20000 ~ -20000, - 27000 ~ -27000, - 106000 ~ 32, - 105000 ~ 33, - 29000 ~ 17, - 103000 ~ 7, - 104000 ~ 6, - 12000 ~ 21, - 30000 ~ -30000, - 28000 ~ -28000, - .default = personID - ), dadID = - case_match( - dadID, - 22000 ~ 15, - 9000 ~ 18, - 6000 ~ 10, - 18000 ~ 29, - 7000 ~ 1, - 17000 ~ 25, - 25000 ~ 27, - 13000 ~ 20, - 19000 ~ 34, - 23000 ~ 16, - 10000 ~ 17, - 14000 ~ 22, - 3000 ~ 8, - 24000 ~ 26, - 1000 ~ 9, - 26000 ~ 30, - 21000 ~ 14, - 11000 ~ 19, - 8000 ~ 13, - 5000 ~ 2, - 15000 ~ 23, - 16000 ~ 24, - 4000 ~ 3, - 2000 ~ 2000, - 101000 ~12, - 102000 ~ 11, - 20000 ~ 20000, - 27000 ~ 27000, - 106000 ~ 32, - 105000 ~ 33, - 29000 ~ 17, - 103000 ~ 7, - 104000 ~ 6, - 12000 ~ 21, - 30000 ~ 30000, - 28000 ~ 28000, - .default = dadID - ), momID = - case_match( - momID, - 22000 ~ 15, - 9000 ~ 18, - 6000 ~ 10, - 18000 ~ 29, - 7000 ~ 1, - 17000 ~ 25, - 25000 ~ 27, - 13000 ~ 20, - 19000 ~ 34, - 23000 ~ 16, - 10000 ~ 17, - 14000 ~ 22, - 3000 ~ 8, - 24000 ~ 26, - 1000 ~ 9, - 26000 ~ 30, - 21000 ~ 14, - 11000 ~ 19, - 8000 ~ 13, - 5000 ~ 2, - 15000 ~ 23, - 16000 ~ 24, - 4000 ~ 3, - 2000 ~ 2000, - 101000 ~12, - 102000 ~ 11, - 20000 ~ 20000, - 27000 ~ 27000, - 106000 ~ 32, - 105000 ~ 33, - 29000 ~ 17, - 103000 ~ 7, - 104000 ~ 6, - 12000 ~ 21, - 30000 ~ 30000, - 28000 ~ 28000, - .default = momID - ), spouseID = - case_match( - spouseID, - 22000 ~ 15, - 9000 ~ 18, - 6000 ~ 10, - 18000 ~ 29, - 7000 ~ 1, - 17000 ~ 25, - 25000 ~ 27, - 13000 ~ 20, - 19000 ~ 34, - 23000 ~ 16, - 10000 ~ 17, - 14000 ~ 22, - 3000 ~ 8, + mutate( + personID = case_match( + personID, + 22000 ~ 15, + 9000 ~ 18, + 6000 ~ 10, + 18000 ~ 29, + 7000 ~ 1, + 17000 ~ 25, + 25000 ~ 27, + 13000 ~ 20, + 19000 ~ 34, + 23000 ~ 16, + 10000 ~ 17, + 14000 ~ 22, + 3000 ~ 8, 24000 ~ 26, - 1000 ~ 9, - 26000 ~ 30, + 1000 ~ 9, + 26000 ~ 30, 21000 ~ 14, - 11000 ~ 19, - 8000 ~ 13, + 11000 ~ 19, + 8000 ~ 13, 5000 ~ 2, - 15000 ~ 23, - 16000 ~ 24, + 15000 ~ 23, + 16000 ~ 24, 4000 ~ 3, - 2000 ~ 2000, - 101000 ~12, - 102000 ~ 11, - 20000 ~ 20000, - 27000 ~ 27000, + 2000 ~ -2000, + 101000 ~ 12, + 102000 ~ 11, + 20000 ~ -20000, + 27000 ~ -27000, 106000 ~ 32, - 105000 ~ 33, - 29000 ~ 17, - 103000 ~ 7, - 104000 ~ 6, - 12000 ~ 21, - 30000 ~ 30000, - 28000 ~ 28000, - .default = spouseID - )) + 105000 ~ 33, + 29000 ~ 17, + 103000 ~ 7, + 104000 ~ 6, + 12000 ~ 21, + 30000 ~ -30000, + 28000 ~ -28000, + .default = personID + ), dadID = + case_match( + dadID, + 22000 ~ 15, + 9000 ~ 18, + 6000 ~ 10, + 18000 ~ 29, + 7000 ~ 1, + 17000 ~ 25, + 25000 ~ 27, + 13000 ~ 20, + 19000 ~ 34, + 23000 ~ 16, + 10000 ~ 17, + 14000 ~ 22, + 3000 ~ 8, + 24000 ~ 26, + 1000 ~ 9, + 26000 ~ 30, + 21000 ~ 14, + 11000 ~ 19, + 8000 ~ 13, + 5000 ~ 2, + 15000 ~ 23, + 16000 ~ 24, + 4000 ~ 3, + 2000 ~ 2000, + 101000 ~ 12, + 102000 ~ 11, + 20000 ~ 20000, + 27000 ~ 27000, + 106000 ~ 32, + 105000 ~ 33, + 29000 ~ 17, + 103000 ~ 7, + 104000 ~ 6, + 12000 ~ 21, + 30000 ~ 30000, + 28000 ~ 28000, + .default = dadID + ), momID = + case_match( + momID, + 22000 ~ 15, + 9000 ~ 18, + 6000 ~ 10, + 18000 ~ 29, + 7000 ~ 1, + 17000 ~ 25, + 25000 ~ 27, + 13000 ~ 20, + 19000 ~ 34, + 23000 ~ 16, + 10000 ~ 17, + 14000 ~ 22, + 3000 ~ 8, + 24000 ~ 26, + 1000 ~ 9, + 26000 ~ 30, + 21000 ~ 14, + 11000 ~ 19, + 8000 ~ 13, + 5000 ~ 2, + 15000 ~ 23, + 16000 ~ 24, + 4000 ~ 3, + 2000 ~ 2000, + 101000 ~ 12, + 102000 ~ 11, + 20000 ~ 20000, + 27000 ~ 27000, + 106000 ~ 32, + 105000 ~ 33, + 29000 ~ 17, + 103000 ~ 7, + 104000 ~ 6, + 12000 ~ 21, + 30000 ~ 30000, + 28000 ~ 28000, + .default = momID + ), spouseID = + case_match( + spouseID, + 22000 ~ 15, + 9000 ~ 18, + 6000 ~ 10, + 18000 ~ 29, + 7000 ~ 1, + 17000 ~ 25, + 25000 ~ 27, + 13000 ~ 20, + 19000 ~ 34, + 23000 ~ 16, + 10000 ~ 17, + 14000 ~ 22, + 3000 ~ 8, + 24000 ~ 26, + 1000 ~ 9, + 26000 ~ 30, + 21000 ~ 14, + 11000 ~ 19, + 8000 ~ 13, + 5000 ~ 2, + 15000 ~ 23, + 16000 ~ 24, + 4000 ~ 3, + 2000 ~ 2000, + 101000 ~ 12, + 102000 ~ 11, + 20000 ~ 20000, + 27000 ~ 27000, + 106000 ~ 32, + 105000 ~ 33, + 29000 ~ 17, + 103000 ~ 7, + 104000 ~ 6, + 12000 ~ 21, + 30000 ~ 30000, + 28000 ~ 28000, + .default = spouseID + ) + ) # Left join by name diff --git a/data-raw/hazard.csv b/data-raw/hazard.csv index 1e5007b8..b986fd7c 100644 --- a/data-raw/hazard.csv +++ b/data-raw/hazard.csv @@ -1,4 +1,4 @@ -"FamID","ID","sex","dadID","momID","affected","DA1","DA2","birthYr","onsetYr","deathYr","available","Gen","proband" +"famID","ID","sex","dadID","momID","affected","DA1","DA2","birthYr","onsetYr","deathYr","available","gen","proband" 1,1,1,NA,NA,TRUE,0,1,1902,1940,1970,TRUE,1,FALSE 1,2,0,NA,NA,FALSE,0,0,NA,NA,NA,FALSE,1,FALSE 1,3,0,2,1,FALSE,0,1,1923,NA,1982,TRUE,2,FALSE diff --git a/data-raw/hex.R b/data-raw/hex.R new file mode 100644 index 00000000..8dbfe218 --- /dev/null +++ b/data-raw/hex.R @@ -0,0 +1,2 @@ +library(hexSticker) +sticker("data-raw/logo_archie.png", package = "BGmisc", p_size = 20, s_x = 1, s_y = .75, s_width = .6, h_fill = "#0fa1e0", h_color = "#333333", p_color = "white", filename = "man/figures/hex.png") diff --git a/data-raw/inbreeding.csv b/data-raw/inbreeding.csv index 0c61ca8e..19b66a51 100644 --- a/data-raw/inbreeding.csv +++ b/data-raw/inbreeding.csv @@ -1,4 +1,4 @@ -ID,sex,dadID,momID,FamID,Gen,proband +ID,sex,dadID,momID,famID,gen,proband 1,1,NA,NA,1,1,FALSE 2,0,NA,NA,1,1,FALSE 3,1,NA,NA,1,1,FALSE diff --git a/data-raw/logo_archie.png b/data-raw/logo_archie.png new file mode 100644 index 00000000..2a947a37 Binary files /dev/null and b/data-raw/logo_archie.png differ diff --git a/data/hazard.rda b/data/hazard.rda index 66556950..7c4f244e 100644 Binary files a/data/hazard.rda and b/data/hazard.rda differ diff --git a/data/inbreeding.rda b/data/inbreeding.rda index 44339aa8..4490821b 100755 Binary files a/data/inbreeding.rda and b/data/inbreeding.rda differ diff --git a/data/royal92.rda b/data/royal92.rda index 357f1aba..678261bd 100644 Binary files a/data/royal92.rda and b/data/royal92.rda differ diff --git a/man/SimPed.Rd b/man/SimPed.Rd deleted file mode 100644 index 79dd1f96..00000000 --- a/man/SimPed.Rd +++ /dev/null @@ -1,33 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helpGeneric.R -\name{SimPed} -\alias{SimPed} -\title{SimPed (Deprecated)} -\usage{ -SimPed(...) -} -\arguments{ -\item{...}{Arguments to be passed to `simulatePedigree`.} -} -\value{ -The same result as calling `simulatePedigree`. -} -\description{ -When calling this function, a warning will be issued about its deprecation. -} -\details{ -This function is a wrapper around the new `simulatePedigree` function. -`SimPed` has been deprecated, and it's advised to use `simulatePedigree` directly. -} -\examples{ -\dontrun{ -# This is an example of the deprecated function: -SimPed(...) -# It is recommended to use: -simulatePedigree(...) -} -} -\seealso{ -\code{\link{simulatePedigree}} for the updated function. -} -\keyword{deprecated} diff --git a/man/addRowlessParents.Rd b/man/addRowlessParents.Rd new file mode 100644 index 00000000..83132805 --- /dev/null +++ b/man/addRowlessParents.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/checkParents.R +\name{addRowlessParents} +\alias{addRowlessParents} +\title{Add addRowlessParents} +\usage{ +addRowlessParents(ped, verbose, validation_results) +} +\arguments{ +\item{ped}{A dataframe representing the pedigree data with columns 'ID', 'dadID', and 'momID'.} + +\item{verbose}{A logical flag indicating whether to print progress and validation messages to the console.} + +\item{validation_results}{validation results} +} +\description{ +This function adds parents who appear in momID or dadID but are missing from ID +} diff --git a/man/applyTagMappings.Rd b/man/applyTagMappings.Rd new file mode 100644 index 00000000..ae08a548 --- /dev/null +++ b/man/applyTagMappings.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcom.R +\name{applyTagMappings} +\alias{applyTagMappings} +\title{Apply Tag Mappings to a Line} +\usage{ +applyTagMappings(line, record, pattern_rows, tag_mappings) +} +\arguments{ +\item{line}{A character string from the GEDCOM file.} + +\item{record}{A named list representing the individual's record.} + +\item{pattern_rows}{A list with GEDCOM tag counts.} + +\item{tag_mappings}{A list of lists. Each sublist should define: +- \code{tag}: the GEDCOM tag, +- \code{field}: the record field to update, +- \code{mode}: either "replace" or "append", +- \code{extractor}: (optional) a custom extraction function.} +} +\value{ +A list with the updated record (\code{record}) and a logical flag (\code{matched}). +} +\description{ +Iterates over a list of tag mappings and, if a tag matches the line, updates the record. +} diff --git a/man/assignCoupleIds.Rd b/man/assignCoupleIDs.Rd similarity index 88% rename from man/assignCoupleIds.Rd rename to man/assignCoupleIDs.Rd index 7f379f64..95165581 100644 --- a/man/assignCoupleIds.Rd +++ b/man/assignCoupleIDs.Rd @@ -1,9 +1,12 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/helpPedigree.R -\name{assignCoupleIds} +\name{assignCoupleIDs} +\alias{assignCoupleIDs} \alias{assignCoupleIds} \title{Assign Couple IDs} \usage{ +assignCoupleIDs(df_Ngen) + assignCoupleIds(df_Ngen) } \arguments{ diff --git a/man/parseTree.Rd b/man/buildTreeGrid.Rd similarity index 68% rename from man/parseTree.Rd rename to man/buildTreeGrid.Rd index e429662c..13205cee 100644 --- a/man/parseTree.Rd +++ b/man/buildTreeGrid.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/readPedigree.R -\name{parseTree} -\alias{parseTree} +% Please edit documentation in R/readWikifamilytree.R +\name{buildTreeGrid} +\alias{buildTreeGrid} \title{Parse Tree} \usage{ -parseTree(tree_lines) +buildTreeGrid(tree_lines) } \arguments{ \item{tree_lines}{A character vector containing the lines of the tree structure.} diff --git a/man/allGens.Rd b/man/calcAllGens.Rd similarity index 87% rename from man/allGens.Rd rename to man/calcAllGens.Rd index 6bc6d9e0..35871be7 100644 --- a/man/allGens.Rd +++ b/man/calcAllGens.Rd @@ -1,10 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/calculateFamilySize.R -\name{allGens} +\name{calcAllGens} +\alias{calcAllGens} \alias{allGens} -\title{allGens +\title{calcAllGens A function to calculate the number of individuals in each generation. This is a supporting function for \code{simulatePedigree}.} \usage{ +calcAllGens(kpc, Ngen, marR) + allGens(kpc, Ngen, marR) } \arguments{ @@ -18,6 +21,6 @@ allGens(kpc, Ngen, marR) Returns a vector containing the number of individuals in every generation. } \description{ -allGens +calcAllGens A function to calculate the number of individuals in each generation. This is a supporting function for \code{simulatePedigree}. } diff --git a/man/famSizeCal.Rd b/man/calcFamilySize.Rd similarity index 86% rename from man/famSizeCal.Rd rename to man/calcFamilySize.Rd index e51f95ee..dda31d20 100644 --- a/man/famSizeCal.Rd +++ b/man/calcFamilySize.Rd @@ -1,10 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/calculateFamilySize.R -\name{famSizeCal} +\name{calcFamilySize} +\alias{calcFamilySize} \alias{famSizeCal} -\title{famSizeCal +\title{calcFamilySize A function to calculate the total number of individuals in a pedigree given parameters. This is a supporting function for function \code{simulatePedigree}} \usage{ +calcFamilySize(kpc, Ngen, marR) + famSizeCal(kpc, Ngen, marR) } \arguments{ @@ -18,6 +21,6 @@ famSizeCal(kpc, Ngen, marR) Returns a numeric value indicating the total pedigree size. } \description{ -famSizeCal +calcFamilySize A function to calculate the total number of individuals in a pedigree given parameters. This is a supporting function for function \code{simulatePedigree} } diff --git a/man/sizeAllGens.Rd b/man/calcFamilySizeByGen.Rd similarity index 80% rename from man/sizeAllGens.Rd rename to man/calcFamilySizeByGen.Rd index e477c19e..f849339c 100644 --- a/man/sizeAllGens.Rd +++ b/man/calcFamilySizeByGen.Rd @@ -1,10 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/calculateFamilySize.R -\name{sizeAllGens} +\name{calcFamilySizeByGen} +\alias{calcFamilySizeByGen} \alias{sizeAllGens} -\title{sizeAllGens +\title{calcFamilySizeByGen An internal supporting function for \code{simulatePedigree}.} \usage{ +calcFamilySizeByGen(kpc, Ngen, marR) + sizeAllGens(kpc, Ngen, marR) } \arguments{ @@ -18,6 +21,6 @@ sizeAllGens(kpc, Ngen, marR) Returns a vector including the number of individuals in every generation. } \description{ -sizeAllGens +calcFamilySizeByGen An internal supporting function for \code{simulatePedigree}. } diff --git a/man/calculateH.Rd b/man/calculateH.Rd index 6717b588..5341b7e4 100644 --- a/man/calculateH.Rd +++ b/man/calculateH.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/computeRelatedness.R +% Please edit documentation in R/calculateRelatedness.R \name{calculateH} \alias{calculateH} \title{Falconer's Formula} diff --git a/man/calculateRelatedness.Rd b/man/calculateRelatedness.Rd index b9a8f067..c52ccf48 100644 --- a/man/calculateRelatedness.Rd +++ b/man/calculateRelatedness.Rd @@ -1,7 +1,8 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/computeRelatedness.R +% Please edit documentation in R/calculateRelatedness.R \name{calculateRelatedness} \alias{calculateRelatedness} +\alias{related_coef} \title{Calculate Relatedness Coefficient} \usage{ calculateRelatedness( @@ -18,6 +19,8 @@ calculateRelatedness( denom_m = FALSE, ... ) + +related_coef(...) } \arguments{ \item{generations}{Number of generations back of common ancestors the pair share.} diff --git a/man/calculateSummaryDT.Rd b/man/calculateSummaryDT.Rd index 45e67d3f..cb403b04 100644 --- a/man/calculateSummaryDT.Rd +++ b/man/calculateSummaryDT.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/summarizePedigree.R \name{calculateSummaryDT} \alias{calculateSummaryDT} -\title{This function calculates summary statistics for all numeric variables in a data.table. It is supposed to be used internally by the \code{summarize_pedigree} function.} +\title{Function to calculate summary statistics for all numeric variables +This function calculates summary statistics for all numeric variables in a data.table. It is supposed to be used internally by the \code{summarize_pedigree} function.} \usage{ calculateSummaryDT(data, group_var, skip_var, five_num_summary = FALSE) } @@ -20,6 +21,7 @@ the minimum, median, and maximum values.} A data.table containing the summary statistics for all numeric variables. } \description{ +Function to calculate summary statistics for all numeric variables This function calculates summary statistics for all numeric variables in a data.table. It is supposed to be used internally by the \code{summarize_pedigree} function. } \keyword{internal} diff --git a/man/checkIDuniqueness.Rd b/man/checkIDuniqueness.Rd new file mode 100644 index 00000000..6c1aebf6 --- /dev/null +++ b/man/checkIDuniqueness.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/checkIDs.R +\name{checkIDuniqueness} +\alias{checkIDuniqueness} +\title{Check for duplicated individual IDs} +\usage{ +checkIDuniqueness(ped, verbose = FALSE) +} +\arguments{ +\item{ped}{A dataframe representing the pedigree data with columns `ID`, `dadID`, and `momID`.} + +\item{verbose}{A logical flag indicating whether to print progress and validation messages to the console.} +} +\value{ +A list containing the results of the check +} +\description{ +This function checks for duplicated individual IDs in a pedigree. +} diff --git a/man/checkParentSex.Rd b/man/checkParentSex.Rd new file mode 100644 index 00000000..74a79c96 --- /dev/null +++ b/man/checkParentSex.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/checkSex.R +\name{checkParentSex} +\alias{checkParentSex} +\title{Check Parental Role Sex Consistency} +\usage{ +checkParentSex(ped, parent_col, sex_col = "sex", verbose = FALSE) +} +\arguments{ +\item{ped}{Pedigree dataframe.} + +\item{parent_col}{The column name for parent IDs ("momID" or "dadID").} + +\item{sex_col}{The column name for sex coding. Default is "sex".} + +\item{verbose}{Logical, whether to print messages.} +} +\value{ +A list containing role, unique sex codes, modal sex, inconsistent parents, and linked children. +} +\description{ +Validates sex coding consistency for a given parental role (momID or dadID). +} diff --git a/man/checkPedigreeNetwork.Rd b/man/checkPedigreeNetwork.Rd index d6e92749..9b8c8d24 100644 --- a/man/checkPedigreeNetwork.Rd +++ b/man/checkPedigreeNetwork.Rd @@ -33,7 +33,9 @@ Checks for structural issues in pedigree networks, including: } \examples{ \dontrun{ -results <- checkPedigreeNetwork(ped, personID = "ID", -momID = "momID", dadID = "dadID", verbose = TRUE) +results <- checkPedigreeNetwork(ped, + personID = "ID", + momID = "momID", dadID = "dadID", verbose = TRUE +) } } diff --git a/man/checkSex.Rd b/man/checkSex.Rd index a9e9ed03..2af43da7 100644 --- a/man/checkSex.Rd +++ b/man/checkSex.Rd @@ -9,7 +9,9 @@ checkSex( code_male = NULL, code_female = NULL, verbose = FALSE, - repair = FALSE + repair = FALSE, + momID = "momID", + dadID = "dadID" ) } \arguments{ @@ -22,6 +24,10 @@ checkSex( \item{verbose}{A logical flag indicating whether to print progress and validation messages to the console.} \item{repair}{A logical flag indicating whether to attempt repairs on the sex coding.} + +\item{momID}{The column name for maternal IDs. Default is "momID".} + +\item{dadID}{The column name for paternal IDs. Default is "dadID".} } \value{ Depending on the value of `repair`, either a list containing validation results or a repaired dataframe is returned. diff --git a/man/checkWithinRowDuplicates.Rd b/man/checkWithinRowDuplicates.Rd new file mode 100644 index 00000000..4797306c --- /dev/null +++ b/man/checkWithinRowDuplicates.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/checkIDs.R +\name{checkWithinRowDuplicates} +\alias{checkWithinRowDuplicates} +\title{Check for within-row duplicates (self-parents, same mom/dad)} +\usage{ +checkWithinRowDuplicates(ped, verbose = FALSE) +} +\arguments{ +\item{ped}{A dataframe representing the pedigree data with columns `ID`, `dadID`, and `momID`.} + +\item{verbose}{A logical flag indicating whether to print progress and validation messages to the console.} +} +\value{ +A list containing the results of the check +} +\description{ +This function checks for within-row duplicates in a pedigree. +} diff --git a/man/collapseNames.Rd b/man/collapseNames.Rd new file mode 100644 index 00000000..b3a0c514 --- /dev/null +++ b/man/collapseNames.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcom.R +\name{collapseNames} +\alias{collapseNames} +\title{collapse Names} +\usage{ +collapseNames(verbose, df_temp) +} +\arguments{ +\item{verbose}{A logical value indicating whether to print messages.} + +\item{df_temp}{A data frame containing the columns to be combined.} +} +\value{ +A data frame with the combined columns. +} +\description{ +This function combines the `name_given` and `name_given_pieces` columns in a data frame. +} diff --git a/man/com2links.Rd b/man/com2links.Rd index 9dae2f09..dc846645 100644 --- a/man/com2links.Rd +++ b/man/com2links.Rd @@ -18,6 +18,7 @@ com2links( legacy = FALSE, outcome_name = "data", drop_upper_triangular = TRUE, + include_all_links_1ped = FALSE, ... ) } @@ -48,6 +49,8 @@ com2links( \item{drop_upper_triangular}{Logical. If TRUE, drops the upper triangular portion of the matrix.} +\item{include_all_links_1ped}{Logical. If TRUE, includes all links in the output. (Default is true when only one ped is provided)} + \item{...}{Additional arguments to be passed to \code{\link{com2links}}} } \value{ diff --git a/man/combine_columns.Rd b/man/combine_columns.Rd index 86e6d4f9..43554b0b 100644 --- a/man/combine_columns.Rd +++ b/man/combine_columns.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/readPedigree.R +% Please edit documentation in R/readGedcom.R \name{combine_columns} \alias{combine_columns} \title{Combine Columns} diff --git a/man/compute_parent_adjacency.Rd b/man/computeParentAdjacency.Rd similarity index 72% rename from man/compute_parent_adjacency.Rd rename to man/computeParentAdjacency.Rd index 9cd4311c..6177f91f 100644 --- a/man/compute_parent_adjacency.Rd +++ b/man/computeParentAdjacency.Rd @@ -1,24 +1,25 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/convertPedigree.R -\name{compute_parent_adjacency} -\alias{compute_parent_adjacency} +% Please edit documentation in R/constructAdjacency.R +\name{computeParentAdjacency} +\alias{computeParentAdjacency} \title{Compute Parent Adjacency Matrix with Multiple Approaches} \usage{ -compute_parent_adjacency( +computeParentAdjacency( ped, component, - adjacency_method = "indexed", + adjacency_method = "direct", saveable, resume, save_path, - verbose, - lastComputed, - nr, + verbose = FALSE, + lastComputed = 0, checkpoint_files, update_rate, parList, lens, save_rate_parlist, + adjBeta_method = NULL, + config, ... ) } @@ -27,7 +28,7 @@ compute_parent_adjacency( \item{component}{character. Which component of the pedigree to return. See Details.} -\item{adjacency_method}{character. The method to use for computing the adjacency matrix. Options are "loop" or "indexed"} +\item{adjacency_method}{character. The method to use for computing the adjacency matrix. Options are "loop", "indexed", direct or beta} \item{saveable}{logical. If TRUE, save the intermediate results to disk} @@ -39,11 +40,9 @@ compute_parent_adjacency( \item{lastComputed}{the last computed index} -\item{nr}{the number of rows in the pedigree dataset} - \item{checkpoint_files}{a list of checkpoint files} -\item{update_rate}{numeric. The rate at which to print progress} +\item{update_rate}{the rate at which to update the progress} \item{parList}{a list of parent-child relationships} @@ -51,6 +50,10 @@ compute_parent_adjacency( \item{save_rate_parlist}{numeric. The rate at which to save the intermediate results by parent list. If NULL, defaults to save_rate*1000} +\item{adjBeta_method}{numeric The method to use for computing the building the adjacency_method matrix when using the "beta" build} + +\item{config}{a configuration list that passes parameters to the function} + \item{...}{additional arguments to be passed to \code{\link{ped2com}}} } \description{ diff --git a/man/makeLongTree.Rd b/man/convertGrid2LongTree.Rd similarity index 67% rename from man/makeLongTree.Rd rename to man/convertGrid2LongTree.Rd index e03db2f9..d3bd5b5f 100644 --- a/man/makeLongTree.Rd +++ b/man/convertGrid2LongTree.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/readPedigree.R -\name{makeLongTree} -\alias{makeLongTree} +% Please edit documentation in R/readWikifamilytree.R +\name{convertGrid2LongTree} +\alias{convertGrid2LongTree} \title{Make Long Tree} \usage{ -makeLongTree(tree_df, cols_to_pivot) +convertGrid2LongTree(tree_df, cols_to_pivot) } \arguments{ \item{tree_df}{A data frame containing the tree structure.} diff --git a/man/countPatternRows.Rd b/man/countPatternRows.Rd index edcdda52..3fe3c3af 100644 --- a/man/countPatternRows.Rd +++ b/man/countPatternRows.Rd @@ -1,18 +1,18 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/readPedigree.R +% Please edit documentation in R/readGedcom.R \name{countPatternRows} \alias{countPatternRows} -\title{Check for Pattern Rows} +\title{Count GEDCOM Pattern Rows} \usage{ countPatternRows(file) } \arguments{ -\item{file}{A data frame containing the GEDCOM file.} +\item{file}{A data frame with a column \code{X1} containing GEDCOM lines.} } \value{ -A list with the number of rows containing each pattern. +A list with counts of specific GEDCOM tag occurrences. } \description{ -This function counts the number of rows containing specific patterns. +Counts the number of lines in a file (passed as a data frame with column "X1") +that match various GEDCOM patterns. } -\keyword{internal} diff --git a/man/deduplicatePairs.Rd b/man/deduplicatePairs.Rd new file mode 100644 index 00000000..e133d803 --- /dev/null +++ b/man/deduplicatePairs.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readWikifamilytree.R +\name{deduplicatePairs} +\alias{deduplicatePairs} +\title{Deduplicate pairs of IDs in a data frame} +\usage{ +deduplicatePairs(df) +} +\arguments{ +\item{df}{A data frame with columns from_id and to_id} +} +\value{ +A data frame with unique pairs of IDs +} +\description{ +Deduplicate pairs of IDs in a data frame +} +\keyword{internal} diff --git a/man/determineSex.Rd b/man/determineSex.Rd index c98644f6..39711ada 100644 --- a/man/determineSex.Rd +++ b/man/determineSex.Rd @@ -4,12 +4,16 @@ \alias{determineSex} \title{Determine Sex of Offspring} \usage{ -determineSex(idGen, sexR) +determineSex(idGen, sexR, code_male = "M", code_female = "F") } \arguments{ \item{idGen}{Vector of IDs for the generation.} \item{sexR}{Numeric value indicating the sex ratio (proportion of males).} + +\item{code_male}{The value to use for males. Default is "M"} + +\item{code_female}{The value to use for females. Default is "F"} } \value{ Vector of sexes ("M" for male, "F" for female) for the offspring. diff --git a/man/dot-assignParentValue.Rd b/man/dot-assignParentValue.Rd new file mode 100644 index 00000000..4ca25a10 --- /dev/null +++ b/man/dot-assignParentValue.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/buildComponent.R +\name{.assignParentValue} +\alias{.assignParentValue} +\title{Assign parent values based on component type} +\usage{ +.assignParentValue(component) +} +\arguments{ +\item{component}{character. Which component of the pedigree to return. See Details.} +} +\description{ +Assign parent values based on component type +} diff --git a/man/dot-collapseNames.legacy.Rd b/man/dot-collapseNames.legacy.Rd new file mode 100644 index 00000000..c127dfe0 --- /dev/null +++ b/man/dot-collapseNames.legacy.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcomlegacy.R +\name{.collapseNames.legacy} +\alias{.collapseNames.legacy} +\title{collapse Names} +\usage{ +.collapseNames.legacy(verbose, df_temp) +} +\arguments{ +\item{verbose}{A logical value indicating whether to print messages.} + +\item{df_temp}{A data frame containing the columns to be combined.} +} +\description{ +This function combines the `name_given` and `name_given_pieces` columns in a data frame. +} +\keyword{internal} diff --git a/man/dot-com2links.legacy.Rd b/man/dot-com2links.legacy.Rd new file mode 100644 index 00000000..b82497a9 --- /dev/null +++ b/man/dot-com2links.legacy.Rd @@ -0,0 +1,56 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/makeLinkslegacy.R +\name{.com2links.legacy} +\alias{.com2links.legacy} +\title{Convert Sparse Relationship Matrices to Kinship Links} +\usage{ +.com2links.legacy( + rel_pairs_file = "dataRelatedPairs.csv", + ad_ped_matrix = NULL, + mit_ped_matrix = mt_ped_matrix, + mt_ped_matrix = NULL, + cn_ped_matrix = NULL, + write_buffer_size = 1000, + update_rate = 1000, + gc = TRUE, + writetodisk = TRUE, + verbose = FALSE, + legacy = FALSE, + outcome_name = "data", + drop_upper_triangular = TRUE, + ... +) +} +\arguments{ +\item{rel_pairs_file}{File path to write related pairs to (CSV format).} + +\item{ad_ped_matrix}{Matrix of additive genetic relatedness coefficients.} + +\item{mit_ped_matrix}{Matrix of mitochondrial relatedness coefficients. Alias: \code{mt_ped_matrix}.} + +\item{mt_ped_matrix}{Matrix of mitochondrial relatedness coefficients.} + +\item{cn_ped_matrix}{Matrix of common nuclear relatedness coefficients.} + +\item{write_buffer_size}{Number of related pairs to write to disk at a time.} + +\item{update_rate}{Numeric. Frequency (in iterations) at which progress messages are printed.} + +\item{gc}{Logical. If TRUE, performs garbage collection via \code{\link{gc}} to free memory.} + +\item{writetodisk}{Logical. If TRUE, writes the related pairs to disk; if FALSE, returns a data frame.} + +\item{verbose}{Logical. If TRUE, prints progress messages.} + +\item{legacy}{Logical. If TRUE, uses the legacy branch of the function.} + +\item{outcome_name}{Character string representing the outcome name (used in file naming).} + +\item{drop_upper_triangular}{Logical. If TRUE, drops the upper triangular portion of the matrix.} + +\item{...}{Additional arguments to be passed to \code{\link{com2links}}} +} +\description{ +Convert Sparse Relationship Matrices to Kinship Links +} +\keyword{internal} diff --git a/man/dot-com2links.og.Rd b/man/dot-com2links.og.Rd new file mode 100644 index 00000000..b87b70b2 --- /dev/null +++ b/man/dot-com2links.og.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/makeLinkslegacy.R +\name{.com2links.og} +\alias{.com2links.og} +\title{Convert Pedigree Matrices to Related Pairs File (Legacy)} +\usage{ +.com2links.og( + rel_pairs_file = "dataRelatedPairs.csv", + ad_ped_matrix = NULL, + mit_ped_matrix = mt_ped_matrix, + mt_ped_matrix = NULL, + cn_ped_matrix = NULL, + update_rate = 500, + verbose = FALSE, + outcome_name = "data", + ... +) +} +\arguments{ +\item{rel_pairs_file}{File path to write related pairs to (CSV format).} + +\item{ad_ped_matrix}{Matrix of additive genetic relatedness coefficients.} + +\item{mit_ped_matrix}{Matrix of mitochondrial relatedness coefficients. Alias: \code{mt_ped_matrix}.} + +\item{mt_ped_matrix}{Matrix of mitochondrial relatedness coefficients.} + +\item{cn_ped_matrix}{Matrix of common nuclear relatedness coefficients.} + +\item{update_rate}{Numeric. Frequency (in iterations) at which progress messages are printed.} + +\item{verbose}{Logical. If TRUE, prints progress messages.} + +\item{outcome_name}{Character string representing the outcome name (used in file naming).} + +\item{...}{Additional arguments to be passed to \code{\link{com2links}}} +} +\description{ +This legacy function converts pedigree matrices into a related pairs file. +} +\keyword{internal} diff --git a/man/dot-combine_columns.legacy.Rd b/man/dot-combine_columns.legacy.Rd new file mode 100644 index 00000000..fffafda2 --- /dev/null +++ b/man/dot-combine_columns.legacy.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcomlegacy.R +\name{.combine_columns.legacy} +\alias{.combine_columns.legacy} +\title{Combine Columns} +\usage{ +.combine_columns.legacy(col1, col2) +} +\arguments{ +\item{col1}{The first column to combine.} + +\item{col2}{The second column to combine.} +} +\value{ +A list with the combined column and a flag indicating if the second column should be retained. +} +\description{ +This function combines two columns, handling conflicts and merging non-conflicting data. +} +\keyword{internal} diff --git a/man/dot-computeTranspose.Rd b/man/dot-computeTranspose.Rd index 38c8fa82..4d90dcbc 100644 --- a/man/dot-computeTranspose.Rd +++ b/man/dot-computeTranspose.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/convertPedigree.R +% Please edit documentation in R/buildComponent.R \name{.computeTranspose} \alias{.computeTranspose} \title{Compute the transpose multiplication for the relatedness matrix} diff --git a/man/dot-countPatternRows.legacy.Rd b/man/dot-countPatternRows.legacy.Rd new file mode 100644 index 00000000..8aba84fb --- /dev/null +++ b/man/dot-countPatternRows.legacy.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcomlegacy.R +\name{.countPatternRows.legacy} +\alias{.countPatternRows.legacy} +\title{Check for Pattern Rows} +\usage{ +.countPatternRows.legacy(file) +} +\arguments{ +\item{file}{A data frame containing the GEDCOM file.} +} +\value{ +A list with the number of rows containing each pattern. +} +\description{ +This function counts the number of rows containing specific patterns. +} +\keyword{internal} diff --git a/man/dot-extract_info.legacy.Rd b/man/dot-extract_info.legacy.Rd new file mode 100644 index 00000000..6ebf6180 --- /dev/null +++ b/man/dot-extract_info.legacy.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcomlegacy.R +\name{.extract_info.legacy} +\alias{.extract_info.legacy} +\title{Extract Information from Line} +\usage{ +.extract_info.legacy(line, type) +} +\arguments{ +\item{line}{A character string representing a line from a GEDCOM file.} + +\item{type}{A character string representing the type of information to extract.} +} +\value{ +A character string with the extracted information. +} +\description{ +This function extracts information from a line based on a specified type. +} +\keyword{internal} diff --git a/man/dot-getModalValue.Rd b/man/dot-getModalValue.Rd new file mode 100644 index 00000000..fd97c2d6 --- /dev/null +++ b/man/dot-getModalValue.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/checkSex.R +\name{.getModalValue} +\alias{.getModalValue} +\title{Get the Modal Value of a Vector} +\usage{ +.getModalValue(x) +} +\arguments{ +\item{x}{A vector of values.} +} +\value{ +The modal value of the vector. If the vector is empty or contains only NA values, returns NA. +} +\description{ +This function calculates the modal value of a vector, which is the most frequently occurring value. +If the vector is empty or contains only NA values, it returns NA. +} +\keyword{internal} diff --git a/man/dot-loadOrComputeIsChild.Rd b/man/dot-loadOrComputeIsChild.Rd new file mode 100644 index 00000000..7656d261 --- /dev/null +++ b/man/dot-loadOrComputeIsChild.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/buildComponent.R +\name{.loadOrComputeIsChild} +\alias{.loadOrComputeIsChild} +\title{Load or compute the isChild matrix} +\usage{ +.loadOrComputeIsChild(ped, checkpoint_files, config) +} +\arguments{ +\item{ped}{a pedigree dataset. Needs ID, momID, and dadID columns} + +\item{checkpoint_files}{A list of checkpoint file paths. + + @keywords internal} + +\item{config}{A list containing configuration parameters such as `resume`, `verbose`, and `saveable`.} +} +\description{ +Load or compute the isChild matrix +} diff --git a/man/dot-loadOrComputeIsPar.Rd b/man/dot-loadOrComputeIsPar.Rd new file mode 100644 index 00000000..7879c078 --- /dev/null +++ b/man/dot-loadOrComputeIsPar.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/buildComponent.R +\name{.loadOrComputeIsPar} +\alias{.loadOrComputeIsPar} +\title{Load or compute the isPar matrix} +\usage{ +.loadOrComputeIsPar(iss, jss, parVal, ped, checkpoint_files, config) +} +\arguments{ +\item{iss}{The row indices of the sparse matrix.} + +\item{jss}{The column indices of the sparse matrix.} + +\item{parVal}{The value to assign to the non-zero elements of the sparse matrix.} + +\item{ped}{The pedigree dataset.} + +\item{checkpoint_files}{A list of checkpoint file paths.} + +\item{config}{A list containing configuration parameters such as `resume`, `verbose`, and `saveable`.} +} +\description{ +Load or compute the isPar matrix +} +\keyword{internal} diff --git a/man/dot-loadOrComputeParList.Rd b/man/dot-loadOrComputeParList.Rd new file mode 100644 index 00000000..91fe33fa --- /dev/null +++ b/man/dot-loadOrComputeParList.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/buildComponent.R +\name{.loadOrComputeParList} +\alias{.loadOrComputeParList} +\title{parent-child adjacency data} +\usage{ +.loadOrComputeParList( + checkpoint_files, + config, + ped = NULL, + parList = NULL, + lens = NULL +) +} +\arguments{ +\item{checkpoint_files}{A list of checkpoint file paths.} + +\item{config}{A list containing configuration parameters such as `resume`, `verbose`, and `saveable`.} + +\item{ped}{a pedigree dataset. Needs ID, momID, and dadID columns} + +\item{parList}{A list of parent-child adjacency data.} + +\item{lens}{A vector of lengths for each parent-child relationship.} +} +\value{ +A list containing the parent-child adjacency data either loaded from a checkpoint or initialized. +} +\description{ +parent-child adjacency data +} +\keyword{internal} diff --git a/man/assignParentIDs.Rd b/man/dot-mapFAMC2parents.legacy.Rd similarity index 66% rename from man/assignParentIDs.Rd rename to man/dot-mapFAMC2parents.legacy.Rd index 8bd15170..11109e6e 100644 --- a/man/assignParentIDs.Rd +++ b/man/dot-mapFAMC2parents.legacy.Rd @@ -1,17 +1,15 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/readPedigree.R -\name{assignParentIDs} -\alias{assignParentIDs} +% Please edit documentation in R/readGedcomlegacy.R +\name{.mapFAMC2parents.legacy} +\alias{.mapFAMC2parents.legacy} \title{Assign momID and dadID based on family mapping} \usage{ -assignParentIDs(df_temp, family_to_parents, datasource) +.mapFAMC2parents.legacy(df_temp, family_to_parents) } \arguments{ \item{df_temp}{A data frame containing individual information.} \item{family_to_parents}{A list mapping family IDs to parent IDs.} - -\item{datasource}{A string indicating the data source. Options are "gedcom" and "wiki".} } \value{ A data frame with added momID and dad_ID columns. diff --git a/man/createFamilyToParentsMapping.Rd b/man/dot-mapFAMS2parents.legacy.Rd similarity index 67% rename from man/createFamilyToParentsMapping.Rd rename to man/dot-mapFAMS2parents.legacy.Rd index 20d8216d..7f07a076 100644 --- a/man/createFamilyToParentsMapping.Rd +++ b/man/dot-mapFAMS2parents.legacy.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/readPedigree.R -\name{createFamilyToParentsMapping} -\alias{createFamilyToParentsMapping} +% Please edit documentation in R/readGedcomlegacy.R +\name{.mapFAMS2parents.legacy} +\alias{.mapFAMS2parents.legacy} \title{Create a mapping of family IDs to parent IDs} \usage{ -createFamilyToParentsMapping(df_temp, datasource) +.mapFAMS2parents.legacy(df_temp) } \arguments{ \item{df_temp}{A data frame containing information about individuals.} diff --git a/man/dot-postProcessGedcom.legacy.Rd b/man/dot-postProcessGedcom.legacy.Rd new file mode 100644 index 00000000..8deeadd3 --- /dev/null +++ b/man/dot-postProcessGedcom.legacy.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcomlegacy.R +\name{.postProcessGedcom.legacy} +\alias{.postProcessGedcom.legacy} +\title{Post-process GEDCOM Data Frame} +\usage{ +.postProcessGedcom.legacy( + df_temp, + remove_empty_cols = TRUE, + combine_cols = TRUE, + add_parents = TRUE, + skinny = TRUE, + verbose = FALSE +) +} +\arguments{ +\item{df_temp}{A data frame containing information about individuals.} + +\item{remove_empty_cols}{A logical value indicating whether to remove columns with all missing values.} + +\item{combine_cols}{A logical value indicating whether to combine columns with duplicate values.} + +\item{add_parents}{A logical value indicating whether to add parents to the data frame.} + +\item{skinny}{A logical value indicating whether to return a skinny data frame.} + +\item{verbose}{A logical value indicating whether to print messages.} +} +\value{ +A data frame with processed information. +} +\description{ +Post-process GEDCOM Data Frame +} diff --git a/man/dot-processParents.legacy.Rd b/man/dot-processParents.legacy.Rd new file mode 100644 index 00000000..eca10438 --- /dev/null +++ b/man/dot-processParents.legacy.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcomlegacy.R +\name{.processParents.legacy} +\alias{.processParents.legacy} +\title{Process parents information} +\usage{ +.processParents.legacy(df_temp, datasource) +} +\arguments{ +\item{df_temp}{A data frame containing information about individuals.} +} +\value{ +A data frame with added momID and dadID columns. +} +\description{ +This function processes the dataframe to add momID and dadID columns. +} +\keyword{internal} diff --git a/man/dot-process_tag.legacy.Rd b/man/dot-process_tag.legacy.Rd new file mode 100644 index 00000000..0fbfac81 --- /dev/null +++ b/man/dot-process_tag.legacy.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcomlegacy.R +\name{.process_tag.legacy} +\alias{.process_tag.legacy} +\title{Process a GEDCOM Tag} +\usage{ +.process_tag.legacy( + tag, + field_name, + pattern_rows, + line, + vars, + extractor = NULL, + mode = "replace" +) +} +\arguments{ +\item{tag}{The GEDCOM tag (e.g., "SEX", "CAST", etc.).} + +\item{field_name}{The name of the variable to assign to in `vars`.} + +\item{pattern_rows}{Output from `countPatternRows()`.} + +\item{line}{The GEDCOM line to parse.} + +\item{vars}{The current list of variables to update.} +} +\value{ +A list with updated `vars` and a `matched` flag. +} +\description{ +Extracts and assigns a value to a specified field in `vars` if the pattern is present. +Returns both the updated variable list and a flag indicating whether the tag was matched. +} +\keyword{internal} diff --git a/man/dot-readGedcom.legacy.Rd b/man/dot-readGedcom.legacy.Rd new file mode 100644 index 00000000..515262e0 --- /dev/null +++ b/man/dot-readGedcom.legacy.Rd @@ -0,0 +1,78 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcomlegacy.R +\name{.readGedcom.legacy} +\alias{.readGedcom.legacy} +\title{Read a GEDCOM File} +\usage{ +.readGedcom.legacy( + file_path, + verbose = FALSE, + add_parents = TRUE, + remove_empty_cols = TRUE, + combine_cols = TRUE, + skinny = FALSE, + update_rate = 1000, + post_process = TRUE, + ... +) +} +\arguments{ +\item{file_path}{The path to the GEDCOM file.} + +\item{verbose}{A logical value indicating whether to print messages.} + +\item{add_parents}{A logical value indicating whether to add parents to the data frame.} + +\item{remove_empty_cols}{A logical value indicating whether to remove columns with all missing values.} + +\item{combine_cols}{A logical value indicating whether to combine columns with duplicate values.} + +\item{skinny}{A logical value indicating whether to return a skinny data frame.} + +\item{update_rate}{numeric. The rate at which to print progress} + +\item{...}{Additional arguments to be passed to the function.} +} +\value{ +A data frame containing information about individuals, with the following potential columns: +- `id`: ID of the individual +- `momID`: ID of the individual's mother +- `dadID`: ID of the individual's father +- `sex`: Sex of the individual +- `name`: Full name of the individual +- `name_given`: First name of the individual +- `name_surn`: Last name of the individual +- `name_marriedsurn`: Married name of the individual +- `name_nick`: Nickname of the individual +- `name_npfx`: Name prefix +- `name_nsfx`: Name suffix +- `birth_date`: Birth date of the individual +- `birth_lat`: Latitude of the birthplace +- `birth_long`: Longitude of the birthplace +- `birth_place`: Birthplace of the individual +- `death_caus`: Cause of death +- `death_date`: Death date of the individual +- `death_lat`: Latitude of the place of death +- `death_long`: Longitude of the place of death +- `death_place`: Place of death of the individual +- `attribute_caste`: Caste of the individual +- `attribute_children`: Number of children of the individual +- `attribute_description`: Description of the individual +- `attribute_education`: Education of the individual +- `attribute_idnumber`: Identification number of the individual +- `attribute_marriages`: Number of marriages of the individual +- `attribute_nationality`: Nationality of the individual +- `attribute_occupation`: Occupation of the individual +- `attribute_property`: Property owned by the individual +- `attribute_religion`: Religion of the individual +- `attribute_residence`: Residence of the individual +- `attribute_ssn`: Social security number of the individual +- `attribute_title`: Title of the individual +- `FAMC`: ID(s) of the family where the individual is a child +- `FAMS`: ID(s) of the family where the individual is a spouse +} +\description{ +This function reads a GEDCOM file and parses it into a structured data frame of individuals. +Inspired by https://raw.githubusercontent.com/jjfitz/readgedcom/master/R/read_gedcom.R +} +\keyword{internal} diff --git a/man/matchMembers.Rd b/man/extractMemberTable.Rd similarity index 70% rename from man/matchMembers.Rd rename to man/extractMemberTable.Rd index e02311af..72b0945b 100644 --- a/man/matchMembers.Rd +++ b/man/extractMemberTable.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/readPedigree.R -\name{matchMembers} -\alias{matchMembers} +% Please edit documentation in R/readWikifamilytree.R +\name{extractMemberTable} +\alias{extractMemberTable} \title{Match Members} \usage{ -matchMembers(text) +extractMemberTable(text) } \arguments{ \item{text}{A character string containing the text of a family tree in wiki format.} diff --git a/man/extract_info.Rd b/man/extract_info.Rd index 0001eb99..3f5bbc1d 100644 --- a/man/extract_info.Rd +++ b/man/extract_info.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/readPedigree.R +% Please edit documentation in R/readGedcom.R \name{extract_info} \alias{extract_info} \title{Extract Information from Line} diff --git a/man/figures/hex.png b/man/figures/hex.png new file mode 100644 index 00000000..fa28002f Binary files /dev/null and b/man/figures/hex.png differ diff --git a/man/findBiggest.Rd b/man/findBiggest.Rd new file mode 100644 index 00000000..c3b3d23e --- /dev/null +++ b/man/findBiggest.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/summarizePedigree.R +\name{findBiggest} +\alias{findBiggest} +\title{This function finds the biggest families in a pedigree. It is supposed to be used internally by the \code{summarize_pedigree} function.} +\usage{ +findBiggest(foo_summary_dt, nbiggest, n_foo) +} +\arguments{ +\item{foo_summary_dt}{A data.table containing the summary statistics.} + +\item{nbiggest}{Integer. Number of largest lineages to return (sorted by count).} + +\item{n_foo}{An integer specifying the number of individuals in the summary.} +} +\value{ +a data.table containing the biggest families in the pedigree. +} +\description{ +This function finds the biggest families in a pedigree. It is supposed to be used internally by the \code{summarize_pedigree} function. +} diff --git a/man/findFounder.Rd b/man/findFounder.Rd index a4cf70bb..19bfcb1f 100644 --- a/man/findFounder.Rd +++ b/man/findFounder.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/summarizePedigree.R \name{findFounder} \alias{findFounder} -\title{This function finds the originating member for each line in a pedigree. It is supposed to be used internally by the \code{summarize_pedigree} function.} +\title{Function to find the originating member for each line} \usage{ findFounder(data, group_var, sort_var) } diff --git a/man/findOldest.Rd b/man/findOldest.Rd new file mode 100644 index 00000000..f97fa65c --- /dev/null +++ b/man/findOldest.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/summarizePedigree.R +\name{findOldest} +\alias{findOldest} +\title{This function finds the oldest families in a pedigree. It is supposed to be used internally by the \code{summarize_pedigree} function.} +\usage{ +findOldest(foo_summary_dt, byr, noldest, n_foo) +} +\arguments{ +\item{foo_summary_dt}{A data.table containing the summary statistics.} + +\item{byr}{Character. Optional column name for birth year. Used to determine the oldest lineages.} + +\item{noldest}{Integer. Number of oldest lineages to return (sorted by birth year).} + +\item{n_foo}{An integer specifying the number of individuals in the summary.} +} +\value{ +a data.table containing the oldest families in the pedigree. +} +\description{ +This function finds the oldest families in a pedigree. It is supposed to be used internally by the \code{summarize_pedigree} function. +} diff --git a/man/getGridNeighbors.Rd b/man/getGridNeighbors.Rd new file mode 100644 index 00000000..cc1e20fa --- /dev/null +++ b/man/getGridNeighbors.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readWikifamilytree.R +\name{getGridNeighbors} +\alias{getGridNeighbors} +\title{Build adjacency list (4-way neighbors)} +\usage{ +getGridNeighbors(cell, active_keys) +} +\arguments{ +\item{cell}{A data frame with columns Row and Column} +} +\value{ +A character vector of neighboring cell keys +} +\description{ +Build adjacency list (4-way neighbors) +} +\keyword{internal} diff --git a/man/extractSummaryText.Rd b/man/getWikiTreeSummary.Rd similarity index 69% rename from man/extractSummaryText.Rd rename to man/getWikiTreeSummary.Rd index ad1e90a9..dadc4c33 100644 --- a/man/extractSummaryText.Rd +++ b/man/getWikiTreeSummary.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/readPedigree.R -\name{extractSummaryText} -\alias{extractSummaryText} +% Please edit documentation in R/readWikifamilytree.R +\name{getWikiTreeSummary} +\alias{getWikiTreeSummary} \title{Extract Summary Text} \usage{ -extractSummaryText(text) +getWikiTreeSummary(text) } \arguments{ \item{text}{A character string containing the text of a family tree in wiki format.} diff --git a/man/inferRelatedness.Rd b/man/inferRelatedness.Rd index dba7a0ed..3d25312b 100644 --- a/man/inferRelatedness.Rd +++ b/man/inferRelatedness.Rd @@ -1,10 +1,13 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/computeRelatedness.R +% Please edit documentation in R/calculateRelatedness.R \name{inferRelatedness} \alias{inferRelatedness} +\alias{relatedness} \title{Infer Relatedness Coefficient} \usage{ inferRelatedness(obsR, aceA = 0.9, aceC = 0, sharedC = 0) + +relatedness(...) } \arguments{ \item{obsR}{Numeric. Observed correlation between the two groups. Must be between -1 and 1.} @@ -14,6 +17,8 @@ inferRelatedness(obsR, aceA = 0.9, aceC = 0, sharedC = 0) \item{aceC}{Numeric. Proportion of variance attributable to shared environmental variance. Must be between 0 and 1. Default is 0.} \item{sharedC}{Numeric. Proportion of shared environment shared between the two individuals. Must be between 0 (no shared environment) and 1 (completely shared environment). Default is 0.} + +\item{...}{Further named arguments that may be passed to another function.} } \value{ Numeric. The calculated relatedness coefficient (`est_r`). @@ -32,3 +37,4 @@ By considering the observed correlation (`obsR`), the proportion of variance att inferRelatedness(obsR = 0.5, aceA = 0.9, aceC = 0, sharedC = 0) } } +\keyword{internal} diff --git a/man/initializeCheckpoint.Rd b/man/initializeCheckpoint.Rd new file mode 100644 index 00000000..2b06a258 --- /dev/null +++ b/man/initializeCheckpoint.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/buildComponent.R +\name{initializeCheckpoint} +\alias{initializeCheckpoint} +\title{Initialize checkpoint files} +\usage{ +initializeCheckpoint( + config = list(verbose = FALSE, saveable = FALSE, resume = FALSE, save_path = + "checkpoint/") +) +} +\description{ +Initialize checkpoint files +} +\keyword{internal} diff --git a/man/initializeRecord.Rd b/man/initializeRecord.Rd new file mode 100644 index 00000000..3c0c08ca --- /dev/null +++ b/man/initializeRecord.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcom.R +\name{initializeRecord} +\alias{initializeRecord} +\title{Initialize an Empty Individual Record} +\usage{ +initializeRecord(all_var_names) +} +\arguments{ +\item{all_var_names}{A character vector of variable names.} +} +\value{ +A named list representing an empty individual record. +} +\description{ +Creates a named list with all GEDCOM fields set to NA. +} diff --git a/man/initialize_empty_df.Rd b/man/initialize_empty_df.Rd new file mode 100644 index 00000000..637d83cb --- /dev/null +++ b/man/initialize_empty_df.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/makeLinks.R +\name{initialize_empty_df} +\alias{initialize_empty_df} +\title{initialize_empty_df} +\usage{ +initialize_empty_df(relNames) +} +\arguments{ +\item{relNames}{A vector of column names to be included in the data frame.} +} +\value{ +An empty data frame with specified column names. +} +\description{ +This function initializes an empty data frame with specified column names. +} +\keyword{internal} diff --git a/man/evenInsert.Rd b/man/insertEven.Rd similarity index 93% rename from man/evenInsert.Rd rename to man/insertEven.Rd index 2dae39c1..7fc08138 100644 --- a/man/evenInsert.Rd +++ b/man/insertEven.Rd @@ -1,10 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/insertEven.R -\name{evenInsert} +\name{insertEven} +\alias{insertEven} \alias{evenInsert} \title{evenInsert A function to insert m elements evenly into a length n vector.} \usage{ +insertEven(m, n, verbose = FALSE) + evenInsert(m, n, verbose = FALSE) } \arguments{ diff --git a/man/isChild.Rd b/man/isChild.Rd new file mode 100644 index 00000000..be64125d --- /dev/null +++ b/man/isChild.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/constructAdjacency.R +\name{isChild} +\alias{isChild} +\title{Determine isChild Status, isChild is the 'S' matrix from RAM} +\usage{ +isChild(isChild_method, ped) +} +\arguments{ +\item{isChild_method}{method to determine isChild status} + +\item{ped}{pedigree data frame} +} +\value{ +isChild 'S' matrix +} +\description{ +Determine isChild Status, isChild is the 'S' matrix from RAM +} diff --git a/man/loadOrComputeCheckpoint.Rd b/man/loadOrComputeCheckpoint.Rd new file mode 100644 index 00000000..20b8f807 --- /dev/null +++ b/man/loadOrComputeCheckpoint.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/buildComponent.R +\name{loadOrComputeCheckpoint} +\alias{loadOrComputeCheckpoint} +\title{Load or compute a checkpoint} +\usage{ +loadOrComputeCheckpoint( + file, + compute_fn, + config, + message_resume = NULL, + message_compute = NULL +) +} +\arguments{ +\item{file}{The file path to load the checkpoint from.} + +\item{compute_fn}{The function to compute the checkpoint if it doesn't exist.} + +\item{config}{A list containing configuration parameters such as `resume`, `verbose`, and `saveable`.} + +\item{message_resume}{Optional message to display when resuming from a checkpoint.} + +\item{message_compute}{Optional message to display when computing the checkpoint.} +} +\value{ +The loaded or computed checkpoint. +} +\description{ +Load or compute a checkpoint +} +\keyword{internal} diff --git a/man/mapFAMC2parents.Rd b/man/mapFAMC2parents.Rd new file mode 100644 index 00000000..3ebd6dd7 --- /dev/null +++ b/man/mapFAMC2parents.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcom.R +\name{mapFAMC2parents} +\alias{mapFAMC2parents} +\title{Assign momID and dadID based on family mapping} +\usage{ +mapFAMC2parents(df_temp, family_to_parents) +} +\arguments{ +\item{df_temp}{A data frame containing individual information.} + +\item{family_to_parents}{A list mapping family IDs to parent IDs.} +} +\value{ +A data frame with added momID and dad_ID columns. +} +\description{ +This function assigns mother and father IDs to individuals in the data frame +based on the mapping of family IDs to parent IDs. +} +\keyword{internal} diff --git a/man/mapFAMS2parents.Rd b/man/mapFAMS2parents.Rd new file mode 100644 index 00000000..10cfb4e4 --- /dev/null +++ b/man/mapFAMS2parents.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcom.R +\name{mapFAMS2parents} +\alias{mapFAMS2parents} +\title{Create a Mapping from Family IDs to Parent IDs} +\usage{ +mapFAMS2parents(df_temp) +} +\arguments{ +\item{df_temp}{A data frame produced by \code{readGedcom()}.} +} +\value{ +A list mapping family IDs to parent information. +} +\description{ +This function scans the data frame and creates a mapping of family IDs +to the corresponding parent IDs. +} diff --git a/man/nullToNA.Rd b/man/null2NA.Rd similarity index 86% rename from man/nullToNA.Rd rename to man/null2NA.Rd index 4bccb4b3..cb6d7571 100644 --- a/man/nullToNA.Rd +++ b/man/null2NA.Rd @@ -1,9 +1,12 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/helpGeneric.R -\name{nullToNA} +\name{null2NA} +\alias{null2NA} \alias{nullToNA} \title{nullToNA} \usage{ +null2NA(x) + nullToNA(x) } \arguments{ diff --git a/man/parseIndividualBlock.Rd b/man/parseIndividualBlock.Rd new file mode 100644 index 00000000..8f58554b --- /dev/null +++ b/man/parseIndividualBlock.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcom.R +\name{parseIndividualBlock} +\alias{parseIndividualBlock} +\title{Parse a GEDCOM Individual Block} +\usage{ +parseIndividualBlock(block, pattern_rows, all_var_names, verbose = FALSE) +} +\arguments{ +\item{block}{A character vector containing the GEDCOM lines for one individual.} + +\item{pattern_rows}{A list with counts of lines matching specific GEDCOM tags.} + +\item{all_var_names}{A character vector of variable names.} + +\item{verbose}{Logical indicating whether to print progress messages.} +} +\value{ +A named list representing the parsed record for the individual, or NULL if no ID is found. +} +\description{ +Processes a block of GEDCOM lines corresponding to a single individual. +} +\keyword{internal} diff --git a/man/parseNameLine.Rd b/man/parseNameLine.Rd new file mode 100644 index 00000000..44490d60 --- /dev/null +++ b/man/parseNameLine.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcom.R +\name{parseNameLine} +\alias{parseNameLine} +\title{Parse a Full Name Line} +\usage{ +parseNameLine(line, record) +} +\arguments{ +\item{line}{A character string containing the name line.} + +\item{record}{A named list representing the individual's record.} +} +\value{ +The updated record with parsed name information. +} +\description{ +Extracts full name information from a GEDCOM "NAME" line and updates the record accordingly. +} diff --git a/man/parseRelationships.Rd b/man/parseTreeRelationships.Rd similarity index 57% rename from man/parseRelationships.Rd rename to man/parseTreeRelationships.Rd index a02a5983..a29a56da 100644 --- a/man/parseRelationships.Rd +++ b/man/parseTreeRelationships.Rd @@ -1,13 +1,15 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/readPedigree.R -\name{parseRelationships} -\alias{parseRelationships} +% Please edit documentation in R/readWikifamilytree.R +\name{parseTreeRelationships} +\alias{parseTreeRelationships} \title{infer relationship from tree template} \usage{ -parseRelationships(tree_long) +parseTreeRelationships(tree_long, tree_paths = NULL) } \arguments{ \item{tree_long}{A data frame containing the tree structure in long format.} + +\item{tree_paths}{Optional. traceTreePaths output. If NULL, it will be calculated.} } \value{ A data frame containing the relationships between family members. diff --git a/man/ped2add.Rd b/man/ped2add.Rd index 79803b21..45eeedea 100644 --- a/man/ped2add.Rd +++ b/man/ped2add.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/convertPedigree.R +% Please edit documentation in R/buildComponent.R \name{ped2add} \alias{ped2add} \title{Take a pedigree and turn it into an additive genetics relatedness matrix} @@ -18,7 +18,7 @@ ped2add( resume = FALSE, save_rate = 5, save_rate_gen = save_rate, - save_rate_parlist = 1000 * save_rate, + save_rate_parlist = 1e+05 * save_rate, save_path = "checkpoint/", ... ) @@ -42,7 +42,7 @@ ped2add( \item{transpose_method}{character. The method to use for computing the transpose. Options are "tcrossprod", "crossprod", or "star"} -\item{adjacency_method}{character. The method to use for computing the adjacency matrix. Options are "loop" or "indexed"} +\item{adjacency_method}{character. The method to use for computing the adjacency matrix. Options are "loop", "indexed", direct or beta} \item{saveable}{logical. If TRUE, save the intermediate results to disk} diff --git a/man/ped2ce.Rd b/man/ped2ce.Rd index b4e3db1d..ed22966a 100644 --- a/man/ped2ce.Rd +++ b/man/ped2ce.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/convertPedigree.R +% Please edit documentation in R/buildComponent.R \name{ped2ce} \alias{ped2ce} \title{Take a pedigree and turn it into an extended environmental relatedness matrix} diff --git a/man/ped2cn.Rd b/man/ped2cn.Rd index 82d25902..3178cab6 100644 --- a/man/ped2cn.Rd +++ b/man/ped2cn.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/convertPedigree.R +% Please edit documentation in R/buildComponent.R \name{ped2cn} \alias{ped2cn} \title{Take a pedigree and turn it into a common nuclear environmental relatedness matrix} @@ -16,7 +16,7 @@ ped2cn( saveable = FALSE, resume = FALSE, save_rate = 5, - adjacency_method = "indexed", + adjacency_method = "direct", save_rate_gen = save_rate, save_rate_parlist = 1000 * save_rate, save_path = "checkpoint/", @@ -48,7 +48,7 @@ ped2cn( \item{save_rate}{numeric. The rate at which to save the intermediate results} -\item{adjacency_method}{character. The method to use for computing the adjacency matrix. Options are "loop" or "indexed"} +\item{adjacency_method}{character. The method to use for computing the adjacency matrix. Options are "loop", "indexed", direct or beta} \item{save_rate_gen}{numeric. The rate at which to save the intermediate results by generation. If NULL, defaults to save_rate} diff --git a/man/ped2com.Rd b/man/ped2com.Rd index 27f632f5..b49a6b03 100644 --- a/man/ped2com.Rd +++ b/man/ped2com.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/convertPedigree.R +% Please edit documentation in R/buildComponent.R \name{ped2com} \alias{ped2com} \title{Take a pedigree and turn it into a relatedness matrix} @@ -14,7 +14,7 @@ ped2com( flatten.diag = FALSE, standardize.colnames = TRUE, transpose_method = "tcrossprod", - adjacency_method = "indexed", + adjacency_method = "direct", isChild_method = "classic", saveable = FALSE, resume = FALSE, @@ -23,6 +23,7 @@ ped2com( save_rate_parlist = 1e+05 * save_rate, update_rate = 100, save_path = "checkpoint/", + adjBeta_method = NULL, ... ) } @@ -47,7 +48,7 @@ ped2com( \item{transpose_method}{character. The method to use for computing the transpose. Options are "tcrossprod", "crossprod", or "star"} -\item{adjacency_method}{character. The method to use for computing the adjacency matrix. Options are "loop" or "indexed"} +\item{adjacency_method}{character. The method to use for computing the adjacency matrix. Options are "loop", "indexed", direct or beta} \item{isChild_method}{character. The method to use for computing the isChild matrix. Options are "classic" or "partialparent"} @@ -65,6 +66,8 @@ ped2com( \item{save_path}{character. The path to save the checkpoint files} +\item{adjBeta_method}{numeric The method to use for computing the building the adjacency_method matrix when using the "beta" build} + \item{...}{additional arguments to be passed to \code{\link{ped2com}}} } \description{ diff --git a/man/ped2fam.Rd b/man/ped2fam.Rd index 4dbc9aa5..3052e568 100644 --- a/man/ped2fam.Rd +++ b/man/ped2fam.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/buildPedigree.R +% Please edit documentation in R/segmentPedigree.R \name{ped2fam} \alias{ped2fam} \title{Segment Pedigree into Extended Families} @@ -30,7 +30,8 @@ ped2fam( A pedigree dataset with one additional column for the newly created extended family ID } \description{ -This function adds an extended family ID variable to a pedigree by segmenting that dataset into independent extended families +This function adds an extended family ID variable to a pedigree by segmenting +that dataset into independent extended families using the weakly connected components algorithm. } \details{ diff --git a/man/ped2graph.Rd b/man/ped2graph.Rd index 13c191d7..5e7ac7b1 100755 --- a/man/ped2graph.Rd +++ b/man/ped2graph.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/buildPedigree.R +% Please edit documentation in R/segmentPedigree.R \name{ped2graph} \alias{ped2graph} \title{Turn a pedigree into a graph} diff --git a/man/ped2maternal.Rd b/man/ped2maternal.Rd index 88106fb6..03e02311 100755 --- a/man/ped2maternal.Rd +++ b/man/ped2maternal.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/buildPedigree.R +% Please edit documentation in R/segmentPedigree.R \name{ped2maternal} \alias{ped2maternal} \title{Add a maternal line ID variable to a pedigree} @@ -22,7 +22,7 @@ ped2maternal( \item{dadID}{character. Name of the column in ped for the father ID variable} -\item{matID}{Character. Maternal line ID variable to be created and added to the pedigree} +\item{matID}{Character. Maternal line ID variable to be created and added to the pedigree} \item{...}{additional arguments to be passed to \code{\link{ped2com}}} } diff --git a/man/ped2mit.Rd b/man/ped2mit.Rd index d11480d1..5b43fa5d 100644 --- a/man/ped2mit.Rd +++ b/man/ped2mit.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/convertPedigree.R +% Please edit documentation in R/buildComponent.R \name{ped2mit} \alias{ped2mit} \alias{ped2mt} @@ -19,7 +19,7 @@ ped2mit( resume = FALSE, save_rate = 5, save_rate_gen = save_rate, - save_rate_parlist = 1000 * save_rate, + save_rate_parlist = 1e+05 * save_rate, save_path = "checkpoint/", ... ) @@ -43,7 +43,7 @@ ped2mit( \item{transpose_method}{character. The method to use for computing the transpose. Options are "tcrossprod", "crossprod", or "star"} -\item{adjacency_method}{character. The method to use for computing the adjacency matrix. Options are "loop" or "indexed"} +\item{adjacency_method}{character. The method to use for computing the adjacency matrix. Options are "loop", "indexed", direct or beta} \item{saveable}{logical. If TRUE, save the intermediate results to disk} diff --git a/man/ped2paternal.Rd b/man/ped2paternal.Rd index 16a9e35a..e893ec03 100755 --- a/man/ped2paternal.Rd +++ b/man/ped2paternal.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/buildPedigree.R +% Please edit documentation in R/segmentPedigree.R \name{ped2paternal} \alias{ped2paternal} \title{Add a paternal line ID variable to a pedigree} diff --git a/man/populateParents.Rd b/man/populateParents.Rd new file mode 100644 index 00000000..f0871d6a --- /dev/null +++ b/man/populateParents.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readWikifamilytree.R +\name{populateParents} +\alias{populateParents} +\title{Assign Parent} +\usage{ +populateParents(df, child, parent) +} +\arguments{ +\item{df}{A data frame containing the relationships.} + +\item{child}{The ID of the child.} + +\item{parent}{The ID of the parent.} +} +\value{ +A data frame with updated parent information. +} +\description{ +Assign Parent +} +\keyword{internal} diff --git a/man/postProcessGedcom.Rd b/man/postProcessGedcom.Rd new file mode 100644 index 00000000..62bbc50e --- /dev/null +++ b/man/postProcessGedcom.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcom.R +\name{postProcessGedcom} +\alias{postProcessGedcom} +\title{Post-process GEDCOM Data Frame} +\usage{ +postProcessGedcom( + df_temp, + remove_empty_cols = TRUE, + combine_cols = TRUE, + add_parents = TRUE, + skinny = TRUE, + verbose = FALSE +) +} +\arguments{ +\item{df_temp}{A data frame produced by \code{readGedcom()}.} + +\item{remove_empty_cols}{Logical indicating whether to remove columns that are entirely missing.} + +\item{combine_cols}{Logical indicating whether to combine columns with duplicate values.} + +\item{add_parents}{Logical indicating whether to add parent information.} + +\item{skinny}{Logical indicating whether to slim down the data frame.} + +\item{verbose}{Logical indicating whether to print progress messages.} +} +\value{ +The post-processed data frame. +} +\description{ +This function optionally adds parent information, combines duplicate columns, +and removes empty columns from the GEDCOM data frame. +} diff --git a/man/processEventLine.Rd b/man/processEventLine.Rd new file mode 100644 index 00000000..d4cff3d3 --- /dev/null +++ b/man/processEventLine.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcom.R +\name{processEventLine} +\alias{processEventLine} +\title{Process Event Lines (Birth or Death)} +\usage{ +processEventLine(event, block, i, record, pattern_rows) +} +\arguments{ +\item{event}{A character string indicating the event type ("birth" or "death").} + +\item{block}{A character vector of GEDCOM lines.} + +\item{i}{The current line index where the event tag is found.} + +\item{record}{A named list representing the individual's record.} + +\item{pattern_rows}{A list with counts of GEDCOM tag occurrences.} +} +\value{ +The updated record with parsed event information.# +} +\description{ +Extracts event details (e.g., date, place, cause, latitude, longitude) from a block of GEDCOM lines. +For "birth": expect DATE on line i+1, PLAC on i+2, LATI on i+4, LONG on i+5. +For "death": expect DATE on line i+1, PLAC on i+2, CAUS on i+3, LATI on i+4, LONG on i+5. +} diff --git a/man/processParents.Rd b/man/processParents.Rd index e8077472..aec5deb3 100644 --- a/man/processParents.Rd +++ b/man/processParents.Rd @@ -1,18 +1,19 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/readPedigree.R +% Please edit documentation in R/readGedcom.R \name{processParents} \alias{processParents} -\title{Process parents information} +\title{Process Parents Information from GEDCOM Data} \usage{ processParents(df_temp, datasource) } \arguments{ -\item{df_temp}{A data frame containing information about individuals.} +\item{df_temp}{A data frame produced by \code{readGedcom()}.} + +\item{datasource}{Character string indicating the data source ("gedcom" or "wiki").} } \value{ -A data frame with added momID and dadID columns. +The updated data frame with parent IDs added. } \description{ -This function processes the dataframe to add momID and dadID columns. +Adds parent IDs to the individuals based on family relationship data. } -\keyword{internal} diff --git a/man/process_one.Rd b/man/process_one.Rd new file mode 100644 index 00000000..0d57de2b --- /dev/null +++ b/man/process_one.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/makeLinks.R +\name{process_one} +\alias{process_one} +\title{Convert Sparse Relationship Matrices to Kinship Links for one Matrix} +\usage{ +process_one( + matrix, + rel_name, + ids, + nc, + rel_pairs_file, + writetodisk, + write_buffer_size, + drop_upper_triangular, + update_rate, + verbose, + gc, + include_all_links = TRUE, + ... +) +} +\arguments{ +\item{rel_pairs_file}{File path to write related pairs to (CSV format).} + +\item{writetodisk}{Logical. If TRUE, writes the related pairs to disk; if FALSE, returns a data frame.} + +\item{write_buffer_size}{Number of related pairs to write to disk at a time.} + +\item{drop_upper_triangular}{Logical. If TRUE, drops the upper triangular portion of the matrix.} + +\item{update_rate}{Numeric. Frequency (in iterations) at which progress messages are printed.} + +\item{verbose}{Logical. If TRUE, prints progress messages.} + +\item{gc}{Logical. If TRUE, performs garbage collection via \code{\link{gc}} to free memory.} + +\item{include_all_links}{Logical. If TRUE, all links are included in the output.} + +\item{...}{Additional arguments to be passed to \code{\link{com2links}}} +} +\description{ +Convert Sparse Relationship Matrices to Kinship Links for one Matrix +} +\keyword{internal} diff --git a/man/process_tag.Rd b/man/process_tag.Rd new file mode 100644 index 00000000..14dfa65e --- /dev/null +++ b/man/process_tag.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcom.R +\name{process_tag} +\alias{process_tag} +\title{Process a GEDCOM Tag} +\usage{ +process_tag( + tag, + field_name, + pattern_rows, + line, + vars, + extractor = NULL, + mode = "replace" +) +} +\arguments{ +\item{tag}{The GEDCOM tag (e.g., "SEX", "CAST", etc.).} + +\item{field_name}{The name of the variable to assign to in `vars`.} + +\item{pattern_rows}{Output from `countPatternRows()`.} + +\item{line}{The GEDCOM line to parse.} + +\item{vars}{The current list of variables to update.} +} +\value{ +A list with updated `vars` and a `matched` flag. +} +\description{ +Extracts and assigns a value to a specified field in `vars` if the pattern is present. +Returns both the updated variable list and a flag indicating whether the tag was matched. +} +\keyword{internal} diff --git a/man/readGedcom.Rd b/man/readGedcom.Rd index a5136bc5..d9701427 100644 --- a/man/readGedcom.Rd +++ b/man/readGedcom.Rd @@ -1,7 +1,9 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/readPedigree.R +% Please edit documentation in R/readGedcom.R \name{readGedcom} \alias{readGedcom} +\alias{readGed} +\alias{readgedcom} \title{Read a GEDCOM File} \usage{ readGedcom( @@ -10,7 +12,34 @@ readGedcom( add_parents = TRUE, remove_empty_cols = TRUE, combine_cols = TRUE, - skinny = FALSE + skinny = FALSE, + update_rate = 1000, + post_process = TRUE, + ... +) + +readGed( + file_path, + verbose = FALSE, + add_parents = TRUE, + remove_empty_cols = TRUE, + combine_cols = TRUE, + skinny = FALSE, + update_rate = 1000, + post_process = TRUE, + ... +) + +readgedcom( + file_path, + verbose = FALSE, + add_parents = TRUE, + remove_empty_cols = TRUE, + combine_cols = TRUE, + skinny = FALSE, + update_rate = 1000, + post_process = TRUE, + ... ) } \arguments{ @@ -25,6 +54,12 @@ readGedcom( \item{combine_cols}{A logical value indicating whether to combine columns with duplicate values.} \item{skinny}{A logical value indicating whether to return a skinny data frame.} + +\item{update_rate}{numeric. The rate at which to print progress} + +\item{post_process}{A logical value indicating whether to post-process the data frame.} + +\item{...}{Additional arguments to be passed to the function.} } \value{ A data frame containing information about individuals, with the following potential columns: @@ -66,5 +101,4 @@ A data frame containing information about individuals, with the following potent } \description{ This function reads a GEDCOM file and parses it into a structured data frame of individuals. -Inspired by https://raw.githubusercontent.com/jjfitz/readgedcom/master/R/read_gedcom.R } diff --git a/man/readWikifamilytree.Rd b/man/readWikifamilytree.Rd index b06acf7b..58a64efd 100644 --- a/man/readWikifamilytree.Rd +++ b/man/readWikifamilytree.Rd @@ -1,13 +1,22 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/readPedigree.R +% Please edit documentation in R/readWikifamilytree.R \name{readWikifamilytree} \alias{readWikifamilytree} \title{Read Wiki Family Tree} \usage{ -readWikifamilytree(text) +readWikifamilytree(text = NULL, verbose = FALSE, file_path = NULL, ...) } \arguments{ \item{text}{A character string containing the text of a family tree in wiki format.} + +\item{verbose}{A logical value indicating whether to print messages.} + +\item{file_path}{The path to the file containing the family tree.} + +\item{...}{Additional arguments (not used).} +} +\value{ +A list containing the summary, members, structure, and relationships of the family tree. } \description{ Read Wiki Family Tree diff --git a/man/related_coef.Rd b/man/related_coef.Rd deleted file mode 100644 index 11112982..00000000 --- a/man/related_coef.Rd +++ /dev/null @@ -1,33 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helpGeneric.R -\name{related_coef} -\alias{related_coef} -\title{related_coef (Deprecated)} -\usage{ -related_coef(...) -} -\arguments{ -\item{...}{Arguments to be passed to `calculateRelatedness`.} -} -\value{ -The same result as calling `calculateRelatedness`. -} -\description{ -When calling this function, a warning will be issued about its deprecation. -} -\details{ -This function is a wrapper around the new `calculateRelatedness` function. -`related_coef` has been deprecated, and it's advised to use `calculateRelatedness` directly. -} -\examples{ -\dontrun{ -# This is an example of the deprecated function: -related_coef(...) -# It is recommended to use: -calculateRelatedness(...) -} -} -\seealso{ -\code{\link{calculateRelatedness}} for the updated function. -} -\keyword{deprecated} diff --git a/man/relatedness.Rd b/man/relatedness.Rd deleted file mode 100644 index f84740df..00000000 --- a/man/relatedness.Rd +++ /dev/null @@ -1,33 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helpGeneric.R -\name{relatedness} -\alias{relatedness} -\title{relatedness (Deprecated)} -\usage{ -relatedness(...) -} -\arguments{ -\item{...}{Arguments to be passed to `inferRelatedness`.} -} -\value{ -The same result as calling `inferRelatedness`. -} -\description{ -When calling this function, a warning will be issued about its deprecation. -} -\details{ -This function is a wrapper around the new `inferRelatedness` function. -`relatedness` has been deprecated, and it's advised to use `inferRelatedness` directly. -} -\examples{ -\dontrun{ -# This is an example of the deprecated function: -relatedness(...) -# It is recommended to use: -inferRelatedness(...) -} -} -\seealso{ -\code{\link{inferRelatedness}} for the updated function. -} -\keyword{deprecated} diff --git a/man/repairIDs.Rd b/man/repairIDs.Rd index 2ce49f7a..cc37d78f 100644 --- a/man/repairIDs.Rd +++ b/man/repairIDs.Rd @@ -7,9 +7,9 @@ repairIDs(ped, verbose = FALSE) } \arguments{ -\item{ped}{A pedigree object} +\item{ped}{A dataframe representing the pedigree data with columns `ID`, `dadID`, and `momID`.} -\item{verbose}{A logical indicating whether to print progress messages} +\item{verbose}{A logical flag indicating whether to print progress and validation messages to the console.} } \value{ A corrected pedigree diff --git a/man/simulatePedigree.Rd b/man/simulatePedigree.Rd index a2e07b23..b7c28650 100644 --- a/man/simulatePedigree.Rd +++ b/man/simulatePedigree.Rd @@ -2,6 +2,7 @@ % Please edit documentation in R/simulatePedigree.R \name{simulatePedigree} \alias{simulatePedigree} +\alias{SimPed} \title{Simulate Pedigrees This function simulates "balanced" pedigrees based on a group of parameters: 1) k - Kids per couple; @@ -19,6 +20,8 @@ simulatePedigree( balancedMar = TRUE, verbose = FALSE ) + +SimPed(...) } \arguments{ \item{kpc}{Number of kids per couple. An integer >= 2 that determines how many kids each fertilized mated couple will have in the pedigree. Default value is 3. Returns an error when kpc equals 1.} @@ -36,6 +39,8 @@ simulatePedigree( \item{balancedMar}{Not fully developed yet. Always \code{TRUE} in the current version.} \item{verbose}{logical If TRUE, print progress through stages of algorithm} + +\item{...}{Additional arguments to be passed to other functions.} } \value{ A \code{data.frame} with each row representing a simulated individual. The columns are as follows: diff --git a/man/splitIndividuals.Rd b/man/splitIndividuals.Rd new file mode 100644 index 00000000..dcb9a7e8 --- /dev/null +++ b/man/splitIndividuals.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcom.R +\name{splitIndividuals} +\alias{splitIndividuals} +\title{Split GEDCOM Lines into Individual Blocks} +\usage{ +splitIndividuals(lines, verbose = FALSE) +} +\arguments{ +\item{lines}{A character vector of lines from the GEDCOM file.} + +\item{verbose}{Logical indicating whether to output progress messages.} +} +\value{ +A list of character vectors, each representing one individual. +} +\description{ +This function partitions the GEDCOM file (as a vector of lines) into a list of blocks, +where each block corresponds to a single individual starting with an "@ INDI" line. +} diff --git a/man/summarizeFamilies.Rd b/man/summarizeFamilies.Rd index 54fc978d..2cb50761 100644 --- a/man/summarizeFamilies.Rd +++ b/man/summarizeFamilies.Rd @@ -2,6 +2,7 @@ % Please edit documentation in R/summarizePedigree.R \name{summarizeFamilies} \alias{summarizeFamilies} +\alias{summariseFamilies} \title{Summarize the families in a pedigree} \usage{ summarizeFamilies( @@ -21,6 +22,24 @@ summarizeFamilies( five_num_summary = FALSE, verbose = FALSE ) + +summariseFamilies( + ped, + famID = "famID", + personID = "ID", + momID = "momID", + dadID = "dadID", + matID = "matID", + patID = "patID", + byr = NULL, + founder_sort_var = NULL, + include_founder = FALSE, + nbiggest = 5, + noldest = 5, + skip_var = NULL, + five_num_summary = FALSE, + verbose = FALSE +) } \arguments{ \item{ped}{a pedigree dataset. Needs ID, momID, and dadID columns} @@ -33,7 +52,7 @@ summarizeFamilies( \item{dadID}{character. Name of the column in ped for the father ID variable} -\item{matID}{Character. Maternal line ID variable to be created and added to the pedigree} +\item{matID}{Character. Maternal line ID variable to be created and added to the pedigree} \item{patID}{Character. Paternal line ID variable to be created and added to the pedigree} diff --git a/man/summarizeFounder.Rd b/man/summarizeFounder.Rd new file mode 100644 index 00000000..703834e3 --- /dev/null +++ b/man/summarizeFounder.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/summarizePedigree.R +\name{summarizeFounder} +\alias{summarizeFounder} +\title{Function to summarize the originating members for each line} +\usage{ +summarizeFounder(ped_dt, group_var, sort_var, foo_summary_dt, verbose) +} +\arguments{ +\item{sort_var}{A character string specifying the column name to sort by.} + +\item{verbose}{Logical, if TRUE, print progress messages.} +} +\description{ +This function summarizes the originating members for each line in a pedigree. It is supposed to be used internally by the \code{summarize_pedigree} function. +} +\keyword{internal} diff --git a/man/summarizeMatrilines.Rd b/man/summarizeMatrilines.Rd index 1930858a..577204f6 100644 --- a/man/summarizeMatrilines.Rd +++ b/man/summarizeMatrilines.Rd @@ -2,6 +2,7 @@ % Please edit documentation in R/summarizePedigree.R \name{summarizeMatrilines} \alias{summarizeMatrilines} +\alias{summariseMatrilines} \title{Summarize the maternal lines in a pedigree} \usage{ summarizeMatrilines( @@ -21,6 +22,24 @@ summarizeMatrilines( five_num_summary = FALSE, verbose = FALSE ) + +summariseMatrilines( + ped, + famID = "famID", + personID = "ID", + momID = "momID", + dadID = "dadID", + matID = "matID", + patID = "patID", + byr = NULL, + include_founder = FALSE, + founder_sort_var = NULL, + nbiggest = 5, + noldest = 5, + skip_var = NULL, + five_num_summary = FALSE, + verbose = FALSE +) } \arguments{ \item{ped}{a pedigree dataset. Needs ID, momID, and dadID columns} @@ -33,7 +52,7 @@ summarizeMatrilines( \item{dadID}{character. Name of the column in ped for the father ID variable} -\item{matID}{Character. Maternal line ID variable to be created and added to the pedigree} +\item{matID}{Character. Maternal line ID variable to be created and added to the pedigree} \item{patID}{Character. Paternal line ID variable to be created and added to the pedigree} diff --git a/man/summarizePatrilines.Rd b/man/summarizePatrilines.Rd index 846b0e4b..27fd9494 100644 --- a/man/summarizePatrilines.Rd +++ b/man/summarizePatrilines.Rd @@ -2,6 +2,7 @@ % Please edit documentation in R/summarizePedigree.R \name{summarizePatrilines} \alias{summarizePatrilines} +\alias{summarisePatrilines} \title{Summarize the paternal lines in a pedigree} \usage{ summarizePatrilines( @@ -21,6 +22,24 @@ summarizePatrilines( five_num_summary = FALSE, verbose = FALSE ) + +summarisePatrilines( + ped, + famID = "famID", + personID = "ID", + momID = "momID", + dadID = "dadID", + matID = "matID", + patID = "patID", + byr = NULL, + founder_sort_var = NULL, + include_founder = FALSE, + nbiggest = 5, + noldest = 5, + skip_var = NULL, + five_num_summary = FALSE, + verbose = FALSE +) } \arguments{ \item{ped}{a pedigree dataset. Needs ID, momID, and dadID columns} @@ -33,7 +52,7 @@ summarizePatrilines( \item{dadID}{character. Name of the column in ped for the father ID variable} -\item{matID}{Character. Maternal line ID variable to be created and added to the pedigree} +\item{matID}{Character. Maternal line ID variable to be created and added to the pedigree} \item{patID}{Character. Paternal line ID variable to be created and added to the pedigree} diff --git a/man/summarizePedigrees.Rd b/man/summarizePedigrees.Rd index 1a076fdc..a4f6a6e9 100644 --- a/man/summarizePedigrees.Rd +++ b/man/summarizePedigrees.Rd @@ -2,6 +2,7 @@ % Please edit documentation in R/summarizePedigree.R \name{summarizePedigrees} \alias{summarizePedigrees} +\alias{summarisePedigrees} \title{Summarize Pedigree Data} \usage{ summarizePedigrees( @@ -17,7 +18,27 @@ summarizePedigrees( include_founder = FALSE, founder_sort_var = NULL, nbiggest = 5, - noldest = 5, + noldest = nbiggest, + skip_var = NULL, + five_num_summary = FALSE, + network_checks = FALSE, + verbose = FALSE +) + +summarisePedigrees( + ped, + famID = "famID", + personID = "ID", + momID = "momID", + dadID = "dadID", + matID = "matID", + patID = "patID", + type = c("fathers", "mothers", "families"), + byr = NULL, + include_founder = FALSE, + founder_sort_var = NULL, + nbiggest = 5, + noldest = nbiggest, skip_var = NULL, five_num_summary = FALSE, network_checks = FALSE, @@ -35,7 +56,7 @@ summarizePedigrees( \item{dadID}{character. Name of the column in ped for the father ID variable} -\item{matID}{Character. Maternal line ID variable to be created and added to the pedigree} +\item{matID}{Character. Maternal line ID variable to be created and added to the pedigree} \item{patID}{Character. Paternal line ID variable to be created and added to the pedigree} diff --git a/man/traceTreePaths.Rd b/man/traceTreePaths.Rd new file mode 100644 index 00000000..5522ed83 --- /dev/null +++ b/man/traceTreePaths.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readWikifamilytree.R +\name{traceTreePaths} +\alias{traceTreePaths} +\title{Trace paths between individuals in a family tree grid} +\usage{ +traceTreePaths(tree_long, deduplicate = TRUE) +} +\arguments{ +\item{tree_long}{A data.frame with columns: Row, Column, Value, id} + +\item{deduplicate}{Logical, if TRUE, will remove duplicate paths} +} +\value{ +A data.frame with columns: from_id, to_id, direction, path_length, intermediates +} +\description{ +Trace paths between individuals in a family tree grid +} diff --git a/man/try_na.Rd b/man/tryNA.Rd similarity index 89% rename from man/try_na.Rd rename to man/tryNA.Rd index 0f8fddeb..388fa600 100644 --- a/man/try_na.Rd +++ b/man/tryNA.Rd @@ -1,9 +1,12 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/helpGeneric.R -\name{try_na} +\name{tryNA} +\alias{tryNA} \alias{try_na} \title{modified tryCatch function} \usage{ +tryNA(x) + try_na(x) } \arguments{ diff --git a/man/validate_and_convert_matrix.Rd b/man/validate_and_convert_matrix.Rd new file mode 100644 index 00000000..f6bf5c2a --- /dev/null +++ b/man/validate_and_convert_matrix.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/makeLinks.R +\name{validate_and_convert_matrix} +\alias{validate_and_convert_matrix} +\title{validate_and_convert_matrix} +\usage{ +validate_and_convert_matrix( + mat, + name, + ensure_symmetric = FALSE, + force_binary = FALSE +) +} +\arguments{ +\item{mat}{The matrix to be validated and converted.} + +\item{name}{The name of the matrix for error messages.} + +\item{ensure_symmetric}{Logical indicating whether to ensure the matrix is symmetric.} + +\item{force_binary}{Logical indicating whether to force the matrix to be binary.} +} +\value{ +The validated and converted matrix. +} +\description{ +This function validates and converts a matrix to a specific format. +} diff --git a/tests/testthat/test-calculateFamilySize.R b/tests/testthat/test-calculateFamilySize.R index b6e32c7d..f648c8fd 100644 --- a/tests/testthat/test-calculateFamilySize.R +++ b/tests/testthat/test-calculateFamilySize.R @@ -65,13 +65,12 @@ test_that("allGens behaves with Ngens", { result <- allGens(kpc = kpc, Ngen = Ngen, marR = marR) - expect_equal(result, 2) + expect_equal(result, 2) Ngen <- 0 - expect_error( allGens(kpc = kpc, Ngen = Ngen, marR = marR)) + expect_error(allGens(kpc = kpc, Ngen = Ngen, marR = marR)) - expect_error(allGens(kpc = kpc, Ngen= NULL, marR = marR)) - expect_error(allGens(kpc = kpc, Ngen= -1, marR = marR)) + expect_error(allGens(kpc = kpc, Ngen = NULL, marR = marR)) + expect_error(allGens(kpc = kpc, Ngen = -1, marR = marR)) }) - diff --git a/tests/testthat/test-checkParents.R b/tests/testthat/test-checkParents.R index 86776577..50916289 100644 --- a/tests/testthat/test-checkParents.R +++ b/tests/testthat/test-checkParents.R @@ -1,29 +1,27 @@ # Test Case 1: Validate sex coding without repair test_that("checkParentIDs identifies parent coding correctly in potter dataset", { - results <- checkParentIDs(potter,verbose = TRUE, repair = FALSE) + results <- checkParentIDs(potter, verbose = TRUE, repair = FALSE) expect_false("parents_in_both" %in% names(results)) expect_false(results$single_parents) expect_false(results$missing_parents) expect_true(results$female_moms) expect_true(results$male_dads) - expect_equal(results$female_var,0) - expect_equal(results$male_var,1) - expect_equal(results$mom_sex,0) - expect_equal(results$dad_sex,1) + expect_equal(results$female_var, 0) + expect_equal(results$male_var, 1) + expect_equal(results$mom_sex, 0) + expect_equal(results$dad_sex, 1) }) # Test Case 2: Validate sex coding without repair test_that("checksif single parents found correctly in ASOIAF dataset", { data(ASOIAF) df_asoiaf <- ASOIAF - results <- checkParentIDs(df_asoiaf,verbose = FALSE, repair = FALSE) + results <- checkParentIDs(df_asoiaf, verbose = FALSE, repair = FALSE) expect_true(results$single_parents) - single_dads <- length(df_asoiaf$id[!is.na(df_asoiaf$dadID)&is.na(df_asoiaf$momID)]) - single_moms <- length(df_asoiaf$id[is.na(df_asoiaf$dadID)&!is.na(df_asoiaf$momID)]) - expect_equal(single_moms, length(results$missing_fathers)) - expect_equal(single_dads, length(results$missing_mothers)) - repaired_df <- checkParentIDs(df_asoiaf,verbose = FALSE, repair = TRUE) - expect_equal(nrow(repaired_df), nrow(df_asoiaf)+single_moms+single_dads) - + single_dads <- length(df_asoiaf$id[!is.na(df_asoiaf$dadID) & is.na(df_asoiaf$momID)]) + single_moms <- length(df_asoiaf$id[is.na(df_asoiaf$dadID) & !is.na(df_asoiaf$momID)]) + expect_equal(single_moms, length(results$missing_fathers)) + expect_equal(single_dads, length(results$missing_mothers)) + repaired_df <- checkParentIDs(df_asoiaf, verbose = FALSE, repair = TRUE, parentswithoutrow = TRUE) + expect_equal(nrow(repaired_df), nrow(df_asoiaf) + single_moms + single_dads) }) - diff --git a/tests/testthat/test-computeRelatedness.R b/tests/testthat/test-computeRelatedness.R index 10b0de29..192df30b 100644 --- a/tests/testthat/test-computeRelatedness.R +++ b/tests/testthat/test-computeRelatedness.R @@ -25,6 +25,17 @@ test_that("calculateRelatedness function with empirical", { }) +# Test 7: empirical divide by zero + +test_that("calculateH handles divide by zero for empirical", { + expect_error( + calculateRelatedness( + generations = 2, + empirical = TRUE, total_a = 0, + total_m = 0 + ) + ) +}) test_that("inferRelatedness performs as expected", { result <- inferRelatedness(0, aceA = .9, aceC = 0, sharedC = 0) expect_equal(result, 0) diff --git a/tests/testthat/test-convertPedigree.R b/tests/testthat/test-convertPedigree.R index fb6e06fe..00139c58 100644 --- a/tests/testthat/test-convertPedigree.R +++ b/tests/testthat/test-convertPedigree.R @@ -1,7 +1,7 @@ test_that("ped2add produces correct matrix dims, values, and dimnames for hazard", { tolerance <- 1e-10 data(hazard) - add <- ped2add(hazard,sparse = FALSE) + add <- ped2add(hazard, sparse = FALSE) # Check dimension expect_equal(dim(add), c(nrow(hazard), nrow(hazard))) # Check several values @@ -21,7 +21,7 @@ test_that("ped2add produces correct matrix dims, values, and dimnames for hazard test_that("ped2add produces correct matrix dims, values, and dimnames for alternative transpose", { tolerance <- 1e-10 data(hazard) - add <- ped2add(hazard, tcross.alt.crossprod = TRUE,sparse = FALSE) + add <- ped2add(hazard, tcross.alt.crossprod = TRUE, sparse = FALSE) # Check dimension expect_equal(dim(add), c(nrow(hazard), nrow(hazard)), tolerance = tolerance) # Check several values @@ -37,12 +37,12 @@ test_that("ped2add produces correct matrix dims, values, and dimnames for altern expect_equal(dn[[1]], dn[[2]]) expect_equal(dn[[1]], as.character(hazard$ID)) }) -# to do, combine the sets that are equalivant. shouldn't need to run 1000 expect equals +# to do, combine the sets that are equivalent. shouldn't need to run 1000 expect equals test_that("ped2add produces correct matrix dims, values, and dimnames for inbreeding data", { tolerance <- 1e-10 data(inbreeding) - add <- ped2add(inbreeding,sparse = FALSE) + add <- ped2add(inbreeding, sparse = FALSE) # Check dimension expect_equal(dim(add), c(nrow(inbreeding), nrow(inbreeding)), tolerance = tolerance) # Check several values @@ -62,7 +62,7 @@ test_that("ped2add produces correct matrix dims, values, and dimnames for inbree test_that("ped2add produces correct matrix dims, values, and dimnames for inbreeding data with alternative transpose", { tolerance <- 1e-10 data(inbreeding) - add <- ped2add(inbreeding, transpose_method = "star",sparse = FALSE) + add <- ped2add(inbreeding, transpose_method = "star", sparse = FALSE) # Check dimension expect_equal(dim(add), c(nrow(inbreeding), nrow(inbreeding))) # Check several values @@ -81,7 +81,7 @@ test_that("ped2add produces correct matrix dims, values, and dimnames for inbree test_that("ped2add produces correct matrix dims, values, and dimnames for inbreeding data with 2nd alternative transpose", { tolerance <- 1e-10 data(inbreeding) - add <- ped2add(inbreeding, transpose_method = "crossprod",sparse = FALSE) + add <- ped2add(inbreeding, transpose_method = "crossprod", sparse = FALSE) # Check dimension expect_equal(dim(add), c(nrow(inbreeding), nrow(inbreeding))) # Check several values @@ -101,7 +101,7 @@ test_that("ped2add produces correct matrix dims, values, and dimnames for inbree test_that("ped2add flattens diagonal for inbreeding data", { tolerance <- 1e-10 data(inbreeding) - add <- ped2add(inbreeding, flatten.diag = TRUE,sparse = FALSE) + add <- ped2add(inbreeding, flatten.diag = TRUE, sparse = FALSE) # Check dimension expect_equal(dim(add), c(nrow(inbreeding), nrow(inbreeding)), tolerance = tolerance) # Check several values @@ -121,7 +121,7 @@ test_that("ped2mit produces correct matrix dims, values, and dimnames for inbree tolerance <- 1e-10 # Check dimension data(inbreeding) - mit <- ped2mit(inbreeding,sparse = FALSE) + mit <- ped2mit(inbreeding, sparse = FALSE) # Check dimension expect_equal(dim(mit), c(nrow(inbreeding), nrow(inbreeding))) # Check several values @@ -142,7 +142,7 @@ test_that("ped2mit produces correct matrix dims, values, and dimnames for inbree tolerance <- 1e-10 # Check dimension data(inbreeding) - mit <- ped2mit(inbreeding,sparse = FALSE) + mit <- ped2mit(inbreeding, sparse = FALSE) # Check dimension expect_equal(dim(mit), c(nrow(inbreeding), nrow(inbreeding)), tolerance = tolerance) # Check several values @@ -164,7 +164,7 @@ test_that("ped2cn produces correct matrix dims, values, and dimnames", { # Check dimension data(inbreeding) - cn <- ped2cn(inbreeding,sparse = FALSE) + cn <- ped2cn(inbreeding, sparse = FALSE) expect_equal(dim(cn), c( nrow(inbreeding), nrow(inbreeding) @@ -189,7 +189,7 @@ test_that("ped2cn produces correct matrix dims, values, and dimnames", { test_that("ped2ce produces correct matrix dims, values, and dimnames", { tolerance <- 1e-10 data(inbreeding) - ce <- ped2ce(inbreeding,sparse = FALSE) + ce <- ped2ce(inbreeding, sparse = FALSE) expect_equal(dim(ce), c(nrow(inbreeding), nrow(inbreeding)), tolerance = tolerance) # Check several values # expect_true(all(diag(ce) == 1)) @@ -207,7 +207,7 @@ test_that("ped2ce produces correct matrix dims, values, and dimnames", { test_that("ped2add verbose prints updates", { data(hazard) - expect_output(ped2add(hazard, verbose = TRUE,sparse = FALSE), regexp = "Family Size =") + expect_output(ped2add(hazard, verbose = TRUE, sparse = FALSE), regexp = "Family Size =") }) @@ -246,6 +246,7 @@ test_that("ped2com handles checkpoint saving and resuming", { gen_checkpoint = file.path(save_path, "gen_checkpoint.rds"), newIsPar_checkpoint = file.path(save_path, "newIsPar_checkpoint.rds"), mtSum_checkpoint = file.path(save_path, "mtSum_checkpoint.rds"), + ram_checkpoint = file.path(save_path, "ram_checkpoint.rds"), r2_checkpoint = file.path(save_path, "r2_checkpoint.rds"), tcrossprod_checkpoint = file.path(save_path, "tcrossprod_checkpoint.rds"), count_checkpoint = file.path(save_path, "count_checkpoint.rds"), @@ -256,7 +257,6 @@ test_that("ped2com handles checkpoint saving and resuming", { checkpoint_files_v1 <- list.files(save_path, pattern = "\\.rds$", full.names = TRUE) expect_equal(length(checkpoint_files_v1), length(checkpoint_files_v0)) - # Resume from checkpoint resumed_matrix <- ped2com(hazard, component = "additive", resume = TRUE, save_path = save_path, @@ -297,14 +297,26 @@ test_that("adjacency_method 'indexed', 'loop', and direct produce the same resu test_that("adjacency_method 'indexed', 'loop', and direct produce the same results for common nuclear matrix", { data(hazard) tolerance <- 1e-10 - + adjBeta_method_1 <- 1 + adjBeta_method_2 <- 4 + adjBeta_method_3 <- 5 # common nuclear ped_common_indexed <- ped2com(hazard, component = "common nuclear", adjacency_method = "indexed") ped_common_loop <- ped2com(hazard, component = "common nuclear", adjacency_method = "loop") - # ped_common_direct <- ped2com(hazard, component = "common nuclear", adjacency_method = "direct") + ped_common_direct <- ped2com(hazard, component = "common nuclear", adjacency_method = "direct") + ped_common_adjBeta_1 <- ped2com(hazard, component = "common nuclear", adjacency_method = "beta", adjBeta_method = adjBeta_method_2) + ped_common_adjBeta_2 <- ped2com(hazard, component = "common nuclear", adjacency_method = "beta", adjBeta_method = adjBeta_method_3) + expect_equal(ped_common_indexed, ped_common_loop, tolerance = tolerance) - # expect_equal(ped_common_loop, ped_common_direct, tolerance = tolerance) - # expect_equal(ped_common_indexed, ped_common_direct, tolerance = tolerance) + expect_equal(ped_common_loop, ped_common_direct, tolerance = tolerance) + expect_equal(ped_common_indexed, ped_common_direct, tolerance = tolerance) + expect_equal(ped_common_loop, ped_common_adjBeta_1, tolerance = tolerance) + expect_equal(ped_common_indexed, ped_common_adjBeta_1, tolerance = tolerance) + expect_equal(ped_common_loop, ped_common_adjBeta_2, tolerance = tolerance) + expect_equal(ped_common_indexed, ped_common_adjBeta_2, tolerance = tolerance) + expect_equal(ped_common_direct, ped_common_adjBeta_1, tolerance = tolerance) + expect_equal(ped_common_direct, ped_common_adjBeta_2, tolerance = tolerance) + expect_equal(ped_common_adjBeta_1, ped_common_adjBeta_2, tolerance = tolerance) }) diff --git a/tests/testthat/test-makeLinks.R b/tests/testthat/test-makeLinks.R index 2edb8377..7d5135f7 100644 --- a/tests/testthat/test-makeLinks.R +++ b/tests/testthat/test-makeLinks.R @@ -1,16 +1,15 @@ test_that("com2links handles missing matrices properly", { expect_error( com2links(ad_ped_matrix = NULL, mit_ped_matrix = NULL, cn_ped_matrix = NULL), - "At least one of 'ped_matrix', 'mit_ped_matrix', or 'cn_ped_matrix' must be provided." + "At least one of 'ad_ped_matrix', 'mit_ped_matrix', or 'cn_ped_matrix' must be provided." ) }) - test_that("com2links rejects invalid matrix types", { fake_matrix <- data.frame(A = c(1, 2), B = c(3, 4)) - expect_error(com2links(ad_ped_matrix = fake_matrix), "The 'ad_ped_matrix' must be a matrix or dgCMatrix.") + expect_error(com2links(ad_ped_matrix = fake_matrix), "The 'ad_ped_matrix' must be a matrix or generalMatrix") }) test_that("com2links produces correct output with a single relationship matrix (hazard dataset)", { @@ -25,6 +24,18 @@ test_that("com2links produces correct output with a single relationship matrix ( expect_true(all(result$addRel >= 0)) # Relatedness values should be non-negative }) +test_that("com2links produces correct output with cn_ped_matrix", { + data(ASOIAF) + cn_ped_matrix <- ped2cn(ASOIAF, sparse = TRUE) + + result <- com2links(cn_ped_matrix = cn_ped_matrix, writetodisk = FALSE) + + expect_true(is.data.frame(result)) + expect_true(all(c("ID1", "ID2", "cnuRel") %in% colnames(result))) + expect_equal(ncol(result), 3) # Expect ID1, ID2, and addRel + expect_true(all(result$cnRel >= 0)) # Relatedness values should be non-negative +}) + test_that("com2links produces correct output with mt_ped_matrix", { data(hazard) mit_ped_matrix <- ped2mit(hazard, sparse = TRUE) @@ -34,7 +45,7 @@ test_that("com2links produces correct output with mt_ped_matrix", { expect_true(is.data.frame(result)) expect_true(all(c("ID1", "ID2", "mitRel") %in% colnames(result))) expect_equal(ncol(result), 3) # Expect ID1, ID2, and addRel - expect_true(all(result$addRel >= 0)) # Relatedness values should be non-negative + expect_true(all(result$mitRel %in% c(0, 1))) # Mitochondrial should be binary }) test_that("com2links processes multiple matrices correctly (hazard dataset)", { @@ -53,7 +64,46 @@ test_that("com2links processes multiple matrices correctly (hazard dataset)", { expect_true(all(result$cnuRel >= 0)) }) +test_that("com2links processes creates same length for cn with 3, 2, and 1 matrices are used", { + data(hazard) + ad_ped_matrix <- ped2add(hazard, sparse = TRUE) + mit_ped_matrix <- ped2mit(hazard, sparse = TRUE) + cn_ped_matrix <- ped2cn(hazard, sparse = TRUE) + result3 <- com2links(ad_ped_matrix = ad_ped_matrix, mit_ped_matrix = mit_ped_matrix, cn_ped_matrix = cn_ped_matrix, writetodisk = FALSE) + + expect_true(is.data.frame(result3)) + expect_true(all(c("ID1", "ID2", "addRel", "mitRel", "cnuRel") %in% colnames(result3))) + expect_equal(ncol(result3), 5) # Expect ID1, ID2, addRel, mitRel, and cnuRel + expect_true(all(result3$addRel >= 0)) + expect_true(all(result3$mitRel %in% c(0, 1))) # Mitochondrial should be binary + expect_true(all(result3$cnuRel >= 0)) + + result2 <- com2links(ad_ped_matrix = ad_ped_matrix, cn_ped_matrix = cn_ped_matrix, writetodisk = FALSE) + expect_true(is.data.frame(result2)) + expect_true(all(c("ID1", "ID2", "addRel", "cnuRel") %in% colnames(result2))) + expect_equal(ncol(result2), 4) # Expect ID1, ID2, addRel, and cnuRel + expect_true(all(result2$addRel >= 0)) + expect_true(all(result2$cnuRel >= 0)) + + expect_equal(result3$cnuRel, result2$cnuRel) + + result1 <- com2links(cn_ped_matrix = cn_ped_matrix, writetodisk = FALSE) + result1_legacy <- .com2links.legacy(cn_ped_matrix = cn_ped_matrix, writetodisk = FALSE) + expect_true(is.data.frame(result1)) + expect_true(is.data.frame(result1_legacy)) + expect_true(all(c("ID1", "ID2", "cnuRel") %in% colnames(result1))) + expect_true(all(c("ID1", "ID2", "cnuRel") %in% colnames(result1_legacy))) + expect_equal(ncol(result1), 3) # Expect ID1, ID2, and cnuRel + expect_equal(ncol(result1_legacy), 3) # Expect ID1, ID2, and cnuRel + expect_true(all(result1$cnuRel >= 0)) + expect_true(all(result1_legacy$cnuRel >= 0)) + expect_equal(result3$cnuRel[result3$cnuRel == 1], result1$cnuRel[result1$cnuRel == 1]) + expect_equal(result3$cnuRel[result3$cnuRel == 1], result1_legacy$cnuRel[result1_legacy$cnuRel == 1]) + expect_equal(result2$cnuRel[result2$cnuRel == 1], result1$cnuRel[result1$cnuRel == 1]) + expect_equal(result2$cnuRel[result2$cnuRel == 1], result1_legacy$cnuRel[result1_legacy$cnuRel == 1]) + expect_equal(result1$cnuRel[result1$cnuRel == 1], result1_legacy$cnuRel[result1_legacy$cnuRel == 1]) +}) test_that("com2links written version matchs", { data(hazard) ad_ped_matrix <- ped2com(hazard, component = "additive", adjacency_method = "direct", sparse = TRUE) @@ -95,7 +145,7 @@ test_that("com2links legacy works", { mit_ped_matrix <- ped2com(hazard, component = "mitochondrial", adjacency_method = "direct", sparse = TRUE) cn_ped_matrix <- ped2com(hazard, component = "common nuclear", adjacency_method = "indexed", sparse = TRUE) - resultlegacy <- com2links( + resultlegacy <- .com2links.legacy( ad_ped_matrix = ad_ped_matrix, mit_ped_matrix = mit_ped_matrix, cn_ped_matrix = cn_ped_matrix, legacy = TRUE @@ -108,22 +158,112 @@ test_that("com2links legacy works", { expect_true(all(c("ID1", "ID2", "addRel", "mitRel", "cnuRel") %in% colnames(written_data))) - result <- com2links( + + result_beta <- com2links( + ad_ped_matrix = ad_ped_matrix, + mit_ped_matrix = mit_ped_matrix, cn_ped_matrix = cn_ped_matrix, + writetodisk = FALSE + ) + + expect_true(is.data.frame(result_beta)) + expect_true(all(c("ID1", "ID2", "addRel", "mitRel", "cnuRel") %in% colnames(result_beta))) + + + result <- .com2links.legacy( ad_ped_matrix = ad_ped_matrix, mit_ped_matrix = mit_ped_matrix, cn_ped_matrix = cn_ped_matrix, writetodisk = FALSE ) expect_true(is.data.frame(result)) + expect_true(all(c("ID1", "ID2", "addRel", "mitRel", "cnuRel") %in% colnames(result))) # Drop row names to avoid mismatches in expect_equal rownames(result) <- NULL rownames(written_data) <- NULL + rownames(result_beta) <- NULL # Final comparison between written versions expect_equal(written_data, result) + expect_equal(result_beta, result) }) +test_that("com2links beta works", { + data(hazard) + ad_ped_matrix <- ped2com(hazard, component = "additive", adjacency_method = "direct", sparse = TRUE) + mit_ped_matrix <- ped2com(hazard, component = "mitochondrial", adjacency_method = "direct", sparse = TRUE) + cn_ped_matrix <- ped2com(hazard, component = "common nuclear", adjacency_method = "indexed", sparse = TRUE) + + # compare 2 + result_beta <- com2links( + ad_ped_matrix = ad_ped_matrix, + mit_ped_matrix = mit_ped_matrix, + writetodisk = FALSE + ) + + expect_true(is.data.frame(result_beta)) + expect_true(all(c("ID1", "ID2", "addRel", "mitRel") %in% colnames(result_beta))) + + + result <- .com2links.legacy( + ad_ped_matrix = ad_ped_matrix, + mit_ped_matrix = mit_ped_matrix, + writetodisk = FALSE + ) + + expect_true(is.data.frame(result)) + expect_true(all(c("ID1", "ID2", "addRel", "mitRel") %in% colnames(result))) + # Drop row names to avoid mismatches in expect_equal + rownames(result) <- NULL + rownames(result_beta) <- NULL + + # Final comparison between versions + expect_equal(result_beta, result) + + + # write to disk + result_disk <- com2links( + ad_ped_matrix = ad_ped_matrix, + mit_ped_matrix = mit_ped_matrix, + writetodisk = TRUE + ) + expect_true(file.exists("dataRelatedPairs.csv")) + written_data <- read.csv("dataRelatedPairs.csv") + # remove the file + expect_true(file.remove("dataRelatedPairs.csv")) + + expect_true(all(c("ID1", "ID2", "addRel", "mitRel") %in% colnames(written_data))) + rownames(written_data) <- NULL + expect_equal(result_beta, written_data) + expect_equal(result, written_data) + # compare 1 + + result_beta <- com2links( + mit_ped_matrix = mit_ped_matrix, + writetodisk = FALSE + ) + + expect_true(is.data.frame(result_beta)) + expect_true(all(c("ID1", "ID2", "mitRel") %in% colnames(result_beta))) + + + result <- com2links( + mit_ped_matrix = mit_ped_matrix, + writetodisk = FALSE + ) + + expect_true(is.data.frame(result)) + expect_true(all(c("ID1", "ID2", "mitRel") %in% colnames(result))) + # Drop row names to avoid mismatches in expect_equal + rownames(result) <- NULL + rownames(result_beta) <- NULL + + # Final comparison between versions + expect_equal(result_beta, result) +}) + + + test_that("com2links correctly handles missing matrices", { data(hazard) @@ -131,10 +271,10 @@ test_that("com2links correctly handles missing matrices", { expect_error( com2links(ad_ped_matrix = NULL, mit_ped_matrix = NULL, cn_ped_matrix = NULL), - "At least one of 'ped_matrix', 'mit_ped_matrix', or 'cn_ped_matrix' must be provided." + "At least one of 'ad_ped_matrix', 'mit_ped_matrix', or 'cn_ped_matrix' must be provided." ) - expect_error(com2links(ad_ped_matrix = hazard), "The 'ad_ped_matrix' must be a matrix or dgCMatrix.") + expect_error(com2links(ad_ped_matrix = hazard), "The 'ad_ped_matrix' must be a matrix or generalMatrix") }) test_that("com2links correctly processes inbreeding dataset", { @@ -179,14 +319,15 @@ test_that("com2links handles large batch writing correctly", { sexR <- 0.5 df_fam <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR) - ad_ped_matrix <- ped2add(df_fam, sparse = TRUE) + cn_ped_matrix <- ped2cn(df_fam, sparse = TRUE) temp_file <- tempfile(fileext = ".csv") - com2links(ad_ped_matrix = ad_ped_matrix, rel_pairs_file = temp_file, writetodisk = TRUE, verbose = TRUE) + com2links(cn_ped_matrix = cn_ped_matrix, rel_pairs_file = temp_file, writetodisk = TRUE, verbose = TRUE) expect_true(file.exists(temp_file)) written_data <- read.csv(temp_file) - expect_true(nrow(written_data) > 1000) # Ensuring batch writing logic works + expect_true(nrow(written_data) == 155) # Ensuring batch writing logic works + expect_true(file.remove(temp_file)) }) test_that("com2links garbage collection does not affect output, using two components", { diff --git a/tests/testthat/test-networks.R b/tests/testthat/test-networks.R index 882e7df8..84e008a0 100644 --- a/tests/testthat/test-networks.R +++ b/tests/testthat/test-networks.R @@ -9,15 +9,15 @@ test_that("inbreeding data loads", { test_that("ped2fam gets the right families for hazard data", { data(hazard) ds <- ped2fam(hazard, famID = "newFamID") - tab <- table(ds$FamID, ds$newFamID) - expect_equal(ds$FamID, ds$newFamID) + tab <- table(ds$famID, ds$newFamID) + expect_equal(ds$famID, ds$newFamID) }) test_that("ped2fam gets the right families for inbreeding data", { data(inbreeding) ds <- ped2fam(inbreeding, famID = "newFamID") - tab <- table(ds$FamID, ds$newFamID) - expect_equal(ds$FamID, ds$newFamID) + tab <- table(ds$famID, ds$newFamID) + expect_equal(ds$famID, ds$newFamID) }) test_that("ped2graph produces a graph for hazard data with mothers", { diff --git a/tests/testthat/test-plotPedigree.R b/tests/testthat/test-plotPedigree.R index 4944bcdb..ce4697c4 100644 --- a/tests/testthat/test-plotPedigree.R +++ b/tests/testthat/test-plotPedigree.R @@ -32,3 +32,16 @@ test_that("pedigree plots correctly with affected variables", { # file.remove("Rplots.pdf") }) # file.remove("Rplots.pdf") + +test_that("pedigree errs when affected variables named", { + data(inbreeding) + + expect_error(plotPedigree(data, verbose = TRUE, affected = "affected")) +}) + + +test_that("pedigree plots multiple families", { + data(inbreeding) + + expect_output(plotPedigree(inbreeding, verbose = TRUE)) +}) diff --git a/tests/testthat/test-readPedigrees.R b/tests/testthat/test-readPedigrees.R index 94cbcb28..779dde29 100644 --- a/tests/testthat/test-readPedigrees.R +++ b/tests/testthat/test-readPedigrees.R @@ -18,7 +18,8 @@ test_that("readGedcom reads and parses a GEDCOM file correctly", { "1 SEX F", "1 BIRT", "2 DATE 2 FEB 1910", - "2 PLAC Anotherplace" + "2 PLAC Anotherplace", + "1 NCHI 2" ) temp_file <- tempfile(fileext = ".ged") writeLines(gedcom_content, temp_file) @@ -179,29 +180,71 @@ test_that("if file does not exist, readGedcom throws an error", { -# readWikifamilytree +test_that("readGedcom parses death event correctly", { + # Test that a GEDCOM file with a death event is parsed correctly. + gedcom_content <- c( + "0 @I1@ INDI", + "1 NAME John /Doe/", + "1 SEX M", + "1 DEAT", + "2 DATE 31 DEC 2000", + "2 PLAC Lastplace", + "2 CAUS Old age", + "2 LATI 12.3456", + "2 LONG -65.4321" + ) + temp_file <- tempfile(fileext = ".ged") + writeLines(gedcom_content, temp_file) -test_that("readWikifamilytree reads a simple file correctly", { - # Create a temporary WikiFamilyTree file for testing - # Example usage - family_tree_text <- "{{familytree/start |summary=I have a brother Joe and a little sister: my mom married my dad, and my dad's parents were Grandma and Grandpa; they had another child, Aunt Daisy.}} -{{familytree | | | | GMa |~|y|~| GPa | | GMa=Gladys|GPa=Sydney}} -{{familytree | | | | | | | |)|-|-|-|.| }} -{{familytree | | | MOM |y| DAD | |DAISY| MOM=Mom|DAD=Dad|DAISY=[[Daisy Duke]]}} -{{familytree | |,|-|-|-|+|-|-|-|.| | | }} -{{familytree | JOE | | ME | | SIS | | | JOE=My brother Joe|ME='''Me!'''|SIS=My little sister}} -{{familytree/end}}" + df <- readGedcom(temp_file, verbose = TRUE) + + expect_true("death_date" %in% colnames(df)) + expect_true("death_place" %in% colnames(df)) + expect_true("death_caus" %in% colnames(df)) + expect_true("death_lat" %in% colnames(df)) + expect_true("death_long" %in% colnames(df)) + + expect_equal(df$death_date[1], "31 DEC 2000") + expect_equal(df$death_place[1], "Lastplace") + expect_equal(df$death_caus[1], "Old age") + expect_equal(df$death_lat[1], "12.3456") + expect_equal(df$death_long[1], "-65.4321") + df_leg <- .readGedcom.legacy(temp_file, verbose = TRUE) + + expect_true("death_date" %in% colnames(df_leg)) + expect_true("death_place" %in% colnames(df_leg)) + expect_true("death_caus" %in% colnames(df_leg)) + expect_true("death_lat" %in% colnames(df_leg)) + expect_true("death_long" %in% colnames(df_leg)) + + expect_equal(df_leg$death_date[1], "31 DEC 2000") + expect_equal(df_leg$death_place[1], "Lastplace") + expect_equal(df_leg$death_caus[1], "Old age") + expect_equal(df_leg$death_lat[1], "12.3456") + expect_equal(df_leg$death_long[1], "-65.4321") + + row.names(df) <- NULL + row.names(df_leg) <- NULL + expect_equal(df_leg, df) - result <- readWikifamilytree(family_tree_text) + unlink(temp_file) +}) - # list( - # summary = summary_text, - # members = members_df, - # structure = tree_long, - # relationships = relationships_df - # ) - expect_equal( - result$summary, - "I have a brother Joe and a little sister: my mom married my dad, and my dad's parents were Grandma and Grandpa; they had another child, Aunt Daisy." +test_that("readGedcom handles incomplete individual records gracefully", { + # Test that an individual record missing a NAME line is handled without error. + gedcom_content <- c( + "0 @I1@ INDI", + "1 SEX M" + # No NAME or BIRT information. ) + temp_file <- tempfile(fileext = ".ged") + writeLines(gedcom_content, temp_file) + + df <- readGedcom(temp_file, verbose = TRUE) + + # Expect one record with missing name fields. + expect_equal(nrow(df), 1) + expect_true(is.null(df$name[1])) + + unlink(temp_file) }) diff --git a/tests/testthat/test-readWikiTree.R b/tests/testthat/test-readWikiTree.R new file mode 100644 index 00000000..e7602e03 --- /dev/null +++ b/tests/testthat/test-readWikiTree.R @@ -0,0 +1,98 @@ +# readWikifamilytree + + +test_that("traceTreePaths works correctly for horizontal tree", { + # Create a mock tree_horizontal data frame + # This is a simplified version of the tree_horizontal data frame + # with Row, Column, Value, and id columns + # The id column is used to identify the nodes in the tree + + tree_horizontal <- data.frame( + Row = rep(1, 3), + Column = 1:3, + Value = c("A", "+", "B"), + stringsAsFactors = FALSE + ) + tree_horizontal$id <- NA + tree_horizontal$id[tree_horizontal$Value %in% c("A", "B")] <- tree_horizontal$Value[tree_horizontal$Value %in% c("A", "B")] + + result <- traceTreePaths(tree_horizontal) + # Check the result + # Check the result + expect_equal(names(result), c("from_id", "to_id", "path_length", "intermediates", "intermediate_values")) + expect_equal(c("A", "B") %in% c(result$from_id, result$to_id), rep(TRUE, 2)) + expect_equal(result$path_length[result$from_id == "A" & result$to_id == "B"], 2) + expect_equal(result$intermediate_values[result$from_id == "A" & result$to_id == "B"], "+") +}) + + +test_that("traceTreePaths works correctly for vertical tree", { + # Create a mock tree_vertical data frame + # This is a simplified version of the tree_vertical data frame + # with Row, Column, Value, and id columns + # The id column is used to identify the nodes in the tree + tree_spouse_child <- data.frame( + Row = c(1, 1, 1, 2, 3, 4, 5), + Column = c(1, 2, 3, 2, 2, 2, 2), + Value = c("A", "+", "B", "|", "y", "|", "C"), + stringsAsFactors = FALSE + ) + tree_spouse_child$id <- NA + tree_spouse_child$id[tree_spouse_child$Value %in% c("A", "B", "C")] <- tree_spouse_child$Value[tree_spouse_child$Value %in% c("A", "B", "C")] + + result <- traceTreePaths(tree_spouse_child) + # Check the result + expect_equal(names(result), c("from_id", "to_id", "path_length", "intermediates", "intermediate_values")) + expect_equal(c("A", "B", "C") %in% c(result$from_id, result$to_id), rep(TRUE, 3)) + expect_equal(result$path_length[result$from_id == "A" & result$to_id == "B"], 2) + expect_equal(result$path_length[result$from_id == "A" & result$to_id == "C"], 5) + expect_equal(result$path_length[result$from_id == "B" & result$to_id == "C"], 5) + expect_equal(result$intermediate_values[result$from_id == "A" & result$to_id == "B"], "+") +}) + + + + + + + + +test_that("readWikifamilytree reads a string correctly", { + # Create a temporary WikiFamilyTree file for testing + # Example usage + family_tree_text <- "{{familytree/start |summary=I have a brother Joe and a little sister: my mom married my dad, and my dad's parents were Grandma and Grandpa; they had another child, Aunt Daisy.}} +{{familytree | | | | GMa |~|y|~| GPa | | GMa=Gladys|GPa=Sydney}} +{{familytree | | | | | | | |)|-|-|-|.| }} +{{familytree | | | MOM |y| DAD | |DAISY| MOM=Mom|DAD=Dad|DAISY=[[Daisy Duke]]}} +{{familytree | |,|-|-|-|+|-|-|-|.| | | }} +{{familytree | JOE | | ME | | SIS | | | JOE=My brother Joe|ME='''Me!'''|SIS=My little sister}} +{{familytree/end}}" + + temp_file <- tempfile(fileext = ".txt") + writeLines(family_tree_text, temp_file) + + + result <- readWikifamilytree(text = family_tree_text) + result2 <- readWikifamilytree(file_path = temp_file) + + expect_equal( + result$summary, + "I have a brother Joe and a little sister: my mom married my dad, and my dad's parents were Grandma and Grandpa; they had another child, Aunt Daisy." + ) + + expect_equal( + result2$summary, + result$summary + ) +}) + + +# read E:/Dropbox/Lab/Research/Projects/2024/BGMiscJoss/BGmisc_main/data-raw/Targaryen tree Dance.txt + +# test_that("readWikifamilytree reads a file correctly", { +# Create a temporary WikiFamilyTree file for testing +# Example usage +# family_tree_file_path <- "data-raw/Targaryen tree Dance.txt" # system.file("extdata", "Targaryen tree Dance.txt", package = "BGmisc") + +# result <- readWikifamilytree(file_path=family_tree_file_path) +# }) diff --git a/tests/testthat/test-summarizePedigrees.R b/tests/testthat/test-summarizePedigrees.R index 13cb46e0..3fd62464 100644 --- a/tests/testthat/test-summarizePedigrees.R +++ b/tests/testthat/test-summarizePedigrees.R @@ -7,7 +7,13 @@ test_that("Counts the correct number people", { expect_equal(result_observed, result_expected) }) - +# Test: SummarizeFamilies is used when SummariseFamilies +test_that("SummarizeFamilies works like SummariseFamilies", { + df <- ped2fam(potter, famID = "newFamID", personID = "personID") + df_summarized <- summarizeFamilies(df, famID = "newFamID", personID = "personID") + df_summarised <- summariseFamilies(df, famID = "newFamID", personID = "personID") + expect_equal(df_summarised, df_summarized) +}) # Test Case 2: Multiple families test_that("summarizeFamilies() works with multiple families", { df <- ped2fam(inbreeding, famID = "newFamID", personID = "ID") @@ -51,39 +57,66 @@ test_that("summarizeFamilies() works with additional summary stats", { # Test Case 4: Does this function work for summarizeMatrilines test_that("summarizeMatrilines() works", { nbiggest <- 2 - df <- ped2fam(potter, famID = "newFamID", personID = "personID") %>% ped2maternal(personID = "personID") - df_summarized <- summarizeMatrilines(df, famID = "newFamID", personID = "personID", nbiggest = nbiggest) + df <- ped2fam(potter, famID = "newFamID", personID = "personID") %>% + ped2maternal(personID = "personID") + df_summarized <- summarizeMatrilines(df, + famID = "newFamID", + personID = "personID", + nbiggest = nbiggest + ) # is the total count from the family summary the same as the raw data? result_observed <- sum(df_summarized$maternal_summary$count) result_expected <- nrow(potter) expect_equal(result_observed, result_expected) - # is the count of the summarized data frame equal to the number of unique families in the input data frame? + # is the count of the summarized data frame equal to the number of + # unique families in the input data frame? result_observed <- length(df_summarized$maternal_summary$count) result_expected <- length(unique(df$matID)) expect_equal(result_observed, result_expected) - # is the count of the biggest families equal to the number of unique families in the input data frame? + # is the count of the biggest families equal to the number of + # unique families in the input data frame? result_observed <- nrow(df_summarized$biggest_maternal) expect_equal(result_observed, nbiggest) }) - +# Test: SummarizeMatrilines is used when SummariseMatrilines +test_that("SummarizeMatrilines works like SummariseMatrilines", { + df <- ped2fam(potter, famID = "newFamID", personID = "personID") + df_summarized <- summarizeMatrilines(df, famID = "newFamID", personID = "personID") + df_summarised <- summariseMatrilines(df, famID = "newFamID", personID = "personID") + expect_equal(df_summarised, df_summarized) +}) # Test Case 5: Does this function work for summarizePatrilines test_that("summarizePatrilines() works", { nbiggest <- 4 - df <- ped2fam(potter, famID = "newFamID", personID = "personID") %>% ped2paternal(personID = "personID") - df_summarized <- summarizePatrilines(df, famID = "newFamID", personID = "personID", nbiggest = nbiggest) + df <- ped2fam(potter, famID = "newFamID", personID = "personID") %>% + ped2paternal(personID = "personID") + df_summarized <- summarizePatrilines(df, + famID = "newFamID", + personID = "personID", + nbiggest = nbiggest + ) # is the total count from the family summary the same as the raw data? result_observed <- sum(df_summarized$paternal_summary$count) result_expected <- nrow(potter) expect_equal(result_observed, result_expected) - # is the count of the summarized data frame equal to the number of unique families in the input data frame? + # is the count of the summarized data frame equal to the number of + # unique families in the input data frame? result_observed <- length(df_summarized$paternal_summary$count) result_expected <- length(unique(df$patID)) expect_equal(result_observed, result_expected) - # is the count of the biggest families equal to the number of unique families in the input data frame? + # is the count of the biggest families equal to the number of + # unique families in the input data frame? result_observed <- nrow(df_summarized$biggest_paternal) expect_equal(result_observed, nbiggest) }) +# Test: summarizePatrilines is used when SummarisePatrilines +test_that("summarizePatrilines works like SummarisePatrilines", { + df <- ped2fam(potter, famID = "newFamID", personID = "personID") + df_summarized <- summarizePatrilines(df, famID = "newFamID", personID = "personID") + df_summarised <- summarisePatrilines(df, famID = "newFamID", personID = "personID") + expect_equal(df_summarised, df_summarized) +}) # Test Case 6: Handling of missing values in critical columns test_that("summarizePedigrees() handles missing values correctly", { df <- data.frame( @@ -101,7 +134,7 @@ test_that("summarizePedigrees() handles missing values correctly", { }) # Test Case 7: When all variables are skipped -test_that("summarizePedigrees() works when all numeric variables are skipped", { +test_that("summarizePedigrees works when all numeric variables are skipped", { df <- data.frame( ID = 1:5, momID = c(NA, 1, 1, NA, 4), @@ -116,10 +149,16 @@ test_that("summarizePedigrees() works when all numeric variables are skipped", { # Test Case 8: Handling invalid column names test_that("summarizePedigrees() throws error on invalid column names", { - df <- data.frame(ID = 1:5, momID = c(NA, 1, 1, NA, 4), dadID = c(NA, 2, 2, NA, 5), famID = c(1, 1, 1, 2, 2)) + df <- data.frame( + ID = 1:5, momID = c(NA, 1, 1, NA, 4), + dadID = c(NA, 2, 2, NA, 5), famID = c(1, 1, 1, 2, 2) + ) expect_error(summarizePedigrees(df, byr = "unknown_column")) }) + + + # Test Case 9: Handling empty dataset # test_that("summarizePedigrees() handles empty dataset gracefully", { # df <- data.frame(ID = integer(), momID = integer(), dadID = integer(), famID = integer()) @@ -138,3 +177,11 @@ test_that("summarizePedigrees() works for single-entry pedigree", { expect_equal(nrow(df_summarized$family_summary), 1) expect_equal(df_summarized$oldest_families$byr_mean, 1920) }) + +# Test: summarizePedigrees is used when SummarisePedigrees +test_that("SummarizePedigrees works like SummarisePedigrees", { + df <- ped2fam(potter, famID = "newFamID", personID = "personID") + df_summarized <- summarizePedigrees(df, famID = "newFamID", personID = "personID") + df_summarised <- summarisePedigrees(df, famID = "newFamID", personID = "personID") + expect_equal(df_summarised, df_summarized) +}) diff --git a/tests/testthat/test-tweakPedigree.R b/tests/testthat/test-tweakPedigree.R index 24f81ff9..eb00908a 100644 --- a/tests/testthat/test-tweakPedigree.R +++ b/tests/testthat/test-tweakPedigree.R @@ -2,7 +2,7 @@ test_that("makeTwins - Twins specified by IDs", { set.seed(1234) ped <- data.frame( - fam = c(1, 1, 2, 2), + famID = c(1, 1, 2, 2), ID = c(1, 2, 3, 4), gen = c(1, 1, 2, 2), dadID = c(NA, NA, 1, 1), @@ -11,7 +11,7 @@ test_that("makeTwins - Twins specified by IDs", { sex = c("M", "F", "M", "F") ) expected_result <- data.frame( - fam = c(1, 1, 2, 2), + famID = c(1, 1, 2, 2), ID = c(1, 2, 3, 4), gen = c(1, 1, 2, 2), dadID = c(NA, NA, 1, 1), @@ -25,7 +25,7 @@ test_that("makeTwins - Twins specified by IDs", { expect_equal(result, expected_result) # does it handle weird variable names? "fam" = "famID" - names(ped)[1] <- "famID" + names(ped)[1] <- "fam" result <- makeTwins(ped, ID_twin1 = 1, ID_twin2 = 2, verbose = TRUE) expect_equal(result, expected_result) @@ -41,7 +41,7 @@ test_that("makeTwins - Twins specified by generation", { ped <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR) # result <- makeTwins(ped, gen_twin = gen_twin) - expect_equal(names(result), c("fam", "ID", "gen", "dadID", "momID", "spID", "sex", "MZtwin")) + expect_equal(names(result), c("famID", "ID", "gen", "dadID", "momID", "spID", "sex", "MZtwin")) # do we have the same people? expect_equal(result$ID, ped$ID) # did it make one pair of twins? @@ -61,7 +61,7 @@ test_that("makeTwins - Twins specified by generation", { # Test for makeInbreeding function test_that("makeInbreeding - Inbred mates specified by IDs", { ped <- data.frame( - fam = c(1, 1, 2, 2), + famID = c(1, 1, 2, 2), ID = c(1, 2, 3, 4), gen = c(1, 1, 2, 2), dadID = c(NA, NA, 1, 1), @@ -70,7 +70,7 @@ test_that("makeInbreeding - Inbred mates specified by IDs", { sex = c("M", "F", "M", "F") ) expected_result <- data.frame( - fam = c(1, 1, 2, 2), + famID = c(1, 1, 2, 2), ID = c(1, 2, 3, 4), gen = c(1, 1, 2, 2), dadID = c(NA, NA, 1, 1), @@ -94,7 +94,7 @@ test_that("makeInbreeding - Inbred mates specified by generation and sibling", { ped <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR) # result <- makeInbreeding(ped, gen_inbred = gen_inbred, type_inbred = type_inbred) - expect_equal(names(result), c("fam", "ID", "gen", "dadID", "momID", "spID", "sex")) + expect_equal(names(result), c("famID", "ID", "gen", "dadID", "momID", "spID", "sex")) # do we have the same people? expect_equal(result$ID, ped$ID) @@ -115,7 +115,7 @@ test_that("makeInbreeding - Inbred mates specified by generation and cousin", { expect_error(makeInbreeding(ped, gen_inbred = gen_inbred, - type_inbred = type_inbred,verbose = TRUE + type_inbred = type_inbred, verbose = TRUE ), regexp = "Cousin inbreedin") }) diff --git a/vignettes/ASOIAF.Rmd b/vignettes/ASOIAF.Rmd index ddc07ebe..00211864 100644 --- a/vignettes/ASOIAF.Rmd +++ b/vignettes/ASOIAF.Rmd @@ -7,12 +7,15 @@ vignette: > %\VignetteEncoding{UTF-8} --- +## Introduction -Just how related are Jon Snow and Daenerys Targaryen? This vignette walks through how to quantify their genetic relatedness using functions from the BGmisc package. While the Game of Thrones canon gives us some clues, we can use a formal pedigree-based approach to quantify their genetic relatedness. This vignette demonstrates how to compute coefficients of relatedness using the `BGmisc` package, along with basic data manipulation from tidyverse. We will also handle incomplete parental information programmatically and generate a plot of the reconstructed pedigree. +Just how closely related are Jon Snow and Daenerys Targaryen? According to the lore of *A Song of Ice and Fire*, Daenerys is Jon's paternal aunt. This would suggest a theoretical genetic relatedness of 0.25, assuming a simple pedigree and no inbreeding. But with tangled ancestries and potentially missing information, how confident can we be in that estimate? + +In this vignette, we use the `BGmisc` package to reconstruct the *ASOIAF* pedigree, handle incomplete parentage data, and compute additive genetic and common nuclear relatedness. We'll focus on Jon and Daenerys as a case study, but the methods generalize to any characters in the provided dataset. ## Load Packages and Data -We begin by loading the necessary packages and accessing the built-in `ASOIAF` pedigree dataset included with `BGmisc`. +We begin by loading the required libraries and examining the structure of the built-in `ASOIAF` pedigree. ```{r echo=TRUE, message=FALSE, warning=FALSE} @@ -21,7 +24,7 @@ library(tidyverse) data(ASOIAF) ``` -The ASOIAF data contains character IDs, family identifiers, and parent-child links extracted from A Song of Ice and Fire lore. +The ASOIAF dataset includes character IDs, names, family identifiers, and parent identifiers for a subset of characters drawn from the *A Song of Ice and Fire* canon. ```{r} head(ASOIAF) @@ -29,7 +32,8 @@ head(ASOIAF) ## Prepare and Validate Sex Codes -We use `checkSex()` to ensure that all individuals have valid sex codes, repairing as needed. This is important for correct pedigree plotting and downstream calculations. +Many pedigree-based algorithms rely on biological sex for downstream calculationss and visualization. We use `checkSex()` to inspect the sex variable, repairing inconsistencies programmatically. + ```{r} df_got <- checkSex(ASOIAF, @@ -39,46 +43,69 @@ df_got <- checkSex(ASOIAF, ) ``` + ## Compute Relatedness Matrices -We now compute the additive genetic relatedness matrix (add) and the common nuclear relatedness matrix (cn) from the pedigree using ped2com() and ped2cn(), respectively. -The `isChild_method` argument specifies how to identify child-parent relationships. We use "partialparent" to account for missing parent information. -The `adjacency_method` argument specifies how to construct the adjacency matrix. We use "direct" for the additive matrix and "indexed" for the common nuclear matrix. The direct method is much faster. The `sparse` argument is set to FALSE to return dense matrices. +With validated pedigree data, we can now compute two distinct relationship matrices: + +- Additive genetic relatedness (add): Proportion of shared additive genetic variance between individuals. + +- Common nuclear relatedness (cn): Indicates shared full-sibling (nuclear family) environments. + +These are derived using ped2add() and ped2cn(), respectively. Both functions rely on internal graph traversal and adjacency structures. In this case: + +- We specify isChild_method = "partialparent" to allow inclusion of dyads where one parent is unknown. + +- We choose adjacency_method = "direct" for the additive matrix to optimize for computational speed. + +- For the common nuclear matrix, we use adjacency_method = "indexed", which is slower but necessary for resolving sibling-group structures. + +- We set `sparse = FALSE` to return full (dense) matrices rather than compressed sparse formats. + ```{r} add <- ped2com(df_got, isChild_method = "partialparent", component = "additive", adjacency_method = "direct", - sparse = FALSE + sparse = TRUE +) + +mt <- ped2com(df_got, + isChild_method = "partialparent", + component = "mitochondrial", + adjacency_method = "direct", + sparse = TRUE ) cn <- ped2cn(df_got, isChild_method = "partialparent", adjacency_method = "indexed", - sparse = FALSE + sparse = TRUE ) ``` ## Convert to Pairwise Format -We convert the component matrices into a long-format table of pairwise relationships using `com2links()`. This gives us a long dataframe where each row represents a pair of individuals and their relatedness. The function can return the entire matrix or just the lower triangular part, which is often sufficient for our purposes. We set `writetodisk = FALSE` to keep the data in memory. +For interpretability, we convert these square matrices into long-format tables using `com2links()`. This function returns a dataframe where each row represents a unique pair of individuals, including their additive and common nuclear coefficients. ```{r} df_links <- com2links( writetodisk = FALSE, - ad_ped_matrix = add, cn_ped_matrix = cn, + ad_ped_matrix = add, cn_ped_matrix = cn, mit_ped_matrix = mt, drop_upper_triangular = TRUE -)# %>% +) # %>% # filter(ID1 != ID2) ``` +The function can return the entire matrix or just the lower triangular part, which is often sufficient for our purposes. Setting `drop_upper_triangular = TRUE` ensures we only retain one entry per dyad, since the matrices are symmetric. We also keep the data in memory by setting `writetodisk = FALSE`. + ## Locate Jon and Daenerys -Next, we extract the IDs corresponding to Jon Snow and Daenerys Targaryen. -We use the `filter()` function to find the rows in the `df_links` dataframe where either ID1 or ID2 corresponds to Jon Snow, and then filter again to find the row where the other ID corresponds to Daenerys Targaryen. -```{r} +We next identify the rows in the pairwise relatedness table that correspond to Jon Snow and Daenerys Targaryen. First, we retrieve their individual IDs: + +```{r} # Find the IDs of Jon Snow and Daenerys Targaryen jon_id <- df_got %>% @@ -90,36 +117,43 @@ dany_id <- df_got %>% pull(ID) ``` -We then filter the pairwise table to retrieve the row containing their relationship. +Then we isolate their dyad: ```{r} - jon_dany_row <- df_links %>% filter(ID1 == jon_id | ID2 == jon_id) %>% - filter(ID1 %in% dany_id| ID2 %in% dany_id) + filter(ID1 %in% dany_id | ID2 %in% dany_id) -jon_dany_row +jon_dany_row ``` -This row contains the additive relatedness coefficient between Jon and Daenerys, which allows us to assess how closely related they are genetically. We'd expect to see a value of 0.25 for an Aunt-Nephew relationship, which is what Jon and Daenerys are in the show. However, the value is `r jon_dany_row$addRel[1]`, indicating a more complex relationship. +This table contains the additive and nuclear relatedness estimates for Jon and Daenerys. If the pedigree reflects their canonical aunt-nephew relationship and is free from inbreeding, we’d expect to see an additive coefficient close to 0.25. However, the value is `r jon_dany_row$addRel[1]`, indicating a more complex relationship. -## Plotting the Pedigree with incomplete parental information +## Plotting the Pedigree with Incomplete Parental Information -To facilitate plotting, we check for individuals with one known parent but a missing other. For those cases, we assign a placeholder ID to the missing parent. +Many real-world and fictional pedigrees contain individuals with unknown or partially known parentage. In such cases, plotting tools typically fail unless these gaps are handled. We use `checkParentIDs()` to: -```{r} +- Identify individuals with one known parent and one missing -df_repaired <- checkParentIDs(df_got,addphantoms=TRUE, - repair=TRUE, - parentswithoutrow=FALSE, - repairsex=FALSE - ) %>% mutate(fam=1, - affected = case_when(ID %in% c(jon_id,dany_id) ~ 1, - TRUE ~ 0) - ) - +- Create "phantom" placeholders for the missing parent +-Optionally repair and harmonize parent fields +To facilitate plotting, we check for individuals with one known parent but a missing other. For those cases, we assign a placeholder ID to the missing parent. + +```{r} +df_repaired <- checkParentIDs(df_got, + addphantoms = TRUE, + repair = TRUE, + parentswithoutrow = FALSE, + repairsex = FALSE +) %>% mutate( + famID = 1, + affected = case_when( + ID %in% c(jon_id, dany_id, "365") ~ 1, + TRUE ~ 0 + ) +) ``` This code creates new IDs for individuals with one known parent and a missing other. It checks if either `momID` or `dadID` is missing, and if so, it assigns a new ID based on the row number. This allows us to visualize the pedigree even when some parental information is incomplete. @@ -128,10 +162,8 @@ This code creates new IDs for individuals with one known parent and a missing ot ## Visualize the Pedigree -```{r, message=FALSE, warning=FALSE} - -#fixParents(id=df_got$ID, dadid=df_got$dadID, momid=df_got$momID, sex=df_got$sex, missid = NA) - -plotPedigree(df_repaired,affected=df_repaired$affected,verbose=FALSE) +We can now visualize the repaired pedigree using the `plotPedigree()` function. This function generates a plot of the pedigree, with individuals colored based on their affected status. In this case, we highlight Jon and Daenerys as "affected" individuals. Otherwise they would be difficult to distinguish from the rest of the pedigree. +```{r, message=FALSE, warning=FALSE} +plotPedigree(df_repaired, affected = df_repaired$affected, verbose = FALSE) ``` diff --git a/vignettes/ASOIAF.html b/vignettes/ASOIAF.html index 709281cc..7848e21a 100644 --- a/vignettes/ASOIAF.html +++ b/vignettes/ASOIAF.html @@ -340,25 +340,30 @@

ASOIAF: How related are Jon and Danny?

-

Just how related are Jon Snow and Daenerys Targaryen? This vignette -walks through how to quantify their genetic relatedness using functions -from the BGmisc package. While the Game of Thrones canon gives us some -clues, we can use a formal pedigree-based approach to quantify their -genetic relatedness. This vignette demonstrates how to compute -coefficients of relatedness using the BGmisc package, along -with basic data manipulation from tidyverse. We will also handle -incomplete parental information programmatically and generate a plot of -the reconstructed pedigree.

+
+

Introduction

+

Just how closely related are Jon Snow and Daenerys Targaryen? +According to the lore of A Song of Ice and Fire, Daenerys is +Jon’s paternal aunt. This would suggest a theoretical genetic +relatedness of 0.25, assuming a simple pedigree and no inbreeding. But +with tangled ancestries and potentially missing information, how +confident can we be in that estimate?

+

In this vignette, we use the BGmisc package to +reconstruct the ASOIAF pedigree, handle incomplete parentage +data, and compute additive genetic and common nuclear relatedness. We’ll +focus on Jon and Daenerys as a case study, but the methods generalize to +any characters in the provided dataset.

+

Load Packages and Data

-

We begin by loading the necessary packages and accessing the built-in -ASOIAF pedigree dataset included with -BGmisc.

+

We begin by loading the required libraries and examining the +structure of the built-in ASOIAF pedigree.

library(BGmisc)
 library(tidyverse)
 data(ASOIAF)
-

The ASOIAF data contains character IDs, family identifiers, and -parent-child links extracted from A Song of Ice and Fire lore.

+

The ASOIAF dataset includes character IDs, names, family identifiers, +and parent identifiers for a subset of characters drawn from the A +Song of Ice and Fire canon.

head(ASOIAF)
##   id famID momID dadID          name sex
 ## 1  1     1    NA    NA   Walder Frey   M
@@ -370,9 +375,10 @@ 

Load Packages and Data

Prepare and Validate Sex Codes

-

We use checkSex() to ensure that all individuals have -valid sex codes, repairing as needed. This is important for correct -pedigree plotting and downstream calculations.

+

Many pedigree-based algorithms rely on biological sex for downstream +calculationss and visualization. We use checkSex() to +inspect the sex variable, repairing inconsistencies +programmatically.

df_got <- checkSex(ASOIAF,
   code_male = 1,
   code_female = 0,
@@ -381,51 +387,71 @@ 

Prepare and Validate Sex Codes

Compute Relatedness Matrices

-

We now compute the additive genetic relatedness matrix (add) and the -common nuclear relatedness matrix (cn) from the pedigree using ped2com() -and ped2cn(), respectively. The isChild_method argument -specifies how to identify child-parent relationships. We use -“partialparent” to account for missing parent information. The -adjacency_method argument specifies how to construct the -adjacency matrix. We use “direct” for the additive matrix and “indexed” -for the common nuclear matrix. The direct method is much faster. The -sparse argument is set to FALSE to return dense -matrices.

+

With validated pedigree data, we can now compute two distinct +relationship matrices:

+
    +
  • Additive genetic relatedness (add): Proportion of shared additive +genetic variance between individuals.

  • +
  • Common nuclear relatedness (cn): Indicates shared full-sibling +(nuclear family) environments.

  • +
+

These are derived using ped2add() and ped2cn(), respectively. Both +functions rely on internal graph traversal and adjacency structures. In +this case:

+
    +
  • We specify isChild_method = “partialparent” to allow inclusion of +dyads where one parent is unknown.

  • +
  • We choose adjacency_method = “direct” for the additive matrix to +optimize for computational speed.

  • +
  • For the common nuclear matrix, we use adjacency_method = +“indexed”, which is slower but necessary for resolving sibling-group +structures.

  • +
  • We set sparse = FALSE to return full (dense) +matrices rather than compressed sparse formats.

  • +
add <- ped2com(df_got,
   isChild_method = "partialparent",
   component = "additive",
   adjacency_method = "direct",
-  sparse = FALSE
+  sparse = TRUE
 )
 
-cn <- ped2cn(df_got,
+mt <- ped2com(df_got,
   isChild_method = "partialparent",
-  adjacency_method = "indexed",
-  sparse = FALSE
-)
+ component = "mitochondrial", + adjacency_method = "direct", + sparse = TRUE +) + +cn <- ped2cn(df_got, + isChild_method = "partialparent", + adjacency_method = "indexed", + sparse = TRUE +)

Convert to Pairwise Format

-

We convert the component matrices into a long-format table of -pairwise relationships using com2links(). This gives us a -long dataframe where each row represents a pair of individuals and their -relatedness. The function can return the entire matrix or just the lower -triangular part, which is often sufficient for our purposes. We set -writetodisk = FALSE to keep the data in memory.

+

For interpretability, we convert these square matrices into +long-format tables using com2links(). This function returns +a dataframe where each row represents a unique pair of individuals, +including their additive and common nuclear coefficients.

df_links <- com2links(
   writetodisk = FALSE,
-  ad_ped_matrix = add, cn_ped_matrix = cn,
+  ad_ped_matrix = add, cn_ped_matrix = cn, mit_ped_matrix = mt,
   drop_upper_triangular = TRUE
-)# %>%
+) # %>%
 #  filter(ID1 != ID2)
+

The function can return the entire matrix or just the lower +triangular part, which is often sufficient for our purposes. Setting +drop_upper_triangular = TRUE ensures we only retain one +entry per dyad, since the matrices are symmetric. We also keep the data +in memory by setting writetodisk = FALSE.

Locate Jon and Daenerys

-

Next, we extract the IDs corresponding to Jon Snow and Daenerys -Targaryen. We use the filter() function to find the rows in -the df_links dataframe where either ID1 or ID2 corresponds -to Jon Snow, and then filter again to find the row where the other ID -corresponds to Daenerys Targaryen.

+

We next identify the rows in the pairwise relatedness table that +correspond to Jon Snow and Daenerys Targaryen. First, we retrieve their +individual IDs:

# Find the IDs of Jon Snow and Daenerys Targaryen
 
 jon_id <- df_got %>%
@@ -435,35 +461,48 @@ 

Locate Jon and Daenerys

dany_id <- df_got %>% filter(name == "Daenerys Targaryen") %>% pull(ID)
-

We then filter the pairwise table to retrieve the row containing -their relationship.

+

Then we isolate their dyad:

jon_dany_row <- df_links %>%
   filter(ID1 == jon_id | ID2 == jon_id) %>%
-  filter(ID1 %in% dany_id| ID2 %in% dany_id)
+  filter(ID1 %in% dany_id | ID2 %in% dany_id)
 
-jon_dany_row 
-
##   ID1 ID2     addRel cnuRel
-## 1 206 211 0.31274414      0
-## 2 211 304 0.01953125      0
-

This row contains the additive relatedness coefficient between Jon -and Daenerys, which allows us to assess how closely related they are -genetically. We’d expect to see a value of 0.25 for an Aunt-Nephew -relationship, which is what Jon and Daenerys are in the show. However, -the value is 0.3127441, indicating a more complex relationship.

+jon_dany_row
+
##   ID1 ID2     addRel mitRel cnuRel
+## 1 206 211 0.31274414      0      0
+## 2 211 304 0.01953125      0      0
+

This table contains the additive and nuclear relatedness estimates +for Jon and Daenerys. If the pedigree reflects their canonical +aunt-nephew relationship and is free from inbreeding, we’d expect to see +an additive coefficient close to 0.25. However, the value is 0.3127441, +indicating a more complex relationship.

-

Plotting the Pedigree with incomplete parental information

+

Plotting the Pedigree with Incomplete Parental Information

+

Many real-world and fictional pedigrees contain individuals with +unknown or partially known parentage. In such cases, plotting tools +typically fail unless these gaps are handled. We use +checkParentIDs() to:

+ +

-Optionally repair and harmonize parent fields

To facilitate plotting, we check for individuals with one known parent but a missing other. For those cases, we assign a placeholder ID to the missing parent.

-
df_repaired <- checkParentIDs(df_got,addphantoms=TRUE,
-                              repair=TRUE,
-                              parentswithoutrow=FALSE,
-                              repairsex=FALSE
-                              ) %>% mutate(fam=1,
-                                           affected = case_when(ID %in% c(jon_id,dany_id) ~ 1,
-                                                               TRUE ~ 0)
-                              )
+
df_repaired <- checkParentIDs(df_got,
+  addphantoms = TRUE,
+  repair = TRUE,
+  parentswithoutrow = FALSE,
+  repairsex = FALSE
+) %>% mutate(
+  famID = 1,
+  affected = case_when(
+    ID %in% c(jon_id, dany_id, "365") ~ 1,
+    TRUE ~ 0
+  )
+)
## REPAIR IN EARLY ALPHA

This code creates new IDs for individuals with one known parent and a missing other. It checks if either momID or @@ -473,9 +512,13 @@

Plotting the Pedigree with incomplete parental information

Visualize the Pedigree

-
#fixParents(id=df_got$ID, dadid=df_got$dadID, momid=df_got$momID, sex=df_got$sex, missid = NA)
-
-plotPedigree(df_repaired,affected=df_repaired$affected,verbose=FALSE)
+

We can now visualize the repaired pedigree using the +plotPedigree() function. This function generates a plot of +the pedigree, with individuals colored based on their affected status. +In this case, we highlight Jon and Daenerys as “affected” individuals. +Otherwise they would be difficult to distinguish from the rest of the +pedigree.

+
plotPedigree(df_repaired, affected = df_repaired$affected, verbose = FALSE)

## Did not plot the following people: 85 88 125 142 228 229 258 259 274 275 305 336 357 381 388 405 409 418 420 424 428 451 487
## named list()
diff --git a/vignettes/modelingrelatedness.Rmd b/vignettes/modelingvariancecomponents.Rmd similarity index 98% rename from vignettes/modelingrelatedness.Rmd rename to vignettes/modelingvariancecomponents.Rmd index 6f793690..c82441a9 100644 --- a/vignettes/modelingrelatedness.Rmd +++ b/vignettes/modelingvariancecomponents.Rmd @@ -2,7 +2,7 @@ title: "Modeling variance components" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{modelingandrelatedness} + %\VignetteIndexEntry{modelingvariancecomponents} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- diff --git a/vignettes/modelingvariancecomponents.html b/vignettes/modelingvariancecomponents.html new file mode 100644 index 00000000..7b5df889 --- /dev/null +++ b/vignettes/modelingvariancecomponents.html @@ -0,0 +1,554 @@ + + + + + + + + + + + + + + +Modeling variance components + + + + + + + + + + + + + + + + + + + + + + + + + + +

Modeling variance components

+ + + +
+

Introduction

+

This vignette provides a detailed guide to specific functions within +the BGmisc package that aid in the identification and +fitting of variance component models common in behavior genetics. We +will explore key functions such as identifyComponentModel, +providing practical examples and theoretical background. Identification +ensures a unique set of parameters that define the model-implied +covariance matrix, preventing free parameters from trading off one +another.

+
+

Loading Required Libraries

+

Ensure that the BGmisc package is installed and +loaded.

+

Ensure that the following dependencies are installed before +proceeding as they provide us with behavior genetic data and models:

+
    +
  • EasyMx

  • +
  • OpenMx

  • +
+
library(BGmisc)
+library(EasyMx)
+library(OpenMx)
+

Note: If any of the libraries are not installed, you can install them +using install.packages(“package_name”).

+
+
+
+

Working with Variance Component Models

+

In this section, we will demonstrate core functions related to the +identification and fitting of variance component models.

+
+

Using comp2vech Function

+

The comp2vech function is used to vectorize a components +model. The function is often used in conjunction with the identification +process. In this example, we apply it to a list of matrices:

+
comp2vech(list(
+  matrix(c(1, .5, .5, 1), 2, 2),
+  matrix(1, 2, 2)
+))
+#> [1] 1.0 0.5 1.0 1.0 1.0 1.0
+

The result showcases how the matrices have been transformed, +reflecting their role in subsequent variance component analysis.

+
+
+

Using identifyComponentModel Function

+

The identifyComponentModel function helps determine if a +variance components model is identified. It accepts relatedness +component matrices and returns information about identified and +non-identified parameters.

+

Here’s an example using the classical twin model with only MZ +twins:

+
identifyComponentModel(
+  A = list(matrix(1, 2, 2)),
+  C = list(matrix(1, 2, 2)),
+  E = diag(1, 2)
+)
+#> Component model is not identified.
+#> Non-identified parameters are  A, C
+#> $identified
+#> [1] FALSE
+#> 
+#> $nidp
+#> [1] "A" "C"
+

As you can see, the model is not identified. We need to add an +additional group so that we have sufficient information. Let us add the +rest of the classical twin model, in this case DZ twins.

+
identifyComponentModel(
+  A = list(matrix(c(1, .5, .5, 1), 2, 2), matrix(1, 2, 2)),
+  C = list(matrix(1, 2, 2), matrix(1, 2, 2)),
+  E = diag(1, 4)
+)
+#> Component model is identified.
+#> $identified
+#> [1] TRUE
+#> 
+#> $nidp
+#> character(0)
+

As you can see the model is identified, now that we’ve added another +group. Let us confirm by fitting a model. First we prepare the data.

+
require(dplyr)
+#> Loading required package: dplyr
+#> 
+#> Attaching package: 'dplyr'
+#> The following objects are masked from 'package:BGmisc':
+#> 
+#>     between, first, last
+#> The following objects are masked from 'package:stats':
+#> 
+#>     filter, lag
+#> The following objects are masked from 'package:base':
+#> 
+#>     intersect, setdiff, setequal, union
+# require(purrr)
+
+data(twinData, package = "OpenMx")
+selVars <- c("ht1", "ht2")
+
+mzdzData <- subset(
+  twinData, zyg %in% c(1, 3),
+  c(selVars, "zyg")
+)
+
+mzdzData$RCoef <- c(1, NA, .5)[mzdzData$zyg]
+
+
+mzData <- mzdzData %>% filter(zyg == 1)
+

Let us fit the data with MZ twins by themselves.

+
run1 <- emxTwinModel(
+  model = "Cholesky",
+  relatedness = "RCoef",
+  data = mzData,
+  use = selVars,
+  run = TRUE, name = "TwCh"
+)
+#> Running TwCh with 4 parameters
+#> Warning: In model 'TwCh' Optimizer returned a non-zero status code 5. The
+#> Hessian at the solution does not appear to be convex. See
+#> ?mxCheckIdentification for possible diagnosis (Mx status RED).
+
+summary(run1)
+#> Summary of TwCh 
+#>  
+#> The Hessian at the solution does not appear to be convex. See ?mxCheckIdentification for possible diagnosis (Mx status RED). 
+#>  
+#> free parameters:
+#>      name matrix row col   Estimate    Std.Error A lbound ubound
+#> 1 sqrtA11  sqrtA   1   1 0.05122646           NA    1e-06       
+#> 2 sqrtC11  sqrtC   1   1 0.03518629           NA       0!       
+#> 3 sqrtE11  sqrtE   1   1 0.02325722 0.0007017955 !     0!       
+#> 4    Mht1  Means ht1   1 1.62974908 0.0027023907                
+#> 
+#> Model Statistics: 
+#>                |  Parameters  |  Degrees of Freedom  |  Fit (-2lnL units)
+#>        Model:              4                   1112             -3693.148
+#>    Saturated:              5                   1111                    NA
+#> Independence:              4                   1112                    NA
+#> Number of observations/statistics: 569/1116
+#> 
+#> 
+#> ** Information matrix is not positive definite (not at a candidate optimum).
+#>   Be suspicious of these results. At minimum, do not trust the standard errors.
+#> 
+#> Information Criteria: 
+#>       |  df Penalty  |  Parameters Penalty  |  Sample-Size Adjusted
+#> AIC:      -5917.148              -3685.148                -3685.078
+#> BIC:     -10747.543              -3667.773                -3680.471
+#> To get additional fit indices, see help(mxRefModels)
+#> timestamp: 2025-04-21 16:43:57 
+#> Wall clock time: 0.04138112 secs 
+#> optimizer:  SLSQP 
+#> OpenMx version number: 2.21.13 
+#> Need help?  See help(mxSummary)
+

As you can see the model was unsuccessful because it was not +identified. But when we add another group, so that the model is +identified, the model now fits.

+
run2 <- emxTwinModel(
+  model = "Cholesky",
+  relatedness = "RCoef",
+  data = mzdzData,
+  use = selVars,
+  run = TRUE, name = "TwCh"
+)
+#> Running TwCh with 4 parameters
+
+summary(run2)
+#> Summary of TwCh 
+#>  
+#> free parameters:
+#>      name matrix row col   Estimate    Std.Error A lbound ubound
+#> 1 sqrtA11  sqrtA   1   1 0.06339271 0.0014377690    1e-06       
+#> 2 sqrtC11  sqrtC   1   1 0.00000100 0.0250260004 !     0!       
+#> 3 sqrtE11  sqrtE   1   1 0.02330040 0.0007015267       0!       
+#> 4    Mht1  Means ht1   1 1.63295540 0.0020511844                
+#> 
+#> Model Statistics: 
+#>                |  Parameters  |  Degrees of Freedom  |  Fit (-2lnL units)
+#>        Model:              4                   1803             -5507.092
+#>    Saturated:              5                   1802                    NA
+#> Independence:              4                   1803                    NA
+#> Number of observations/statistics: 920/1807
+#> 
+#> Information Criteria: 
+#>       |  df Penalty  |  Parameters Penalty  |  Sample-Size Adjusted
+#> AIC:      -9113.092              -5499.092                -5499.048
+#> BIC:     -17811.437              -5479.794                -5492.498
+#> To get additional fit indices, see help(mxRefModels)
+#> timestamp: 2025-04-21 16:43:57 
+#> Wall clock time: 0.03366184 secs 
+#> optimizer:  SLSQP 
+#> OpenMx version number: 2.21.13 
+#> Need help?  See help(mxSummary)
+
+
+ + + + + + + + + + + diff --git a/vignettes/partial.Rmd b/vignettes/partial.Rmd index 45fb7ce6..3e4b11fb 100644 --- a/vignettes/partial.Rmd +++ b/vignettes/partial.Rmd @@ -49,11 +49,13 @@ We compute the additive genetic relationship matrix using both the classic and p ped_add_partial_complete <- ped2com(df, isChild_method = "partialparent", component = "additive", - adjacency_method = "direct" + adjacency_method = "direct", + sparse = FALSE ) ped_add_classic_complete <- ped2com(df, isChild_method = "classic", - component = "additive", adjacency_method = "direct" + component = "additive", adjacency_method = "direct", + sparse = FALSE ) ``` @@ -67,22 +69,31 @@ library(corrplot) corrplot(as.matrix(ped_add_classic_complete), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE, title = "Additive component - Classic method" + is.corr = FALSE, title = "Additive component - Classic method", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) + corrplot(as.matrix(ped_add_partial_complete), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE, title = "Additive component - Partial parent method" + is.corr = FALSE, title = "Additive component - Partial parent method", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) ``` To verify this, we subtract one matrix from the other and calculate RMSE. The difference should be numerically zero. Indeed, it is `r sqrt(mean((ped_add_classic_complete-ped_add_partial_complete)^2))`. -```{r} -corrplot(as.matrix(ped_add_classic_complete - ped_add_partial_complete), +```{r,warning=FALSE} +corrplot((as.matrix(ped_add_classic_complete) - as.matrix(ped_add_partial_complete)), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE + is.corr = FALSE, order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) ``` @@ -101,12 +112,14 @@ df$momID[df$ID == 4] <- NA ped_add_partial_mom <- ped_add_partial <- ped2com(df, isChild_method = "partialparent", component = "additive", - adjacency_method = "direct" + adjacency_method = "direct", + sparse = FALSE ) ped_add_classic_mom <- ped_add_classic <- ped2com(df, isChild_method = "classic", - component = "additive", adjacency_method = "direct" + component = "additive", adjacency_method = "direct", + sparse = FALSE ) ``` @@ -119,12 +132,17 @@ The resulting additive matrices reflect this difference. The RMSE between the tw ```{r} corrplot(as.matrix(ped_add_classic), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE, title = "Classic (mother removed)" + is.corr = FALSE, title = "Classic (mother removed)", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) - corrplot(as.matrix(ped_add_partial), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE, title = "Partial (mother removed)" + is.corr = FALSE, title = "Partial (mother removed)", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) ``` @@ -132,16 +150,19 @@ corrplot(as.matrix(ped_add_partial), We quantify the overall matrix difference: ```{r} -sqrt(mean((ped_add_classic - ped_add_partial)^2)) +sqrt(mean((as.matrix(ped_add_classic) - as.matrix(ped_add_partial))^2)) ``` Next, we compare each method to the matrix from the complete pedigree. This evaluates how much each method deviates from the correct additive structure. ```{r} -corrplot(as.matrix(ped_add_classic_complete - ped_add_classic), +corrplot(as.matrix(ped_add_classic_complete) - as.matrix(ped_add_classic), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE + is.corr = FALSE, + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) sqrt(mean((ped_add_classic_complete - ped_add_classic)^2)) @@ -152,7 +173,10 @@ The RMSE between the true additive component and the classic method is `r sqrt(m ```{r} corrplot(as.matrix(ped_add_classic_complete - ped_add_partial), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE + is.corr = FALSE, + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) sqrt(mean((ped_add_classic_complete - ped_add_partial)^2)) @@ -190,17 +214,23 @@ ped_add_classic_dad <- ped_add_classic <- ped2com(df, ``` -As we can see, the two matrices are different. The RMSE between the two matrices is `r sqrt(mean((ped_add_classic-ped_add_partial)^2))`. +As we can see, the two matrices are different. The RMSE between the two matrices is `r sqrt(mean((as.matrix(ped_add_classic)-as.matrix(ped_add_partial))^2))`. ```{r} corrplot(as.matrix(ped_add_classic_dad), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE, title = "Classic (father removed)" + is.corr = FALSE, title = "Classic (father removed)", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) corrplot(as.matrix(ped_add_partial_dad), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE, title = "Partial (father removed)" + is.corr = FALSE, title = "Partial (father removed)", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) ``` @@ -210,7 +240,10 @@ Again, we compare to the true matrix from the complete pedigree: ```{r} corrplot(as.matrix(ped_add_classic_complete - ped_add_classic), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE + is.corr = FALSE, + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) sqrt(mean((ped_add_classic_complete - ped_add_classic)^2)) @@ -220,7 +253,10 @@ sqrt(mean((ped_add_classic_complete - ped_add_classic)^2)) ```{r} corrplot(as.matrix(ped_add_classic_complete - ped_add_partial), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE + is.corr = FALSE, + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) sqrt(mean((ped_add_classic_complete - ped_add_partial)^2)) @@ -239,7 +275,7 @@ data("inbreeding") df <- inbreeding -FamIDs <- unique(df$FamID) +famIDs <- unique(df$famID) ``` For each one, we construct the additive relationship matrix under complete information and then simulate two missingness scenarios: @@ -255,16 +291,16 @@ In each condition, we recompute the additive matrix using both the classic and p ```{r} inbreeding_list <- list() results <- data.frame( - FamIDs = FamIDs, - RMSE_partial_dad = rep(NA, length(FamIDs)), - RMSE_partial_mom = rep(NA, length(FamIDs)), - RMSE_classic_dad = rep(NA, length(FamIDs)), - RMSE_classic_mom = rep(NA, length(FamIDs)), - max_R_classic_dad = rep(NA, length(FamIDs)), - max_R_partial_dad = rep(NA, length(FamIDs)), - max_R_classic_mom = rep(NA, length(FamIDs)), - max_R_partial_mom = rep(NA, length(FamIDs)), - max_R_classic = rep(NA, length(FamIDs)) + famIDs = famIDs, + RMSE_partial_dad = rep(NA, length(famIDs)), + RMSE_partial_mom = rep(NA, length(famIDs)), + RMSE_classic_dad = rep(NA, length(famIDs)), + RMSE_classic_mom = rep(NA, length(famIDs)), + max_R_classic_dad = rep(NA, length(famIDs)), + max_R_partial_dad = rep(NA, length(famIDs)), + max_R_classic_mom = rep(NA, length(famIDs)), + max_R_partial_mom = rep(NA, length(famIDs)), + max_R_classic = rep(NA, length(famIDs)) ) ``` @@ -272,9 +308,9 @@ The loop below performs this procedure for all families in the dataset and store ```{r} -for (i in 1:length(FamIDs)) { +for (i in 1:length(famIDs)) { # make three versions to filter down - df_fam_dad <- df_fam_mom <- df_fam <- df[df$FamID == FamIDs[i], ] + df_fam_dad <- df_fam_mom <- df_fam <- df[df$famID == famIDs[i], ] results$RMSE_partial_mom[i] <- sqrt(mean((ped_add_classic_complete - ped_add_partial_mom)^2)) @@ -345,7 +381,7 @@ for (i in 1:length(FamIDs)) { } ``` -### Example: Family ``r FamIDs[1]`` +### Example: Family ``r famIDs[1]`` To understand what these matrices look like, we visualize them for one representative family. For this example, we select the first family in the dataset. @@ -362,27 +398,42 @@ fam1 <- inbreeding_list[[1]] corrplot(as.matrix(fam1$ped_add_classic_complete), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE, title = "Classic - Complete" + is.corr = FALSE, title = "Classic - Complete", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) corrplot(as.matrix(fam1$ped_add_classic_mom), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE, title = "Classic - Mom Missing" + is.corr = FALSE, title = "Classic - Mom Missing", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) corrplot(as.matrix(fam1$ped_add_partial_mom), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE, title = "Partial - Mom Missing" + is.corr = FALSE, title = "Partial - Mom Missing", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) corrplot(as.matrix(fam1$ped_add_classic_dad), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE, title = "Classic - Dad Missing" + is.corr = FALSE, title = "Classic - Dad Missing", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) corrplot(as.matrix(fam1$ped_add_partial_dad), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE, title = "Partial - Dad Missing" + is.corr = FALSE, title = "Partial - Dad Missing", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) ``` @@ -392,22 +443,34 @@ To visualize the differences from the true matrix: ```{r} corrplot(as.matrix(fam1$ped_add_classic_complete - fam1$ped_add_classic_mom), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE, title = "Classic Mom Diff from Complete" + is.corr = FALSE, title = "Classic Mom Diff from Complete", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) corrplot(as.matrix(fam1$ped_add_classic_complete - fam1$ped_add_partial_mom), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE, title = "Partial Mom Diff from Complete" + is.corr = FALSE, title = "Partial Mom Diff from Complete", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) corrplot(as.matrix(fam1$ped_add_classic_complete - fam1$ped_add_classic_dad), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE, title = "Classic Dad Diff from Complete" + is.corr = FALSE, title = "Classic Dad Diff from Complete", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) corrplot(as.matrix(fam1$ped_add_classic_complete - fam1$ped_add_partial_dad), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE, title = "Partial Dad Diff from Complete" + is.corr = FALSE, title = "Partial Dad Diff from Complete", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) ) ``` @@ -456,7 +519,7 @@ These proportions show how often the partial method produces a lower RMSE across results %>% as.data.frame() %>% select( - -FamIDs, -RMSE_diff_mom, -RMSE_diff_dad, -max_R_classic_dad, + -famIDs, -RMSE_diff_mom, -RMSE_diff_dad, -max_R_classic_dad, -max_R_partial_dad, -max_R_classic_mom, -max_R_partial_mom, -max_R_classic ) %>% summary() diff --git a/vignettes/partial.html b/vignettes/partial.html index aaa1fbec..3e0901a1 100644 --- a/vignettes/partial.html +++ b/vignettes/partial.html @@ -377,12 +377,14 @@

Hazard Data Example

ped_add_partial_complete <- ped2com(df,
   isChild_method = "partialparent",
   component = "additive",
-  adjacency_method = "direct"
-)
-ped_add_classic_complete <- ped2com(df,
-  isChild_method = "classic",
-  component = "additive", adjacency_method = "direct"
-)
+ adjacency_method = "direct", + sparse = FALSE +) +ped_add_classic_complete <- ped2com(df, + isChild_method = "classic", + component = "additive", adjacency_method = "direct", + sparse = FALSE +)

The following plots display the full additive matrices. These matrices should be identical.

This can be confirmed visually and numerically.

@@ -392,25 +394,31 @@

Hazard Data Example

corrplot(as.matrix(ped_add_classic_complete), method = "color", type = "lower", col.lim = c(0, 1), - is.corr = FALSE, title = "Additive component - Classic method" -) -

+ is.corr = FALSE, title = "Additive component - Classic method", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) +) +


-corrplot(as.matrix(ped_add_partial_complete),
-  method = "color", type = "lower", col.lim = c(0, 1),
-  is.corr = FALSE, title = "Additive component - Partial parent method"
-)
-

+ +corrplot(as.matrix(ped_add_partial_complete), + method = "color", type = "lower", col.lim = c(0, 1), + is.corr = FALSE, title = "Additive component - Partial parent method", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) +) +

To verify this, we subtract one matrix from the other and calculate RMSE. The difference should be numerically zero. Indeed, it is 0.

-
corrplot(as.matrix(ped_add_classic_complete - ped_add_partial_complete),
+
corrplot((as.matrix(ped_add_classic_complete) - as.matrix(ped_add_partial_complete)),
   method = "color", type = "lower", col.lim = c(0, 1),
-  is.corr = FALSE
-)
-#> Warning in corrplot(as.matrix(ped_add_classic_complete -
-#> ped_add_partial_complete), : col.lim interval too wide, please set a suitable
-#> value
-

+ is.corr = FALSE, order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) +)
+

Introducing Missingness: Remove a Parent

@@ -420,13 +428,15 @@

Introducing Missingness: Remove a Parent

ped_add_partial_mom <- ped_add_partial <- ped2com(df,
   isChild_method = "partialparent",
   component = "additive",
-  adjacency_method = "direct"
-)
-
-ped_add_classic_mom <- ped_add_classic <- ped2com(df,
-  isChild_method = "classic",
-  component = "additive", adjacency_method = "direct"
-)
+ adjacency_method = "direct", + sparse = FALSE +) + +ped_add_classic_mom <- ped_add_classic <- ped2com(df, + isChild_method = "classic", + component = "additive", adjacency_method = "direct", + sparse = FALSE +)

The two methods now treat individual 4 differently in the parent adjacency matrix. The classic method applies a fixed contribution because one parent remains. The partial parent method inflates the @@ -436,26 +446,34 @@

Introducing Missingness: Remove a Parent

between the two matrices is 0.009811.

corrplot(as.matrix(ped_add_classic),
   method = "color", type = "lower", col.lim = c(0, 1),
-  is.corr = FALSE, title = "Classic (mother removed)"
-)
-

-

-corrplot(as.matrix(ped_add_partial),
-  method = "color", type = "lower", col.lim = c(0, 1),
-  is.corr = FALSE, title = "Partial (mother removed)"
-)
-

+ is.corr = FALSE, title = "Classic (mother removed)", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) +) +

+
corrplot(as.matrix(ped_add_partial),
+  method = "color", type = "lower", col.lim = c(0, 1),
+  is.corr = FALSE, title = "Partial (mother removed)",
+  order = "hclust",
+  tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2,
+  col = COL1("Reds", 100), mar = c(0, 0, 2, 0)
+)
+

We quantify the overall matrix difference:

-
sqrt(mean((ped_add_classic - ped_add_partial)^2))
+
sqrt(mean((as.matrix(ped_add_classic) - as.matrix(ped_add_partial))^2))
 #> [1] 0.009811047

Next, we compare each method to the matrix from the complete pedigree. This evaluates how much each method deviates from the correct additive structure.

-
corrplot(as.matrix(ped_add_classic_complete - ped_add_classic),
+
corrplot(as.matrix(ped_add_classic_complete) - as.matrix(ped_add_classic),
   method = "color", type = "lower", col.lim = c(0, 1),
-  is.corr = FALSE
-)
-

+ is.corr = FALSE, + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) +)
+


 sqrt(mean((ped_add_classic_complete - ped_add_classic)^2))
 #> [1] 0.02991371
@@ -463,9 +481,12 @@

Introducing Missingness: Remove a Parent

is 0.0299137.

corrplot(as.matrix(ped_add_classic_complete - ped_add_partial),
   method = "color", type = "lower", col.lim = c(0, 1),
-  is.corr = FALSE
-)
-

+ is.corr = FALSE, + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) +)
+


 sqrt(mean((ped_add_classic_complete - ped_add_partial)^2))
 #> [1] 0.02825904
@@ -499,32 +520,48 @@

Removing the Father Instead

two matrices is 0.009811.

corrplot(as.matrix(ped_add_classic_dad),
   method = "color", type = "lower", col.lim = c(0, 1),
-  is.corr = FALSE, title = "Classic (father removed)"
-)
-

+ is.corr = FALSE, title = "Classic (father removed)", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) +) +


 corrplot(as.matrix(ped_add_partial_dad),
   method = "color", type = "lower", col.lim = c(0, 1),
-  is.corr = FALSE, title = "Partial (father removed)"
-)
-

+ is.corr = FALSE, title = "Partial (father removed)", + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) +) +

Again, we compare to the true matrix from the complete pedigree:

corrplot(as.matrix(ped_add_classic_complete - ped_add_classic),
   method = "color", type = "lower", col.lim = c(0, 1),
-  is.corr = FALSE
-)
-

+ is.corr = FALSE, + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) +) +


 sqrt(mean((ped_add_classic_complete - ped_add_classic)^2))
-#> [1] 0.02991371
+#> Warning in mean.default((ped_add_classic_complete - ped_add_classic)^2): +#> argument is not numeric or logical: returning NA +#> [1] NA
corrplot(as.matrix(ped_add_classic_complete - ped_add_partial),
   method = "color", type = "lower", col.lim = c(0, 1),
-  is.corr = FALSE
-)
-

+ is.corr = FALSE, + order = "hclust", + tl.pos = "l", tl.col = "black", tl.srt = 5, tl.cex = 0.2, + col = COL1("Reds", 100), mar = c(0, 0, 2, 0) +) +


 sqrt(mean((ped_add_classic_complete - ped_add_partial)^2))
-#> [1] 0.02825904
+#> Warning in mean.default((ped_add_classic_complete - ped_add_partial)^2): +#> argument is not numeric or logical: returning NA +#> [1] NA

The partial parent method again yields a matrix closer to the full-data version.

@@ -538,7 +575,7 @@

Inbreeding Dataset: Family-Level Evaluation

df <- inbreeding -FamIDs <- unique(df$FamID) +famIDs <- unique(df$famID)

For each one, we construct the additive relationship matrix under complete information and then simulate two missingness scenarios: