diff --git a/.gitignore b/.gitignore index f84ea082..22835dfa 100644 --- a/.gitignore +++ b/.gitignore @@ -1,19 +1,22 @@ -.Rproj.user +*.ASOIAF.ged +*.Rproj +*.knit.md .DS_Store -data-raw/ASOIAF.ged -R/.Rhistory +.Rdata .Rhistory -paper/paper.html +.Rproj.user +.httr-oauth +.quarto +.vscode/launch.json /Meta/ -*.knit.md -vignettes/articles/paper.html BGmisc.code-workspace -tests/testthat/Rplots.pdf -*.ASOIAF.ged -ASOIAF.ged -*.Rproj +R/.Rhistory benchmark_results.csv -.vscode/launch.json -dataRelatedPairs_new2.csv +data-raw/ASOIAF.ged data-raw/ASOIAF_040725.ged dataRelatedPairs.csv +dataRelatedPairs_new2.csv +paper/paper.html +tests/testthat/Rplots.pdf +vignettes/articles/paper.html +.vscode/settings.json diff --git a/NAMESPACE b/NAMESPACE index 69e23307..879d7895 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,8 +1,8 @@ # Generated by roxygen2: do not edit by hand export(SimPed) -export(allGens) export(assignCoupleIDs) +export(buildTreeGrid) export(calcAllGens) export(calcFamilySize) export(calculateRelatedness) @@ -12,18 +12,17 @@ export(checkPedigreeNetwork) export(checkSex) export(com2links) export(comp2vech) +export(computeParentAdjacency) export(createGenDataFrame) export(dropLink) export(evenInsert) -export(extractSummaryText) -export(famSizeCal) export(fitComponentModel) +export(getWikiTreeSummary) export(identifyComponentModel) export(inferRelatedness) export(insertEven) export(makeInbreeding) export(makeTwins) -export(parseTree) export(ped2add) export(ped2ce) export(ped2cn) @@ -44,7 +43,6 @@ export(relatedness) export(repairSex) export(resample) export(simulatePedigree) -export(sizeAllGens) export(summariseFamilies) export(summariseMatrilines) export(summarisePatrilines) @@ -53,6 +51,7 @@ export(summarizeFamilies) export(summarizeMatrilines) export(summarizePatrilines) export(summarizePedigrees) +export(traceTreePaths) export(vech) import(data.table) import(kinship2) diff --git a/NEWS.md b/NEWS.md index c0daa9a2..7693f0eb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -7,6 +7,7 @@ * list SimPed and related_coef as aliases for functions * harmonizing function names like calcFamilySize from famSizeCal * implemented adjBeta function to evaluation alternative build method +* reorganize file names to be more consistent # BGmisc 1.3.5.1 * Setting the default for the `sparse` argument in `ped2com()` to TRUE diff --git a/R/buildComponent.R b/R/buildComponent.R new file mode 100644 index 00000000..b57e90dc --- /dev/null +++ b/R/buildComponent.R @@ -0,0 +1,504 @@ +#' Take a pedigree and turn it into a relatedness matrix +#' @param ped a pedigree dataset. Needs ID, momID, and dadID columns +#' @param component character. Which component of the pedigree to return. See Details. +#' @param max.gen the maximum number of generations to compute +#' (e.g., only up to 4th degree relatives). The default is 25. However it can be set to infinity. +#' `Inf` uses as many generations as there are in the data. +#' @param sparse logical. If TRUE, use and return sparse matrices from Matrix package +#' @param verbose logical. If TRUE, print progress through stages of algorithm +#' @param update_rate numeric. The rate at which to print progress +#' @param gc logical. If TRUE, do frequent garbage collection via \code{\link{gc}} to save memory +#' @param saveable logical. If TRUE, save the intermediate results to disk +#' @param save_rate numeric. The rate at which to save the intermediate results +#' @param save_rate_gen numeric. The rate at which to save the intermediate results by generation. If NULL, defaults to save_rate +#' @param save_rate_parlist numeric. The rate at which to save the intermediate results by parent list. If NULL, defaults to save_rate*1000 +#' @param resume logical. If TRUE, resume from a checkpoint +#' @param save_path character. The path to save the checkpoint files +#' @param flatten.diag logical. If TRUE, overwrite the diagonal of the final relatedness matrix with ones +#' @param standardize.colnames logical. If TRUE, standardize the column names of the pedigree dataset +#' @param transpose_method character. The method to use for computing the transpose. Options are "tcrossprod", "crossprod", or "star" +#' @param adjacency_method character. The method to use for computing the adjacency matrix. Options are "loop", "indexed", direct or beta +#' @param isChild_method character. The method to use for computing the isChild matrix. Options are "classic" or "partialparent" +#' @param adjBeta_method numeric The method to use for computing the building the adjacency_method matrix when using the "beta" build +#' @param ... additional arguments to be passed to \code{\link{ped2com}} +#' @details The algorithms and methodologies used in this function are further discussed and exemplified in the vignette titled "examplePedigreeFunctions". For more advanced scenarios and detailed explanations, consult this vignette. +#' @export +#' +ped2com <- function(ped, component, + max.gen = 25, + sparse = TRUE, + verbose = FALSE, + gc = FALSE, + flatten.diag = FALSE, + standardize.colnames = TRUE, + transpose_method = "tcrossprod", + adjacency_method = "direct", + isChild_method = "classic", + saveable = FALSE, + resume = FALSE, + save_rate = 5, + save_rate_gen = save_rate, + save_rate_parlist = 100000 * save_rate, + update_rate = 100, + save_path = "checkpoint/", + adjBeta_method = NULL, + ...) { + #------ + # Checkpointing + #------ + if (saveable || resume) { # prepare checkpointing + if (verbose) cat("Preparing checkpointing...\n") + # Ensure save path exists + if (saveable && !dir.exists(save_path)) { + if (verbose) cat("Creating save path...\n") + dir.create(save_path, recursive = TRUE) + } else if (resume && !dir.exists(save_path)) { + stop("Cannot resume from checkpoint. Save path does not exist.") + } + + # Define checkpoint files + checkpoint_files <- list( + parList = file.path(save_path, "parList.rds"), + lens = file.path(save_path, "lens.rds"), + isPar = file.path(save_path, "isPar.rds"), + iss = file.path(save_path, "iss.rds"), + jss = file.path(save_path, "jss.rds"), + isChild = file.path(save_path, "isChild.rds"), + r_checkpoint = file.path(save_path, "r_checkpoint.rds"), + gen_checkpoint = file.path(save_path, "gen_checkpoint.rds"), + newIsPar_checkpoint = file.path(save_path, "newIsPar_checkpoint.rds"), + mtSum_checkpoint = file.path(save_path, "mtSum_checkpoint.rds"), + r2_checkpoint = file.path(save_path, "r2_checkpoint.rds"), + tcrossprod_checkpoint = file.path(save_path, "tcrossprod_checkpoint.rds"), + count_checkpoint = file.path(save_path, "count_checkpoint.rds"), + final_matrix = file.path(save_path, "final_matrix.rds") + ) + } + #------ + # Validation/Preparation + #------ + + # Validate the 'component' argument and match it against predefined choices + component <- match.arg(tolower(component), + choices = c( + "generation", + "additive", + "common nuclear", + "mitochondrial" + ) + ) + + if (!transpose_method %in% c("tcrossprod", "crossprod", "star", "tcross.alt.crossprod", "tcross.alt.star")) { + stop("Invalid method specified. Choose from 'tcrossprod', 'crossprod', or 'star' or 'tcross.alt.crossprod' or 'tcross.alt.star'.") + } + if (!adjacency_method %in% c("indexed", "loop", "direct", "beta")) { + stop("Invalid method specified. Choose from 'indexed', 'loop', 'direct', or 'beta'.") + } + + # standardize colnames + if (standardize.colnames) { + ped <- standardizeColnames(ped, verbose = verbose) + } + + # Load final result if computation was completed + if (resume && file.exists(checkpoint_files$final_matrix)) { + if (verbose) cat("Loading final computed matrix...\n") + return(readRDS(checkpoint_files$final_matrix)) + } + + + #------ + # Algorithm + #------ + + # Get the number of rows in the pedigree dataset, representing the size of the family + nr <- nrow(ped) + + # Print the family size if verbose is TRUE + if (verbose) { + cat(paste0("Family Size = ", nr, "\n")) + } + + # Step 1: Construct parent-child adjacency matrix + ## A. Resume from Checkpoint if Needed + if (resume && file.exists(checkpoint_files$parList) && file.exists(checkpoint_files$lens)) { + if (verbose) cat("Resuming: Loading parent-child adjacency data...\n") + parList <- readRDS(checkpoint_files$parList) + lens <- readRDS(checkpoint_files$lens) + computed_indices <- which(!sapply(parList, is.null)) + lastComputed <- if (length(computed_indices) > 0) max(computed_indices) else 0 + if (verbose) cat("Resuming from iteration", lastComputed + 1, "\n") + } else { + ## Initialize variables + parList <- vector("list", nr) + lens <- integer(nr) + lastComputed <- 0 + + if (verbose) cat("Building parent adjacency matrix...\n") + } + + + ## B. Resume loop from the next uncomputed index + + if (verbose) cat("Computing parent-child adjacency matrix...\n") + # Construct sparse matrix + if (resume && file.exists(checkpoint_files$iss) && file.exists(checkpoint_files$jss)) { # fix to check actual + if (verbose) cat("Resuming: Constructed matrix...\n") + jss <- readRDS(checkpoint_files$jss) + iss <- readRDS(checkpoint_files$iss) + list_of_adjacencies <- list(iss = iss, jss = jss) + } else { + list_of_adjacencies <- computeParentAdjacency( + ped = ped, + save_rate_parlist = save_rate_parlist, + checkpoint_files = checkpoint_files, + component = component, + adjacency_method = adjacency_method, # adjacency_method, + saveable = saveable, + resume = resume, + save_path = save_path, + update_rate = update_rate, + verbose = verbose, + lastComputed = lastComputed, + nr = nr, + parList = parList, + lens = lens, + adjBeta_method = adjBeta_method + ) + + # Construct sparse matrix + iss <- list_of_adjacencies$iss + jss <- list_of_adjacencies$jss + + if (verbose) { + cat("Constructed sparse matrix\n") + } + if (saveable) { + saveRDS(jss, file = checkpoint_files$jss) + saveRDS(iss, file = checkpoint_files$iss) + } + # Garbage collection if gc is TRUE + if (gc) { + rm(parList, lens, list_of_adjacencies) + gc() + } + } + # Set parent values depending on the component type + if (component %in% c("generation", "additive")) { + parVal <- .5 + } else if (component %in% c("common nuclear", "mitochondrial")) { + parVal <- 1 + } else { + stop("Don't know how to set parental value") + } + # Construct sparse matrix + if (resume && file.exists(checkpoint_files$isPar)) { + if (verbose) cat("Resuming: Loading adjacency matrix...\n") + isPar <- readRDS(checkpoint_files$isPar) + } else { + # Initialize adjacency matrix for parent-child relationships + isPar <- Matrix::sparseMatrix( + i = iss, + j = jss, + x = parVal, + dims = c(nr, nr), + dimnames = list(ped$ID, ped$ID) + ) + if (verbose) { + cat("Completed first degree relatives (adjacency)\n") + } + if (saveable) { + saveRDS(isPar, file = checkpoint_files$isPar) + } + } + + # isPar is the adjacency matrix. 'A' matrix from RAM + if (component %in% c("common nuclear")) { + Matrix::diag(isPar) <- 1 + if (sparse == FALSE) { + isPar <- as.matrix(isPar) + } + return(isPar) + } + + if (resume && file.exists(checkpoint_files$isChild)) { + if (verbose) cat("Resuming: Loading isChild matrix...\n") + isChild <- readRDS(checkpoint_files$isChild) + } else { + # isChild is the 'S' matrix from RAM + + isChild <- isChild(isChild_method = isChild_method, ped = ped) + + if (saveable) { + saveRDS(isChild, file = checkpoint_files$isChild) + } + } + # --- Step 2: Compute Relatedness Matrix --- + if (resume && file.exists(checkpoint_files$r_checkpoint) && file.exists(checkpoint_files$gen_checkpoint) && file.exists(checkpoint_files$mtSum_checkpoint) && file.exists(checkpoint_files$newIsPar_checkpoint) && + file.exists(checkpoint_files$count_checkpoint) + ) { + if (verbose) cat("Resuming: Loading previous computation...\n") + r <- readRDS(checkpoint_files$r_checkpoint) + gen <- readRDS(checkpoint_files$gen_checkpoint) + mtSum <- readRDS(checkpoint_files$mtSum_checkpoint) + newIsPar <- readRDS(checkpoint_files$newIsPar_checkpoint) + count <- readRDS(checkpoint_files$count_checkpoint) + } else { + r <- Matrix::Diagonal(x = 1, n = nr) + gen <- rep(1, nr) + mtSum <- sum(r, na.rm = TRUE) + newIsPar <- isPar + count <- 0 + } + maxCount <- max.gen + 1 + if (verbose) { + cat("About to do RAM path tracing\n") + } + + # r is I + A + A^2 + ... = (I-A)^-1 from RAM + # could trim, here + while (mtSum != 0 && count < maxCount) { + r <- r + newIsPar + gen <- gen + (Matrix::rowSums(newIsPar) > 0) + newIsPar <- newIsPar %*% isPar + mtSum <- sum(newIsPar) + count <- count + 1 + if (verbose) { + cat(paste0("Completed ", count - 1, " degree relatives\n")) + } + # Save progress every save_rate iterations + if (saveable && (count %% save_rate_gen == 0)) { + saveRDS(r, file = checkpoint_files$r_checkpoint) + saveRDS(gen, file = checkpoint_files$gen_checkpoint) + saveRDS(newIsPar, file = checkpoint_files$newIsPar_checkpoint) + saveRDS(mtSum, file = checkpoint_files$mtSum_checkpoint) + saveRDS(count, file = checkpoint_files$count_checkpoint) + } + } + # compute rsq <- r %*% sqrt(diag(isChild)) + # compute rel <- tcrossprod(rsq) + if (gc) { + rm(isPar, newIsPar) + gc() + } + + if (component == "generation") { # no need to do the rest + return(gen) + } else { + if (verbose) { + cat("Completed RAM path tracing\n") + } + } + + # --- Step 3: I-A inverse times diagonal multiplication --- + if (resume && file.exists(checkpoint_files$r2_checkpoint)) { + if (verbose) cat("Resuming: Loading I-A inverse...\n") + r2 <- readRDS(checkpoint_files$r2_checkpoint) + } else { + if (verbose) { + cat("Doing I-A inverse times diagonal multiplication\n") + } + r2 <- r %*% Matrix::Diagonal(x = sqrt(isChild), n = nr) + if (gc) { + rm(r, isChild) + gc() + } + if (saveable) { + saveRDS(r2, file = checkpoint_files$r2_checkpoint) + } + } + + # --- Step 4: T crossproduct --- + + if (resume && file.exists(checkpoint_files$tcrossprod_checkpoint) && component != "generation") { + if (verbose) cat("Resuming: Loading tcrossprod...\n") + r <- readRDS(checkpoint_files$tcrossprod_checkpoint) + } else { + r <- .computeTranspose(r2 = r2, transpose_method = transpose_method, verbose = verbose) + if (saveable) { + saveRDS(r, file = checkpoint_files$tcrossprod_checkpoint) + } + } + + + if (component == "mitochondrial") { + r@x <- rep(1, length(r@x)) + # Assign 1 to all nonzero elements for mitochondrial component + } + + if (sparse == FALSE) { + r <- as.matrix(r) + } + if (flatten.diag) { # flattens diagonal if you don't want to deal with inbreeding + diag(r) <- 1 + } + if (saveable) { + saveRDS(r, file = checkpoint_files$final_matrix) + } + return(r) +} + +#' Take a pedigree and turn it into an additive genetics relatedness matrix +#' @inheritParams ped2com +#' @inherit ped2com details +#' @export +#' +ped2add <- function(ped, max.gen = 25, sparse = TRUE, verbose = FALSE, + gc = FALSE, + flatten.diag = FALSE, standardize.colnames = TRUE, + transpose_method = "tcrossprod", + adjacency_method = "direct", + saveable = FALSE, + resume = FALSE, + save_rate = 5, + save_rate_gen = save_rate, + save_rate_parlist = 1000 * save_rate, + save_path = "checkpoint/", + ...) { + ped2com( + ped = ped, + max.gen = max.gen, + sparse = sparse, + verbose = verbose, + gc = gc, + component = "additive", + flatten.diag = flatten.diag, + standardize.colnames = standardize.colnames, + transpose_method = transpose_method, + adjacency_method = adjacency_method, + saveable = saveable, + resume = resume, + save_rate_gen = save_rate_gen, + save_rate_parlist = save_rate_parlist, + save_path = save_path + ) +} + +#' Take a pedigree and turn it into a mitochondrial relatedness matrix +#' @inheritParams ped2com +#' @inherit ped2com details +#' @export +#' @aliases ped2mt +#' +ped2mit <- ped2mt <- function(ped, max.gen = 25, + sparse = TRUE, + verbose = FALSE, gc = FALSE, + flatten.diag = FALSE, + standardize.colnames = TRUE, + transpose_method = "tcrossprod", + adjacency_method = "direct", + saveable = FALSE, + resume = FALSE, + save_rate = 5, + save_rate_gen = save_rate, + save_rate_parlist = 1000 * save_rate, + save_path = "checkpoint/", + ...) { + ped2com( + ped = ped, + max.gen = max.gen, + sparse = sparse, + verbose = verbose, + gc = gc, + component = "mitochondrial", + flatten.diag = flatten.diag, + standardize.colnames = standardize.colnames, + transpose_method = transpose_method, + adjacency_method = adjacency_method, + saveable = saveable, + resume = resume, + save_rate_gen = save_rate_gen, + save_rate_parlist = save_rate_parlist, + save_path = save_path, + ... + ) +} + +#' Take a pedigree and turn it into a common nuclear environmental relatedness matrix +#' @inheritParams ped2com +#' @inherit ped2com details +#' @export +#' +ped2cn <- function(ped, max.gen = 25, sparse = TRUE, verbose = FALSE, + gc = FALSE, flatten.diag = FALSE, + standardize.colnames = TRUE, + transpose_method = "tcrossprod", + saveable = FALSE, + resume = FALSE, + save_rate = 5, + adjacency_method = "direct", + save_rate_gen = save_rate, + save_rate_parlist = 1000 * save_rate, + save_path = "checkpoint/", + ...) { + ped2com( + ped = ped, + max.gen = max.gen, + sparse = sparse, + verbose = verbose, + gc = gc, + component = "common nuclear", + adjacency_method = adjacency_method, + flatten.diag = flatten.diag, + standardize.colnames = standardize.colnames, + transpose_method = transpose_method, + saveable = saveable, + resume = resume, + save_rate_gen = save_rate_gen, + save_rate_parlist = save_rate_parlist, + save_path = save_path, + ... + ) +} +#' Take a pedigree and turn it into an extended environmental relatedness matrix +#' @inheritParams ped2com +#' @inherit ped2com details +#' @export +#' +ped2ce <- function(ped, + ...) { + matrix(1, nrow = nrow(ped), ncol = nrow(ped), dimnames = list(ped$ID, ped$ID)) +} + + +#' Compute the transpose multiplication for the relatedness matrix +#' @inheritParams ped2com +#' @inherit ped2com details +#' @param r2 a relatedness matrix +#' +.computeTranspose <- function(r2, transpose_method = "tcrossprod", verbose = FALSE) { + valid_methods <- c("tcrossprod", "crossprod", "star", + "tcross.alt.crossprod", "tcross.alt.star") + if (!transpose_method %in% valid_methods) { + stop("Invalid method specified. Choose from 'tcrossprod', 'crossprod', 'star', 'tcross.alt.crossprod', or 'tcross.alt.star'.") + } + + # Map aliases to core methods + alias_map <- c( + "tcross.alt.crossprod" = "crossprod", + "tcross.alt.star" = "star" + ) + + if (transpose_method %in% names(alias_map)) { + method_normalized <- alias_map[[transpose_method]] + } else { + method_normalized <- transpose_method + } + + result <- switch(method_normalized, + "tcrossprod" = { + if (verbose) cat("Doing tcrossprod\n") + Matrix::tcrossprod(r2) + }, + "crossprod" = { + if (verbose) cat("Doing tcrossprod using crossprod(t(.))\n") + crossprod(t(as.matrix(r2))) + }, + "star" = { + if (verbose) cat("Doing tcrossprod using %*% t(.)\n") + r2 %*% t(as.matrix(r2)) + } + ) + + return(result) +} diff --git a/R/calculateFamilySize.R b/R/calculateFamilySize.R index 06cc3f16..c346cb66 100644 --- a/R/calculateFamilySize.R +++ b/R/calculateFamilySize.R @@ -1,4 +1,4 @@ -#' allGens +#' calcAllGens #' A function to calculate the number of individuals in each generation. This is a supporting function for \code{simulatePedigree}. #' @param kpc Number of kids per couple (integer >= 2). #' @param Ngen Number of generations (integer >= 1). @@ -24,10 +24,9 @@ calcAllGens <- function(kpc, Ngen, marR) { return(allGens) } #' @rdname calcAllGens -#' @export allGens <- calcAllGens -#' sizeAllGens +#' calcFamilySizeByGen #' An internal supporting function for \code{simulatePedigree}. #' @inheritParams calcAllGens #' @return Returns a vector including the number of individuals in every generation. @@ -49,10 +48,9 @@ calcFamilySizeByGen <- function(kpc, Ngen, marR) { return(allGens) } #' @rdname calcFamilySizeByGen -#' @export sizeAllGens <- calcFamilySizeByGen -#' famSizeCal +#' calcFamilySize #' A function to calculate the total number of individuals in a pedigree given parameters. This is a supporting function for function \code{simulatePedigree} #' @inheritParams calcAllGens #' @return Returns a numeric value indicating the total pedigree size. @@ -77,6 +75,5 @@ calcFamilySize <- function(kpc, Ngen, marR) { } #' @rdname calcFamilySize -#' @export #' famSizeCal <- calcFamilySize diff --git a/R/computeRelatedness.R b/R/calculateRelatedness.R similarity index 99% rename from R/computeRelatedness.R rename to R/calculateRelatedness.R index 3fa91ffd..d1da241d 100644 --- a/R/computeRelatedness.R +++ b/R/calculateRelatedness.R @@ -107,7 +107,6 @@ inferRelatedness <- function(obsR, aceA = .9, aceC = 0, sharedC = 0) { } #' @rdname inferRelatedness -#' @export relatedness <- function(...) { warning("The 'relatedness' function is deprecated. Please use 'inferRelatedness' instead.") inferRelatedness(...) diff --git a/R/constructAdjacency.R b/R/constructAdjacency.R new file mode 100644 index 00000000..6a2c337b --- /dev/null +++ b/R/constructAdjacency.R @@ -0,0 +1,548 @@ +.adjLoop <- function(ped, component, saveable, resume, + save_path, verbose, lastComputed, + nr, checkpoint_files, update_rate, + parList, lens, save_rate_parlist, + ...) { + # Loop through each individual in the pedigree + # Build the adjacency matrix for parent-child relationships + # Is person in column j the parent of the person in row i? .5 for yes, 0 for no. + ped$momID <- as.numeric(ped$momID) + ped$dadID <- as.numeric(ped$dadID) + ped$ID <- as.numeric(ped$ID) + + for (i in (lastComputed + 1):nr) { + x <- ped[i, , drop = FALSE] + # Handle parentage according to the 'component' specified + if (component %in% c("generation", "additive")) { + # Code for 'generation' and 'additive' components + # Checks if is mom of ID or is dad of ID + xID <- as.numeric(x["ID"]) + sMom <- (xID == ped$momID) + sDad <- (xID == ped$dadID) + val <- sMom | sDad + val[is.na(val)] <- FALSE + } else if (component %in% c("common nuclear")) { + # Code for 'common nuclear' component + # IDs have the Same mom and Same dad + sMom <- (as.numeric(x["momID"]) == ped$momID) + sMom[is.na(sMom)] <- FALSE + sDad <- (as.numeric(x["dadID"]) == ped$dadID) + sDad[is.na(sDad)] <- FALSE + val <- sMom & sDad + } else if (component %in% c("mitochondrial")) { + # Code for 'mitochondrial' component + val <- (as.numeric(x["ID"]) == ped$momID) + val[is.na(val)] <- FALSE + } else { + stop("Unknown relatedness component requested") + } + # Storing the indices of the parent-child relationships + # Keep track of indices only, and then initialize a single sparse matrix + wv <- which(val) + parList[[i]] <- wv + lens[i] <- length(wv) + # Print progress if verbose is TRUE + if (verbose && (i %% update_rate == 0)) { + cat(paste0("Done with ", i, " of ", nr, "\n")) + } + # Checkpointing every save_rate iterations + if (saveable && (i %% save_rate_parlist == 0)) { + saveRDS(parList, file = checkpoint_files$parList) + saveRDS(lens, file = checkpoint_files$lens) + if (verbose) cat("Checkpointed parlist saved at iteration", i, "\n") + } + } + jss <- rep(1L:nr, times = lens) + iss <- unlist(parList) + list_of_adjacency <- list(iss = iss, jss = jss) + return(list_of_adjacency) +} + +.adjIndexed <- function(ped, component, saveable, resume, + save_path, verbose, lastComputed, + nr, checkpoint_files, update_rate, + parList, lens, save_rate_parlist) { + # Loop through each individual in the pedigree + # Build the adjacency matrix for parent-child relationships + # Is person in column j the parent of the person in row i? .5 for yes, 0 for no. + + # Convert IDs + ped$ID <- as.numeric(ped$ID) + ped$momID <- as.numeric(ped$momID) + ped$dadID <- as.numeric(ped$dadID) + + # parent-child lookup + mom_index <- match(ped$momID, ped$ID, nomatch = 0) + dad_index <- match(ped$dadID, ped$ID, nomatch = 0) + + for (i in (lastComputed + 1):nr) { + if (component %in% c("generation", "additive")) { + sMom <- (mom_index == i) + sDad <- (dad_index == i) + val <- sMom | sDad + } else if (component %in% c("common nuclear")) { + # Code for 'common nuclear' component + # IDs have the Same mom and Same dad + sMom <- (ped$momID[i] == ped$momID) + sMom[is.na(sMom)] <- FALSE + sDad <- (ped$dadID[i] == ped$dadID) + sDad[is.na(sDad)] <- FALSE + val <- sMom & sDad + } else if (component %in% c("mitochondrial")) { + val <- (mom_index == i) + } else { + stop("Unknown relatedness component requested") + } + + val[is.na(val)] <- FALSE + parList[[i]] <- which(val) + lens[i] <- length(parList[[i]]) + + # Print progress if verbose is TRUE + if (verbose && (i %% update_rate == 0)) { + cat(paste0("Done with ", i, " of ", nr, "\n")) + } + + # Checkpointing every save_rate iterations + if (saveable && (i %% save_rate_parlist == 0)) { + saveRDS(parList, file = checkpoint_files$parList) + saveRDS(lens, file = checkpoint_files$lens) + if (verbose) cat("Checkpointed parlist saved at iteration", i, "\n") + } + } + jss <- rep(1L:nr, times = lens) + iss <- unlist(parList) + list_of_adjacency <- list(iss = iss, jss = jss) + return(list_of_adjacency) +} + +.adjDirect <- function(ped, component, saveable, resume, + save_path, verbose, lastComputed, + nr, checkpoint_files, update_rate, + parList, lens, save_rate_parlist, adjBeta_method, + ...) { + # Loop through each individual in the pedigree + # Build the adjacency matrix for parent-child relationships + # Is person in column j the parent of the person in row i? .5 for yes, 0 for no. + uniID <- ped$ID # live dangerously without sort(unique(ped$ID)) + ped$ID <- as.numeric(factor(ped$ID, levels = uniID)) + ped$momID <- as.numeric(factor(ped$momID, levels = uniID)) + ped$dadID <- as.numeric(factor(ped$dadID, levels = uniID)) + + if (component %in% c("generation", "additive")) { + mIDs <- stats::na.omit(data.frame(rID = ped$ID, cID = ped$momID)) + dIDs <- stats::na.omit(data.frame(rID = ped$ID, cID = ped$dadID)) + iss <- c(mIDs$rID, dIDs$rID) + jss <- c(mIDs$cID, dIDs$cID) + } else if (component %in% c("common nuclear")) { + # message("Common Nuclear component is not yet implemented for direct method. Using index method.\n") + + # 1) Create a logical mask for only known parents + mask <- !is.na(ped$momID) & !is.na(ped$dadID) + + # 2) Create a single hash label for each known (momID, dadID) pair + base <- max(ped$ID, na.rm = TRUE) + 1L + pairCode <- ped$momID[mask] + base * ped$dadID[mask] + + # 3) Factor that label => each row with the same (mom,dad) gets the same integer code + childVec <- which(mask) + + # 4) Group children by pairCode, so each group is "all children with that (mom,dad)" + groupList <- split(childVec, pairCode) + + # 5) For each group with >1 children, form pairwise adjacency (i->j) for i != j + iss_list <- vector("list", length(groupList)) + jss_list <- vector("list", length(groupList)) + counter <- 1 + + for (g in groupList) { + k <- length(g) + if (k > 1) { + # We'll form all k^2 combos, then remove the diagonal i=j + # rep() calls faster than expand.grid + + # v = each child repeated k times + # w = entire group repeated once for each child + v <- rep(g, each = k) # row index + w <- rep(g, times = k) # col index + + keep <- (v != w) # remove diagonal where v == w + iss_list[[counter]] <- v[keep] + jss_list[[counter]] <- w[keep] + counter <- counter + 1 + } + } + + + iss <- unlist(iss_list, use.names = FALSE) + jss <- unlist(jss_list, use.names = FALSE) + + # list_of_adjacency <- .adjBeta(ped=ped,adjBeta_method=adjBeta_method, + # component = component, + # saveable = saveable, resume = resume, + # save_path = save_path, verbose = verbose, + # lastComputed = lastComputed, nr = nr, + # checkpoint_files = checkpoint_files, + # update_rate = update_rate, + # parList = parList, + # lens = lens, save_rate_parlist = save_rate_parlist, + # ...) + + # return(list_of_adjacency) + } else if (component %in% c("mitochondrial")) { + mIDs <- stats::na.omit(data.frame(rID = ped$ID, cID = ped$momID)) + iss <- c(mIDs$rID) + jss <- c(mIDs$cID) + } else { + stop("Unknown relatedness component requested") + } + list_of_adjacency <- list( + iss = iss, + jss = jss + ) + return(list_of_adjacency) +} + + + +.adjBeta <- function(ped, component, + adjBeta_method = 5, + parList = NULL, + lastComputed = 0, + nr = NULL, + lens = NULL, + saveable = FALSE, + resume = FALSE, + save_path = NULL, + verbose = FALSE, + save_rate_parlist = NULL, + update_rate = NULL, + checkpoint_files = NULL, + ...) { # 1) Pairwise compare mother IDs + if (adjBeta_method == 1) { + # gets slow when data are bigger. much slower than indexed + momMatch <- outer(ped$momID, ped$momID, FUN = "==") + momMatch[is.na(momMatch)] <- FALSE + + # 2) Pairwise compare father IDs + dadMatch <- outer(ped$dadID, ped$dadID, FUN = "==") + dadMatch[is.na(dadMatch)] <- FALSE + + # 3) Sibling adjacency if both mom & dad match + adj <- momMatch & dadMatch + + # 4) Extract indices where adj[i,j] is TRUE + w <- which(adj, arr.ind = TRUE) + # iss <- w[, 1] + # jss <- w[, 2] + # + list_of_adjacency <- list( + iss = w[, 1], + jss = w[, 2] + ) + } else if (adjBeta_method == 2) { + # 1) Create a logical mask for known parents + mask <- !is.na(ped$momID) & !is.na(ped$dadID) + + # 2) Create a single string label for each known (momID, dadID) pair + pairLabel <- paste0(ped$momID[mask], "_", ped$dadID[mask]) + + # 3) Factor that label => each row with the same (mom,dad) gets the same integer code + # This is "creating a new ID" for each unique parent pair + pairCode <- match(pairLabel, unique(pairLabel)) + + # childVec are the row indices in 'ped' that have known parents + childVec <- which(mask) # length(childVec) = sum(mask) + + # 4) Group children by pairCode, so each group is "all children with that (mom,dad)" + groupList <- split(childVec, pairCode) + + # 5) For each group with >1 children, form pairwise adjacency i->j + iss_list <- list() + jss_list <- list() + counter <- 1 + + for (g in groupList) { + if (length(g) > 1) { + combos <- expand.grid(g, g, KEEP.OUT.ATTRS = FALSE) + combos <- combos[combos[, 1] != combos[, 2], , drop = FALSE] + iss_list[[counter]] <- combos[, 1] + jss_list[[counter]] <- combos[, 2] + counter <- counter + 1 + } + } + # iss <- unlist(iss_list, use.names = FALSE) + # jss <- unlist(jss_list, use.names = FALSE) + + list_of_adjacency <- list( + iss = unlist(iss_list, use.names = FALSE), + jss = unlist(jss_list, use.names = FALSE) + ) + } else if (adjBeta_method == 3) { + nr <- nrow(ped) + # terrible + # Define a scalar-checking function: + f_check <- function(i, j) { + # i, j are each single integers + # Return one boolean: do they share both parents? + !is.na(ped$momID[i]) && !is.na(ped$dadID[i]) && + !is.na(ped$momID[j]) && !is.na(ped$dadID[j]) && + (ped$momID[i] == ped$momID[j]) && + (ped$dadID[i] == ped$dadID[j]) + } + + # Vectorize it so outer() will produce an nr x nr matrix + vf_check <- Vectorize(f_check) + + # Now outer() calls vf_check(...) in a way that yields scalar results + adj <- outer(seq_len(nr), seq_len(nr), FUN = vf_check) + + # Extract which cells of adj are TRUE + w <- which(adj, arr.ind = TRUE) + # iss <- w[, 1] + # jss <- w[, 2] + + list_of_adjacency <- list( + iss = iss <- w[, 1], + jss = jss <- w[, 2] + ) + } else if (adjBeta_method == 4) { + # 1) Create a logical mask for known parents + mask <- !is.na(ped$momID) & !is.na(ped$dadID) + + # 2) Create a single string label for each known (momID, dadID) pair + pairLabel <- paste0(ped$momID[mask], "_", ped$dadID[mask]) + + # 3) Factor that label => each row with the same (mom,dad) gets the same integer code + pairCode <- match(pairLabel, unique(pairLabel)) + + # childVec are the row indices in 'ped' that have known parents + childVec <- which(mask) + + # 4) Group children by pairCode, so each group is "all children with that (mom,dad)" + groupList <- split(childVec, pairCode) + + # 5) For each group with >1 children, form pairwise adjacency (i->j) for i != j + iss_list <- vector("list", length(groupList)) + jss_list <- vector("list", length(groupList)) + counter <- 1 + + for (g in groupList) { + k <- length(g) + if (k > 1) { + # We'll form all k^2 combos, then remove the diagonal i=j + # Instead of expand.grid, do rep() calls: + + # v = each child repeated k times + # w = entire group repeated once for each child + v <- rep(g, each = k) # row index + w <- rep(g, times = k) # col index + + keep <- (v != w) # remove diagonal where v == w + iss_list[[counter]] <- v[keep] + jss_list[[counter]] <- w[keep] + counter <- counter + 1 + } + } + + list_of_adjacency <- list( + iss = unlist(iss_list, use.names = FALSE), + jss = unlist(jss_list, use.names = FALSE) + ) + } else if (adjBeta_method == 5) { + # 1) Create a logical mask for known parents + mask <- !is.na(ped$momID) & !is.na(ped$dadID) + + # 2) Create a single hash label for each known (momID, dadID) pair + # pairLabel <- paste0(ped$momID[mask], "_", ped$dadID[mask]) + base <- max(ped$ID, na.rm = TRUE) + 1L + pairCode <- ped$momID[mask] + base * ped$dadID[mask] + + # 3) Factor that label => each row with the same (mom,dad) gets the same integer code + childVec <- which(mask) + + # 4) Group children by pairCode, so each group is "all children with that (mom,dad)" + groupList <- split(childVec, pairCode) + + # 5) For each group with >1 children, form pairwise adjacency (i->j) for i != j + iss_list <- vector("list", length(groupList)) + jss_list <- vector("list", length(groupList)) + counter <- 1 + + for (g in groupList) { + k <- length(g) + if (k > 1) { + # We'll form all k^2 combos, then remove the diagonal i=j + # Instead of expand.grid, do rep() calls: + + # v = each child repeated k times + # w = entire group repeated once for each child + v <- rep(g, each = k) # row index + w <- rep(g, times = k) # col index + + keep <- (v != w) # remove diagonal where v == w + iss_list[[counter]] <- v[keep] + jss_list[[counter]] <- w[keep] + counter <- counter + 1 + } + } + + list_of_adjacency <- list( + iss = unlist(iss_list, use.names = FALSE), + jss = unlist(jss_list, use.names = FALSE) + ) + } else { + list_of_adjacency <- .adjIndexed( + ped = ped, component = component, + saveable = saveable, resume = resume, + save_path = save_path, verbose = verbose, + lastComputed = lastComputed, nr = nr, + checkpoint_files = checkpoint_files, + update_rate = update_rate, parList = parList, + lens = lens, save_rate_parlist = save_rate_parlist + ) + } + return(list_of_adjacency) +} + + +#' Compute Parent Adjacency Matrix with Multiple Approaches +#' @inheritParams ped2com +#' @inherit ped2com details +#' @param nr the number of rows in the pedigree dataset +#' @param lastComputed the last computed index +#' @param parList a list of parent-child relationships +#' @param lens a vector of the lengths of the parent-child relationships +#' @param checkpoint_files a list of checkpoint files +#' @param update_rate the rate at which to update the progress +#' +#' @export +computeParentAdjacency <- function(ped, component, + adjacency_method = "direct", + saveable, resume, + save_path, + verbose = FALSE, + lastComputed = 0, nr, + checkpoint_files, + update_rate, + parList, lens, save_rate_parlist, + adjBeta_method = NULL, + ...) { + if (!adjacency_method %in% c("loop", "indexed", "direct", "beta")) { + stop("Invalid method specified. Choose from 'loop', 'direct', 'indexed', or 'beta'.") + } + # For loop/indexed/direct: skip if already complete + if (adjacency_method != "beta" && lastComputed >= nr) { + list_of_adjacency <- NULL + } else { + list_of_adjacency <- switch(adjacency_method, + + "loop" = { + # Original version + .adjLoop( + ped = ped, + component = component, + saveable = saveable, + resume = resume, + save_path = save_path, + verbose = verbose, + lastComputed = lastComputed, + nr = nr, + checkpoint_files = checkpoint_files, + update_rate = update_rate, + parList = parList, + lens = lens, + save_rate_parlist = save_rate_parlist, + ... + ) + }, + + "indexed" = { + # Garrison version + .adjIndexed( + ped = ped, + component = component, + saveable = saveable, + resume = resume, + save_path = save_path, + verbose = verbose, + lastComputed = lastComputed, + nr = nr, + checkpoint_files = checkpoint_files, + update_rate = update_rate, + parList = parList, + lens = lens, + save_rate_parlist = save_rate_parlist, + ... + ) + }, + + "direct" = { + # Hunter version + .adjDirect( + ped = ped, + component = component, + saveable = saveable, + resume = resume, + save_path = save_path, + verbose = verbose, + lastComputed = lastComputed, + nr = nr, + checkpoint_files = checkpoint_files, + update_rate = update_rate, + parList = parList, + lens = lens, + save_rate_parlist = save_rate_parlist, + ... + ) + }, + + "beta" = { + .adjBeta( + ped = ped, + adjBeta_method = adjBeta_method, + component = component, + saveable = saveable, + resume = resume, + save_path = save_path, + verbose = verbose, + lastComputed = lastComputed, + nr = nr, + checkpoint_files = checkpoint_files, + update_rate = update_rate, + parList = parList, + lens = lens, + save_rate_parlist = save_rate_parlist, + ... + ) + } + ) + } + if (saveable) { + saveRDS(parList, file = checkpoint_files$parList) + saveRDS(lens, file = checkpoint_files$lens) + if (verbose) { + cat("Final checkpoint saved for adjacency matrix.\n") + } + } + return(list_of_adjacency) +} + + +#' Determine isChild Status, isChild is the 'S' matrix from RAM +#' @param isChild_method method to determine isChild status +#' @param ped pedigree data frame +#' @return isChild 'S' matrix +#' + +isChild <- function(isChild_method, ped) { + if (isChild_method == "partialparent") { + isChild <- apply(ped[, c("momID", "dadID")], 1, function(x) { + .5 + .25 * sum(is.na(x)) # 2 parents -> .5, 1 parent -> .75, 0 parents -> 1 + }) + } else { + isChild <- apply(ped[, c("momID", "dadID")], 1, function(x) { + 2^(-!all(is.na(x))) + }) + } +} diff --git a/R/convertPedigree.R b/R/convertPedigree.R deleted file mode 100644 index 86e9970a..00000000 --- a/R/convertPedigree.R +++ /dev/null @@ -1,1012 +0,0 @@ -#' Take a pedigree and turn it into a relatedness matrix -#' @param ped a pedigree dataset. Needs ID, momID, and dadID columns -#' @param component character. Which component of the pedigree to return. See Details. -#' @param max.gen the maximum number of generations to compute -#' (e.g., only up to 4th degree relatives). The default is 25. However it can be set to infinity. -#' `Inf` uses as many generations as there are in the data. -#' @param sparse logical. If TRUE, use and return sparse matrices from Matrix package -#' @param verbose logical. If TRUE, print progress through stages of algorithm -#' @param update_rate numeric. The rate at which to print progress -#' @param gc logical. If TRUE, do frequent garbage collection via \code{\link{gc}} to save memory -#' @param saveable logical. If TRUE, save the intermediate results to disk -#' @param save_rate numeric. The rate at which to save the intermediate results -#' @param save_rate_gen numeric. The rate at which to save the intermediate results by generation. If NULL, defaults to save_rate -#' @param save_rate_parlist numeric. The rate at which to save the intermediate results by parent list. If NULL, defaults to save_rate*1000 -#' @param resume logical. If TRUE, resume from a checkpoint -#' @param save_path character. The path to save the checkpoint files -#' @param flatten.diag logical. If TRUE, overwrite the diagonal of the final relatedness matrix with ones -#' @param standardize.colnames logical. If TRUE, standardize the column names of the pedigree dataset -#' @param transpose_method character. The method to use for computing the transpose. Options are "tcrossprod", "crossprod", or "star" -#' @param adjacency_method character. The method to use for computing the adjacency matrix. Options are "loop", "indexed", direct or beta -#' @param isChild_method character. The method to use for computing the isChild matrix. Options are "classic" or "partialparent" -#' @param adjBeta_method numeric The method to use for computing the building the adjacency_method matrix when using the "beta" build -#' @param ... additional arguments to be passed to \code{\link{ped2com}} -#' @details The algorithms and methodologies used in this function are further discussed and exemplified in the vignette titled "examplePedigreeFunctions". For more advanced scenarios and detailed explanations, consult this vignette. -#' @export -#' -ped2com <- function(ped, component, - max.gen = 25, - sparse = TRUE, - verbose = FALSE, - gc = FALSE, - flatten.diag = FALSE, - standardize.colnames = TRUE, - transpose_method = "tcrossprod", - adjacency_method = "direct", - isChild_method = "classic", - saveable = FALSE, - resume = FALSE, - save_rate = 5, - save_rate_gen = save_rate, - save_rate_parlist = 100000 * save_rate, - update_rate = 100, - save_path = "checkpoint/", - adjBeta_method = NULL, - ...) { - #------ - # Checkpointing - #------ - if (saveable || resume) { # prepare checkpointing - if (verbose) cat("Preparing checkpointing...\n") - # Ensure save path exists - if (saveable && !dir.exists(save_path)) { - if (verbose) cat("Creating save path...\n") - dir.create(save_path, recursive = TRUE) - } else if (resume && !dir.exists(save_path)) { - stop("Cannot resume from checkpoint. Save path does not exist.") - } - - # Define checkpoint files - checkpoint_files <- list( - parList = file.path(save_path, "parList.rds"), - lens = file.path(save_path, "lens.rds"), - isPar = file.path(save_path, "isPar.rds"), - iss = file.path(save_path, "iss.rds"), - jss = file.path(save_path, "jss.rds"), - isChild = file.path(save_path, "isChild.rds"), - r_checkpoint = file.path(save_path, "r_checkpoint.rds"), - gen_checkpoint = file.path(save_path, "gen_checkpoint.rds"), - newIsPar_checkpoint = file.path(save_path, "newIsPar_checkpoint.rds"), - mtSum_checkpoint = file.path(save_path, "mtSum_checkpoint.rds"), - r2_checkpoint = file.path(save_path, "r2_checkpoint.rds"), - tcrossprod_checkpoint = file.path(save_path, "tcrossprod_checkpoint.rds"), - count_checkpoint = file.path(save_path, "count_checkpoint.rds"), - final_matrix = file.path(save_path, "final_matrix.rds") - ) - } - #------ - # Validation/Preparation - #------ - - # Validate the 'component' argument and match it against predefined choices - component <- match.arg(tolower(component), - choices = c( - "generation", - "additive", - "common nuclear", - "mitochondrial" - ) - ) - - if (!transpose_method %in% c("tcrossprod", "crossprod", "star", "tcross.alt.crossprod", "tcross.alt.star")) { - stop("Invalid method specified. Choose from 'tcrossprod', 'crossprod', or 'star' or 'tcross.alt.crossprod' or 'tcross.alt.star'.") - } - if (!adjacency_method %in% c("indexed", "loop", "direct", "beta")) { - stop("Invalid method specified. Choose from 'indexed', 'loop', 'direct', or 'beta'.") - } - - # standardize colnames - if (standardize.colnames) { - ped <- standardizeColnames(ped, verbose = verbose) - } - - # Load final result if computation was completed - if (resume && file.exists(checkpoint_files$final_matrix)) { - if (verbose) cat("Loading final computed matrix...\n") - return(readRDS(checkpoint_files$final_matrix)) - } - - - #------ - # Algorithm - #------ - - # Get the number of rows in the pedigree dataset, representing the size of the family - nr <- nrow(ped) - - # Print the family size if verbose is TRUE - if (verbose) { - cat(paste0("Family Size = ", nr, "\n")) - } - - # Step 1: Construct parent-child adjacency matrix - ## A. Resume from Checkpoint if Needed - if (resume && file.exists(checkpoint_files$parList) && file.exists(checkpoint_files$lens)) { - if (verbose) cat("Resuming: Loading parent-child adjacency data...\n") - parList <- readRDS(checkpoint_files$parList) - lens <- readRDS(checkpoint_files$lens) - computed_indices <- which(!sapply(parList, is.null)) - lastComputed <- if (length(computed_indices) > 0) max(computed_indices) else 0 - if (verbose) cat("Resuming from iteration", lastComputed + 1, "\n") - } else { - ## Initialize variables - parList <- vector("list", nr) - lens <- integer(nr) - lastComputed <- 0 - - if (verbose) cat("Building parent adjacency matrix...\n") - } - - - ## B. Resume loop from the next uncomputed index - - if (verbose) cat("Computing parent-child adjacency matrix...\n") - # Construct sparse matrix - if (resume && file.exists(checkpoint_files$iss) && file.exists(checkpoint_files$jss)) { # fix to check actual - if (verbose) cat("Resuming: Constructed matrix...\n") - jss <- readRDS(checkpoint_files$jss) - iss <- readRDS(checkpoint_files$iss) - list_of_adjacencies <- list(iss = iss, jss = jss) - } else { - list_of_adjacencies <- compute_parent_adjacency( - ped = ped, - save_rate_parlist = save_rate_parlist, - checkpoint_files = checkpoint_files, - component = component, - adjacency_method = adjacency_method, # adjacency_method, - saveable = saveable, - resume = resume, - save_path = save_path, - update_rate = update_rate, - verbose = verbose, - lastComputed = lastComputed, - nr = nr, - parList = parList, - lens = lens, - adjBeta_method = adjBeta_method - ) - - # Construct sparse matrix - iss <- list_of_adjacencies$iss - jss <- list_of_adjacencies$jss - - if (verbose) { - cat("Constructed sparse matrix\n") - } - if (saveable) { - saveRDS(jss, file = checkpoint_files$jss) - saveRDS(iss, file = checkpoint_files$iss) - } - # Garbage collection if gc is TRUE - if (gc) { - rm(parList, lens, list_of_adjacencies) - gc() - } - } - # Set parent values depending on the component type - if (component %in% c("generation", "additive")) { - parVal <- .5 - } else if (component %in% c("common nuclear", "mitochondrial")) { - parVal <- 1 - } else { - stop("Don't know how to set parental value") - } - # Construct sparse matrix - if (resume && file.exists(checkpoint_files$isPar)) { - if (verbose) cat("Resuming: Loading adjacency matrix...\n") - isPar <- readRDS(checkpoint_files$isPar) - } else { - # Initialize adjacency matrix for parent-child relationships - isPar <- Matrix::sparseMatrix( - i = iss, - j = jss, - x = parVal, - dims = c(nr, nr), - dimnames = list(ped$ID, ped$ID) - ) - if (verbose) { - cat("Completed first degree relatives (adjacency)\n") - } - if (saveable) { - saveRDS(isPar, file = checkpoint_files$isPar) - } - } - - # isPar is the adjacency matrix. 'A' matrix from RAM - if (component %in% c("common nuclear")) { - Matrix::diag(isPar) <- 1 - if (sparse == FALSE) { - isPar <- as.matrix(isPar) - } - return(isPar) - } - - if (resume && file.exists(checkpoint_files$isChild)) { - if (verbose) cat("Resuming: Loading isChild matrix...\n") - isChild <- readRDS(checkpoint_files$isChild) - } else { - # isChild is the 'S' matrix from RAM - - isChild <- isChild(isChild_method = isChild_method, ped = ped) - - if (saveable) { - saveRDS(isChild, file = checkpoint_files$isChild) - } - } - # --- Step 2: Compute Relatedness Matrix --- - if (resume && file.exists(checkpoint_files$r_checkpoint) && file.exists(checkpoint_files$gen_checkpoint) && file.exists(checkpoint_files$mtSum_checkpoint) && file.exists(checkpoint_files$newIsPar_checkpoint) && - file.exists(checkpoint_files$count_checkpoint) - ) { - if (verbose) cat("Resuming: Loading previous computation...\n") - r <- readRDS(checkpoint_files$r_checkpoint) - gen <- readRDS(checkpoint_files$gen_checkpoint) - mtSum <- readRDS(checkpoint_files$mtSum_checkpoint) - newIsPar <- readRDS(checkpoint_files$newIsPar_checkpoint) - count <- readRDS(checkpoint_files$count_checkpoint) - } else { - r <- Matrix::Diagonal(x = 1, n = nr) - gen <- rep(1, nr) - mtSum <- sum(r, na.rm = TRUE) - newIsPar <- isPar - count <- 0 - } - maxCount <- max.gen + 1 - if (verbose) { - cat("About to do RAM path tracing\n") - } - - # r is I + A + A^2 + ... = (I-A)^-1 from RAM - # could trim, here - while (mtSum != 0 && count < maxCount) { - r <- r + newIsPar - gen <- gen + (Matrix::rowSums(newIsPar) > 0) - newIsPar <- newIsPar %*% isPar - mtSum <- sum(newIsPar) - count <- count + 1 - if (verbose) { - cat(paste0("Completed ", count - 1, " degree relatives\n")) - } - # Save progress every save_rate iterations - if (saveable && (count %% save_rate_gen == 0)) { - saveRDS(r, file = checkpoint_files$r_checkpoint) - saveRDS(gen, file = checkpoint_files$gen_checkpoint) - saveRDS(newIsPar, file = checkpoint_files$newIsPar_checkpoint) - saveRDS(mtSum, file = checkpoint_files$mtSum_checkpoint) - saveRDS(count, file = checkpoint_files$count_checkpoint) - } - } - # compute rsq <- r %*% sqrt(diag(isChild)) - # compute rel <- tcrossprod(rsq) - if (gc) { - rm(isPar, newIsPar) - gc() - } - - if (component == "generation") { # no need to do the rest - return(gen) - } else { - if (verbose) { - cat("Completed RAM path tracing\n") - } - } - - # --- Step 3: I-A inverse times diagonal multiplication --- - if (resume && file.exists(checkpoint_files$r2_checkpoint)) { - if (verbose) cat("Resuming: Loading I-A inverse...\n") - r2 <- readRDS(checkpoint_files$r2_checkpoint) - } else { - if (verbose) { - cat("Doing I-A inverse times diagonal multiplication\n") - } - r2 <- r %*% Matrix::Diagonal(x = sqrt(isChild), n = nr) - if (gc) { - rm(r, isChild) - gc() - } - if (saveable) { - saveRDS(r2, file = checkpoint_files$r2_checkpoint) - } - } - - # --- Step 4: T crossproduct --- - - if (resume && file.exists(checkpoint_files$tcrossprod_checkpoint) && component != "generation") { - if (verbose) cat("Resuming: Loading tcrossprod...\n") - r <- readRDS(checkpoint_files$tcrossprod_checkpoint) - } else { - r <- .computeTranspose(r2 = r2, transpose_method = transpose_method, verbose = verbose) - if (saveable) { - saveRDS(r, file = checkpoint_files$tcrossprod_checkpoint) - } - } - - - if (component == "mitochondrial") { - r@x <- rep(1, length(r@x)) - # Assign 1 to all nonzero elements for mitochondrial component - } - - if (sparse == FALSE) { - r <- as.matrix(r) - } - if (flatten.diag) { # flattens diagonal if you don't want to deal with inbreeding - diag(r) <- 1 - } - if (saveable) { - saveRDS(r, file = checkpoint_files$final_matrix) - } - return(r) -} - -#' Take a pedigree and turn it into an additive genetics relatedness matrix -#' @inheritParams ped2com -#' @inherit ped2com details -#' @export -#' -ped2add <- function(ped, max.gen = 25, sparse = TRUE, verbose = FALSE, - gc = FALSE, - flatten.diag = FALSE, standardize.colnames = TRUE, - transpose_method = "tcrossprod", - adjacency_method = "direct", - saveable = FALSE, - resume = FALSE, - save_rate = 5, - save_rate_gen = save_rate, - save_rate_parlist = 1000 * save_rate, - save_path = "checkpoint/", - ...) { - ped2com( - ped = ped, - max.gen = max.gen, - sparse = sparse, - verbose = verbose, - gc = gc, - component = "additive", - flatten.diag = flatten.diag, - standardize.colnames = standardize.colnames, - transpose_method = transpose_method, - adjacency_method = adjacency_method, - saveable = saveable, - resume = resume, - save_rate_gen = save_rate_gen, - save_rate_parlist = save_rate_parlist, - save_path = save_path - ) -} - -#' Take a pedigree and turn it into a mitochondrial relatedness matrix -#' @inheritParams ped2com -#' @inherit ped2com details -#' @export -#' @aliases ped2mt -#' -ped2mit <- ped2mt <- function(ped, max.gen = 25, - sparse = TRUE, - verbose = FALSE, gc = FALSE, - flatten.diag = FALSE, - standardize.colnames = TRUE, - transpose_method = "tcrossprod", - adjacency_method = "direct", - saveable = FALSE, - resume = FALSE, - save_rate = 5, - save_rate_gen = save_rate, - save_rate_parlist = 1000 * save_rate, - save_path = "checkpoint/", - ...) { - ped2com( - ped = ped, - max.gen = max.gen, - sparse = sparse, - verbose = verbose, - gc = gc, - component = "mitochondrial", - flatten.diag = flatten.diag, - standardize.colnames = standardize.colnames, - transpose_method = transpose_method, - adjacency_method = adjacency_method, - saveable = saveable, - resume = resume, - save_rate_gen = save_rate_gen, - save_rate_parlist = save_rate_parlist, - save_path = save_path, - ... - ) -} - -#' Take a pedigree and turn it into a common nuclear environmental relatedness matrix -#' @inheritParams ped2com -#' @inherit ped2com details -#' @export -#' -ped2cn <- function(ped, max.gen = 25, sparse = TRUE, verbose = FALSE, - gc = FALSE, flatten.diag = FALSE, - standardize.colnames = TRUE, - transpose_method = "tcrossprod", - saveable = FALSE, - resume = FALSE, - save_rate = 5, - adjacency_method = "direct", - save_rate_gen = save_rate, - save_rate_parlist = 1000 * save_rate, - save_path = "checkpoint/", - ...) { - ped2com( - ped = ped, - max.gen = max.gen, - sparse = sparse, - verbose = verbose, - gc = gc, - component = "common nuclear", - adjacency_method = adjacency_method, - flatten.diag = flatten.diag, - standardize.colnames = standardize.colnames, - transpose_method = transpose_method, - saveable = saveable, - resume = resume, - save_rate_gen = save_rate_gen, - save_rate_parlist = save_rate_parlist, - save_path = save_path, - ... - ) -} -#' Take a pedigree and turn it into an extended environmental relatedness matrix -#' @inheritParams ped2com -#' @inherit ped2com details -#' @export -#' -ped2ce <- function(ped, - ...) { - matrix(1, nrow = nrow(ped), ncol = nrow(ped), dimnames = list(ped$ID, ped$ID)) -} - - -#' Compute the transpose multiplication for the relatedness matrix -#' @inheritParams ped2com -#' @inherit ped2com details -#' @param r2 a relatedness matrix -#' -.computeTranspose <- function(r2, transpose_method = "tcrossprod", verbose = FALSE) { - if (!transpose_method %in% c("tcrossprod", "crossprod", "star", "tcross.alt.crossprod", "tcross.alt.star")) { - stop("Invalid method specified. Choose from 'tcrossprod', 'crossprod', or 'star'.") - } - if (transpose_method %in% c("crossprod", "tcross.alt.crossprod")) { - if (verbose) cat("Doing alt tcrossprod crossprod t \n") - return(crossprod(t(as.matrix(r2)))) - } else if (transpose_method %in% c("star", "tcross.alt.star")) { - if (verbose) cat("Doing alt tcrossprod %*% t \n") - return(r2 %*% t(as.matrix(r2))) - } else { - if (verbose) cat("Doing tcrossprod\n") - return(Matrix::tcrossprod(r2)) - } -} - -.adjLoop <- function(ped, component, saveable, resume, - save_path, verbose, lastComputed, - nr, checkpoint_files, update_rate, - parList, lens, save_rate_parlist, - ...) { - # Loop through each individual in the pedigree - # Build the adjacency matrix for parent-child relationships - # Is person in column j the parent of the person in row i? .5 for yes, 0 for no. - ped$momID <- as.numeric(ped$momID) - ped$dadID <- as.numeric(ped$dadID) - ped$ID <- as.numeric(ped$ID) - - for (i in (lastComputed + 1):nr) { - x <- ped[i, , drop = FALSE] - # Handle parentage according to the 'component' specified - if (component %in% c("generation", "additive")) { - # Code for 'generation' and 'additive' components - # Checks if is mom of ID or is dad of ID - xID <- as.numeric(x["ID"]) - sMom <- (xID == ped$momID) - sDad <- (xID == ped$dadID) - val <- sMom | sDad - val[is.na(val)] <- FALSE - } else if (component %in% c("common nuclear")) { - # Code for 'common nuclear' component - # IDs have the Same mom and Same dad - sMom <- (as.numeric(x["momID"]) == ped$momID) - sMom[is.na(sMom)] <- FALSE - sDad <- (as.numeric(x["dadID"]) == ped$dadID) - sDad[is.na(sDad)] <- FALSE - val <- sMom & sDad - } else if (component %in% c("mitochondrial")) { - # Code for 'mitochondrial' component - val <- (as.numeric(x["ID"]) == ped$momID) - val[is.na(val)] <- FALSE - } else { - stop("Unknown relatedness component requested") - } - # Storing the indices of the parent-child relationships - # Keep track of indices only, and then initialize a single sparse matrix - wv <- which(val) - parList[[i]] <- wv - lens[i] <- length(wv) - # Print progress if verbose is TRUE - if (verbose && (i %% update_rate == 0)) { - cat(paste0("Done with ", i, " of ", nr, "\n")) - } - # Checkpointing every save_rate iterations - if (saveable && (i %% save_rate_parlist == 0)) { - saveRDS(parList, file = checkpoint_files$parList) - saveRDS(lens, file = checkpoint_files$lens) - if (verbose) cat("Checkpointed parlist saved at iteration", i, "\n") - } - } - jss <- rep(1L:nr, times = lens) - iss <- unlist(parList) - list_of_adjacency <- list(iss = iss, jss = jss) - return(list_of_adjacency) -} - -.adjIndexed <- function(ped, component, saveable, resume, - save_path, verbose, lastComputed, - nr, checkpoint_files, update_rate, - parList, lens, save_rate_parlist) { - # Loop through each individual in the pedigree - # Build the adjacency matrix for parent-child relationships - # Is person in column j the parent of the person in row i? .5 for yes, 0 for no. - - # Convert IDs - ped$ID <- as.numeric(ped$ID) - ped$momID <- as.numeric(ped$momID) - ped$dadID <- as.numeric(ped$dadID) - - # parent-child lookup - mom_index <- match(ped$momID, ped$ID, nomatch = 0) - dad_index <- match(ped$dadID, ped$ID, nomatch = 0) - - for (i in (lastComputed + 1):nr) { - if (component %in% c("generation", "additive")) { - sMom <- (mom_index == i) - sDad <- (dad_index == i) - val <- sMom | sDad - } else if (component %in% c("common nuclear")) { - # Code for 'common nuclear' component - # IDs have the Same mom and Same dad - sMom <- (ped$momID[i] == ped$momID) - sMom[is.na(sMom)] <- FALSE - sDad <- (ped$dadID[i] == ped$dadID) - sDad[is.na(sDad)] <- FALSE - val <- sMom & sDad - } else if (component %in% c("mitochondrial")) { - val <- (mom_index == i) - } else { - stop("Unknown relatedness component requested") - } - - val[is.na(val)] <- FALSE - parList[[i]] <- which(val) - lens[i] <- length(parList[[i]]) - - # Print progress if verbose is TRUE - if (verbose && (i %% update_rate == 0)) { - cat(paste0("Done with ", i, " of ", nr, "\n")) - } - - # Checkpointing every save_rate iterations - if (saveable && (i %% save_rate_parlist == 0)) { - saveRDS(parList, file = checkpoint_files$parList) - saveRDS(lens, file = checkpoint_files$lens) - if (verbose) cat("Checkpointed parlist saved at iteration", i, "\n") - } - } - jss <- rep(1L:nr, times = lens) - iss <- unlist(parList) - list_of_adjacency <- list(iss = iss, jss = jss) - return(list_of_adjacency) -} - -.adjDirect <- function(ped, component, saveable, resume, - save_path, verbose, lastComputed, - nr, checkpoint_files, update_rate, - parList, lens, save_rate_parlist, adjBeta_method, - ...) { - # Loop through each individual in the pedigree - # Build the adjacency matrix for parent-child relationships - # Is person in column j the parent of the person in row i? .5 for yes, 0 for no. - uniID <- ped$ID # live dangerously without sort(unique(ped$ID)) - ped$ID <- as.numeric(factor(ped$ID, levels = uniID)) - ped$momID <- as.numeric(factor(ped$momID, levels = uniID)) - ped$dadID <- as.numeric(factor(ped$dadID, levels = uniID)) - - if (component %in% c("generation", "additive")) { - mIDs <- stats::na.omit(data.frame(rID = ped$ID, cID = ped$momID)) - dIDs <- stats::na.omit(data.frame(rID = ped$ID, cID = ped$dadID)) - iss <- c(mIDs$rID, dIDs$rID) - jss <- c(mIDs$cID, dIDs$cID) - } else if (component %in% c("common nuclear")) { - # message("Common Nuclear component is not yet implemented for direct method. Using index method.\n") - - # 1) Create a logical mask for only known parents - mask <- !is.na(ped$momID) & !is.na(ped$dadID) - - # 2) Create a single hash label for each known (momID, dadID) pair - base <- max(ped$ID, na.rm = TRUE) + 1L - pairCode <- ped$momID[mask] + base * ped$dadID[mask] - - # 3) Factor that label => each row with the same (mom,dad) gets the same integer code - childVec <- which(mask) - - # 4) Group children by pairCode, so each group is "all children with that (mom,dad)" - groupList <- split(childVec, pairCode) - - # 5) For each group with >1 children, form pairwise adjacency (i->j) for i != j - iss_list <- vector("list", length(groupList)) - jss_list <- vector("list", length(groupList)) - counter <- 1 - - for (g in groupList) { - k <- length(g) - if (k > 1) { - # We'll form all k^2 combos, then remove the diagonal i=j - # rep() calls faster than expand.grid - - # v = each child repeated k times - # w = entire group repeated once for each child - v <- rep(g, each = k) # row index - w <- rep(g, times = k) # col index - - keep <- (v != w) # remove diagonal where v == w - iss_list[[counter]] <- v[keep] - jss_list[[counter]] <- w[keep] - counter <- counter + 1 - } - } - - - iss <- unlist(iss_list, use.names = FALSE) - jss <- unlist(jss_list, use.names = FALSE) - - # list_of_adjacency <- .adjBeta(ped=ped,adjBeta_method=adjBeta_method, - # component = component, - # saveable = saveable, resume = resume, - # save_path = save_path, verbose = verbose, - # lastComputed = lastComputed, nr = nr, - # checkpoint_files = checkpoint_files, - # update_rate = update_rate, - # parList = parList, - # lens = lens, save_rate_parlist = save_rate_parlist, - # ...) - - # return(list_of_adjacency) - } else if (component %in% c("mitochondrial")) { - mIDs <- stats::na.omit(data.frame(rID = ped$ID, cID = ped$momID)) - iss <- c(mIDs$rID) - jss <- c(mIDs$cID) - } else { - stop("Unknown relatedness component requested") - } - list_of_adjacency <- list( - iss = iss, - jss = jss - ) - return(list_of_adjacency) -} - -#' Compute Parent Adjacency Matrix with Multiple Approaches -#' @inheritParams ped2com -#' @inherit ped2com details -#' @param nr the number of rows in the pedigree dataset -#' @param lastComputed the last computed index -#' @param parList a list of parent-child relationships -#' @param lens a vector of the lengths of the parent-child relationships -#' @param checkpoint_files a list of checkpoint files - -compute_parent_adjacency <- function(ped, component, - adjacency_method = "direct", - saveable, resume, - save_path, verbose = FALSE, - lastComputed = 0, nr, checkpoint_files, update_rate, - parList, lens, save_rate_parlist, adjBeta_method = NULL, - ...) { - if (adjacency_method == "loop") { - if (lastComputed < nr) { # Original version - list_of_adjacency <- .adjLoop( - ped = ped, - component = component, - saveable = saveable, - resume = resume, - save_path = save_path, - verbose = verbose, - lastComputed = lastComputed, - nr = nr, - checkpoint_files = checkpoint_files, - update_rate = update_rate, - parList = parList, - lens = lens, - save_rate_parlist = save_rate_parlist, - ... - ) - } - } else if (adjacency_method == "indexed") { # Garrison version - if (lastComputed < nr) { - list_of_adjacency <- .adjIndexed( - ped = ped, - component = component, - saveable = saveable, - resume = resume, - save_path = save_path, - verbose = verbose, - lastComputed = lastComputed, - nr = nr, - checkpoint_files = checkpoint_files, - update_rate = update_rate, - parList = parList, - lens = lens, - save_rate_parlist = save_rate_parlist, - ... - ) - } - } else if (adjacency_method == "direct") { # Hunter version - if (lastComputed < nr) { - list_of_adjacency <- .adjDirect( - ped = ped, - component = component, - saveable = saveable, - resume = resume, - save_path = save_path, - verbose = verbose, - lastComputed = lastComputed, - nr = nr, - checkpoint_files = checkpoint_files, - update_rate = update_rate, - parList = parList, - lens = lens, - save_rate_parlist = save_rate_parlist, - ... - ) - } - } else if (adjacency_method == "beta") { - list_of_adjacency <- .adjBeta( - ped = ped, - adjBeta_method = adjBeta_method, - component = component, - saveable = saveable, - resume = resume, - save_path = save_path, - verbose = verbose, - lastComputed = lastComputed, - nr = nr, - checkpoint_files = checkpoint_files, - update_rate = update_rate, - parList = parList, - lens = lens, - save_rate_parlist = save_rate_parlist, - ... - ) - } else { - stop("Invalid method specified. Choose from 'loop', 'direct', 'indexed', or beta") - } - if (saveable) { - saveRDS(parList, file = checkpoint_files$parList) - saveRDS(lens, file = checkpoint_files$lens) - if (verbose) { - cat("Final checkpoint saved for adjacency matrix.\n") - } - } - return(list_of_adjacency) -} - - -#' Determine isChild Status, isChild is the 'S' matrix from RAM -#' @param isChild_method method to determine isChild status -#' @param ped pedigree data frame -#' @return isChild 'S' matrix -#' - -isChild <- function(isChild_method, ped) { - if (isChild_method == "partialparent") { - isChild <- apply(ped[, c("momID", "dadID")], 1, function(x) { - .5 + .25 * sum(is.na(x)) # 2 parents -> .5, 1 parent -> .75, 0 parents -> 1 - }) - } else { - isChild <- apply(ped[, c("momID", "dadID")], 1, function(x) { - 2^(-!all(is.na(x))) - }) - } -} - - -.adjBeta <- function(ped, component, - adjBeta_method = 5, - parList = NULL, - lastComputed = 0, - nr = NULL, - lens = NULL, - saveable = FALSE, - resume = FALSE, - save_path = NULL, - verbose = FALSE, - save_rate_parlist = NULL, - update_rate = NULL, - checkpoint_files = NULL, - ...) { # 1) Pairwise compare mother IDs - if (adjBeta_method == 1) { - # gets slow when data are bigger. much slower than indexed - momMatch <- outer(ped$momID, ped$momID, FUN = "==") - momMatch[is.na(momMatch)] <- FALSE - - # 2) Pairwise compare father IDs - dadMatch <- outer(ped$dadID, ped$dadID, FUN = "==") - dadMatch[is.na(dadMatch)] <- FALSE - - # 3) Sibling adjacency if both mom & dad match - adj <- momMatch & dadMatch - - # 4) Extract indices where adj[i,j] is TRUE - w <- which(adj, arr.ind = TRUE) - # iss <- w[, 1] - # jss <- w[, 2] - # - list_of_adjacency <- list( - iss = w[, 1], - jss = w[, 2] - ) - } else if (adjBeta_method == 2) { - # 1) Create a logical mask for known parents - mask <- !is.na(ped$momID) & !is.na(ped$dadID) - - # 2) Create a single string label for each known (momID, dadID) pair - pairLabel <- paste0(ped$momID[mask], "_", ped$dadID[mask]) - - # 3) Factor that label => each row with the same (mom,dad) gets the same integer code - # This is "creating a new ID" for each unique parent pair - pairCode <- match(pairLabel, unique(pairLabel)) - - # childVec are the row indices in 'ped' that have known parents - childVec <- which(mask) # length(childVec) = sum(mask) - - # 4) Group children by pairCode, so each group is "all children with that (mom,dad)" - groupList <- split(childVec, pairCode) - - # 5) For each group with >1 children, form pairwise adjacency i->j - iss_list <- list() - jss_list <- list() - counter <- 1 - - for (g in groupList) { - if (length(g) > 1) { - combos <- expand.grid(g, g, KEEP.OUT.ATTRS = FALSE) - combos <- combos[combos[, 1] != combos[, 2], , drop = FALSE] - iss_list[[counter]] <- combos[, 1] - jss_list[[counter]] <- combos[, 2] - counter <- counter + 1 - } - } - # iss <- unlist(iss_list, use.names = FALSE) - # jss <- unlist(jss_list, use.names = FALSE) - - list_of_adjacency <- list( - iss = unlist(iss_list, use.names = FALSE), - jss = unlist(jss_list, use.names = FALSE) - ) - } else if (adjBeta_method == 3) { - nr <- nrow(ped) - # terrible - # Define a scalar-checking function: - f_check <- function(i, j) { - # i, j are each single integers - # Return one boolean: do they share both parents? - !is.na(ped$momID[i]) && !is.na(ped$dadID[i]) && - !is.na(ped$momID[j]) && !is.na(ped$dadID[j]) && - (ped$momID[i] == ped$momID[j]) && - (ped$dadID[i] == ped$dadID[j]) - } - - # Vectorize it so outer() will produce an nr x nr matrix - vf_check <- Vectorize(f_check) - - # Now outer() calls vf_check(...) in a way that yields scalar results - adj <- outer(seq_len(nr), seq_len(nr), FUN = vf_check) - - # Extract which cells of adj are TRUE - w <- which(adj, arr.ind = TRUE) - # iss <- w[, 1] - # jss <- w[, 2] - - list_of_adjacency <- list( - iss = iss <- w[, 1], - jss = jss <- w[, 2] - ) - } else if (adjBeta_method == 4) { - # 1) Create a logical mask for known parents - mask <- !is.na(ped$momID) & !is.na(ped$dadID) - - # 2) Create a single string label for each known (momID, dadID) pair - pairLabel <- paste0(ped$momID[mask], "_", ped$dadID[mask]) - - # 3) Factor that label => each row with the same (mom,dad) gets the same integer code - pairCode <- match(pairLabel, unique(pairLabel)) - - # childVec are the row indices in 'ped' that have known parents - childVec <- which(mask) - - # 4) Group children by pairCode, so each group is "all children with that (mom,dad)" - groupList <- split(childVec, pairCode) - - # 5) For each group with >1 children, form pairwise adjacency (i->j) for i != j - iss_list <- vector("list", length(groupList)) - jss_list <- vector("list", length(groupList)) - counter <- 1 - - for (g in groupList) { - k <- length(g) - if (k > 1) { - # We'll form all k^2 combos, then remove the diagonal i=j - # Instead of expand.grid, do rep() calls: - - # v = each child repeated k times - # w = entire group repeated once for each child - v <- rep(g, each = k) # row index - w <- rep(g, times = k) # col index - - keep <- (v != w) # remove diagonal where v == w - iss_list[[counter]] <- v[keep] - jss_list[[counter]] <- w[keep] - counter <- counter + 1 - } - } - - list_of_adjacency <- list( - iss = unlist(iss_list, use.names = FALSE), - jss = unlist(jss_list, use.names = FALSE) - ) - } else if (adjBeta_method == 5) { - # 1) Create a logical mask for known parents - mask <- !is.na(ped$momID) & !is.na(ped$dadID) - - # 2) Create a single hash label for each known (momID, dadID) pair - # pairLabel <- paste0(ped$momID[mask], "_", ped$dadID[mask]) - base <- max(ped$ID, na.rm = TRUE) + 1L - pairCode <- ped$momID[mask] + base * ped$dadID[mask] - - # 3) Factor that label => each row with the same (mom,dad) gets the same integer code - childVec <- which(mask) - - # 4) Group children by pairCode, so each group is "all children with that (mom,dad)" - groupList <- split(childVec, pairCode) - - # 5) For each group with >1 children, form pairwise adjacency (i->j) for i != j - iss_list <- vector("list", length(groupList)) - jss_list <- vector("list", length(groupList)) - counter <- 1 - - for (g in groupList) { - k <- length(g) - if (k > 1) { - # We'll form all k^2 combos, then remove the diagonal i=j - # Instead of expand.grid, do rep() calls: - - # v = each child repeated k times - # w = entire group repeated once for each child - v <- rep(g, each = k) # row index - w <- rep(g, times = k) # col index - - keep <- (v != w) # remove diagonal where v == w - iss_list[[counter]] <- v[keep] - jss_list[[counter]] <- w[keep] - counter <- counter + 1 - } - } - - list_of_adjacency <- list( - iss = unlist(iss_list, use.names = FALSE), - jss = unlist(jss_list, use.names = FALSE) - ) - } else { - list_of_adjacency <- .adjIndexed( - ped = ped, component = component, - saveable = saveable, resume = resume, - save_path = save_path, verbose = verbose, - lastComputed = lastComputed, nr = nr, - checkpoint_files = checkpoint_files, - update_rate = update_rate, parList = parList, - lens = lens, save_rate_parlist = save_rate_parlist - ) - } - return(list_of_adjacency) -} diff --git a/R/readWikifamilytree.R b/R/readWikifamilytree.R index fc0a7521..018caaef 100644 --- a/R/readWikifamilytree.R +++ b/R/readWikifamilytree.R @@ -28,23 +28,23 @@ readWikifamilytree <- function(text = NULL, verbose = FALSE, file_path = NULL, . } # Extract summary text - summary_text <- extractSummaryText(text) + summary_text <- getWikiTreeSummary(text) # Extract all lines defining the family tree tree_lines <- unlist(stringr::str_extract_all(text, "\\{\\{familytree.*?\\}\\}")) tree_lines <- tree_lines[!stringr::str_detect(tree_lines, "start|end")] # Remove start/end markers tree_lines <- gsub("\\{\\{familytree(.*?)\\}\\}", "\\1", tree_lines) # Remove wrapping markup # Convert tree structure into a coordinate grid (preserves symbols!) - tree_df <- parseTree(tree_lines) + tree_df <- buildTreeGrid(tree_lines) # Identify columns that start with "Y" cols_to_pivot <- grep("^Y", names(tree_df), value = TRUE) # Reshape from wide to long format - tree_long <- makeLongTree(tree_df, cols_to_pivot) + tree_long <- convertGrid2LongTree(tree_df, cols_to_pivot) # Extract member definitions - members_df <- matchMembers(text) + members_df <- extractMemberTable(text) members_df$id <- paste0("P", seq_len(nrow(members_df))) # Assign unique person IDs # Merge names into the tree structure (keeping all symbols!) @@ -53,19 +53,16 @@ readWikifamilytree <- function(text = NULL, verbose = FALSE, file_path = NULL, . tree_long$DisplayName <- ifelse(!is.na(tree_long$name), tree_long$name, tree_long$Value) # Use name if available # parse relationships and infer them + tree_paths <- traceTreePaths(tree_long, deduplicate = FALSE) - relationships_df <- parseRelationships(tree_long) - - # relationships_df <- processParents(tree_long, datasource = "wiki") - - - + parsedRelationships <- parseTreeRelationships(tree_long, tree_paths) # Return structured table of the family tree (symbols included) list( summary = summary_text, members = members_df, structure = tree_long, - relationships = relationships_df + tree_paths = parsedRelationships$tree_paths, + relationships = parsedRelationships$relationships ) } @@ -74,7 +71,7 @@ readWikifamilytree <- function(text = NULL, verbose = FALSE, file_path = NULL, . #' @param cols_to_pivot A character vector of column names to pivot. #' @return A long data frame containing the tree structure. #' @keywords internal -makeLongTree <- function(tree_df, cols_to_pivot) { +convertGrid2LongTree <- function(tree_df, cols_to_pivot) { tree_long <- stats::reshape(tree_df, varying = cols_to_pivot, v.names = "Value", @@ -95,7 +92,7 @@ makeLongTree <- function(tree_df, cols_to_pivot) { #' @return A data frame containing information about the members of the family tree. #' @keywords internal -matchMembers <- function(text) { +extractMemberTable <- function(text) { member_matches <- stringr::str_extract_all(text, "\\|\\s*([A-Za-z0-9]+)\\s*=\\s*([^|}]*)")[[1]] member_matches <- gsub("\\[|\\]|'''", "", member_matches) # Remove formatting @@ -120,7 +117,7 @@ matchMembers <- function(text) { #' @keywords internal #' @export -extractSummaryText <- function(text) { +getWikiTreeSummary <- function(text) { summary_match <- stringr::str_match(text, "\\{\\{familytree/start \\|summary=(.*?)\\}\\}") summary_text <- ifelse(!is.na(summary_match[, 2]), summary_match[, 2], NA) return(summary_text) @@ -132,7 +129,7 @@ extractSummaryText <- function(text) { #' @keywords internal #' @export -parseTree <- function(tree_lines) { +buildTreeGrid <- function(tree_lines) { tree_matrix <- base::strsplit(tree_lines, "\\|") # Split each row into columns max_cols <- max(sapply(tree_matrix, length)) # Find the max column count @@ -151,47 +148,236 @@ parseTree <- function(tree_lines) { #' infer relationship from tree template #' #' @param tree_long A data frame containing the tree structure in long format. +#' @param tree_paths Optional. traceTreePaths output. If NULL, it will be calculated. #' @return A data frame containing the relationships between family members. #' @keywords internal #' -parseRelationships <- function(tree_long) { +parseTreeRelationships <- function(tree_long, tree_paths = NULL) { + # Check if tree_paths is NULL and call traceTreePaths if necessary + if (is.null(tree_paths)) { + tree_paths <- traceTreePaths(tree_long, deduplicate = FALSE) + } + # Initialize relationships dataframe: one row per unique person + person_ids <- unique(tree_long$id[!is.na(tree_long$id)]) + + # Initialize relationships data frame relationships <- data.frame( - id = tree_long$id, + id = person_ids, momID = NA_character_, dadID = NA_character_, + parent_1 = NA_character_, + parent_2 = NA_character_, spouseID = NA_character_, stringsAsFactors = FALSE ) - # Loop through rows to find connections - for (i in seq_len(nrow(tree_long))) { - row <- tree_long[i, ] - - # **Parent-Child Detection** - if (row$Value == "y") { - parent <- tree_long$Value[tree_long$Row == row$Row - 1 & tree_long$Column == row$Column] - child <- tree_long$Value[tree_long$Row == row$Row + 1 & tree_long$Column == row$Column] - - if (length(parent) == 0) parent <- NA - if (length(child) == 0) child <- NA - # Assign mom/dad IDs based on tree structure - if (!is.na(parent) && !is.na(child)) { - relationships$momID[relationships$id == child] <- parent - relationships$dadID[relationships$id == child] <- parent # Assuming one parent detected for now - } + tree_paths <- tree_paths[!is.na(tree_paths$from_id) & !is.na(tree_paths$to_id), ] + + # Fill in relationships based on the tree structure + tree_paths$relationship <- NA_character_ + + # map relationships based on the intermediate values + + tree_paths$relationship[ + grepl("\\+", tree_paths$intermediate_values) & + !grepl("y", tree_paths$intermediate_values) + ] <- "spouse" + + # Parent-child: + and y both present + tree_paths$relationship[ + grepl("\\+", tree_paths$intermediate_values) & + grepl("y", tree_paths$intermediate_values) + ] <- "offspring" + + tree_paths$relationship[ + is.na(tree_paths$relationship) & grepl("y", tree_paths$intermediate_values) + ] <- "offspring" + + # determine direction + tree_paths$relationship[grepl("^\\+", tree_paths$intermediate_values) & tree_paths$relationship == "offspring"] <- "parent-child" + tree_paths$relationship[grepl("[y\\|]$", tree_paths$intermediate_values) & tree_paths$relationship == "offspring"] <- "parent-child" + tree_paths$relationship[grepl("\\+$", tree_paths$intermediate_values) & tree_paths$relationship == "offspring"] <- "child-parent" + tree_paths$relationship[grepl("^[y\\|]", tree_paths$intermediate_values) & tree_paths$relationship == "offspring"] <- "child-parent" + + # Fill spouse links + spouse_links <- tree_paths[tree_paths$relationship == "spouse", ] + + for (i in seq_len(nrow(spouse_links))) { + a <- spouse_links$from_id[i] + b <- spouse_links$to_id[i] + relationships$spouseID[relationships$id == a] <- b + relationships$spouseID[relationships$id == b] <- a + } + + # Fill parent-child links from directional tags + + pc_links <- tree_paths[tree_paths$relationship == "parent-child", ] + for (i in seq_len(nrow(pc_links))) { + relationships <- populateParents( + df = relationships, + child = pc_links$to_id[i], + parent = pc_links$from_id[i] + ) + } + + # --- Child-parent (to_id = parent) --- + cp_links <- tree_paths[tree_paths$relationship == "child-parent", ] + for (i in seq_len(nrow(cp_links))) { + relationships <- populateParents( + df = relationships, + child = cp_links$from_id[i], + parent = cp_links$to_id[i] + ) + } + + out <- list( + tree_paths = tree_paths, + relationships = relationships + ) + return(out) +} + +#' Assign Parent +#' @param df A data frame containing the relationships. +#' @param child The ID of the child. +#' @param parent The ID of the parent. +#' @return A data frame with updated parent information. +#' @keywords internal +populateParents <- function(df, child, parent) { + idx <- which(df$id == child) + if (length(idx) != 1) { + return(df) + } + + if (is.na(df$parent_1[idx])) { + df$parent_1[idx] <- parent + } else if (is.na(df$parent_2[idx]) && df$parent_1[idx] != parent) { + df$parent_2[idx] <- parent + } + return(df) +} + + +#' Trace paths between individuals in a family tree grid +#' +#' @param tree_long A data.frame with columns: Row, Column, Value, id +#' @param deduplicate Logical, if TRUE, will remove duplicate paths +#' @return A data.frame with columns: from_id, to_id, direction, path_length, intermediates +#' @export +#' +traceTreePaths <- function(tree_long, deduplicate = TRUE) { + # Keep only relevant cells (people and path symbols) + path_symbols <- c("|", "-", "+", "v", "^", "y", ",", ".", "`", "!", "~", "x", ")", "(") + tree_long$Value <- gsub("\\s+", "", tree_long$Value) # Remove whitespace + active_cells <- tree_long[!is.na(tree_long$Value) & + (tree_long$Value %in% path_symbols | !is.na(tree_long$id)), ] + + active_cells$key <- paste(active_cells$Row, active_cells$Column, sep = "_") + + edges <- do.call(rbind, lapply(seq_len(nrow(active_cells)), function(i) { + from_key <- active_cells$key[i] + to_keys <- getGridNeighbors(active_cells[i, ], + active_keys = active_cells$key + ) + if (length(to_keys) > 0) { + data.frame(from = from_key, to = to_keys, stringsAsFactors = FALSE) } + })) + + # Create graph + g <- igraph::graph_from_data_frame(edges, directed = FALSE) + + # Map keys to IDs + person_nodes <- active_cells[!is.na(active_cells$id), c("key", "id")] + id_map <- setNames(person_nodes$id, person_nodes$key) + + # Find all pairs of people and trace paths + person_keys <- names(id_map) + result <- data.frame() - # **Spouse Detection** - if (row$Value == "+") { - spouse1 <- tree_long$Value[tree_long$Row == row$Row & tree_long$Column == row$Column - 1] - spouse2 <- tree_long$Value[tree_long$Row == row$Row & tree_long$Column == row$Column + 1] + for (i in seq_along(person_keys)) { + for (j in seq_along(person_keys)) { + if (i == j) next + from_key <- person_keys[i] + to_key <- person_keys[j] - if (!is.na(spouse1) && !is.na(spouse2)) { - relationships$spouseID[relationships$id == spouse1] <- spouse2 - relationships$spouseID[relationships$id == spouse2] <- spouse1 + # skip if either endpoint is not in graph + if (!(from_key %in% igraph::V(g)$name) || !(to_key %in% igraph::V(g)$name)) { + next } + # Find the shortest path between the two keys + sp <- suppressWarnings(igraph::shortest_paths(g, from_key, to_key, output = "vpath")$vpath[[1]]) + if (length(sp) > 1) { + intermediate <- setdiff(names(sp), c(from_key, to_key)) + # Extract values at those intermediate keys + intermediate_values <- sapply(intermediate, function(k) { + cell <- active_cells[active_cells$key == k, ] + if (nrow(cell) > 0) cell$Value else NA + }) + result <- rbind(result, data.frame( + from_id = id_map[[from_key]], + to_id = id_map[[to_key]], + path_length = length(sp) - 1, + intermediates = paste(intermediate, collapse = ";"), + intermediate_values = paste(intermediate_values, collapse = ""), + stringsAsFactors = FALSE + )) + } else { + # If no path found, add a row with NA values + result <- rbind(result, data.frame( + from_id = id_map[[from_key]], + to_id = id_map[[to_key]], + path_length = NA, + intermediates = NA, + intermediate_values = NA, + stringsAsFactors = FALSE + )) + } + } + } + + if (deduplicate == TRUE) { + # Deduplicate pairs + result <- deduplicatePairs(result) + } + + return(result) +} + +#' Build adjacency list (4-way neighbors) +#' +#' @param cell A data frame with columns Row and Column +#' @return A character vector of neighboring cell keys +#' @keywords internal + +getGridNeighbors <- function(cell, active_keys) { + offsets <- list(c(1, 0), c(-1, 0), c(0, 1), c(0, -1)) # down, up, right, left + out <- character() + for (offset in offsets) { + r2 <- cell$Row + offset[1] + c2 <- cell$Column + offset[2] + key2 <- paste(r2, c2, sep = "_") + if (key2 %in% active_keys) { + out <- c(out, key2) } } + return(out) +} + +#' Deduplicate pairs of IDs in a data frame +#' +#' @param df A data frame with columns from_id and to_id +#' @return A data frame with unique pairs of IDs +#' @keywords internal +deduplicatePairs <- function(df) { + # Create a new column with sorted pairs + df$pair <- apply(df[, c("from_id", "to_id")], 1, function(x) paste(sort(x), collapse = "_")) + + # Remove duplicates based on the pair column + df_dedup <- df[!duplicated(df$pair), ] + + # Drop the pair column + df_dedup$pair <- NULL - return(relationships) + return(df_dedup) } diff --git a/R/buildPedigree.R b/R/segmentPedigree.R similarity index 100% rename from R/buildPedigree.R rename to R/segmentPedigree.R diff --git a/man/parseTree.Rd b/man/buildTreeGrid.Rd similarity index 82% rename from man/parseTree.Rd rename to man/buildTreeGrid.Rd index 6982013e..13205cee 100644 --- a/man/parseTree.Rd +++ b/man/buildTreeGrid.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/readWikifamilytree.R -\name{parseTree} -\alias{parseTree} +\name{buildTreeGrid} +\alias{buildTreeGrid} \title{Parse Tree} \usage{ -parseTree(tree_lines) +buildTreeGrid(tree_lines) } \arguments{ \item{tree_lines}{A character vector containing the lines of the tree structure.} diff --git a/man/calcAllGens.Rd b/man/calcAllGens.Rd index 66a89c25..35871be7 100644 --- a/man/calcAllGens.Rd +++ b/man/calcAllGens.Rd @@ -3,7 +3,7 @@ \name{calcAllGens} \alias{calcAllGens} \alias{allGens} -\title{allGens +\title{calcAllGens A function to calculate the number of individuals in each generation. This is a supporting function for \code{simulatePedigree}.} \usage{ calcAllGens(kpc, Ngen, marR) @@ -21,6 +21,6 @@ allGens(kpc, Ngen, marR) Returns a vector containing the number of individuals in every generation. } \description{ -allGens +calcAllGens A function to calculate the number of individuals in each generation. This is a supporting function for \code{simulatePedigree}. } diff --git a/man/calcFamilySize.Rd b/man/calcFamilySize.Rd index a0128d0c..dda31d20 100644 --- a/man/calcFamilySize.Rd +++ b/man/calcFamilySize.Rd @@ -3,7 +3,7 @@ \name{calcFamilySize} \alias{calcFamilySize} \alias{famSizeCal} -\title{famSizeCal +\title{calcFamilySize A function to calculate the total number of individuals in a pedigree given parameters. This is a supporting function for function \code{simulatePedigree}} \usage{ calcFamilySize(kpc, Ngen, marR) @@ -21,6 +21,6 @@ famSizeCal(kpc, Ngen, marR) Returns a numeric value indicating the total pedigree size. } \description{ -famSizeCal +calcFamilySize A function to calculate the total number of individuals in a pedigree given parameters. This is a supporting function for function \code{simulatePedigree} } diff --git a/man/calcFamilySizeByGen.Rd b/man/calcFamilySizeByGen.Rd index ae3e5e88..f849339c 100644 --- a/man/calcFamilySizeByGen.Rd +++ b/man/calcFamilySizeByGen.Rd @@ -3,7 +3,7 @@ \name{calcFamilySizeByGen} \alias{calcFamilySizeByGen} \alias{sizeAllGens} -\title{sizeAllGens +\title{calcFamilySizeByGen An internal supporting function for \code{simulatePedigree}.} \usage{ calcFamilySizeByGen(kpc, Ngen, marR) @@ -21,6 +21,6 @@ sizeAllGens(kpc, Ngen, marR) Returns a vector including the number of individuals in every generation. } \description{ -sizeAllGens +calcFamilySizeByGen An internal supporting function for \code{simulatePedigree}. } diff --git a/man/compute_parent_adjacency.Rd b/man/computeParentAdjacency.Rd similarity index 90% rename from man/compute_parent_adjacency.Rd rename to man/computeParentAdjacency.Rd index f6364808..353a655a 100644 --- a/man/compute_parent_adjacency.Rd +++ b/man/computeParentAdjacency.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/convertPedigree.R -\name{compute_parent_adjacency} -\alias{compute_parent_adjacency} +% Please edit documentation in R/constructAdjacency.R +\name{computeParentAdjacency} +\alias{computeParentAdjacency} \title{Compute Parent Adjacency Matrix with Multiple Approaches} \usage{ -compute_parent_adjacency( +computeParentAdjacency( ped, component, adjacency_method = "direct", @@ -44,7 +44,7 @@ compute_parent_adjacency( \item{checkpoint_files}{a list of checkpoint files} -\item{update_rate}{numeric. The rate at which to print progress} +\item{update_rate}{the rate at which to update the progress} \item{parList}{a list of parent-child relationships} diff --git a/man/makeLongTree.Rd b/man/convertGrid2LongTree.Rd similarity index 78% rename from man/makeLongTree.Rd rename to man/convertGrid2LongTree.Rd index 96d4a514..d3bd5b5f 100644 --- a/man/makeLongTree.Rd +++ b/man/convertGrid2LongTree.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/readWikifamilytree.R -\name{makeLongTree} -\alias{makeLongTree} +\name{convertGrid2LongTree} +\alias{convertGrid2LongTree} \title{Make Long Tree} \usage{ -makeLongTree(tree_df, cols_to_pivot) +convertGrid2LongTree(tree_df, cols_to_pivot) } \arguments{ \item{tree_df}{A data frame containing the tree structure.} diff --git a/man/deduplicatePairs.Rd b/man/deduplicatePairs.Rd new file mode 100644 index 00000000..e133d803 --- /dev/null +++ b/man/deduplicatePairs.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readWikifamilytree.R +\name{deduplicatePairs} +\alias{deduplicatePairs} +\title{Deduplicate pairs of IDs in a data frame} +\usage{ +deduplicatePairs(df) +} +\arguments{ +\item{df}{A data frame with columns from_id and to_id} +} +\value{ +A data frame with unique pairs of IDs +} +\description{ +Deduplicate pairs of IDs in a data frame +} +\keyword{internal} diff --git a/man/dot-computeTranspose.Rd b/man/dot-computeTranspose.Rd index 38c8fa82..4d90dcbc 100644 --- a/man/dot-computeTranspose.Rd +++ b/man/dot-computeTranspose.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/convertPedigree.R +% Please edit documentation in R/buildComponent.R \name{.computeTranspose} \alias{.computeTranspose} \title{Compute the transpose multiplication for the relatedness matrix} diff --git a/man/matchMembers.Rd b/man/extractMemberTable.Rd similarity index 82% rename from man/matchMembers.Rd rename to man/extractMemberTable.Rd index 382c05ec..72b0945b 100644 --- a/man/matchMembers.Rd +++ b/man/extractMemberTable.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/readWikifamilytree.R -\name{matchMembers} -\alias{matchMembers} +\name{extractMemberTable} +\alias{extractMemberTable} \title{Match Members} \usage{ -matchMembers(text) +extractMemberTable(text) } \arguments{ \item{text}{A character string containing the text of a family tree in wiki format.} diff --git a/man/getGridNeighbors.Rd b/man/getGridNeighbors.Rd new file mode 100644 index 00000000..cc1e20fa --- /dev/null +++ b/man/getGridNeighbors.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readWikifamilytree.R +\name{getGridNeighbors} +\alias{getGridNeighbors} +\title{Build adjacency list (4-way neighbors)} +\usage{ +getGridNeighbors(cell, active_keys) +} +\arguments{ +\item{cell}{A data frame with columns Row and Column} +} +\value{ +A character vector of neighboring cell keys +} +\description{ +Build adjacency list (4-way neighbors) +} +\keyword{internal} diff --git a/man/extractSummaryText.Rd b/man/getWikiTreeSummary.Rd similarity index 81% rename from man/extractSummaryText.Rd rename to man/getWikiTreeSummary.Rd index 9b12be26..dadc4c33 100644 --- a/man/extractSummaryText.Rd +++ b/man/getWikiTreeSummary.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/readWikifamilytree.R -\name{extractSummaryText} -\alias{extractSummaryText} +\name{getWikiTreeSummary} +\alias{getWikiTreeSummary} \title{Extract Summary Text} \usage{ -extractSummaryText(text) +getWikiTreeSummary(text) } \arguments{ \item{text}{A character string containing the text of a family tree in wiki format.} diff --git a/man/isChild.Rd b/man/isChild.Rd index 1bd6f1e9..be64125d 100644 --- a/man/isChild.Rd +++ b/man/isChild.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/convertPedigree.R +% Please edit documentation in R/constructAdjacency.R \name{isChild} \alias{isChild} \title{Determine isChild Status, isChild is the 'S' matrix from RAM} diff --git a/man/parseRelationships.Rd b/man/parseTreeRelationships.Rd similarity index 66% rename from man/parseRelationships.Rd rename to man/parseTreeRelationships.Rd index 24e864b5..a29a56da 100644 --- a/man/parseRelationships.Rd +++ b/man/parseTreeRelationships.Rd @@ -1,13 +1,15 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/readWikifamilytree.R -\name{parseRelationships} -\alias{parseRelationships} +\name{parseTreeRelationships} +\alias{parseTreeRelationships} \title{infer relationship from tree template} \usage{ -parseRelationships(tree_long) +parseTreeRelationships(tree_long, tree_paths = NULL) } \arguments{ \item{tree_long}{A data frame containing the tree structure in long format.} + +\item{tree_paths}{Optional. traceTreePaths output. If NULL, it will be calculated.} } \value{ A data frame containing the relationships between family members. diff --git a/man/ped2add.Rd b/man/ped2add.Rd index c2179e99..c3949c68 100644 --- a/man/ped2add.Rd +++ b/man/ped2add.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/convertPedigree.R +% Please edit documentation in R/buildComponent.R \name{ped2add} \alias{ped2add} \title{Take a pedigree and turn it into an additive genetics relatedness matrix} diff --git a/man/ped2ce.Rd b/man/ped2ce.Rd index b4e3db1d..ed22966a 100644 --- a/man/ped2ce.Rd +++ b/man/ped2ce.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/convertPedigree.R +% Please edit documentation in R/buildComponent.R \name{ped2ce} \alias{ped2ce} \title{Take a pedigree and turn it into an extended environmental relatedness matrix} diff --git a/man/ped2cn.Rd b/man/ped2cn.Rd index c738d13b..3178cab6 100644 --- a/man/ped2cn.Rd +++ b/man/ped2cn.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/convertPedigree.R +% Please edit documentation in R/buildComponent.R \name{ped2cn} \alias{ped2cn} \title{Take a pedigree and turn it into a common nuclear environmental relatedness matrix} diff --git a/man/ped2com.Rd b/man/ped2com.Rd index 58c0fc47..b49a6b03 100644 --- a/man/ped2com.Rd +++ b/man/ped2com.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/convertPedigree.R +% Please edit documentation in R/buildComponent.R \name{ped2com} \alias{ped2com} \title{Take a pedigree and turn it into a relatedness matrix} diff --git a/man/ped2fam.Rd b/man/ped2fam.Rd index 2ff7eb0d..3052e568 100644 --- a/man/ped2fam.Rd +++ b/man/ped2fam.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/buildPedigree.R +% Please edit documentation in R/segmentPedigree.R \name{ped2fam} \alias{ped2fam} \title{Segment Pedigree into Extended Families} diff --git a/man/ped2graph.Rd b/man/ped2graph.Rd index 13c191d7..5e7ac7b1 100755 --- a/man/ped2graph.Rd +++ b/man/ped2graph.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/buildPedigree.R +% Please edit documentation in R/segmentPedigree.R \name{ped2graph} \alias{ped2graph} \title{Turn a pedigree into a graph} diff --git a/man/ped2maternal.Rd b/man/ped2maternal.Rd index e54c3e76..03e02311 100755 --- a/man/ped2maternal.Rd +++ b/man/ped2maternal.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/buildPedigree.R +% Please edit documentation in R/segmentPedigree.R \name{ped2maternal} \alias{ped2maternal} \title{Add a maternal line ID variable to a pedigree} diff --git a/man/ped2mit.Rd b/man/ped2mit.Rd index c19e9ba7..2c7dbcb1 100644 --- a/man/ped2mit.Rd +++ b/man/ped2mit.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/convertPedigree.R +% Please edit documentation in R/buildComponent.R \name{ped2mit} \alias{ped2mit} \alias{ped2mt} diff --git a/man/ped2paternal.Rd b/man/ped2paternal.Rd index 16a9e35a..e893ec03 100755 --- a/man/ped2paternal.Rd +++ b/man/ped2paternal.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/buildPedigree.R +% Please edit documentation in R/segmentPedigree.R \name{ped2paternal} \alias{ped2paternal} \title{Add a paternal line ID variable to a pedigree} diff --git a/man/populateParents.Rd b/man/populateParents.Rd new file mode 100644 index 00000000..f0871d6a --- /dev/null +++ b/man/populateParents.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readWikifamilytree.R +\name{populateParents} +\alias{populateParents} +\title{Assign Parent} +\usage{ +populateParents(df, child, parent) +} +\arguments{ +\item{df}{A data frame containing the relationships.} + +\item{child}{The ID of the child.} + +\item{parent}{The ID of the parent.} +} +\value{ +A data frame with updated parent information. +} +\description{ +Assign Parent +} +\keyword{internal} diff --git a/man/traceTreePaths.Rd b/man/traceTreePaths.Rd new file mode 100644 index 00000000..5522ed83 --- /dev/null +++ b/man/traceTreePaths.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readWikifamilytree.R +\name{traceTreePaths} +\alias{traceTreePaths} +\title{Trace paths between individuals in a family tree grid} +\usage{ +traceTreePaths(tree_long, deduplicate = TRUE) +} +\arguments{ +\item{tree_long}{A data.frame with columns: Row, Column, Value, id} + +\item{deduplicate}{Logical, if TRUE, will remove duplicate paths} +} +\value{ +A data.frame with columns: from_id, to_id, direction, path_length, intermediates +} +\description{ +Trace paths between individuals in a family tree grid +} diff --git a/tests/testthat/test-readWikiTree.R b/tests/testthat/test-readWikiTree.R index 166539d9..e7602e03 100644 --- a/tests/testthat/test-readWikiTree.R +++ b/tests/testthat/test-readWikiTree.R @@ -1,5 +1,62 @@ # readWikifamilytree + +test_that("traceTreePaths works correctly for horizontal tree", { + # Create a mock tree_horizontal data frame + # This is a simplified version of the tree_horizontal data frame + # with Row, Column, Value, and id columns + # The id column is used to identify the nodes in the tree + + tree_horizontal <- data.frame( + Row = rep(1, 3), + Column = 1:3, + Value = c("A", "+", "B"), + stringsAsFactors = FALSE + ) + tree_horizontal$id <- NA + tree_horizontal$id[tree_horizontal$Value %in% c("A", "B")] <- tree_horizontal$Value[tree_horizontal$Value %in% c("A", "B")] + + result <- traceTreePaths(tree_horizontal) + # Check the result + # Check the result + expect_equal(names(result), c("from_id", "to_id", "path_length", "intermediates", "intermediate_values")) + expect_equal(c("A", "B") %in% c(result$from_id, result$to_id), rep(TRUE, 2)) + expect_equal(result$path_length[result$from_id == "A" & result$to_id == "B"], 2) + expect_equal(result$intermediate_values[result$from_id == "A" & result$to_id == "B"], "+") +}) + + +test_that("traceTreePaths works correctly for vertical tree", { + # Create a mock tree_vertical data frame + # This is a simplified version of the tree_vertical data frame + # with Row, Column, Value, and id columns + # The id column is used to identify the nodes in the tree + tree_spouse_child <- data.frame( + Row = c(1, 1, 1, 2, 3, 4, 5), + Column = c(1, 2, 3, 2, 2, 2, 2), + Value = c("A", "+", "B", "|", "y", "|", "C"), + stringsAsFactors = FALSE + ) + tree_spouse_child$id <- NA + tree_spouse_child$id[tree_spouse_child$Value %in% c("A", "B", "C")] <- tree_spouse_child$Value[tree_spouse_child$Value %in% c("A", "B", "C")] + + result <- traceTreePaths(tree_spouse_child) + # Check the result + expect_equal(names(result), c("from_id", "to_id", "path_length", "intermediates", "intermediate_values")) + expect_equal(c("A", "B", "C") %in% c(result$from_id, result$to_id), rep(TRUE, 3)) + expect_equal(result$path_length[result$from_id == "A" & result$to_id == "B"], 2) + expect_equal(result$path_length[result$from_id == "A" & result$to_id == "C"], 5) + expect_equal(result$path_length[result$from_id == "B" & result$to_id == "C"], 5) + expect_equal(result$intermediate_values[result$from_id == "A" & result$to_id == "B"], "+") +}) + + + + + + + + test_that("readWikifamilytree reads a string correctly", { # Create a temporary WikiFamilyTree file for testing # Example usage @@ -25,7 +82,7 @@ test_that("readWikifamilytree reads a string correctly", { expect_equal( result2$summary, - "I have a brother Joe and a little sister: my mom married my dad, and my dad's parents were Grandma and Grandpa; they had another child, Aunt Daisy." + result$summary ) }) diff --git a/vignettes/modelingrelatedness.Rmd b/vignettes/modelingvariancecomponents.Rmd similarity index 98% rename from vignettes/modelingrelatedness.Rmd rename to vignettes/modelingvariancecomponents.Rmd index 6f793690..c82441a9 100644 --- a/vignettes/modelingrelatedness.Rmd +++ b/vignettes/modelingvariancecomponents.Rmd @@ -2,7 +2,7 @@ title: "Modeling variance components" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{modelingandrelatedness} + %\VignetteIndexEntry{modelingvariancecomponents} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- diff --git a/vignettes/modelingvariancecomponents.html b/vignettes/modelingvariancecomponents.html new file mode 100644 index 00000000..7b5df889 --- /dev/null +++ b/vignettes/modelingvariancecomponents.html @@ -0,0 +1,554 @@ + + + + +
+ + + + + + + + + +This vignette provides a detailed guide to specific functions within
+the BGmisc package that aid in the identification and
+fitting of variance component models common in behavior genetics. We
+will explore key functions such as identifyComponentModel,
+providing practical examples and theoretical background. Identification
+ensures a unique set of parameters that define the model-implied
+covariance matrix, preventing free parameters from trading off one
+another.
Ensure that the BGmisc package is installed and
+loaded.
Ensure that the following dependencies are installed before +proceeding as they provide us with behavior genetic data and models:
+EasyMx
OpenMx
Note: If any of the libraries are not installed, you can install them
+using install.packages(“package_name”).
In this section, we will demonstrate core functions related to the +identification and fitting of variance component models.
+comp2vech FunctionThe comp2vech function is used to vectorize a components
+model. The function is often used in conjunction with the identification
+process. In this example, we apply it to a list of matrices:
comp2vech(list(
+ matrix(c(1, .5, .5, 1), 2, 2),
+ matrix(1, 2, 2)
+))
+#> [1] 1.0 0.5 1.0 1.0 1.0 1.0The result showcases how the matrices have been transformed, +reflecting their role in subsequent variance component analysis.
+identifyComponentModel FunctionThe identifyComponentModel function helps determine if a
+variance components model is identified. It accepts relatedness
+component matrices and returns information about identified and
+non-identified parameters.
Here’s an example using the classical twin model with only MZ +twins:
+identifyComponentModel(
+ A = list(matrix(1, 2, 2)),
+ C = list(matrix(1, 2, 2)),
+ E = diag(1, 2)
+)
+#> Component model is not identified.
+#> Non-identified parameters are A, C
+#> $identified
+#> [1] FALSE
+#>
+#> $nidp
+#> [1] "A" "C"As you can see, the model is not identified. We need to add an +additional group so that we have sufficient information. Let us add the +rest of the classical twin model, in this case DZ twins.
+identifyComponentModel(
+ A = list(matrix(c(1, .5, .5, 1), 2, 2), matrix(1, 2, 2)),
+ C = list(matrix(1, 2, 2), matrix(1, 2, 2)),
+ E = diag(1, 4)
+)
+#> Component model is identified.
+#> $identified
+#> [1] TRUE
+#>
+#> $nidp
+#> character(0)As you can see the model is identified, now that we’ve added another +group. Let us confirm by fitting a model. First we prepare the data.
+require(dplyr)
+#> Loading required package: dplyr
+#>
+#> Attaching package: 'dplyr'
+#> The following objects are masked from 'package:BGmisc':
+#>
+#> between, first, last
+#> The following objects are masked from 'package:stats':
+#>
+#> filter, lag
+#> The following objects are masked from 'package:base':
+#>
+#> intersect, setdiff, setequal, union
+# require(purrr)
+
+data(twinData, package = "OpenMx")
+selVars <- c("ht1", "ht2")
+
+mzdzData <- subset(
+ twinData, zyg %in% c(1, 3),
+ c(selVars, "zyg")
+)
+
+mzdzData$RCoef <- c(1, NA, .5)[mzdzData$zyg]
+
+
+mzData <- mzdzData %>% filter(zyg == 1)Let us fit the data with MZ twins by themselves.
+run1 <- emxTwinModel(
+ model = "Cholesky",
+ relatedness = "RCoef",
+ data = mzData,
+ use = selVars,
+ run = TRUE, name = "TwCh"
+)
+#> Running TwCh with 4 parameters
+#> Warning: In model 'TwCh' Optimizer returned a non-zero status code 5. The
+#> Hessian at the solution does not appear to be convex. See
+#> ?mxCheckIdentification for possible diagnosis (Mx status RED).
+
+summary(run1)
+#> Summary of TwCh
+#>
+#> The Hessian at the solution does not appear to be convex. See ?mxCheckIdentification for possible diagnosis (Mx status RED).
+#>
+#> free parameters:
+#> name matrix row col Estimate Std.Error A lbound ubound
+#> 1 sqrtA11 sqrtA 1 1 0.05122646 NA 1e-06
+#> 2 sqrtC11 sqrtC 1 1 0.03518629 NA 0!
+#> 3 sqrtE11 sqrtE 1 1 0.02325722 0.0007017955 ! 0!
+#> 4 Mht1 Means ht1 1 1.62974908 0.0027023907
+#>
+#> Model Statistics:
+#> | Parameters | Degrees of Freedom | Fit (-2lnL units)
+#> Model: 4 1112 -3693.148
+#> Saturated: 5 1111 NA
+#> Independence: 4 1112 NA
+#> Number of observations/statistics: 569/1116
+#>
+#>
+#> ** Information matrix is not positive definite (not at a candidate optimum).
+#> Be suspicious of these results. At minimum, do not trust the standard errors.
+#>
+#> Information Criteria:
+#> | df Penalty | Parameters Penalty | Sample-Size Adjusted
+#> AIC: -5917.148 -3685.148 -3685.078
+#> BIC: -10747.543 -3667.773 -3680.471
+#> To get additional fit indices, see help(mxRefModels)
+#> timestamp: 2025-04-21 16:43:57
+#> Wall clock time: 0.04138112 secs
+#> optimizer: SLSQP
+#> OpenMx version number: 2.21.13
+#> Need help? See help(mxSummary)As you can see the model was unsuccessful because it was not +identified. But when we add another group, so that the model is +identified, the model now fits.
+run2 <- emxTwinModel(
+ model = "Cholesky",
+ relatedness = "RCoef",
+ data = mzdzData,
+ use = selVars,
+ run = TRUE, name = "TwCh"
+)
+#> Running TwCh with 4 parameters
+
+summary(run2)
+#> Summary of TwCh
+#>
+#> free parameters:
+#> name matrix row col Estimate Std.Error A lbound ubound
+#> 1 sqrtA11 sqrtA 1 1 0.06339271 0.0014377690 1e-06
+#> 2 sqrtC11 sqrtC 1 1 0.00000100 0.0250260004 ! 0!
+#> 3 sqrtE11 sqrtE 1 1 0.02330040 0.0007015267 0!
+#> 4 Mht1 Means ht1 1 1.63295540 0.0020511844
+#>
+#> Model Statistics:
+#> | Parameters | Degrees of Freedom | Fit (-2lnL units)
+#> Model: 4 1803 -5507.092
+#> Saturated: 5 1802 NA
+#> Independence: 4 1803 NA
+#> Number of observations/statistics: 920/1807
+#>
+#> Information Criteria:
+#> | df Penalty | Parameters Penalty | Sample-Size Adjusted
+#> AIC: -9113.092 -5499.092 -5499.048
+#> BIC: -17811.437 -5479.794 -5492.498
+#> To get additional fit indices, see help(mxRefModels)
+#> timestamp: 2025-04-21 16:43:57
+#> Wall clock time: 0.03366184 secs
+#> optimizer: SLSQP
+#> OpenMx version number: 2.21.13
+#> Need help? See help(mxSummary)