diff --git a/DESCRIPTION b/DESCRIPTION index b35bfadd..bcdab09f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: BGmisc Title: An R Package for Extended Behavior Genetics Analysis -Version: 1.3.5.1 +Version: 1.3.6 Authors@R: c( person("S. Mason", "Garrison", , "garrissm@wfu.edu", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-4804-6003")), diff --git a/NEWS.md b/NEWS.md index f0fd33f4..d8bd3dd0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,7 @@ * revived checkParents function to check for handling phantom parents and missing parents * added tests for checkParents function * added GoT analysis +* reduced complexity of com2links and summarizePedigree with the use of subfunctions # BGmisc 1.3.5.1 * Setting the default for the `sparse` argument in `ped2com()` to TRUE diff --git a/R/buildPedigree.R b/R/buildPedigree.R index 6a89f94e..3d94ecad 100644 --- a/R/buildPedigree.R +++ b/R/buildPedigree.R @@ -1,6 +1,7 @@ #' Segment Pedigree into Extended Families #' -#' This function adds an extended family ID variable to a pedigree by segmenting that dataset into independent extended families +#' This function adds an extended family ID variable to a pedigree by segmenting +#' that dataset into independent extended families #' using the weakly connected components algorithm. #' #' @@ -86,7 +87,8 @@ ped2graph <- function(ped, adjacent = c("parents", "mothers", "fathers"), ...) { # Check ped/data.fram - if (!inherits(ped, "data.frame")) stop("ped should be a data.frame or inherit to a data.frame") + if (!inherits(ped, "data.frame")) { + stop("ped should be a data.frame or inherit to a data.frame")} # Handle adjacent argument adjacent <- match.arg(tolower(adjacent)[1], choices = c( @@ -165,7 +167,7 @@ ped2graph <- function(ped, #' Add a maternal line ID variable to a pedigree #' @inheritParams ped2fam -#' @param matID Character. Maternal line ID variable to be created and added to the pedigree +#' @param matID Character. Maternal line ID variable to be created and added to the pedigree #' @details #' Under various scenarios it is useful to know which people in a pedigree #' belong to the same maternal lines. This function first turns a pedigree @@ -177,9 +179,11 @@ ped2graph <- function(ped, #' @export #' ped2maternal <- function(ped, personID = "ID", - momID = "momID", dadID = "dadID", matID = "matID", ...) { + momID = "momID", dadID = "dadID", + matID = "matID", ...) { # Call to wrapper function - .ped2id(ped = ped, personID = personID, momID = momID, dadID = dadID, famID = matID, type = "mothers") + .ped2id(ped = ped, personID = personID, momID = momID, + dadID = dadID, famID = matID, type = "mothers") } #' Add a paternal line ID variable to a pedigree @@ -199,5 +203,6 @@ ped2paternal <- function(ped, personID = "ID", momID = "momID", dadID = "dadID", patID = "patID", ...) { # Call to wrapper function - .ped2id(ped = ped, personID = personID, momID = momID, dadID = dadID, famID = patID, type = "fathers") + .ped2id(ped = ped, personID = personID, momID = momID, + dadID = dadID, famID = patID, type = "fathers") } diff --git a/R/checkParents.R b/R/checkParents.R index b03a5926..eab8e0ca 100644 --- a/R/checkParents.R +++ b/R/checkParents.R @@ -23,8 +23,7 @@ checkParentIDs <- function(ped, verbose = FALSE, repair = FALSE, repairsex = repair, addphantoms = repair, - parentswithoutrow = repair - ) { + parentswithoutrow = repair) { # Standardize column names in the input dataframe ped_og <- ped ped <- standardizeColnames(ped) @@ -74,7 +73,7 @@ checkParentIDs <- function(ped, verbose = FALSE, repair = FALSE, cat("All parents are listed in the pedigree.\n") } } - validation_results$missing_parents <- validation_results$single_parents&length(rowless_parents) > 0 + validation_results$missing_parents <- validation_results$single_parents & length(rowless_parents) > 0 if (verbose) { cat("Step 2: Determining the if moms are the same sex and dads are same sex\n") @@ -146,11 +145,13 @@ checkParentIDs <- function(ped, verbose = FALSE, repair = FALSE, # Are any parents in both momID and dadID? momdad <- intersect(ped$dadID, ped$momID) - if (!is.na(momdad)&&length(momdad) > 0) { + if (!is.na(momdad) && length(momdad) > 0) { validation_results$parents_in_both <- momdad if (verbose) { - cat(paste("Some individuals appear in both momID and dadID roles.\n", - "These individuals are:\n")) + cat(paste( + "Some individuals appear in both momID and dadID roles.\n", + "These individuals are:\n" + )) print(momdad) } } @@ -162,12 +163,12 @@ checkParentIDs <- function(ped, verbose = FALSE, repair = FALSE, print(validation_results) } return(validation_results) - } else{ - if (verbose) { - cat("Validation Results:\n") - print(validation_results) - cat("Step 3: Attempting to repair missing parents...\n") - } + } else { + if (verbose) { + cat("Validation Results:\n") + print(validation_results) + cat("Step 3: Attempting to repair missing parents...\n") + } cat("REPAIR IN EARLY ALPHA\n") # Initialize a list to track changes made during repair @@ -177,32 +178,32 @@ checkParentIDs <- function(ped, verbose = FALSE, repair = FALSE, phantom_dads_added = c(), phantom_moms_added = c() ) - if(repairsex){ - # Fix sex of existing parents if wrong - mom_indices <- match(ped$momID, ped$ID) - dad_indices <- match(ped$dadID, ped$ID) + if (repairsex) { + # Fix sex of existing parents if wrong + mom_indices <- match(ped$momID, ped$ID) + dad_indices <- match(ped$dadID, ped$ID) - if (!is.na(validation_results$female_var)) { - corrected_moms <- ped$ID[mom_indices[!is.na(mom_indices)]] - ped$sex[mom_indices[!is.na(mom_indices)]] <- validation_results$female_var - changes$corrected_mom_sex <- corrected_moms - if (verbose && length(corrected_moms) > 0) { - cat("Corrected sex of moms for:", paste(corrected_moms, collapse = ", "), "\n") + if (!is.na(validation_results$female_var)) { + corrected_moms <- ped$ID[mom_indices[!is.na(mom_indices)]] + ped$sex[mom_indices[!is.na(mom_indices)]] <- validation_results$female_var + changes$corrected_mom_sex <- corrected_moms + if (verbose && length(corrected_moms) > 0) { + cat("Corrected sex of moms for:", paste(corrected_moms, collapse = ", "), "\n") + } } - } - if (!is.na(validation_results$male_var)) { - corrected_dads <- ped$ID[dad_indices[!is.na(dad_indices)]] - ped$sex[dad_indices[!is.na(dad_indices)]] <- validation_results$male_var - changes$corrected_dad_sex <- corrected_dads - if (verbose && length(corrected_dads) > 0) { - cat("Corrected sex of dads for:", paste(corrected_dads, collapse = ", "), "\n") + if (!is.na(validation_results$male_var)) { + corrected_dads <- ped$ID[dad_indices[!is.na(dad_indices)]] + ped$sex[dad_indices[!is.na(dad_indices)]] <- validation_results$male_var + changes$corrected_dad_sex <- corrected_dads + if (verbose && length(corrected_dads) > 0) { + cat("Corrected sex of dads for:", paste(corrected_dads, collapse = ", "), "\n") + } } } - } -} -if (addphantoms){ + } + if (addphantoms) { # Generate new IDs newIDbase <- if (is.numeric(ped$ID)) max(ped$ID, na.rm = TRUE) + 1 else paste0("phantom-", seq_len(nrow(ped))) new_entries <- data.frame() @@ -248,47 +249,47 @@ if (addphantoms){ if (verbose && length(changes$phantom_moms_added) > 0) { cat("Added phantom moms for:", paste(changes$phantom_moms_added, collapse = ", "), "\n") } -} + } # add phantom parents - if(parentswithoutrow){ - # Add parents who appear in momID or dadID but are missing from ID - listed_parents <- unique(c(ped$momID, ped$dadID)) - listed_parents <- listed_parents[!is.na(listed_parents)] - - existing_ids <- ped$ID - missing_parents <- setdiff(listed_parents, existing_ids) - - if (length(missing_parents) > 0) { - if (verbose) { - cat("Adding parents who were listed in momID/dadID but missing from ID:\n") - print(missing_parents) - } + if (parentswithoutrow) { + # Add parents who appear in momID or dadID but are missing from ID + listed_parents <- unique(c(ped$momID, ped$dadID)) + listed_parents <- listed_parents[!is.na(listed_parents)] + + existing_ids <- ped$ID + missing_parents <- setdiff(listed_parents, existing_ids) + + if (length(missing_parents) > 0) { + if (verbose) { + cat("Adding parents who were listed in momID/dadID but missing from ID:\n") + print(missing_parents) + } - for (pid in missing_parents) { - role <- unique( - c( - if (pid %in% ped$momID) "mom" else NULL, - if (pid %in% ped$dadID) "dad" else NULL + for (pid in missing_parents) { + role <- unique( + c( + if (pid %in% ped$momID) "mom" else NULL, + if (pid %in% ped$dadID) "dad" else NULL + ) ) - ) - inferred_sex <- if ("mom" %in% role) validation_results$female_var else validation_results$male_var - - new_row <- ped[1, ] - new_row$ID <- pid - new_row$dadID <- NA - new_row$momID <- NA - new_row$sex <- inferred_sex - new_entries <- rbind(new_entries, new_row) + inferred_sex <- if ("mom" %in% role) validation_results$female_var else validation_results$male_var + + new_row <- ped[1, ] + new_row$ID <- pid + new_row$dadID <- NA + new_row$momID <- NA + new_row$sex <- inferred_sex + new_entries <- rbind(new_entries, new_row) + } } - } - ped <- merge(ped, new_entries, all = TRUE) + ped <- merge(ped, new_entries, all = TRUE) } - if (verbose) { - cat("Changes Made:\n") - print(changes) - } - return(ped) + if (verbose) { + cat("Changes Made:\n") + print(changes) + } + return(ped) } #' Repair Parent IDs #' diff --git a/R/checkPedigree.R b/R/checkPedigree.R index 1ebe6afa..3c0af179 100644 --- a/R/checkPedigree.R +++ b/R/checkPedigree.R @@ -13,8 +13,10 @@ #' @return List containing detailed validation results. #' @examples #' \dontrun{ -#' results <- checkPedigreeNetwork(ped, personID = "ID", -#' momID = "momID", dadID = "dadID", verbose = TRUE) +#' results <- checkPedigreeNetwork(ped, +#' personID = "ID", +#' momID = "momID", dadID = "dadID", verbose = TRUE +#' ) #' } #' @export checkPedigreeNetwork <- function(ped, personID = "ID", momID = "momID", dadID = "dadID", verbose = FALSE) { diff --git a/R/computeRelatedness.R b/R/computeRelatedness.R index 2533c16f..e3f53b8b 100644 --- a/R/computeRelatedness.R +++ b/R/computeRelatedness.R @@ -133,7 +133,6 @@ calculateH <- function(r1, r2, obsR1, obsR2) { message("Your scale might be reverse coded because you have negative correlations. Please check your data. ") } - # Calculate heritability estimates (H^2) for all pairs heritability_estimates <- (obsR1 - obsR2) / (r1 - r2) @@ -145,7 +144,5 @@ calculateH <- function(r1, r2, obsR1, obsR2) { if (any(heritability_estimates > 1)) { warning("Some calculated heritability values are greater than 1, which may suggest overestimation or errors in the observed correlations or relatedness coefficients.") } - - return(heritability_estimates) } diff --git a/R/convertPedigree.R b/R/convertPedigree.R index 257bc02f..0f454acc 100644 --- a/R/convertPedigree.R +++ b/R/convertPedigree.R @@ -95,7 +95,7 @@ ped2com <- function(ped, component, # standardize colnames if (standardize.colnames) { - ped <- standardizeColnames(ped) + ped <- standardizeColnames(ped, verbose = verbose) } # Load final result if computation was completed @@ -212,7 +212,7 @@ ped2com <- function(ped, component, # isPar is the adjacency matrix. 'A' matrix from RAM if (component %in% c("common nuclear")) { Matrix::diag(isPar) <- 1 - if (sparse==FALSE) { + if (sparse == FALSE) { isPar <- as.matrix(isPar) } return(isPar) @@ -224,15 +224,8 @@ ped2com <- function(ped, component, } else { # isChild is the 'S' matrix from RAM - if (isChild_method == "partialparent") { - isChild <- apply(ped[, c("momID", "dadID")], 1, function(x) { - .5 + .25 * sum(is.na(x)) # 2 parents -> .5, 1 parent -> .75, 0 parents -> 1 - }) - } else { - isChild <- apply(ped[, c("momID", "dadID")], 1, function(x) { - 2^(-!all(is.na(x))) - }) - } + isChild <- isChild(isChild_method=isChild_method, ped=ped) + if (saveable) { saveRDS(isChild, file = checkpoint_files$isChild) } @@ -330,7 +323,7 @@ ped2com <- function(ped, component, # Assign 1 to all nonzero elements for mitochondrial component } - if (sparse==FALSE) { + if (sparse == FALSE) { r <- as.matrix(r) } if (flatten.diag) { # flattens diagonal if you don't want to deal with inbreeding @@ -723,3 +716,22 @@ compute_parent_adjacency <- function(ped, component, } return(list_of_adjacency) } + + +#' Determine isChild Status, isChild is the 'S' matrix from RAM +#' @param isChild_method method to determine isChild status +#' @param ped pedigree data frame +#' @return isChild 'S' matrix +#' + +isChild <- function(isChild_method, ped) { + if (isChild_method == "partialparent") { + isChild <- apply(ped[, c("momID", "dadID")], 1, function(x) { + .5 + .25 * sum(is.na(x)) # 2 parents -> .5, 1 parent -> .75, 0 parents -> 1 + }) + } else { + isChild <- apply(ped[, c("momID", "dadID")], 1, function(x) { + 2^(-!all(is.na(x))) + }) + } +} diff --git a/R/makeLinks.R b/R/makeLinks.R index 9517b533..96768fd5 100644 --- a/R/makeLinks.R +++ b/R/makeLinks.R @@ -46,42 +46,27 @@ com2links <- function( # Ensure that at least one relationship matrix is provided. if (is.null(ad_ped_matrix) && is.null(mit_ped_matrix) && is.null(cn_ped_matrix)) { - stop("At least one of 'ped_matrix', 'mit_ped_matrix', or 'cn_ped_matrix' must be provided.") + stop("At least one of 'ad_ped_matrix', 'mit_ped_matrix', or 'cn_ped_matrix' must be provided.") } # Validate and convert ad_ped_matrix to a sparse dgCMatrix if provided. if (!is.null(ad_ped_matrix)) { - if (!inherits(ad_ped_matrix, c("matrix", "dgCMatrix", "dsCMatrix"))) { - stop("The 'ad_ped_matrix' must be a matrix or dgCMatrix.") - } - # convert to sparse - if (!inherits(ad_ped_matrix, "dgCMatrix")) { - ad_ped_matrix <- methods::as(ad_ped_matrix, "dgCMatrix") - } + ad_ped_matrix <- validate_and_convert_matrix(mat=ad_ped_matrix, + name = "ad_ped_matrix") } # Validate and convert cn_ped_matrix to a sparse dgCMatrix if provided. if (!is.null(cn_ped_matrix)) { - if (!inherits(cn_ped_matrix, c("matrix", "dgCMatrix", "dsCMatrix"))) { - stop("The 'cn_ped_matrix' must be a matrix or dgCMatrix.") - } - # convert to sparse - if (!inherits(cn_ped_matrix, "dgCMatrix")) { - cn_ped_matrix <- methods::as(cn_ped_matrix, "dgCMatrix") - } - # Ensure CN matrix is symmetric. - cn_ped_matrix <- methods::as(cn_ped_matrix, "symmetricMatrix") + cn_ped_matrix <- validate_and_convert_matrix(mat=cn_ped_matrix, + name="cn_ped_matrix", + ensure_symmetric = TRUE) } # Validate and process mit_ped_matrix: convert and ensure binary values. if (!is.null(mit_ped_matrix)) { - if (!inherits(mit_ped_matrix, c("matrix", "dgCMatrix", "dsCMatrix"))) { - stop("The 'mit_ped_matrix' must be a matrix or dgCMatrix.") - } - if (!inherits(mit_ped_matrix, "dgCMatrix")) { - mit_ped_matrix <- methods::as(mit_ped_matrix, "symmetricMatrix") - } - # Ensure mitochondrial matrix values are binary (0/1) - mit_ped_matrix@x[mit_ped_matrix@x > 0] <- 1 + + mit_ped_matrix <- validate_and_convert_matrix(mat=mit_ped_matrix, + name="mit_ped_matrix",force_binary = TRUE, + ensure_symmetric = TRUE) } # --- Build IDs and Prepare Matrix Pointers --- @@ -162,12 +147,7 @@ com2links <- function( # mapa_id_file <- paste0(outcome_name, "_data_mapaID.csv") # Initialize the related pairs file with headers. - df_relpairs <- data.frame( - ID1 = numeric(0), ID2 = numeric(0) - ) - df_relpairs[[relNames[1]]] <- numeric(0) - df_relpairs[[relNames[2]]] <- numeric(0) - df_relpairs[[relNames[3]]] <- numeric(0) + df_relpairs <- initialize_empty_df(relNames = relNames) # Write the headers to the related pairs file. if (writetodisk == TRUE) { @@ -261,10 +241,8 @@ com2links <- function( } } } - if (verbose) { - if (!(j %% update_rate)) { - cat(paste0("Done with ", j, " of ", nc, "\n")) - } + if (verbose && (j %% update_rate == 0L)) { + cat("Done with", j, "of", nc, "\n") } } } else if (sum_nulls == 2) { @@ -309,11 +287,8 @@ com2links <- function( } # Initialize the related pairs file with the appropriate headers. - df_relpairs <- data.frame( - ID1 = numeric(0), ID2 = numeric(0) - ) - df_relpairs[[relNames[1]]] <- numeric(0) - df_relpairs[[relNames[2]]] <- numeric(0) + df_relpairs <- initialize_empty_df(relNames = relNames) + if (writetodisk == TRUE) { utils::write.table( df_relpairs, @@ -386,10 +361,8 @@ com2links <- function( } } } - if (verbose) { - if (!(j %% update_rate)) { - cat(paste0("Done with ", j, " of ", nc, "\n")) - } + if (verbose && (j %% update_rate == 0L)) { + cat("Done with", j, "of", nc, "\n") } } } else if (sum_nulls == 1) { @@ -426,10 +399,8 @@ com2links <- function( } # Initialize the related pairs file. - df_relpairs <- data.frame( - ID1 = numeric(0), ID2 = numeric(0) - ) - df_relpairs[[relNames[1]]] <- numeric(0) + df_relpairs <- initialize_empty_df(relNames = relNames) + if (writetodisk == TRUE) { utils::write.table( df_relpairs, @@ -486,8 +457,8 @@ com2links <- function( } } } - if (verbose && !(j %% update_rate)) { - cat(paste0("Done with ", j, " of ", nc, "\n")) + if (verbose && (j %% update_rate == 0L)) { + cat("Done with", j, "of", nc, "\n") } } } else { @@ -508,6 +479,43 @@ com2links <- function( # return(NULL) } } else if (legacy) { + # --- Legacy Mode --- + # In legacy mode, convert matrices to the expected symmetric formats. + com2links.legacy( + rel_pairs_file = rel_pairs_file, + ad_ped_matrix = ad_ped_matrix, + mit_ped_matrix = mit_ped_matrix, + cn_ped_matrix = cn_ped_matrix, + update_rate = update_rate, + verbose = verbose, + outcome_name = outcome_name + ) + return(NULL) + } + + # --- End of Legacy Mode --- + + # Merge and write the parentage matrices + # df <- full_join(mat_ped_matrix %>% arrange(ID), pat_ped_matrix %>% arrange(ID)) + + # write.table(df, file = mapa_id_file, sep = ",", append = FALSE, row.names = FALSE) +} + +#' Convert Pedigree Matrices to Related Pairs File (Legacy) +#' @description +#' This legacy function converts pedigree matrices into a related pairs file. +#' @inheritParams com2links + +com2links.legacy <- function( + rel_pairs_file = "dataRelatedPairs.csv", + ad_ped_matrix = NULL, + mit_ped_matrix = mt_ped_matrix, + mt_ped_matrix = NULL, + cn_ped_matrix = NULL, + update_rate = 500, + verbose = FALSE, + outcome_name = "data", + ...) { # --- Legacy Mode --- if (verbose) { message("Using legacy mode") @@ -536,8 +544,12 @@ com2links <- function( fname <- paste0(outcome_name, "_dataBiggestRelatedPairsTake2.csv") } # Initialize the output file with headers. - ds <- data.frame(ID1 = numeric(0), ID2 = numeric(0), addRel = numeric(0), mitRel = numeric(0), cnuRel = numeric(0)) - utils::write.table(ds, file = fname, sep = ",", append = FALSE, row.names = FALSE) + ds <- data.frame(ID1 = numeric(0), ID2 = numeric(0), + addRel = numeric(0), + mitRel = numeric(0), cnuRel = numeric(0)) + + utils::write.table(ds, file = fname, sep = ",", + append = FALSE, row.names = FALSE) # Extract IDs from the common nuclear matrix. ids <- as.numeric(dimnames(biggestCnPed)[[1]]) @@ -587,7 +599,8 @@ com2links <- function( # browser() if (cond1 || cond2 || cond3) { ID1 <- ids[u] - tds <- data.frame(ID1 = ID1, ID2 = ID2, addRel = 0, mitRel = 0, cnuRel = 0) + tds <- data.frame(ID1 = ID1, ID2 = ID2, + addRel = 0, mitRel = 0, cnuRel = 0) if (cond1) { tds$addRel[u %in% iss1vv] <- biggestPed@x[vv1] } @@ -597,18 +610,52 @@ com2links <- function( if (cond3) { tds$cnuRel[u %in% iss3vv] <- biggestCnPed@x[vv3] } - utils::write.table(tds, file = fname, row.names = FALSE, col.names = FALSE, append = TRUE, sep = ",") + utils::write.table(tds, file = fname, row.names = FALSE, + col.names = FALSE, append = TRUE, sep = ",") } - if (!(j %% 500)) { + if (!(j %% update_rate)) { cat(paste0("Done with ", j, " of ", nc, "\n")) } } + return(NULL) } +#' @title validate_and_convert_matrix +#' @description +#' This function validates and converts a matrix to a specific format. +#' +#' @param mat The matrix to be validated and converted. +#' @param name The name of the matrix for error messages. +#' @param ensure_symmetric Logical indicating whether to ensure the matrix is symmetric. +#' @param force_binary Logical indicating whether to force the matrix to be binary. +#' +#' @return The validated and converted matrix. +validate_and_convert_matrix <- function(mat, name, ensure_symmetric = FALSE, force_binary = FALSE) { + if (!inherits(mat, c("matrix", "dgCMatrix", "dsCMatrix"))) { + stop(paste0("The '", name, "' must be a matrix or dgCMatrix.")) + } + if (!inherits(mat, "dgCMatrix")) { + mat <- methods::as(mat, if (ensure_symmetric) "symmetricMatrix" else "dgCMatrix") + } + if (force_binary) { + mat@x[mat@x > 0] <- 1 + } + return(mat) +} +#' @title initialize_empty_df +#' @description +#' This function initializes an empty data frame with specified column names. +#' +#' @param relNames A vector of column names to be included in the data frame. +#' +#' @return An empty data frame with specified column names. +#' @keywords internal - # Merge and write the parentage matrices - # df <- full_join(mat_ped_matrix %>% arrange(ID), pat_ped_matrix %>% arrange(ID)) - - # write.table(df, file = mapa_id_file, sep = ",", append = FALSE, row.names = FALSE) +initialize_empty_df <- function(relNames) { + df <- data.frame(ID1 = numeric(0), ID2 = numeric(0)) + for (r in relNames) { + df[[r]] <- numeric(0) + } + return(df) } diff --git a/R/readPedigree.R b/R/readGedcom.R similarity index 53% rename from R/readPedigree.R rename to R/readGedcom.R index d79bb230..d6858e58 100644 --- a/R/readPedigree.R +++ b/R/readGedcom.R @@ -9,6 +9,7 @@ #' @param combine_cols A logical value indicating whether to combine columns with duplicate values. #' @param verbose A logical value indicating whether to print messages. #' @param skinny A logical value indicating whether to return a skinny data frame. +#' @param ... Additional arguments to be passed to the function. #' @return A data frame containing information about individuals, with the following potential columns: #' - `id`: ID of the individual #' - `momID`: ID of the individual's mother @@ -51,7 +52,8 @@ readGedcom <- function(file_path, add_parents = TRUE, remove_empty_cols = TRUE, combine_cols = TRUE, - skinny = FALSE) { + skinny = FALSE, + ...) { # Checks if (!file.exists(file_path)) stop("File does not exist: ", file_path) if (verbose) { @@ -120,38 +122,34 @@ readGedcom <- function(file_path, next } # PERSONAL_NAME_PIECES := NAME | NPFX | GIVN | NICK | SPFX | SURN | NSFX - if (num_rows$num_givn_rows > 0 && grepl(" GIVN", tmpv)) { - vars$name_given_pieces <- extract_info(tmpv, "GIVN") - next - } + result <- process_tag("GIVN", "name_given_pieces", num_rows, tmpv, vars) + vars <- result$vars + if (result$matched) next # npfx := Name Prefix - if (num_rows$num_npfx_rows > 0 && grepl(" NPFX", tmpv)) { - vars$name_npfx <- extract_info(tmpv, "NPFX") - next - } + result <- process_tag("NPFX", "name_npfx", num_rows, tmpv, vars) + vars <- result$vars + if (result$matched) next # NICK := Nickname - if (num_rows$num_nick_rows > 0 && grepl(" NICK", tmpv)) { - vars$name_nick <- extract_info(tmpv, "NICK") - next - } + result <- process_tag("NICK", "name_nick", num_rows, tmpv, vars) + vars <- result$vars + if (result$matched) next # surn := Surname - if (num_rows$num_surn_rows > 0 && grepl(" SURN", tmpv)) { - vars$name_surn_pieces <- extract_info(tmpv, "SURN") - next - } + result <- process_tag("SURN", "name_surn_pieces", num_rows, tmpv, vars) + vars <- result$vars + if (result$matched) next # nsfx := Name suffix - if (num_rows$num_nsfx_rows > 0 && grepl(" NSFX", tmpv)) { - vars$name_nsfx <- extract_info(tmpv, "NSFX") - next - } - if (num_rows$num_marnm_rows > 0 && grepl(" _MARNM", tmpv)) { - vars$name_marriedsurn <- extract_info(tmpv, "_MARNM") - next - } + result <- process_tag("NSFX", "name_nsfx", num_rows, tmpv, vars) + vars <- result$vars + if (result$matched) next + + result <- process_tag("_MARNM", "name_marriedsurn", num_rows, tmpv, vars) + vars <- result$vars + if (result$matched) next + # Birth event related information if (num_rows$num_birt_rows > 0 && grepl(" BIRT", tmpv)) { if (num_rows$num_date_rows > 0 && i + 1 <= file_length) { @@ -190,118 +188,87 @@ readGedcom <- function(file_path, next } - if (grepl(" SEX", tmpv)) { - vars$sex <- extract_info(tmpv, "SEX") - next - } - # Individual Attributes + # Attribute tags using process_tag() + for (tag_field in list( + c("SEX", "sex"), - # CAST caste - # g7:CAST The name of an individual’s rank or status in society which is sometimes based on racial or religious differences, or differences in wealth, inherited rank, profession, or occupation. - if (num_rows$num_cast_rows > 0 && grepl(" CAST", tmpv)) { - vars$attribute_caste <- extract_info(tmpv, "CAST") - next - } - # DSCR physical description - # g7:DSCR The physical characteristics of a person. - if (num_rows$num_dscr_rows > 0 && grepl(" DSCR", tmpv)) { - vars$attribute_description <- extract_info(tmpv, "DSCR") - next - } - # EDUC education - # g7:EDUC Indicator of a level of education attained. - if (num_rows$num_educ_rows > 0 && grepl(" EDUC", tmpv)) { - vars$attribute_education <- extract_info(tmpv, "EDUC") - next - } - # IDNO identifying number - # g7:IDNO A number or other string assigned to identify a person within some significant external system. It must have a TYPE substructure to define what kind of identification number is being provided. - if (num_rows$num_idno_rows > 0 && grepl(" IDNO", tmpv)) { - vars$attribute_idnumber <- extract_info(tmpv, "IDNO") - next - } - # NATI nationality - # g7:NATI An individual’s national heritage or origin, or other folk, house, kindred, lineage, or tribal interest. - if (num_rows$num_nati_rows > 0 && grepl(" NATI", tmpv)) { - vars$attribute_nationality <- extract_info(tmpv, "NATI") - next - } - # NCHI number of children - # g7:INDI-NCHI The number of children that this person is known to be the parent of (all marriages). - if (num_rows$num_nchi_rows > 0 && grepl(" NCHI", tmpv)) { - vars$attribute_children <- extract_info(tmpv, "NCHI") - next - } + # CAST caste + # g7:CAST The name of an individual’s rank or status in society which is sometimes based on racial or religious differences, or differences in wealth, inherited rank, profession, or occupation. + c("CAST", "attribute_caste"), - # NMR number of marriages - # g7:NMR The number of times this person has participated in a family as a spouse or parent. - if (num_rows$num_nmr_rows > 0 && grepl(" NMR", tmpv)) { - vars$attribute_marriages <- extract_info(tmpv, "NMR") - next - } + # DSCR physical description + # g7:DSCR The physical characteristics of a person. + c("DSCR", "attribute_description"), - # OCCU occupation - # g7:OCCU The type of work or profession of an individual. - if (num_rows$num_occu_rows > 0 && grepl(" OCCU", tmpv)) { - vars$attribute_occupation <- extract_info(tmpv, "OCCU") - next - } - # PROP property - # g7:PROP Pertaining to possessions such as real estate or other property of interest. + # EDUC education + # g7:EDUC Indicator of a level of education attained. + c("EDUC", "attribute_education"), - if (num_rows$num_prop_rows > 0 && grepl(" PROP", tmpv)) { - vars$attribute_property <- extract_info(tmpv, "PROP") - next - } + # IDNO identifying number + # g7:IDNO A number or other string assigned to identify a person within some significant external system. It must have a TYPE substructure to define what kind of identification number is being provided. + c("IDNO", "attribute_idnumber"), - # RELI religion - # g7:INDI-RELI A religious denomination to which a person is affiliated or for which a record applies. - if (num_rows$num_reli_rows > 0 && grepl(" RELI", tmpv)) { - vars$attribute_religion <- extract_info(tmpv, "RELI") - next - } - # RESI residence - # g7:INDI-RESI An address or place of residence where an individual resided. - if (num_rows$num_resi_rows > 0 && grepl(" RESI", tmpv)) { - vars$attribute_residence <- extract_info(tmpv, "RESI") - next - } + # NATI nationality + # g7:NATI An individual’s national heritage or origin, or other folk, house, kindred, lineage, or tribal interest. + c("NATI", "attribute_nationality"), - # SSN social security number - # g7:SSN A number assigned by the United States Social Security Administration, used for tax identification purposes. It is a type of IDNO. - if (num_rows$num_ssn_rows > 0 && grepl(" SSN", tmpv)) { - vars$attribute_ssn <- extract_info(tmpv, "SSN") - next - } - # TITL title - # g7:INDI-TITL A formal designation used by an individual in connection with positions of royalty or other social status, such as Grand Duke. - if (num_rows$num_titl_rows > 0 && grepl(" TITL", tmpv)) { - vars$attribute_title <- extract_info(tmpv, "TITL") - next + # NCHI number of children + # g7:INDI-NCHI The number of children that this person is known to be the parent of (all marriages). + c("NCHI", "attribute_children"), + + # NMR number of marriages + # g7:NMR The number of times this person has participated in a family as a spouse or parent. + c("NMR", "attribute_marriages"), + + # OCCU occupation + # g7:OCCU The type of work or profession of an individual. + c("OCCU", "attribute_occupation"), + + # PROP property + # g7:PROP Pertaining to possessions such as real estate or other property of interest. + c("PROP", "attribute_property"), + + # RELI religion + # g7:INDI-RELI A religious denomination to which a person is affiliated or for which a record applies. + c("RELI", "attribute_religion"), + + # RESI residence + # g7:INDI-RESI An address or place of residence where an individual resided. + c("RESI", "attribute_residence"), + + # SSN social security number + # g7:SSN A number assigned by the United States Social Security Administration, used for tax identification purposes. It is a type of IDNO. + c("SSN", "attribute_ssn"), + + # TITL title + # g7:INDI-TITL A formal designation used by an individual in connection with positions of royalty or other social status, such as Grand Duke. + c("TITL", "attribute_title") + )) { + result <- process_tag(tag_field[1], tag_field[2], num_rows, tmpv, vars) + vars <- result$vars + if (result$matched) next } # relationship data # g7:INDI-FAMC ## The family in which an individual appears as a child. It is also used with a g7:FAMC-STAT substructure to show individuals who are not children of the family. See FAMILY_RECORD for more details. - if (num_rows$num_famc_rows > 0 && grepl(" FAMC", tmpv)) { - if (is.na(vars$FAMC)) { - vars$FAMC <- stringr::str_extract(tmpv, "(?<=@.)\\d*(?=@)") - } else { - vars$FAMC <- paste0(vars$FAMC, ", ", stringr::str_extract(tmpv, "(?<=@.)\\d*(?=@)")) - } - next - } + result <- process_tag("FAMC", "FAMC", num_rows, tmpv, vars, + extractor = function(x) stringr::str_extract(x, "(?<=@.)\\d*(?=@)"), + mode = "append" + ) + vars <- result$vars + if (result$matched) next + # FAMS (Family spouse) g7:FAMS # The family in which an individual appears as a partner. See FAMILY_RECORD for more details. - if (num_rows$num_fams_rows > 0 && grepl(" FAMS", tmpv)) { - if (is.na(vars$FAMS)) { - vars$FAMS <- stringr::str_extract(tmpv, "(?<=@.)\\d*(?=@)") - } else { - vars$FAMS <- paste0(vars$FAMS, ", ", stringr::str_extract(tmpv, "(?<=@.)\\d*(?=@)")) - } - next - } + result <- process_tag("FAMS", "FAMS", num_rows, tmpv, vars, + extractor = function(x) stringr::str_extract(x, "(?<=@.)\\d*(?=@)"), + mode = "append" + ) + vars <- result$vars + if (result$matched) next + if (verbose && i %% 1000 == 0) { cat("Processed", i, "lines\n") } @@ -383,37 +350,33 @@ readGedcom <- function(file_path, #' @param df_temp A data frame containing information about individuals. #' @return A list mapping family IDs to parent IDs. #' @keywords internal -createFamilyToParentsMapping <- function(df_temp, datasource) { - if (datasource == "gedcom") { - if (!all(c("FAMS", "sex") %in% colnames(df_temp))) { - warning("The data frame does not contain the necessary columns (FAMS, sex)") - return(NULL) - } - family_to_parents <- list() - for (i in 1:nrow(df_temp)) { - if (!is.na(df_temp$FAMS[i])) { - fams_ids <- unlist(strsplit(df_temp$FAMS[i], ", ")) - for (fams_id in fams_ids) { - if (!is.null(family_to_parents[[fams_id]])) { - if (df_temp$sex[i] == "M") { - family_to_parents[[fams_id]]$father <- df_temp$id[i] - } else if (df_temp$sex[i] == "F") { - family_to_parents[[fams_id]]$mother <- df_temp$id[i] - } - } else { - family_to_parents[[fams_id]] <- list() - if (df_temp$sex[i] == "M") { - family_to_parents[[fams_id]]$father <- df_temp$id[i] - } else if (df_temp$sex[i] == "F") { - family_to_parents[[fams_id]]$mother <- df_temp$id[i] - } +#' +mapFAMS2parents <- function(df_temp) { + if (!all(c("FAMS", "sex") %in% colnames(df_temp))) { + warning("The data frame does not contain the necessary columns (FAMS, sex)") + return(NULL) + } + family_to_parents <- list() + for (i in 1:nrow(df_temp)) { + if (!is.na(df_temp$FAMS[i])) { + fams_ids <- unlist(strsplit(df_temp$FAMS[i], ", ")) + for (fams_id in fams_ids) { + if (!is.null(family_to_parents[[fams_id]])) { + if (df_temp$sex[i] == "M") { + family_to_parents[[fams_id]]$father <- df_temp$id[i] + } else if (df_temp$sex[i] == "F") { + family_to_parents[[fams_id]]$mother <- df_temp$id[i] + } + } else { + family_to_parents[[fams_id]] <- list() + if (df_temp$sex[i] == "M") { + family_to_parents[[fams_id]]$father <- df_temp$id[i] + } else if (df_temp$sex[i] == "F") { + family_to_parents[[fams_id]]$mother <- df_temp$id[i] } } } } - } else if (datasource == "wiki") { - message("The data source is not supported") - return(df_temp) } return(family_to_parents) } @@ -425,33 +388,27 @@ createFamilyToParentsMapping <- function(df_temp, datasource) { #' #' @param df_temp A data frame containing individual information. #' @param family_to_parents A list mapping family IDs to parent IDs. -#' @param datasource A string indicating the data source. Options are "gedcom" and "wiki". #' @return A data frame with added momID and dad_ID columns. #' @keywords internal -assignParentIDs <- function(df_temp, family_to_parents, datasource) { +mapFAMC2parents <- function(df_temp, family_to_parents) { df_temp$momID <- NA_character_ df_temp$dadID <- NA_character_ - if (datasource == "gedcom") { - for (i in 1:nrow(df_temp)) { - if (!is.na(df_temp$FAMC[i])) { - famc_ids <- unlist(strsplit(df_temp$FAMC[i], ", ")) - for (famc_id in famc_ids) { - if (!is.null(family_to_parents[[famc_id]])) { - if (!is.null(family_to_parents[[famc_id]]$father)) { - df_temp$dadID[i] <- family_to_parents[[famc_id]]$father - } - if (!is.null(family_to_parents[[famc_id]]$mother)) { - df_temp$momID[i] <- family_to_parents[[famc_id]]$mother - } + for (i in 1:nrow(df_temp)) { + if (!is.na(df_temp$FAMC[i])) { + famc_ids <- unlist(strsplit(df_temp$FAMC[i], ", ")) + for (famc_id in famc_ids) { + if (!is.null(family_to_parents[[famc_id]])) { + if (!is.null(family_to_parents[[famc_id]]$father)) { + df_temp$dadID[i] <- family_to_parents[[famc_id]]$father + } + if (!is.null(family_to_parents[[famc_id]]$mother)) { + df_temp$momID[i] <- family_to_parents[[famc_id]]$mother } } } } - return(df_temp) - } else if (datasource == "wiki") { - message("No parents information available for wiki data") - return(df_temp) } + return(df_temp) } #' Process parents information @@ -477,11 +434,11 @@ processParents <- function(df_temp, datasource) { return(df_temp) } - family_to_parents <- createFamilyToParentsMapping(df_temp, datasource = datasource) + family_to_parents <- mapFAMS2parents(df_temp) if (is.null(family_to_parents) || length(family_to_parents) == 0) { return(df_temp) } - df_temp <- assignParentIDs(df_temp, family_to_parents, datasource = datasource) + df_temp <- mapFAMC2parents(df_temp, family_to_parents) return(df_temp) } @@ -576,177 +533,35 @@ countPatternRows <- function(file) { return(num_rows) } -#' Read Wiki Family Tree +#' Process a GEDCOM Tag #' -#' @param text A character string containing the text of a family tree in wiki format. -#' @export -readWikifamilytree <- function(text) { - # Extract summary text - - summary_text <- extractSummaryText(text) - # Extract all lines defining the family tree - tree_lines <- unlist(stringr::str_extract_all(text, "\\{\\{familytree.*?\\}\\}")) - tree_lines <- tree_lines[!stringr::str_detect(tree_lines, "start|end")] # Remove start/end markers - tree_lines <- gsub("\\{\\{familytree(.*?)\\}\\}", "\\1", tree_lines) # Remove wrapping markup - - # Convert tree structure into a coordinate grid (preserves symbols!) - tree_df <- parseTree(tree_lines) - - # Identify columns that start with "Y" - cols_to_pivot <- grep("^Y", names(tree_df), value = TRUE) - - # Reshape from wide to long format - tree_long <- makeLongTree(tree_df, cols_to_pivot) - - # Extract member definitions - members_df <- matchMembers(text) - members_df$id <- paste0("P", seq_len(nrow(members_df))) # Assign unique person IDs - - # Merge names into the tree structure (keeping all symbols!) - tree_long <- merge(tree_long, members_df, by.x = "Value", by.y = "identifier", all.x = TRUE) - - tree_long$DisplayName <- ifelse(!is.na(tree_long$name), tree_long$name, tree_long$Value) # Use name if available - - # parse relationships and infer them - - relationships_df <- parseRelationships(tree_long) - - # relationships_df <- processParents(tree_long, datasource = "wiki") - - - - # Return structured table of the family tree (symbols included) - list( - summary = summary_text, - members = members_df, - structure = tree_long, - relationships = relationships_df - ) -} - -#' Make Long Tree -#' @param tree_df A data frame containing the tree structure. -#' @param cols_to_pivot A character vector of column names to pivot. -#' @return A long data frame containing the tree structure. -#' @keywords internal -makeLongTree <- function(tree_df, cols_to_pivot) { - tree_long <- stats::reshape(tree_df, - varying = cols_to_pivot, - v.names = "Value", - timevar = "Column", - times = cols_to_pivot, - idvar = setdiff(names(tree_df), cols_to_pivot), - direction = "long" - ) - - tree_long <- tree_long[!is.na(tree_long$Value), ] - tree_long$Value <- stringr::str_trim(tree_long$Value) - tree_long$Column <- as.numeric(gsub("^Y", "", tree_long$Column)) - return(tree_long) -} - -#' Match Members -#' @inheritParams readWikifamilytree -#' @return A data frame containing information about the members of the family tree. -#' @keywords internal - -matchMembers <- function(text) { - member_matches <- stringr::str_extract_all(text, "\\|\\s*([A-Za-z0-9]+)\\s*=\\s*([^|}]*)")[[1]] - member_matches <- gsub("\\[|\\]|'''", "", member_matches) # Remove formatting - - members_df <- data.frame( - identifier = stringr::str_trim(stringr::str_extract(member_matches, "^[^=]+")), - name = stringr::str_trim(stringr::str_extract(member_matches, "(?<=\\=).*")), - stringsAsFactors = FALSE - ) - - # Remove leading pipes (`|`) from identifiers for consistency - members_df$identifier <- gsub("^\\|\\s*", "", members_df$identifier) - - # remove summary row - members_df <- members_df[members_df$identifier != "summary", ] - - return(members_df) -} - -#' Extract Summary Text -#' @inheritParams readWikifamilytree -#' @return A character string containing the summary text. -#' @keywords internal -#' @export - -extractSummaryText <- function(text) { - summary_match <- stringr::str_match(text, "\\{\\{familytree/start \\|summary=(.*?)\\}\\}") - summary_text <- ifelse(!is.na(summary_match[, 2]), summary_match[, 2], NA) - return(summary_text) -} - -#' Parse Tree -#' @param tree_lines A character vector containing the lines of the tree structure. -#' @return A data frame containing the tree structure. -#' @keywords internal -#' @export - -parseTree <- function(tree_lines) { - tree_matrix <- base::strsplit(tree_lines, "\\|") # Split each row into columns - max_cols <- max(sapply(tree_matrix, length)) # Find the max column count - - # Convert to a data frame (ensures correct structure) - tree_df <- do.call(rbind, lapply(tree_matrix, function(row) { - length(row) <- max_cols # Ensure uniform column length - return(row) - })) - - tree_df <- as.data.frame(tree_df, stringsAsFactors = FALSE) - colnames(tree_df) <- paste0("Y", seq_len(ncol(tree_df))) # Assign column names - tree_df$Row <- seq_len(nrow(tree_df)) # Assign row numbers - return(tree_df) -} - -#' infer relationship from tree template +#' Extracts and assigns a value to a specified field in `vars` if the pattern is present. +#' Returns both the updated variable list and a flag indicating whether the tag was matched. #' -#' @param tree_long A data frame containing the tree structure in long format. -#' @return A data frame containing the relationships between family members. +#' @param tag The GEDCOM tag (e.g., "SEX", "CAST", etc.). +#' @param field_name The name of the variable to assign to in `vars`. +#' @param pattern_rows Output from `countPatternRows()`. +#' @param line The GEDCOM line to parse. +#' @param vars The current list of variables to update. +#' @return A list with updated `vars` and a `matched` flag. #' @keywords internal #' -parseRelationships <- function(tree_long) { - relationships <- data.frame( - id = tree_long$id, - momID = NA_character_, - dadID = NA_character_, - spouseID = NA_character_, - stringsAsFactors = FALSE - ) - - # Loop through rows to find connections - for (i in seq_len(nrow(tree_long))) { - row <- tree_long[i, ] - - # **Parent-Child Detection** - if (row$Value == "y") { - parent <- tree_long$Value[tree_long$Row == row$Row - 1 & tree_long$Column == row$Column] - child <- tree_long$Value[tree_long$Row == row$Row + 1 & tree_long$Column == row$Column] - - if (length(parent) == 0) parent <- NA - if (length(child) == 0) child <- NA - # Assign mom/dad IDs based on tree structure - if (!is.na(parent) && !is.na(child)) { - relationships$momID[relationships$id == child] <- parent - relationships$dadID[relationships$id == child] <- parent # Assuming one parent detected for now - } - } - - # **Spouse Detection** - if (row$Value == "+") { - spouse1 <- tree_long$Value[tree_long$Row == row$Row & tree_long$Column == row$Column - 1] - spouse2 <- tree_long$Value[tree_long$Row == row$Row & tree_long$Column == row$Column + 1] - - if (!is.na(spouse1) && !is.na(spouse2)) { - relationships$spouseID[relationships$id == spouse1] <- spouse2 - relationships$spouseID[relationships$id == spouse2] <- spouse1 - } - } +process_tag <- function(tag, field_name, pattern_rows, line, vars, + extractor = NULL, mode = "replace") { + count_name <- paste0("num_", tolower(tag), "_rows") + matched <- FALSE + if (!is.null(pattern_rows[[count_name]]) && + pattern_rows[[count_name]] > 0 && + grepl(paste0(" ", tag), line)) { + value <- if (is.null(extractor)) extract_info(line, tag) else extractor(line) + + if (mode == "append" && !is.na(vars[[field_name]])) { + vars[[field_name]] <- paste0(vars[[field_name]], ", ", value) + } else { + vars[[field_name]] <- value + } + + matched <- TRUE } - - return(relationships) + return(list(vars = vars, matched = matched)) } diff --git a/R/readWikifamilytree.R b/R/readWikifamilytree.R new file mode 100644 index 00000000..fc0a7521 --- /dev/null +++ b/R/readWikifamilytree.R @@ -0,0 +1,197 @@ +#' Read Wiki Family Tree +#' +#' @param text A character string containing the text of a family tree in wiki format. +#' @param verbose A logical value indicating whether to print messages. +#' @param file_path The path to the file containing the family tree. +#' @param ... Additional arguments (not used). +#' +#' @return A list containing the summary, members, structure, and relationships of the family tree. +#' @export +readWikifamilytree <- function(text = NULL, verbose = FALSE, file_path = NULL, ...) { + # Checks + if (is.null(text) && is.null(file_path)) { + stop("Either 'text' or 'file_path' must be provided.") + } + # read from file if provided + if (!is.null(file_path)) { + if (!file.exists(file_path)) stop("File does not exist: ", file_path) + + if (verbose) { + print(paste("Reading file:", file_path)) + } + file <- data.frame(X1 = readLines(file_path)) + file_length <- nrow(file) + if (verbose) { + print(paste0("File is ", file_length, " lines long")) + } + text <- paste0(file$X1, collapse = "\n") + } + # Extract summary text + + summary_text <- extractSummaryText(text) + # Extract all lines defining the family tree + tree_lines <- unlist(stringr::str_extract_all(text, "\\{\\{familytree.*?\\}\\}")) + tree_lines <- tree_lines[!stringr::str_detect(tree_lines, "start|end")] # Remove start/end markers + tree_lines <- gsub("\\{\\{familytree(.*?)\\}\\}", "\\1", tree_lines) # Remove wrapping markup + + # Convert tree structure into a coordinate grid (preserves symbols!) + tree_df <- parseTree(tree_lines) + + # Identify columns that start with "Y" + cols_to_pivot <- grep("^Y", names(tree_df), value = TRUE) + + # Reshape from wide to long format + tree_long <- makeLongTree(tree_df, cols_to_pivot) + + # Extract member definitions + members_df <- matchMembers(text) + members_df$id <- paste0("P", seq_len(nrow(members_df))) # Assign unique person IDs + + # Merge names into the tree structure (keeping all symbols!) + tree_long <- merge(tree_long, members_df, by.x = "Value", by.y = "identifier", all.x = TRUE) + + tree_long$DisplayName <- ifelse(!is.na(tree_long$name), tree_long$name, tree_long$Value) # Use name if available + + # parse relationships and infer them + + relationships_df <- parseRelationships(tree_long) + + # relationships_df <- processParents(tree_long, datasource = "wiki") + + + + # Return structured table of the family tree (symbols included) + list( + summary = summary_text, + members = members_df, + structure = tree_long, + relationships = relationships_df + ) +} + +#' Make Long Tree +#' @param tree_df A data frame containing the tree structure. +#' @param cols_to_pivot A character vector of column names to pivot. +#' @return A long data frame containing the tree structure. +#' @keywords internal +makeLongTree <- function(tree_df, cols_to_pivot) { + tree_long <- stats::reshape(tree_df, + varying = cols_to_pivot, + v.names = "Value", + timevar = "Column", + times = cols_to_pivot, + idvar = setdiff(names(tree_df), cols_to_pivot), + direction = "long" + ) + + tree_long <- tree_long[!is.na(tree_long$Value), ] + tree_long$Value <- stringr::str_trim(tree_long$Value) + tree_long$Column <- as.numeric(gsub("^Y", "", tree_long$Column)) + return(tree_long) +} + +#' Match Members +#' @inheritParams readWikifamilytree +#' @return A data frame containing information about the members of the family tree. +#' @keywords internal + +matchMembers <- function(text) { + member_matches <- stringr::str_extract_all(text, "\\|\\s*([A-Za-z0-9]+)\\s*=\\s*([^|}]*)")[[1]] + member_matches <- gsub("\\[|\\]|'''", "", member_matches) # Remove formatting + + members_df <- data.frame( + identifier = stringr::str_trim(stringr::str_extract(member_matches, "^[^=]+")), + name = stringr::str_trim(stringr::str_extract(member_matches, "(?<=\\=).*")), + stringsAsFactors = FALSE + ) + + # Remove leading pipes (`|`) from identifiers for consistency + members_df$identifier <- gsub("^\\|\\s*", "", members_df$identifier) + + # remove summary row + members_df <- members_df[members_df$identifier != "summary", ] + + return(members_df) +} + +#' Extract Summary Text +#' @inheritParams readWikifamilytree +#' @return A character string containing the summary text. +#' @keywords internal +#' @export + +extractSummaryText <- function(text) { + summary_match <- stringr::str_match(text, "\\{\\{familytree/start \\|summary=(.*?)\\}\\}") + summary_text <- ifelse(!is.na(summary_match[, 2]), summary_match[, 2], NA) + return(summary_text) +} + +#' Parse Tree +#' @param tree_lines A character vector containing the lines of the tree structure. +#' @return A data frame containing the tree structure. +#' @keywords internal +#' @export + +parseTree <- function(tree_lines) { + tree_matrix <- base::strsplit(tree_lines, "\\|") # Split each row into columns + max_cols <- max(sapply(tree_matrix, length)) # Find the max column count + + # Convert to a data frame (ensures correct structure) + tree_df <- do.call(rbind, lapply(tree_matrix, function(row) { + length(row) <- max_cols # Ensure uniform column length + return(row) + })) + + tree_df <- as.data.frame(tree_df, stringsAsFactors = FALSE) + colnames(tree_df) <- paste0("Y", seq_len(ncol(tree_df))) # Assign column names + tree_df$Row <- seq_len(nrow(tree_df)) # Assign row numbers + return(tree_df) +} + +#' infer relationship from tree template +#' +#' @param tree_long A data frame containing the tree structure in long format. +#' @return A data frame containing the relationships between family members. +#' @keywords internal +#' +parseRelationships <- function(tree_long) { + relationships <- data.frame( + id = tree_long$id, + momID = NA_character_, + dadID = NA_character_, + spouseID = NA_character_, + stringsAsFactors = FALSE + ) + + # Loop through rows to find connections + for (i in seq_len(nrow(tree_long))) { + row <- tree_long[i, ] + + # **Parent-Child Detection** + if (row$Value == "y") { + parent <- tree_long$Value[tree_long$Row == row$Row - 1 & tree_long$Column == row$Column] + child <- tree_long$Value[tree_long$Row == row$Row + 1 & tree_long$Column == row$Column] + + if (length(parent) == 0) parent <- NA + if (length(child) == 0) child <- NA + # Assign mom/dad IDs based on tree structure + if (!is.na(parent) && !is.na(child)) { + relationships$momID[relationships$id == child] <- parent + relationships$dadID[relationships$id == child] <- parent # Assuming one parent detected for now + } + } + + # **Spouse Detection** + if (row$Value == "+") { + spouse1 <- tree_long$Value[tree_long$Row == row$Row & tree_long$Column == row$Column - 1] + spouse2 <- tree_long$Value[tree_long$Row == row$Row & tree_long$Column == row$Column + 1] + + if (!is.na(spouse1) && !is.na(spouse2)) { + relationships$spouseID[relationships$id == spouse1] <- spouse2 + relationships$spouseID[relationships$id == spouse2] <- spouse1 + } + } + } + + return(relationships) +} diff --git a/R/summarizePedigree.R b/R/summarizePedigree.R index 1fbd06ac..6c6564f3 100644 --- a/R/summarizePedigree.R +++ b/R/summarizePedigree.R @@ -104,14 +104,13 @@ summarizePedigrees <- function(ped, famID = "famID", personID = "ID", if (network_checks) { if (verbose) message("Performing network validation checks...") - network_validation_results <- checkPedigreeNetwork( + output$network_validation <- checkPedigreeNetwork( ped, personID = personID, momID = momID, dadID = dadID, verbose = verbose ) - output$network_validation <- network_validation_results } # Calculate summary statistics for families, maternal lines, and paternal lines @@ -191,50 +190,59 @@ summarizePedigrees <- function(ped, famID = "famID", personID = "ID", # repair = FALSE, verbose = verbose) # } - - # Optionally find the superlative lines - # & noldest <= unique(ped_dt[[famID]]) - # determin number of lines - - ## oldest if (!is.null(byr) && noldest > 0) { if (!is.null(n_families) && "families" %in% type) { if (verbose) message("Finding oldest families...") - output$oldest_families <- try_na(family_summary_dt[order(get(byr))][1:min(c(noldest, n_families), - na.rm = TRUE - )]) + output$oldest_families <- findOldest( + foo_summary_dt = family_summary_dt, + byr = byr, + noldest = noldest, + n_foo = n_families + ) } if (!is.null(n_mothers) && "mothers" %in% type) { if (verbose) message("Finding oldest maternal lines...") - output$oldest_maternal <- try_na(maternal_summary_dt[order(get(byr))][1:min(c(noldest, n_mothers), - na.rm = TRUE - )]) + output$oldest_maternal <- findOldest( + foo_summary_dt = maternal_summary_dt, + byr = byr, + noldest = noldest, + n_foo = n_mothers + ) } if (!is.null(n_fathers) && "fathers" %in% type) { if (verbose) message("Finding oldest paternal lines...") - output$oldest_paternal <- try_na(paternal_summary_dt[order(get(byr))][1:min(c(noldest, n_fathers), - na.rm = TRUE - )]) + output$oldest_paternal <- findOldest( + foo_summary_dt = paternal_summary_dt, + byr = byr, + noldest = noldest, + n_foo = n_fathers + ) } } # biggest lines if (!is.null(nbiggest) && nbiggest > 0) { if (!is.null(n_families) && "families" %in% type) { - output$biggest_families <- try_na(family_summary_dt[order(-get("count"))][1:min(c(nbiggest, n_families), - na.rm = TRUE - )]) + output$biggest_families <- findBiggest( + foo_summary_dt = family_summary_dt, + nbiggest = nbiggest, + n_foo = n_families + ) } if (!is.null(n_mothers) && "mothers" %in% type) { - output$biggest_maternal <- try_na(maternal_summary_dt[order(-get("count"))][1:min(c(nbiggest, n_mothers), - na.rm = TRUE - )]) + output$biggest_maternal <- findBiggest( + foo_summary_dt = maternal_summary_dt, + nbiggest = nbiggest, + n_foo = n_mothers + ) } if (!is.null(n_fathers) && "fathers" %in% type) { - output$biggest_paternal <- try_na(paternal_summary_dt[order(-get("count"))][1:min(c(nbiggest, n_fathers), - na.rm = TRUE - )]) + output$biggest_paternal <- findBiggest( + foo_summary_dt = paternal_summary_dt, + nbiggest = nbiggest, + n_foo = n_fathers + ) } } @@ -385,3 +393,30 @@ summarizeFamilies <- function(ped, famID = "famID", personID = "ID", founder_sort_var = founder_sort_var ) } +# Function to find the oldest individuals in a pedigree +#' This function finds the oldest families in a pedigree. It is supposed to be used internally by the \code{summarize_pedigree} function. +#' @inheritParams summarizePedigrees +#' @param foo_summary_dt A data.table containing the summary statistics. +#' @param n_foo An integer specifying the number of individuals in the summary. +#' @returns a data.table containing the oldest families in the pedigree. + +findOldest <- function(foo_summary_dt, byr, noldest, n_foo) { + oldest_foo <- try_na(foo_summary_dt[order(get(byr))][1:min(c(noldest, n_foo), + na.rm = TRUE + )]) + return(oldest_foo) +} + +# Function to find the biggest families in a pedigree +#' This function finds the biggest families in a pedigree. It is supposed to be used internally by the \code{summarize_pedigree} function. +#' @inheritParams findOldest +#' @inheritParams summarizePedigrees +#' @returns a data.table containing the biggest families in the pedigree. + + +findBiggest <- function(foo_summary_dt, nbiggest, n_foo) { + biggest_foo <- try_na(foo_summary_dt[order(-get("count"))][1:min(c(nbiggest, n_foo), + na.rm = TRUE + )]) + return(biggest_foo) +} diff --git a/data-raw/benchmark.R b/data-raw/benchmark.R index 80d6db40..8c4c90b5 100644 --- a/data-raw/benchmark.R +++ b/data-raw/benchmark.R @@ -59,4 +59,5 @@ benchmark_results <- microbenchmark( print(benchmark_results) # Optional: Save results to CSV for later analysis -write.csv(summary(benchmark_results), "benchmark_results.csv", row.names = FALSE) +write.csv(summary(benchmark_results), + "benchmark_results.csv", row.names = FALSE) diff --git a/data-raw/df_ASOIAF.R b/data-raw/df_ASOIAF.R index e5cdc112..9741d04f 100644 --- a/data-raw/df_ASOIAF.R +++ b/data-raw/df_ASOIAF.R @@ -10,7 +10,7 @@ library(BGmisc) ## Create dataframe ASOIAF <- readGedcom("data-raw/ASOIAF.ged") -#ASOIAF <- readGedcom("data-raw/ASOIAF_040725.ged") +# ASOIAF <- readGedcom("data-raw/ASOIAF_040725.ged") df <- ped2fam(ASOIAF, personID = "id") %>% select( @@ -29,7 +29,8 @@ df <- ped2fam(ASOIAF, personID = "id") %>% name = str_remove(name, "/") ) -# pedADD <- ped2com(df , personID = "id", momID = "momID", dadID = "dadID", component = "additive", isChild_method = "partial_parent") +# pedADD <- ped2com(df , personID = "id", momID = "momID", +# dadID = "dadID", component = "additive", isChild_method = "partial_parent") # com2links(ad_ped_matrix=pedADD) # if missing momID or dadID, assign the next available ID diff --git a/data-raw/df_inbreeding.R b/data-raw/df_inbreeding.R index 4419baef..9af10ff0 100644 --- a/data-raw/df_inbreeding.R +++ b/data-raw/df_inbreeding.R @@ -8,5 +8,5 @@ inbreeding <- raw ## # data processing -write.csv(inbreeding, "data-raw/inbreeding.csv", row.names = FALSE) +#write.csv(inbreeding, "data-raw/inbreeding.csv", row.names = FALSE) usethis::use_data(inbreeding, overwrite = TRUE, compress = "xz") diff --git a/data-raw/df_potter.R b/data-raw/df_potter.R index 77c147aa..aa039c81 100644 --- a/data-raw/df_potter.R +++ b/data-raw/df_potter.R @@ -45,7 +45,9 @@ potter <- data.frame( "Molly Weasley", "Lucy Weasley" ), - gen = c(1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3), + gen = c(1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3), momID = c( 101, 101, 103, 103, NA, 3, 4, 10, NA, NA, 10, 10, 10, 10, 10, 10, NA, 105, 105, NA, @@ -137,7 +139,8 @@ potter[nrow(potter) + 1, ] <- list( 0 ) -# potter[nrow(potter) + 1,] <- list(personID,fam,name,gen,momID,dadID,spouseID,sex) +# potter[nrow(potter) + 1,] <- list(personID,fam,name,gen, +# momID,dadID,spouseID,sex) write_csv(potter, here("data-raw", "potter.csv")) usethis::use_data(potter, overwrite = TRUE, compress = "xz") @@ -156,18 +159,24 @@ df <- ped2fam(potter_big, personID = "id") %>% -birth_date, -FAMC, -FAMS - ) %>% rename(personID=id) %>% + ) %>% + rename(personID = id) %>% mutate( personID = as.numeric(personID), momID = as.numeric(momID), dadID = as.numeric(dadID), name = str_remove(name, "/") - ) %>% arrange(name) + ) %>% + arrange(name) -potter_clean <- potter %>% arrange(name) %>% mutate(personID = as.numeric(personID)*1000, - momID = as.numeric(momID)*1000, - dadID = as.numeric(dadID)*1000) +potter_clean <- potter %>% + arrange(name) %>% + mutate( + personID = as.numeric(personID) * 1000, + momID = as.numeric(momID) * 1000, + dadID = as.numeric(dadID) * 1000 + ) # merge so that I have all of A and all the Bs that match A @@ -177,166 +186,168 @@ df_clean <- df %>% potter_clean <- potter_clean %>% - mutate(personID = case_match( - personID, - 22000 ~ 15, - 9000 ~ 18, - 6000 ~ 10, - 18000 ~ 29, - 7000 ~ 1, - 17000 ~ 25, - 25000 ~ 27, - 13000 ~ 20, - 19000 ~ 34, - 23000 ~ 16, - 10000 ~ 17, - 14000 ~ 22, - 3000 ~ 8, - 24000 ~ 26, - 1000 ~ 9, - 26000 ~ 30, - 21000 ~ 14, - 11000 ~ 19, - 8000 ~ 13, - 5000 ~ 2, - 15000 ~ 23, - 16000 ~ 24, - 4000 ~ 3, - 2000 ~ -2000, - 101000 ~12, - 102000 ~ 11, - 20000 ~ -20000, - 27000 ~ -27000, - 106000 ~ 32, - 105000 ~ 33, - 29000 ~ 17, - 103000 ~ 7, - 104000 ~ 6, - 12000 ~ 21, - 30000 ~ -30000, - 28000 ~ -28000, - .default = personID - ), dadID = - case_match( - dadID, - 22000 ~ 15, - 9000 ~ 18, - 6000 ~ 10, - 18000 ~ 29, - 7000 ~ 1, - 17000 ~ 25, - 25000 ~ 27, - 13000 ~ 20, - 19000 ~ 34, - 23000 ~ 16, - 10000 ~ 17, - 14000 ~ 22, - 3000 ~ 8, - 24000 ~ 26, - 1000 ~ 9, - 26000 ~ 30, - 21000 ~ 14, - 11000 ~ 19, - 8000 ~ 13, - 5000 ~ 2, - 15000 ~ 23, - 16000 ~ 24, - 4000 ~ 3, - 2000 ~ 2000, - 101000 ~12, - 102000 ~ 11, - 20000 ~ 20000, - 27000 ~ 27000, - 106000 ~ 32, - 105000 ~ 33, - 29000 ~ 17, - 103000 ~ 7, - 104000 ~ 6, - 12000 ~ 21, - 30000 ~ 30000, - 28000 ~ 28000, - .default = dadID - ), momID = - case_match( - momID, - 22000 ~ 15, - 9000 ~ 18, - 6000 ~ 10, - 18000 ~ 29, - 7000 ~ 1, - 17000 ~ 25, - 25000 ~ 27, - 13000 ~ 20, - 19000 ~ 34, - 23000 ~ 16, - 10000 ~ 17, - 14000 ~ 22, - 3000 ~ 8, - 24000 ~ 26, - 1000 ~ 9, - 26000 ~ 30, - 21000 ~ 14, - 11000 ~ 19, - 8000 ~ 13, - 5000 ~ 2, - 15000 ~ 23, - 16000 ~ 24, - 4000 ~ 3, - 2000 ~ 2000, - 101000 ~12, - 102000 ~ 11, - 20000 ~ 20000, - 27000 ~ 27000, - 106000 ~ 32, - 105000 ~ 33, - 29000 ~ 17, - 103000 ~ 7, - 104000 ~ 6, - 12000 ~ 21, - 30000 ~ 30000, - 28000 ~ 28000, - .default = momID - ), spouseID = - case_match( - spouseID, - 22000 ~ 15, - 9000 ~ 18, - 6000 ~ 10, - 18000 ~ 29, - 7000 ~ 1, - 17000 ~ 25, - 25000 ~ 27, - 13000 ~ 20, - 19000 ~ 34, - 23000 ~ 16, - 10000 ~ 17, - 14000 ~ 22, - 3000 ~ 8, + mutate( + personID = case_match( + personID, + 22000 ~ 15, + 9000 ~ 18, + 6000 ~ 10, + 18000 ~ 29, + 7000 ~ 1, + 17000 ~ 25, + 25000 ~ 27, + 13000 ~ 20, + 19000 ~ 34, + 23000 ~ 16, + 10000 ~ 17, + 14000 ~ 22, + 3000 ~ 8, 24000 ~ 26, - 1000 ~ 9, - 26000 ~ 30, + 1000 ~ 9, + 26000 ~ 30, 21000 ~ 14, - 11000 ~ 19, - 8000 ~ 13, + 11000 ~ 19, + 8000 ~ 13, 5000 ~ 2, - 15000 ~ 23, - 16000 ~ 24, + 15000 ~ 23, + 16000 ~ 24, 4000 ~ 3, - 2000 ~ 2000, - 101000 ~12, - 102000 ~ 11, - 20000 ~ 20000, - 27000 ~ 27000, + 2000 ~ -2000, + 101000 ~ 12, + 102000 ~ 11, + 20000 ~ -20000, + 27000 ~ -27000, 106000 ~ 32, - 105000 ~ 33, - 29000 ~ 17, - 103000 ~ 7, - 104000 ~ 6, - 12000 ~ 21, - 30000 ~ 30000, - 28000 ~ 28000, - .default = spouseID - )) + 105000 ~ 33, + 29000 ~ 17, + 103000 ~ 7, + 104000 ~ 6, + 12000 ~ 21, + 30000 ~ -30000, + 28000 ~ -28000, + .default = personID + ), dadID = + case_match( + dadID, + 22000 ~ 15, + 9000 ~ 18, + 6000 ~ 10, + 18000 ~ 29, + 7000 ~ 1, + 17000 ~ 25, + 25000 ~ 27, + 13000 ~ 20, + 19000 ~ 34, + 23000 ~ 16, + 10000 ~ 17, + 14000 ~ 22, + 3000 ~ 8, + 24000 ~ 26, + 1000 ~ 9, + 26000 ~ 30, + 21000 ~ 14, + 11000 ~ 19, + 8000 ~ 13, + 5000 ~ 2, + 15000 ~ 23, + 16000 ~ 24, + 4000 ~ 3, + 2000 ~ 2000, + 101000 ~ 12, + 102000 ~ 11, + 20000 ~ 20000, + 27000 ~ 27000, + 106000 ~ 32, + 105000 ~ 33, + 29000 ~ 17, + 103000 ~ 7, + 104000 ~ 6, + 12000 ~ 21, + 30000 ~ 30000, + 28000 ~ 28000, + .default = dadID + ), momID = + case_match( + momID, + 22000 ~ 15, + 9000 ~ 18, + 6000 ~ 10, + 18000 ~ 29, + 7000 ~ 1, + 17000 ~ 25, + 25000 ~ 27, + 13000 ~ 20, + 19000 ~ 34, + 23000 ~ 16, + 10000 ~ 17, + 14000 ~ 22, + 3000 ~ 8, + 24000 ~ 26, + 1000 ~ 9, + 26000 ~ 30, + 21000 ~ 14, + 11000 ~ 19, + 8000 ~ 13, + 5000 ~ 2, + 15000 ~ 23, + 16000 ~ 24, + 4000 ~ 3, + 2000 ~ 2000, + 101000 ~ 12, + 102000 ~ 11, + 20000 ~ 20000, + 27000 ~ 27000, + 106000 ~ 32, + 105000 ~ 33, + 29000 ~ 17, + 103000 ~ 7, + 104000 ~ 6, + 12000 ~ 21, + 30000 ~ 30000, + 28000 ~ 28000, + .default = momID + ), spouseID = + case_match( + spouseID, + 22000 ~ 15, + 9000 ~ 18, + 6000 ~ 10, + 18000 ~ 29, + 7000 ~ 1, + 17000 ~ 25, + 25000 ~ 27, + 13000 ~ 20, + 19000 ~ 34, + 23000 ~ 16, + 10000 ~ 17, + 14000 ~ 22, + 3000 ~ 8, + 24000 ~ 26, + 1000 ~ 9, + 26000 ~ 30, + 21000 ~ 14, + 11000 ~ 19, + 8000 ~ 13, + 5000 ~ 2, + 15000 ~ 23, + 16000 ~ 24, + 4000 ~ 3, + 2000 ~ 2000, + 101000 ~ 12, + 102000 ~ 11, + 20000 ~ 20000, + 27000 ~ 27000, + 106000 ~ 32, + 105000 ~ 33, + 29000 ~ 17, + 103000 ~ 7, + 104000 ~ 6, + 12000 ~ 21, + 30000 ~ 30000, + 28000 ~ 28000, + .default = spouseID + ) + ) # Left join by name diff --git a/data/royal92.rda b/data/royal92.rda index 357f1aba..4a10b174 100644 Binary files a/data/royal92.rda and b/data/royal92.rda differ diff --git a/man/checkPedigreeNetwork.Rd b/man/checkPedigreeNetwork.Rd index d6e92749..9b8c8d24 100644 --- a/man/checkPedigreeNetwork.Rd +++ b/man/checkPedigreeNetwork.Rd @@ -33,7 +33,9 @@ Checks for structural issues in pedigree networks, including: } \examples{ \dontrun{ -results <- checkPedigreeNetwork(ped, personID = "ID", -momID = "momID", dadID = "dadID", verbose = TRUE) +results <- checkPedigreeNetwork(ped, + personID = "ID", + momID = "momID", dadID = "dadID", verbose = TRUE +) } } diff --git a/man/com2links.legacy.Rd b/man/com2links.legacy.Rd new file mode 100644 index 00000000..757117a1 --- /dev/null +++ b/man/com2links.legacy.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/makeLinks.R +\name{com2links.legacy} +\alias{com2links.legacy} +\title{Convert Pedigree Matrices to Related Pairs File (Legacy)} +\usage{ +com2links.legacy( + rel_pairs_file = "dataRelatedPairs.csv", + ad_ped_matrix = NULL, + mit_ped_matrix = mt_ped_matrix, + mt_ped_matrix = NULL, + cn_ped_matrix = NULL, + update_rate = 500, + verbose = FALSE, + outcome_name = "data", + ... +) +} +\arguments{ +\item{rel_pairs_file}{File path to write related pairs to (CSV format).} + +\item{ad_ped_matrix}{Matrix of additive genetic relatedness coefficients.} + +\item{mit_ped_matrix}{Matrix of mitochondrial relatedness coefficients. Alias: \code{mt_ped_matrix}.} + +\item{mt_ped_matrix}{Matrix of mitochondrial relatedness coefficients.} + +\item{cn_ped_matrix}{Matrix of common nuclear relatedness coefficients.} + +\item{update_rate}{Numeric. Frequency (in iterations) at which progress messages are printed.} + +\item{verbose}{Logical. If TRUE, prints progress messages.} + +\item{outcome_name}{Character string representing the outcome name (used in file naming).} + +\item{...}{Additional arguments to be passed to \code{\link{com2links}}} +} +\description{ +This legacy function converts pedigree matrices into a related pairs file. +} diff --git a/man/combine_columns.Rd b/man/combine_columns.Rd index 86e6d4f9..43554b0b 100644 --- a/man/combine_columns.Rd +++ b/man/combine_columns.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/readPedigree.R +% Please edit documentation in R/readGedcom.R \name{combine_columns} \alias{combine_columns} \title{Combine Columns} diff --git a/man/countPatternRows.Rd b/man/countPatternRows.Rd index edcdda52..f9100912 100644 --- a/man/countPatternRows.Rd +++ b/man/countPatternRows.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/readPedigree.R +% Please edit documentation in R/readGedcom.R \name{countPatternRows} \alias{countPatternRows} \title{Check for Pattern Rows} diff --git a/man/extractSummaryText.Rd b/man/extractSummaryText.Rd index ad1e90a9..9b12be26 100644 --- a/man/extractSummaryText.Rd +++ b/man/extractSummaryText.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/readPedigree.R +% Please edit documentation in R/readWikifamilytree.R \name{extractSummaryText} \alias{extractSummaryText} \title{Extract Summary Text} diff --git a/man/extract_info.Rd b/man/extract_info.Rd index 0001eb99..3f5bbc1d 100644 --- a/man/extract_info.Rd +++ b/man/extract_info.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/readPedigree.R +% Please edit documentation in R/readGedcom.R \name{extract_info} \alias{extract_info} \title{Extract Information from Line} diff --git a/man/findBiggest.Rd b/man/findBiggest.Rd new file mode 100644 index 00000000..c3b3d23e --- /dev/null +++ b/man/findBiggest.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/summarizePedigree.R +\name{findBiggest} +\alias{findBiggest} +\title{This function finds the biggest families in a pedigree. It is supposed to be used internally by the \code{summarize_pedigree} function.} +\usage{ +findBiggest(foo_summary_dt, nbiggest, n_foo) +} +\arguments{ +\item{foo_summary_dt}{A data.table containing the summary statistics.} + +\item{nbiggest}{Integer. Number of largest lineages to return (sorted by count).} + +\item{n_foo}{An integer specifying the number of individuals in the summary.} +} +\value{ +a data.table containing the biggest families in the pedigree. +} +\description{ +This function finds the biggest families in a pedigree. It is supposed to be used internally by the \code{summarize_pedigree} function. +} diff --git a/man/findOldest.Rd b/man/findOldest.Rd new file mode 100644 index 00000000..f97fa65c --- /dev/null +++ b/man/findOldest.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/summarizePedigree.R +\name{findOldest} +\alias{findOldest} +\title{This function finds the oldest families in a pedigree. It is supposed to be used internally by the \code{summarize_pedigree} function.} +\usage{ +findOldest(foo_summary_dt, byr, noldest, n_foo) +} +\arguments{ +\item{foo_summary_dt}{A data.table containing the summary statistics.} + +\item{byr}{Character. Optional column name for birth year. Used to determine the oldest lineages.} + +\item{noldest}{Integer. Number of oldest lineages to return (sorted by birth year).} + +\item{n_foo}{An integer specifying the number of individuals in the summary.} +} +\value{ +a data.table containing the oldest families in the pedigree. +} +\description{ +This function finds the oldest families in a pedigree. It is supposed to be used internally by the \code{summarize_pedigree} function. +} diff --git a/man/initialize_empty_df.Rd b/man/initialize_empty_df.Rd new file mode 100644 index 00000000..637d83cb --- /dev/null +++ b/man/initialize_empty_df.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/makeLinks.R +\name{initialize_empty_df} +\alias{initialize_empty_df} +\title{initialize_empty_df} +\usage{ +initialize_empty_df(relNames) +} +\arguments{ +\item{relNames}{A vector of column names to be included in the data frame.} +} +\value{ +An empty data frame with specified column names. +} +\description{ +This function initializes an empty data frame with specified column names. +} +\keyword{internal} diff --git a/man/isChild.Rd b/man/isChild.Rd new file mode 100644 index 00000000..1bd6f1e9 --- /dev/null +++ b/man/isChild.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/convertPedigree.R +\name{isChild} +\alias{isChild} +\title{Determine isChild Status, isChild is the 'S' matrix from RAM} +\usage{ +isChild(isChild_method, ped) +} +\arguments{ +\item{isChild_method}{method to determine isChild status} + +\item{ped}{pedigree data frame} +} +\value{ +isChild 'S' matrix +} +\description{ +Determine isChild Status, isChild is the 'S' matrix from RAM +} diff --git a/man/makeLongTree.Rd b/man/makeLongTree.Rd index e03db2f9..96d4a514 100644 --- a/man/makeLongTree.Rd +++ b/man/makeLongTree.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/readPedigree.R +% Please edit documentation in R/readWikifamilytree.R \name{makeLongTree} \alias{makeLongTree} \title{Make Long Tree} diff --git a/man/assignParentIDs.Rd b/man/mapFAMC2parents.Rd similarity index 66% rename from man/assignParentIDs.Rd rename to man/mapFAMC2parents.Rd index 8bd15170..3ebd6dd7 100644 --- a/man/assignParentIDs.Rd +++ b/man/mapFAMC2parents.Rd @@ -1,17 +1,15 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/readPedigree.R -\name{assignParentIDs} -\alias{assignParentIDs} +% Please edit documentation in R/readGedcom.R +\name{mapFAMC2parents} +\alias{mapFAMC2parents} \title{Assign momID and dadID based on family mapping} \usage{ -assignParentIDs(df_temp, family_to_parents, datasource) +mapFAMC2parents(df_temp, family_to_parents) } \arguments{ \item{df_temp}{A data frame containing individual information.} \item{family_to_parents}{A list mapping family IDs to parent IDs.} - -\item{datasource}{A string indicating the data source. Options are "gedcom" and "wiki".} } \value{ A data frame with added momID and dad_ID columns. diff --git a/man/createFamilyToParentsMapping.Rd b/man/mapFAMS2parents.Rd similarity index 67% rename from man/createFamilyToParentsMapping.Rd rename to man/mapFAMS2parents.Rd index 20d8216d..25d5a9f3 100644 --- a/man/createFamilyToParentsMapping.Rd +++ b/man/mapFAMS2parents.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/readPedigree.R -\name{createFamilyToParentsMapping} -\alias{createFamilyToParentsMapping} +% Please edit documentation in R/readGedcom.R +\name{mapFAMS2parents} +\alias{mapFAMS2parents} \title{Create a mapping of family IDs to parent IDs} \usage{ -createFamilyToParentsMapping(df_temp, datasource) +mapFAMS2parents(df_temp) } \arguments{ \item{df_temp}{A data frame containing information about individuals.} diff --git a/man/matchMembers.Rd b/man/matchMembers.Rd index e02311af..382c05ec 100644 --- a/man/matchMembers.Rd +++ b/man/matchMembers.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/readPedigree.R +% Please edit documentation in R/readWikifamilytree.R \name{matchMembers} \alias{matchMembers} \title{Match Members} diff --git a/man/parseRelationships.Rd b/man/parseRelationships.Rd index a02a5983..24e864b5 100644 --- a/man/parseRelationships.Rd +++ b/man/parseRelationships.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/readPedigree.R +% Please edit documentation in R/readWikifamilytree.R \name{parseRelationships} \alias{parseRelationships} \title{infer relationship from tree template} diff --git a/man/parseTree.Rd b/man/parseTree.Rd index e429662c..6982013e 100644 --- a/man/parseTree.Rd +++ b/man/parseTree.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/readPedigree.R +% Please edit documentation in R/readWikifamilytree.R \name{parseTree} \alias{parseTree} \title{Parse Tree} diff --git a/man/ped2fam.Rd b/man/ped2fam.Rd index 4dbc9aa5..2ff7eb0d 100644 --- a/man/ped2fam.Rd +++ b/man/ped2fam.Rd @@ -30,7 +30,8 @@ ped2fam( A pedigree dataset with one additional column for the newly created extended family ID } \description{ -This function adds an extended family ID variable to a pedigree by segmenting that dataset into independent extended families +This function adds an extended family ID variable to a pedigree by segmenting +that dataset into independent extended families using the weakly connected components algorithm. } \details{ diff --git a/man/ped2maternal.Rd b/man/ped2maternal.Rd index 88106fb6..e54c3e76 100755 --- a/man/ped2maternal.Rd +++ b/man/ped2maternal.Rd @@ -22,7 +22,7 @@ ped2maternal( \item{dadID}{character. Name of the column in ped for the father ID variable} -\item{matID}{Character. Maternal line ID variable to be created and added to the pedigree} +\item{matID}{Character. Maternal line ID variable to be created and added to the pedigree} \item{...}{additional arguments to be passed to \code{\link{ped2com}}} } diff --git a/man/processParents.Rd b/man/processParents.Rd index e8077472..9aa205a7 100644 --- a/man/processParents.Rd +++ b/man/processParents.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/readPedigree.R +% Please edit documentation in R/readGedcom.R \name{processParents} \alias{processParents} \title{Process parents information} diff --git a/man/process_tag.Rd b/man/process_tag.Rd new file mode 100644 index 00000000..14dfa65e --- /dev/null +++ b/man/process_tag.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readGedcom.R +\name{process_tag} +\alias{process_tag} +\title{Process a GEDCOM Tag} +\usage{ +process_tag( + tag, + field_name, + pattern_rows, + line, + vars, + extractor = NULL, + mode = "replace" +) +} +\arguments{ +\item{tag}{The GEDCOM tag (e.g., "SEX", "CAST", etc.).} + +\item{field_name}{The name of the variable to assign to in `vars`.} + +\item{pattern_rows}{Output from `countPatternRows()`.} + +\item{line}{The GEDCOM line to parse.} + +\item{vars}{The current list of variables to update.} +} +\value{ +A list with updated `vars` and a `matched` flag. +} +\description{ +Extracts and assigns a value to a specified field in `vars` if the pattern is present. +Returns both the updated variable list and a flag indicating whether the tag was matched. +} +\keyword{internal} diff --git a/man/readGedcom.Rd b/man/readGedcom.Rd index a5136bc5..78d45773 100644 --- a/man/readGedcom.Rd +++ b/man/readGedcom.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/readPedigree.R +% Please edit documentation in R/readGedcom.R \name{readGedcom} \alias{readGedcom} \title{Read a GEDCOM File} @@ -10,7 +10,8 @@ readGedcom( add_parents = TRUE, remove_empty_cols = TRUE, combine_cols = TRUE, - skinny = FALSE + skinny = FALSE, + ... ) } \arguments{ @@ -25,6 +26,8 @@ readGedcom( \item{combine_cols}{A logical value indicating whether to combine columns with duplicate values.} \item{skinny}{A logical value indicating whether to return a skinny data frame.} + +\item{...}{Additional arguments to be passed to the function.} } \value{ A data frame containing information about individuals, with the following potential columns: diff --git a/man/readWikifamilytree.Rd b/man/readWikifamilytree.Rd index b06acf7b..58a64efd 100644 --- a/man/readWikifamilytree.Rd +++ b/man/readWikifamilytree.Rd @@ -1,13 +1,22 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/readPedigree.R +% Please edit documentation in R/readWikifamilytree.R \name{readWikifamilytree} \alias{readWikifamilytree} \title{Read Wiki Family Tree} \usage{ -readWikifamilytree(text) +readWikifamilytree(text = NULL, verbose = FALSE, file_path = NULL, ...) } \arguments{ \item{text}{A character string containing the text of a family tree in wiki format.} + +\item{verbose}{A logical value indicating whether to print messages.} + +\item{file_path}{The path to the file containing the family tree.} + +\item{...}{Additional arguments (not used).} +} +\value{ +A list containing the summary, members, structure, and relationships of the family tree. } \description{ Read Wiki Family Tree diff --git a/man/summarizeFamilies.Rd b/man/summarizeFamilies.Rd index 54fc978d..8903eb93 100644 --- a/man/summarizeFamilies.Rd +++ b/man/summarizeFamilies.Rd @@ -33,7 +33,7 @@ summarizeFamilies( \item{dadID}{character. Name of the column in ped for the father ID variable} -\item{matID}{Character. Maternal line ID variable to be created and added to the pedigree} +\item{matID}{Character. Maternal line ID variable to be created and added to the pedigree} \item{patID}{Character. Paternal line ID variable to be created and added to the pedigree} diff --git a/man/summarizeMatrilines.Rd b/man/summarizeMatrilines.Rd index 1930858a..2890b622 100644 --- a/man/summarizeMatrilines.Rd +++ b/man/summarizeMatrilines.Rd @@ -33,7 +33,7 @@ summarizeMatrilines( \item{dadID}{character. Name of the column in ped for the father ID variable} -\item{matID}{Character. Maternal line ID variable to be created and added to the pedigree} +\item{matID}{Character. Maternal line ID variable to be created and added to the pedigree} \item{patID}{Character. Paternal line ID variable to be created and added to the pedigree} diff --git a/man/summarizePatrilines.Rd b/man/summarizePatrilines.Rd index 846b0e4b..aed89bcd 100644 --- a/man/summarizePatrilines.Rd +++ b/man/summarizePatrilines.Rd @@ -33,7 +33,7 @@ summarizePatrilines( \item{dadID}{character. Name of the column in ped for the father ID variable} -\item{matID}{Character. Maternal line ID variable to be created and added to the pedigree} +\item{matID}{Character. Maternal line ID variable to be created and added to the pedigree} \item{patID}{Character. Paternal line ID variable to be created and added to the pedigree} diff --git a/man/summarizePedigrees.Rd b/man/summarizePedigrees.Rd index 1a076fdc..b298d326 100644 --- a/man/summarizePedigrees.Rd +++ b/man/summarizePedigrees.Rd @@ -35,7 +35,7 @@ summarizePedigrees( \item{dadID}{character. Name of the column in ped for the father ID variable} -\item{matID}{Character. Maternal line ID variable to be created and added to the pedigree} +\item{matID}{Character. Maternal line ID variable to be created and added to the pedigree} \item{patID}{Character. Paternal line ID variable to be created and added to the pedigree} diff --git a/man/validate_and_convert_matrix.Rd b/man/validate_and_convert_matrix.Rd new file mode 100644 index 00000000..f6bf5c2a --- /dev/null +++ b/man/validate_and_convert_matrix.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/makeLinks.R +\name{validate_and_convert_matrix} +\alias{validate_and_convert_matrix} +\title{validate_and_convert_matrix} +\usage{ +validate_and_convert_matrix( + mat, + name, + ensure_symmetric = FALSE, + force_binary = FALSE +) +} +\arguments{ +\item{mat}{The matrix to be validated and converted.} + +\item{name}{The name of the matrix for error messages.} + +\item{ensure_symmetric}{Logical indicating whether to ensure the matrix is symmetric.} + +\item{force_binary}{Logical indicating whether to force the matrix to be binary.} +} +\value{ +The validated and converted matrix. +} +\description{ +This function validates and converts a matrix to a specific format. +} diff --git a/tests/testthat/test-calculateFamilySize.R b/tests/testthat/test-calculateFamilySize.R index b6e32c7d..f648c8fd 100644 --- a/tests/testthat/test-calculateFamilySize.R +++ b/tests/testthat/test-calculateFamilySize.R @@ -65,13 +65,12 @@ test_that("allGens behaves with Ngens", { result <- allGens(kpc = kpc, Ngen = Ngen, marR = marR) - expect_equal(result, 2) + expect_equal(result, 2) Ngen <- 0 - expect_error( allGens(kpc = kpc, Ngen = Ngen, marR = marR)) + expect_error(allGens(kpc = kpc, Ngen = Ngen, marR = marR)) - expect_error(allGens(kpc = kpc, Ngen= NULL, marR = marR)) - expect_error(allGens(kpc = kpc, Ngen= -1, marR = marR)) + expect_error(allGens(kpc = kpc, Ngen = NULL, marR = marR)) + expect_error(allGens(kpc = kpc, Ngen = -1, marR = marR)) }) - diff --git a/tests/testthat/test-checkParents.R b/tests/testthat/test-checkParents.R index 86776577..21660c4b 100644 --- a/tests/testthat/test-checkParents.R +++ b/tests/testthat/test-checkParents.R @@ -1,29 +1,27 @@ # Test Case 1: Validate sex coding without repair test_that("checkParentIDs identifies parent coding correctly in potter dataset", { - results <- checkParentIDs(potter,verbose = TRUE, repair = FALSE) + results <- checkParentIDs(potter, verbose = TRUE, repair = FALSE) expect_false("parents_in_both" %in% names(results)) expect_false(results$single_parents) expect_false(results$missing_parents) expect_true(results$female_moms) expect_true(results$male_dads) - expect_equal(results$female_var,0) - expect_equal(results$male_var,1) - expect_equal(results$mom_sex,0) - expect_equal(results$dad_sex,1) + expect_equal(results$female_var, 0) + expect_equal(results$male_var, 1) + expect_equal(results$mom_sex, 0) + expect_equal(results$dad_sex, 1) }) # Test Case 2: Validate sex coding without repair test_that("checksif single parents found correctly in ASOIAF dataset", { data(ASOIAF) df_asoiaf <- ASOIAF - results <- checkParentIDs(df_asoiaf,verbose = FALSE, repair = FALSE) + results <- checkParentIDs(df_asoiaf, verbose = FALSE, repair = FALSE) expect_true(results$single_parents) - single_dads <- length(df_asoiaf$id[!is.na(df_asoiaf$dadID)&is.na(df_asoiaf$momID)]) - single_moms <- length(df_asoiaf$id[is.na(df_asoiaf$dadID)&!is.na(df_asoiaf$momID)]) - expect_equal(single_moms, length(results$missing_fathers)) - expect_equal(single_dads, length(results$missing_mothers)) - repaired_df <- checkParentIDs(df_asoiaf,verbose = FALSE, repair = TRUE) - expect_equal(nrow(repaired_df), nrow(df_asoiaf)+single_moms+single_dads) - + single_dads <- length(df_asoiaf$id[!is.na(df_asoiaf$dadID) & is.na(df_asoiaf$momID)]) + single_moms <- length(df_asoiaf$id[is.na(df_asoiaf$dadID) & !is.na(df_asoiaf$momID)]) + expect_equal(single_moms, length(results$missing_fathers)) + expect_equal(single_dads, length(results$missing_mothers)) + repaired_df <- checkParentIDs(df_asoiaf, verbose = FALSE, repair = TRUE) + expect_equal(nrow(repaired_df), nrow(df_asoiaf) + single_moms + single_dads) }) - diff --git a/tests/testthat/test-computeRelatedness.R b/tests/testthat/test-computeRelatedness.R index 10b0de29..05fb7e62 100644 --- a/tests/testthat/test-computeRelatedness.R +++ b/tests/testthat/test-computeRelatedness.R @@ -25,6 +25,15 @@ test_that("calculateRelatedness function with empirical", { }) +# Test 7: empirical divide by zero + +test_that("calculateH handles divide by zero for empirical", { +expect_error( + calculateRelatedness(generations = 2, + empirical = TRUE, total_a = 0, + total_m = 0)) + +}) test_that("inferRelatedness performs as expected", { result <- inferRelatedness(0, aceA = .9, aceC = 0, sharedC = 0) expect_equal(result, 0) @@ -96,3 +105,4 @@ test_that("calculateH stops for illegal coefficients", { "The observed correlations should be between -1 and 1" ) }) + diff --git a/tests/testthat/test-convertPedigree.R b/tests/testthat/test-convertPedigree.R index fb6e06fe..11dfbeca 100644 --- a/tests/testthat/test-convertPedigree.R +++ b/tests/testthat/test-convertPedigree.R @@ -1,7 +1,7 @@ test_that("ped2add produces correct matrix dims, values, and dimnames for hazard", { tolerance <- 1e-10 data(hazard) - add <- ped2add(hazard,sparse = FALSE) + add <- ped2add(hazard, sparse = FALSE) # Check dimension expect_equal(dim(add), c(nrow(hazard), nrow(hazard))) # Check several values @@ -21,7 +21,7 @@ test_that("ped2add produces correct matrix dims, values, and dimnames for hazard test_that("ped2add produces correct matrix dims, values, and dimnames for alternative transpose", { tolerance <- 1e-10 data(hazard) - add <- ped2add(hazard, tcross.alt.crossprod = TRUE,sparse = FALSE) + add <- ped2add(hazard, tcross.alt.crossprod = TRUE, sparse = FALSE) # Check dimension expect_equal(dim(add), c(nrow(hazard), nrow(hazard)), tolerance = tolerance) # Check several values @@ -42,7 +42,7 @@ test_that("ped2add produces correct matrix dims, values, and dimnames for altern test_that("ped2add produces correct matrix dims, values, and dimnames for inbreeding data", { tolerance <- 1e-10 data(inbreeding) - add <- ped2add(inbreeding,sparse = FALSE) + add <- ped2add(inbreeding, sparse = FALSE) # Check dimension expect_equal(dim(add), c(nrow(inbreeding), nrow(inbreeding)), tolerance = tolerance) # Check several values @@ -62,7 +62,7 @@ test_that("ped2add produces correct matrix dims, values, and dimnames for inbree test_that("ped2add produces correct matrix dims, values, and dimnames for inbreeding data with alternative transpose", { tolerance <- 1e-10 data(inbreeding) - add <- ped2add(inbreeding, transpose_method = "star",sparse = FALSE) + add <- ped2add(inbreeding, transpose_method = "star", sparse = FALSE) # Check dimension expect_equal(dim(add), c(nrow(inbreeding), nrow(inbreeding))) # Check several values @@ -81,7 +81,7 @@ test_that("ped2add produces correct matrix dims, values, and dimnames for inbree test_that("ped2add produces correct matrix dims, values, and dimnames for inbreeding data with 2nd alternative transpose", { tolerance <- 1e-10 data(inbreeding) - add <- ped2add(inbreeding, transpose_method = "crossprod",sparse = FALSE) + add <- ped2add(inbreeding, transpose_method = "crossprod", sparse = FALSE) # Check dimension expect_equal(dim(add), c(nrow(inbreeding), nrow(inbreeding))) # Check several values @@ -101,7 +101,7 @@ test_that("ped2add produces correct matrix dims, values, and dimnames for inbree test_that("ped2add flattens diagonal for inbreeding data", { tolerance <- 1e-10 data(inbreeding) - add <- ped2add(inbreeding, flatten.diag = TRUE,sparse = FALSE) + add <- ped2add(inbreeding, flatten.diag = TRUE, sparse = FALSE) # Check dimension expect_equal(dim(add), c(nrow(inbreeding), nrow(inbreeding)), tolerance = tolerance) # Check several values @@ -121,7 +121,7 @@ test_that("ped2mit produces correct matrix dims, values, and dimnames for inbree tolerance <- 1e-10 # Check dimension data(inbreeding) - mit <- ped2mit(inbreeding,sparse = FALSE) + mit <- ped2mit(inbreeding, sparse = FALSE) # Check dimension expect_equal(dim(mit), c(nrow(inbreeding), nrow(inbreeding))) # Check several values @@ -142,7 +142,7 @@ test_that("ped2mit produces correct matrix dims, values, and dimnames for inbree tolerance <- 1e-10 # Check dimension data(inbreeding) - mit <- ped2mit(inbreeding,sparse = FALSE) + mit <- ped2mit(inbreeding, sparse = FALSE) # Check dimension expect_equal(dim(mit), c(nrow(inbreeding), nrow(inbreeding)), tolerance = tolerance) # Check several values @@ -164,7 +164,7 @@ test_that("ped2cn produces correct matrix dims, values, and dimnames", { # Check dimension data(inbreeding) - cn <- ped2cn(inbreeding,sparse = FALSE) + cn <- ped2cn(inbreeding, sparse = FALSE) expect_equal(dim(cn), c( nrow(inbreeding), nrow(inbreeding) @@ -189,7 +189,7 @@ test_that("ped2cn produces correct matrix dims, values, and dimnames", { test_that("ped2ce produces correct matrix dims, values, and dimnames", { tolerance <- 1e-10 data(inbreeding) - ce <- ped2ce(inbreeding,sparse = FALSE) + ce <- ped2ce(inbreeding, sparse = FALSE) expect_equal(dim(ce), c(nrow(inbreeding), nrow(inbreeding)), tolerance = tolerance) # Check several values # expect_true(all(diag(ce) == 1)) @@ -207,7 +207,7 @@ test_that("ped2ce produces correct matrix dims, values, and dimnames", { test_that("ped2add verbose prints updates", { data(hazard) - expect_output(ped2add(hazard, verbose = TRUE,sparse = FALSE), regexp = "Family Size =") + expect_output(ped2add(hazard, verbose = TRUE, sparse = FALSE), regexp = "Family Size =") }) diff --git a/tests/testthat/test-makeLinks.R b/tests/testthat/test-makeLinks.R index 2edb8377..e8dfe99e 100644 --- a/tests/testthat/test-makeLinks.R +++ b/tests/testthat/test-makeLinks.R @@ -1,7 +1,7 @@ test_that("com2links handles missing matrices properly", { expect_error( com2links(ad_ped_matrix = NULL, mit_ped_matrix = NULL, cn_ped_matrix = NULL), - "At least one of 'ped_matrix', 'mit_ped_matrix', or 'cn_ped_matrix' must be provided." + "At least one of 'ad_ped_matrix', 'mit_ped_matrix', or 'cn_ped_matrix' must be provided." ) }) @@ -131,8 +131,7 @@ test_that("com2links correctly handles missing matrices", { expect_error( com2links(ad_ped_matrix = NULL, mit_ped_matrix = NULL, cn_ped_matrix = NULL), - "At least one of 'ped_matrix', 'mit_ped_matrix', or 'cn_ped_matrix' must be provided." - ) + "At least one of 'ad_ped_matrix', 'mit_ped_matrix', or 'cn_ped_matrix' must be provided." ) expect_error(com2links(ad_ped_matrix = hazard), "The 'ad_ped_matrix' must be a matrix or dgCMatrix.") }) diff --git a/tests/testthat/test-readPedigrees.R b/tests/testthat/test-readPedigrees.R index 94cbcb28..1f098697 100644 --- a/tests/testthat/test-readPedigrees.R +++ b/tests/testthat/test-readPedigrees.R @@ -18,7 +18,8 @@ test_that("readGedcom reads and parses a GEDCOM file correctly", { "1 SEX F", "1 BIRT", "2 DATE 2 FEB 1910", - "2 PLAC Anotherplace" + "2 PLAC Anotherplace", + "1 NCHI 2" ) temp_file <- tempfile(fileext = ".ged") writeLines(gedcom_content, temp_file) @@ -181,7 +182,7 @@ test_that("if file does not exist, readGedcom throws an error", { # readWikifamilytree -test_that("readWikifamilytree reads a simple file correctly", { +test_that("readWikifamilytree reads a string correctly", { # Create a temporary WikiFamilyTree file for testing # Example usage family_tree_text <- "{{familytree/start |summary=I have a brother Joe and a little sister: my mom married my dad, and my dad's parents were Grandma and Grandpa; they had another child, Aunt Daisy.}} @@ -192,16 +193,31 @@ test_that("readWikifamilytree reads a simple file correctly", { {{familytree | JOE | | ME | | SIS | | | JOE=My brother Joe|ME='''Me!'''|SIS=My little sister}} {{familytree/end}}" - result <- readWikifamilytree(family_tree_text) + temp_file <- tempfile(fileext = ".txt") + writeLines(family_tree_text, temp_file) + + + result <- readWikifamilytree(text = family_tree_text) + result2 <- readWikifamilytree(file_path = temp_file) - # list( - # summary = summary_text, - # members = members_df, - # structure = tree_long, - # relationships = relationships_df - # ) expect_equal( result$summary, "I have a brother Joe and a little sister: my mom married my dad, and my dad's parents were Grandma and Grandpa; they had another child, Aunt Daisy." ) + + expect_equal( + result2$summary, + "I have a brother Joe and a little sister: my mom married my dad, and my dad's parents were Grandma and Grandpa; they had another child, Aunt Daisy." + ) }) + + +# read E:/Dropbox/Lab/Research/Projects/2024/BGMiscJoss/BGmisc_main/data-raw/Targaryen tree Dance.txt + +#test_that("readWikifamilytree reads a file correctly", { + # Create a temporary WikiFamilyTree file for testing + # Example usage +# family_tree_file_path <- "data-raw/Targaryen tree Dance.txt" # system.file("extdata", "Targaryen tree Dance.txt", package = "BGmisc") + + # result <- readWikifamilytree(file_path=family_tree_file_path) +#}) diff --git a/tests/testthat/test-summarizePedigrees.R b/tests/testthat/test-summarizePedigrees.R index 13cb46e0..61dae665 100644 --- a/tests/testthat/test-summarizePedigrees.R +++ b/tests/testthat/test-summarizePedigrees.R @@ -51,17 +51,22 @@ test_that("summarizeFamilies() works with additional summary stats", { # Test Case 4: Does this function work for summarizeMatrilines test_that("summarizeMatrilines() works", { nbiggest <- 2 - df <- ped2fam(potter, famID = "newFamID", personID = "personID") %>% ped2maternal(personID = "personID") - df_summarized <- summarizeMatrilines(df, famID = "newFamID", personID = "personID", nbiggest = nbiggest) + df <- ped2fam(potter, famID = "newFamID", personID = "personID") %>% + ped2maternal(personID = "personID") + df_summarized <- summarizeMatrilines(df, famID = "newFamID", + personID = "personID", + nbiggest = nbiggest) # is the total count from the family summary the same as the raw data? result_observed <- sum(df_summarized$maternal_summary$count) result_expected <- nrow(potter) expect_equal(result_observed, result_expected) - # is the count of the summarized data frame equal to the number of unique families in the input data frame? + # is the count of the summarized data frame equal to the number of + # unique families in the input data frame? result_observed <- length(df_summarized$maternal_summary$count) result_expected <- length(unique(df$matID)) expect_equal(result_observed, result_expected) - # is the count of the biggest families equal to the number of unique families in the input data frame? + # is the count of the biggest families equal to the number of + # unique families in the input data frame? result_observed <- nrow(df_summarized$biggest_maternal) expect_equal(result_observed, nbiggest) }) @@ -69,17 +74,22 @@ test_that("summarizeMatrilines() works", { # Test Case 5: Does this function work for summarizePatrilines test_that("summarizePatrilines() works", { nbiggest <- 4 - df <- ped2fam(potter, famID = "newFamID", personID = "personID") %>% ped2paternal(personID = "personID") - df_summarized <- summarizePatrilines(df, famID = "newFamID", personID = "personID", nbiggest = nbiggest) + df <- ped2fam(potter, famID = "newFamID", personID = "personID") %>% + ped2paternal(personID = "personID") + df_summarized <- summarizePatrilines(df, famID = "newFamID", + personID = "personID", + nbiggest = nbiggest) # is the total count from the family summary the same as the raw data? result_observed <- sum(df_summarized$paternal_summary$count) result_expected <- nrow(potter) expect_equal(result_observed, result_expected) - # is the count of the summarized data frame equal to the number of unique families in the input data frame? + # is the count of the summarized data frame equal to the number of + # unique families in the input data frame? result_observed <- length(df_summarized$paternal_summary$count) result_expected <- length(unique(df$patID)) expect_equal(result_observed, result_expected) - # is the count of the biggest families equal to the number of unique families in the input data frame? + # is the count of the biggest families equal to the number of + # unique families in the input data frame? result_observed <- nrow(df_summarized$biggest_paternal) expect_equal(result_observed, nbiggest) }) @@ -101,7 +111,7 @@ test_that("summarizePedigrees() handles missing values correctly", { }) # Test Case 7: When all variables are skipped -test_that("summarizePedigrees() works when all numeric variables are skipped", { +test_that("summarizePedigrees works when all numeric variables are skipped",{ df <- data.frame( ID = 1:5, momID = c(NA, 1, 1, NA, 4), @@ -116,7 +126,8 @@ test_that("summarizePedigrees() works when all numeric variables are skipped", { # Test Case 8: Handling invalid column names test_that("summarizePedigrees() throws error on invalid column names", { - df <- data.frame(ID = 1:5, momID = c(NA, 1, 1, NA, 4), dadID = c(NA, 2, 2, NA, 5), famID = c(1, 1, 1, 2, 2)) + df <- data.frame(ID = 1:5, momID = c(NA, 1, 1, NA, 4), + dadID = c(NA, 2, 2, NA, 5), famID = c(1, 1, 1, 2, 2)) expect_error(summarizePedigrees(df, byr = "unknown_column")) }) diff --git a/tests/testthat/test-tweakPedigree.R b/tests/testthat/test-tweakPedigree.R index 24f81ff9..0b542e2e 100644 --- a/tests/testthat/test-tweakPedigree.R +++ b/tests/testthat/test-tweakPedigree.R @@ -115,7 +115,7 @@ test_that("makeInbreeding - Inbred mates specified by generation and cousin", { expect_error(makeInbreeding(ped, gen_inbred = gen_inbred, - type_inbred = type_inbred,verbose = TRUE + type_inbred = type_inbred, verbose = TRUE ), regexp = "Cousin inbreedin") }) diff --git a/vignettes/ASOIAF.Rmd b/vignettes/ASOIAF.Rmd index 69d72a76..f9dceaa0 100644 --- a/vignettes/ASOIAF.Rmd +++ b/vignettes/ASOIAF.Rmd @@ -42,7 +42,6 @@ df_got <- checkSex(ASOIAF, code_female = 0, verbose = FALSE, repair = TRUE ) - ``` @@ -94,9 +93,9 @@ For interpretability, we convert these square matrices into long-format tables u ```{r} df_links <- com2links( writetodisk = FALSE, - ad_ped_matrix = add, cn_ped_matrix = cn, mit_ped_matrix= mt, + ad_ped_matrix = add, cn_ped_matrix = cn, mit_ped_matrix = mt, drop_upper_triangular = TRUE -)# %>% +) # %>% # filter(ID1 != ID2) ``` @@ -108,7 +107,6 @@ We next identify the rows in the pairwise relatedness table that correspond to J ```{r} - # Find the IDs of Jon Snow and Daenerys Targaryen jon_id <- df_got %>% @@ -123,12 +121,11 @@ dany_id <- df_got %>% Then we isolate their dyad: ```{r} - jon_dany_row <- df_links %>% filter(ID1 == jon_id | ID2 == jon_id) %>% - filter(ID1 %in% dany_id| ID2 %in% dany_id) + filter(ID1 %in% dany_id | ID2 %in% dany_id) -jon_dany_row +jon_dany_row ``` This table contains the additive and nuclear relatedness estimates for Jon and Daenerys. If the pedigree reflects their canonical aunt-nephew relationship and is free from inbreeding, we’d expect to see an additive coefficient close to 0.25. However, the value is `r jon_dany_row$addRel[1]`, indicating a more complex relationship. @@ -146,18 +143,18 @@ Many real-world and fictional pedigrees contain individuals with unknown or part To facilitate plotting, we check for individuals with one known parent but a missing other. For those cases, we assign a placeholder ID to the missing parent. ```{r} - -df_repaired <- checkParentIDs(df_got,addphantoms=TRUE, - repair=TRUE, - parentswithoutrow=FALSE, - repairsex=FALSE - ) %>% mutate(fam=1, - affected = case_when(ID %in% c(jon_id,dany_id, "365") ~ 1, - TRUE ~ 0) - ) - - - +df_repaired <- checkParentIDs(df_got, + addphantoms = TRUE, + repair = TRUE, + parentswithoutrow = FALSE, + repairsex = FALSE +) %>% mutate( + fam = 1, + affected = case_when( + ID %in% c(jon_id, dany_id, "365") ~ 1, + TRUE ~ 0 + ) +) ``` This code creates new IDs for individuals with one known parent and a missing other. It checks if either `momID` or `dadID` is missing, and if so, it assigns a new ID based on the row number. This allows us to visualize the pedigree even when some parental information is incomplete. @@ -167,9 +164,7 @@ This code creates new IDs for individuals with one known parent and a missing ot ## Visualize the Pedigree ```{r} +# fixParents(id=df_got$ID, dadid=df_got$dadID, momid=df_got$momID, sex=df_got$sex, missid = NA) -#fixParents(id=df_got$ID, dadid=df_got$dadID, momid=df_got$momID, sex=df_got$sex, missid = NA) - -plotPedigree(df_repaired,affected=df_repaired$affected,verbose=FALSE) - +plotPedigree(df_repaired, affected = df_repaired$affected, verbose = FALSE) ``` diff --git a/vignettes/ASOIAF.html b/vignettes/ASOIAF.html index 9b925ee3..6762cd1f 100644 --- a/vignettes/ASOIAF.html +++ b/vignettes/ASOIAF.html @@ -360,13 +360,13 @@

Load Packages and Data

structure of the built-in ASOIAF pedigree.

library(BGmisc)
 library(tidyverse)
-
## ── Attaching core tidyverse packages ───── tidyverse 2.0.0 ──
+
## ── Attaching core tidyverse packages ────── tidyverse 2.0.0 ──
 ## ✔ dplyr     1.1.4     ✔ readr     2.1.5
 ## ✔ forcats   1.0.0     ✔ stringr   1.5.1
 ## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
 ## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
 ## ✔ purrr     1.0.4     
-## ── Conflicts ─────────────────────── tidyverse_conflicts() ──
+## ── Conflicts ──────────────────────── tidyverse_conflicts() ──
 ## ✖ dplyr::between()     masks BGmisc::between()
 ## ✖ dplyr::filter()      masks stats::filter()
 ## ✖ dplyr::first()       masks BGmisc::first()
@@ -400,7 +400,7 @@ 

Load Packages and Data

Prepare and Validate Sex Codes

-

Many pedigree-based algorithms rely on valid sex codes for downstream +

Many pedigree-based algorithms rely on biological sex for downstream calculationss and visualization. We use checkSex() to inspect the sex variable, repairing inconsistencies programmatically.

@@ -408,93 +408,129 @@

Prepare and Validate Sex Codes

code_male = 1, code_female = 0, verbose = FALSE, repair = TRUE -)
+)

Compute Relatedness Matrices

-

We now compute the additive genetic relatedness matrix (add) and the -common nuclear relatedness matrix (cn) from the pedigree using ped2com() -and ped2cn(), respectively. The isChild_method argument -specifies how to identify child-parent relationships. We use -“partialparent” to account for missing parent information. The -adjacency_method argument specifies how to construct the -adjacency matrix. We use “direct” for the additive matrix and “indexed” -for the common nuclear matrix. The direct method is much faster. The -sparse argument is set to FALSE to return dense -matrices.

+

With validated pedigree data, we can now compute two distinct +relationship matrices:

+ +

These are derived using ped2add() and ped2cn(), respectively. Both +functions rely on internal graph traversal and adjacency structures. In +this case:

+
add <- ped2com(df_got,
   isChild_method = "partialparent",
   component = "additive",
   adjacency_method = "direct",
-  sparse = FALSE
+  sparse = TRUE
 )
 
-cn <- ped2cn(df_got,
+mt <- ped2com(df_got,
   isChild_method = "partialparent",
-  adjacency_method = "indexed",
-  sparse = FALSE
-)
+ component = "mitochondrial", + adjacency_method = "direct", + sparse = TRUE +) + +cn <- ped2cn(df_got, + isChild_method = "partialparent", + adjacency_method = "indexed", + sparse = TRUE +)

Convert to Pairwise Format

-

We convert the component matrices into a long-format table of -pairwise relationships using com2links(). This gives us a -long dataframe where each row represents a pair of individuals and their -relatedness. The function can return the entire matrix or just the lower -triangular part, which is often sufficient for our purposes. We set -writetodisk = FALSE to keep the data in memory.

+

For interpretability, we convert these square matrices into +long-format tables using com2links(). This function returns +a dataframe where each row represents a unique pair of individuals, +including their additive and common nuclear coefficients.

df_links <- com2links(
   writetodisk = FALSE,
-  ad_ped_matrix = add, cn_ped_matrix = cn,
+  ad_ped_matrix = add, cn_ped_matrix = cn, mit_ped_matrix = mt,
   drop_upper_triangular = TRUE
-)# %>%
-#  filter(ID1 != ID2)
+) # %>%
+
## 'as(<dsCMatrix>, "dgCMatrix")' is deprecated.
+## Use 'as(., "generalMatrix")' instead.
+## See help("Deprecated") and help("Matrix-deprecated").
+
#  filter(ID1 != ID2)
+

The function can return the entire matrix or just the lower +triangular part, which is often sufficient for our purposes. Setting +drop_upper_triangular = TRUE ensures we only retain one +entry per dyad, since the matrices are symmetric. We also keep the data +in memory by setting writetodisk = FALSE.

Locate Jon and Daenerys

-

Next, we extract the IDs corresponding to Jon Snow and Daenerys -Targaryen. We use the filter() function to find the rows in -the df_links dataframe where either ID1 or ID2 corresponds -to Jon Snow, and then filter again to find the row where the other ID -corresponds to Daenerys Targaryen.

-
# Find the IDs of Jon Snow and Daenerys Targaryen
-
-jon_id <- df_got %>%
-  filter(name == "Jon Snow") %>%
-  pull(ID)
-
-dany_id <- df_got %>%
-  filter(name == "Daenerys Targaryen") %>%
-  pull(ID)
-

We then filter the pairwise table to retrieve the row containing -their relationship.

-
jon_dany_row <- df_links %>%
-  filter(ID1 == jon_id | ID2 == jon_id) %>%
-  filter(ID1 %in% dany_id| ID2 %in% dany_id)
-
-jon_dany_row 
-
##   ID1 ID2     addRel cnuRel
-## 1 206 211 0.31274414      0
-## 2 211 304 0.01953125      0
-

This row contains the additive relatedness coefficient between Jon -and Daenerys, which allows us to assess how closely related they are -genetically. We’d expect to see a value of 0.25 for an Aunt-Nephew -relationship, which is what Jon and Daenerys are in the show. However, -the value is 0.3127441, indicating a more complex relationship.

+

We next identify the rows in the pairwise relatedness table that +correspond to Jon Snow and Daenerys Targaryen. First, we retrieve their +individual IDs:

+
# Find the IDs of Jon Snow and Daenerys Targaryen
+
+jon_id <- df_got %>%
+  filter(name == "Jon Snow") %>%
+  pull(ID)
+
+dany_id <- df_got %>%
+  filter(name == "Daenerys Targaryen") %>%
+  pull(ID)
+

Then we isolate their dyad:

+
jon_dany_row <- df_links %>%
+  filter(ID1 == jon_id | ID2 == jon_id) %>%
+  filter(ID1 %in% dany_id | ID2 %in% dany_id)
+
+jon_dany_row
+
##   ID1 ID2     addRel mitRel cnuRel
+## 1 206 211 0.31274414      0      0
+## 2 211 304 0.01953125      0      0
+

This table contains the additive and nuclear relatedness estimates +for Jon and Daenerys. If the pedigree reflects their canonical +aunt-nephew relationship and is free from inbreeding, we’d expect to see +an additive coefficient close to 0.25. However, the value is 0.3127441, +indicating a more complex relationship.

-

Plotting the Pedigree with incomplete parental information

+

Plotting the Pedigree with Incomplete Parental Information

+

Many real-world and fictional pedigrees contain individuals with +unknown or partially known parentage. In such cases, plotting tools +typically fail unless these gaps are handled. We use +checkParentIDs() to:

+ +

-Optionally repair and harmonize parent fields

To facilitate plotting, we check for individuals with one known parent but a missing other. For those cases, we assign a placeholder ID to the missing parent.

-
df_repaired <- checkParentIDs(df_got,addphantoms=TRUE,
-                              repair=TRUE,
-                              parentswithoutrow=FALSE,
-                              repairsex=FALSE
-                              ) %>% mutate(fam=1,
-                                           affected = case_when(ID %in% c(jon_id,dany_id) ~ 1,
-                                                               TRUE ~ 0)
-                              )
+
df_repaired <- checkParentIDs(df_got,
+  addphantoms = TRUE,
+  repair = TRUE,
+  parentswithoutrow = FALSE,
+  repairsex = FALSE
+) %>% mutate(
+  fam = 1,
+  affected = case_when(
+    ID %in% c(jon_id, dany_id, "365") ~ 1,
+    TRUE ~ 0
+  )
+)
## REPAIR IN EARLY ALPHA

This code creates new IDs for individuals with one known parent and a missing other. It checks if either momID or @@ -504,9 +540,9 @@

Plotting the Pedigree with incomplete parental information

Visualize the Pedigree

-
#fixParents(id=df_got$ID, dadid=df_got$dadID, momid=df_got$momID, sex=df_got$sex, missid = NA)
-
-plotPedigree(df_repaired,affected=df_repaired$affected,verbose=FALSE)
+
# fixParents(id=df_got$ID, dadid=df_got$dadID, momid=df_got$momID, sex=df_got$sex, missid = NA)
+
+plotPedigree(df_repaired, affected = df_repaired$affected, verbose = FALSE)

## Did not plot the following people: 85 88 125 142 228 229 258 259 274 275 305 336 357 381 388 405 409 418 420 424 428 451 487
## named list()