diff --git a/DESCRIPTION b/DESCRIPTION index 115dc66c..e04bdef4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: BGmisc Title: An R Package for Extended Behavior Genetics Analysis -Version: 1.3.4.1 +Version: 1.3.5 Authors@R: c( person("S. Mason", "Garrison", , "garrissm@wfu.edu", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-4804-6003")), @@ -33,8 +33,10 @@ Imports: kinship2, Matrix, stats, - stringr + stringr, + methods Suggests: + corrplot, dplyr, EasyMx, knitr, diff --git a/NAMESPACE b/NAMESPACE index a907a21a..f6f979b8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,16 +5,19 @@ export(allGens) export(calculateRelatedness) export(checkIDs) export(checkSex) +export(com2links) export(comp2vech) export(createGenDataFrame) export(dropLink) export(evenInsert) +export(extractSummaryText) export(famSizeCal) export(fitComponentModel) export(identifyComponentModel) export(inferRelatedness) export(makeInbreeding) export(makeTwins) +export(parseTree) export(ped2add) export(ped2ce) export(ped2cn) @@ -26,6 +29,7 @@ export(ped2mit) export(ped2paternal) export(plotPedigree) export(readGedcom) +export(readWikifamilytree) export(recodeSex) export(related_coef) export(relatedness) diff --git a/NEWS.md b/NEWS.md index da7ea6d3..2c2d99bf 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,10 +1,18 @@ +# BGmisc 1.3.5 +* Add com2links function that converts components to kinship links +* Add tests for com2links +* Add function to extract family tree from wiki family tree template +* Add tests for readWikifamilytree +* Create vignette for adjacency matrix methods +* Silences invisible list for plot + # BGmisc 1.3.4.1 * Hot fix to resolve issue with list of adjacency matrix not loading saved version * Reoptimized generation calculation # BGmisc 1.3.4 * Added alternative (and faster) methods to create the adjacency matrix -* Add tests for comparison of adjacency matrix methods +* Add tests for comparison of adjacency matrix build methods * Added Royal Family pedigree # BGmisc 1.3.3 diff --git a/R/checkSex.R b/R/checkSex.R index a3c51dc5..35b14a49 100644 --- a/R/checkSex.R +++ b/R/checkSex.R @@ -166,7 +166,7 @@ recodeSex <- function( } # Recode as "F" or "M" based on code_male, preserving NAs - if (!is.null(code_male) & !is.null(code_female)) { + if (!is.null(code_male) && !is.null(code_female)) { # Initialize sex_recode as NA, preserving the length of the 'sex' column ped$sex_recode <- recode_na ped$sex_recode[ped$sex == code_female] <- recode_female @@ -174,7 +174,7 @@ recodeSex <- function( # Overwriting temp recode variable ped$sex <- ped$sex_recode ped$sex_recode <- NULL - } else if (!is.null(code_male) & is.null(code_female)) { + } else if (!is.null(code_male) && is.null(code_female)) { # Initialize sex_recode as NA, preserving the length of the 'sex' column ped$sex_recode <- recode_na ped$sex_recode[ped$sex != code_male & !is.na(ped$sex)] <- recode_female @@ -182,7 +182,7 @@ recodeSex <- function( # Overwriting temp recode variable ped$sex <- ped$sex_recode ped$sex_recode <- NULL - } else if (is.null(code_male) & !is.null(code_female)) { + } else if (is.null(code_male) && !is.null(code_female)) { # Initialize sex_recode as NA, preserving the length of the 'sex' column ped$sex_recode <- recode_na ped$sex_recode[ped$sex != code_female & !is.na(ped$sex)] <- recode_male diff --git a/R/convertPedigree.R b/R/convertPedigree.R index 904bde31..5c9ce2a5 100644 --- a/R/convertPedigree.R +++ b/R/convertPedigree.R @@ -144,7 +144,7 @@ ped2com <- function(ped, component, if (verbose) cat("Resuming: Constructed matrix...\n") jss <- readRDS(checkpoint_files$jss) iss <- readRDS(checkpoint_files$iss) - list_of_adjacencies <- list(iss=iss, jss=jss) + list_of_adjacencies <- list(iss = iss, jss = jss) } else { list_of_adjacencies <- compute_parent_adjacency( @@ -177,7 +177,7 @@ ped2com <- function(ped, component, } # Garbage collection if gc is TRUE if (gc) { - rm(parList, lens,list_of_adjacencies) + rm(parList, lens, list_of_adjacencies) gc() } } @@ -264,7 +264,7 @@ ped2com <- function(ped, component, # r is I + A + A^2 + ... = (I-A)^-1 from RAM # could trim, here - while (mtSum != 0 & count < maxCount) { + while (mtSum != 0 && count < maxCount) { r <- r + newIsPar gen <- gen + (Matrix::rowSums(newIsPar) > 0) newIsPar <- newIsPar %*% isPar diff --git a/R/makeLinks.R b/R/makeLinks.R new file mode 100644 index 00000000..891982b7 --- /dev/null +++ b/R/makeLinks.R @@ -0,0 +1,543 @@ +#' Take a sparse component and turn it into kinship links +#' @param rel_pairs_file File to write related pairs to +#' @param ad_ped_matrix Matrix of additive genetic relatedness coefficients +#' @param mit_ped_matrix Matrix of mitochondrial relatedness coefficients +#' @param mt_ped_matrix (alternative) Matrix of mitochondrial relatedness coefficients +#' @param cn_ped_matrix Matrix of common nuclear relatedness coefficients +#' @param write_buffer_size Number of related pairs to write to disk at a time +#' @param gc logical. If TRUE, do frequent garbage collection via \code{\link{gc}} to save memory +#' @param writetodisk logical. If TRUE, write the related pairs to disk +#' @param verbose logical. If TRUE, print progress messages +#' @param update_rate numeric. How often to print progress messages +#' @param legacy logical. If TRUE, use the legacy version of the function +#' @param outcome_name Character string. The name of the outcome +#' @param ... Additional arguments to be passed to \code{\link{com2links}} +#' @return A data frame of related pairs +#' @export +com2links <- function( + rel_pairs_file = "dataRelatedPairs.csv", + ad_ped_matrix = NULL, + mit_ped_matrix = mt_ped_matrix, + mt_ped_matrix = NULL, + cn_ped_matrix = NULL, + # pat_ped_matrix = NULL, + # mat_ped_matrix = NULL, + # mapa_id_file = "data_mapaID.csv", + write_buffer_size = 1000, + update_rate = 1000, + gc = TRUE, + writetodisk = TRUE, + verbose = FALSE, + legacy = FALSE, + outcome_name = "data", + ...) { +if(!legacy){ + # match arguments +# a <- match.call() +# alternative names +# mit_ped_names <- c("mit_ped_matrix","mt_ped_matrix","mtdna_ped_matrix") + # cn_ped_names <- c("cn_ped_matrix","comn_ped_matrix","nuc_ped_matrix") +# ad_ped_names <- c("ad_ped_matrix","add_ped_matrix","additive_ped_matrix") + +# if (sum(names(a) %in% mit_ped_names)>0) { +# mit_ped_matrix <- unlist(as.list(match.call())[mit_ped_names])[1] + # print(inherits(mit_ped_matrix)) + # } +# if (sum(names(a) %in% cn_ped_names) > 0) { +# cn_ped_matrix <- unlist(as.list(match.call())[cn_ped_names])[1] +# } +# +# if (sum(names(a) %in% ad_ped_names) > 0) { +# ad_ped_matrix <- unlist(as.list(match.call())[ad_ped_names])[1] +#} + # Fast fails + + # Ensure at least one relationship matrix is provided + if (is.null(ad_ped_matrix) && is.null(mit_ped_matrix) && is.null(cn_ped_matrix)) { + stop("At least one of 'ped_matrix', 'mit_ped_matrix', or 'cn_ped_matrix' must be provided.") + } + # Check for matrix type + if (!is.null(ad_ped_matrix)) { + if (!inherits(ad_ped_matrix, c("matrix", "dgCMatrix", "dsCMatrix"))) { + stop("The 'ad_ped_matrix' must be a matrix or dgCMatrix.") + } + # convert to sparse + if (!inherits(ad_ped_matrix, "dgCMatrix")) { + ad_ped_matrix <- methods::as(ad_ped_matrix, "dgCMatrix") + } + } + if (!is.null(cn_ped_matrix)) { + if (!inherits(cn_ped_matrix, c("matrix", "dgCMatrix", "dsCMatrix"))) { + stop("The 'cn_ped_matrix' must be a matrix or dgCMatrix.") + } + # convert to sparse + if (!inherits(cn_ped_matrix, "dgCMatrix")) { + cn_ped_matrix <- methods::as(cn_ped_matrix, "dgCMatrix") + } + } + if (!is.null(mit_ped_matrix)) { + if (!inherits(mit_ped_matrix, c("matrix", "dgCMatrix", "dsCMatrix"))) { + stop("The 'mit_ped_matrix' must be a matrix or dgCMatrix.") + } + if (!inherits(mit_ped_matrix, "dgCMatrix")) { + mit_ped_matrix <- methods::as(mit_ped_matrix, "symmetricMatrix") + } + # Ensure mitochondrial matrix values are binary (0/1) + mit_ped_matrix@x[mit_ped_matrix@x > 0] <- 1 + } + + # build IDs + # Extract IDs from the first available matrix + ids <- NULL + if (!is.null(cn_ped_matrix)) { + # Convert CN matrix to symmetric if needed + cn_ped_matrix <- methods::as(cn_ped_matrix, "symmetricMatrix") + ids <- as.numeric(dimnames(cn_ped_matrix)[[1]]) + nc <- ncol(cn_ped_matrix) + } else if (!is.null(ad_ped_matrix)) { + ids <- as.numeric(dimnames(ad_ped_matrix)[[1]]) + nc <- ncol(ad_ped_matrix) + } else if (!is.null(mit_ped_matrix)) { + ids <- as.numeric(dimnames(mit_ped_matrix)[[1]]) + nc <- ncol(mit_ped_matrix) + } + + if (is.null(ids)) { + stop("Could not extract IDs from the provided matrices.") + } + + # check which matrices are provided + sum_nulls <- sum(!is.null(ad_ped_matrix), + !is.null(mit_ped_matrix), + !is.null(cn_ped_matrix), + na.rm = TRUE + ) +if(verbose){ + print(sum_nulls) +} + # Extract matrix pointers (directly) + if (!is.null(ad_ped_matrix)) { + ad_ped_p <- ad_ped_matrix@p + 1L + ad_ped_i <- ad_ped_matrix@i + 1L + ad_ped_x <- ad_ped_matrix@x + } + if (!is.null(mit_ped_matrix)) { + mt_p <- mit_ped_matrix@p + 1L + mt_i <- mit_ped_matrix@i + 1L + mt_x <- mit_ped_matrix@x + } + if (!is.null(cn_ped_matrix)) { + cn_p <- cn_ped_matrix@p + 1L + cn_i <- cn_ped_matrix@i + 1L + cn_x <- cn_ped_matrix@x + } + + # if all matrices are provided + if (sum_nulls == 3) { + # Matrix index adjustments + newColPos1 <- ad_ped_p + iss1 <- ad_ped_i + x1 <- ad_ped_x + newColPos2 <- mt_p + iss2 <- mt_i + x2 <- mt_x + newColPos3 <- cn_p + iss3 <- cn_i + x3 <- cn_x + # cleanup + relNames <- c("addRel", "mitRel", "cnuRel") + if (gc == TRUE) { + remove(ad_ped_p, ad_ped_i, ad_ped_x, mt_p, mt_i, mt_x, cn_p, cn_i, cn_x) + } + if(verbose){ + print("All 3 matrix is present") + } + + # File names + # rel_pairs_file <- paste0(outcome_name, "_dataRelatedPairs.csv") + # mapa_id_file <- paste0(outcome_name, "_data_mapaID.csv") + + # Initialize related pairs file + df_relpairs <- data.frame( + ID1 = numeric(0), ID2 = numeric(0) + ) + df_relpairs[[relNames[1]]] <- numeric(0) + df_relpairs[[relNames[2]]] <- numeric(0) + df_relpairs[[relNames[3]]] <- numeric(0) + if (writetodisk == TRUE) { + utils::write.table( + df_relpairs, + file = rel_pairs_file, sep = ",", append = FALSE, row.names = FALSE + ) + # initial buffer + write_buffer <- list() + remove(df_relpairs) + } + for (j in 1L:nc) { + ID2 <- ids[j] + + # Extract column indices + ncp1 <- newColPos1[j] + ncp1p <- newColPos1[j + 1L] + cond1 <- ncp1 < ncp1p + if (cond1) { + vv1 <- ncp1:(ncp1p - 1L) + iss1vv <- iss1[vv1] + } + + ncp2 <- newColPos2[j] + ncp2p <- newColPos2[j + 1L] + cond2 <- ncp2 < ncp2p + if (cond2) { + vv2 <- ncp2:(ncp2p - 1L) + iss2vv <- iss2[vv2] + } + + ncp3 <- newColPos3[j] + ncp3p <- newColPos3[j + 1L] + cond3 <- ncp3 < ncp3p + if (cond3) { + vv3 <- ncp3:(ncp3p - 1L) + iss3vv <- iss3[vv3] + } + + u <- sort(igraph::union(igraph::union(if (cond1) { + iss1vv + }, if (cond2) { + iss2vv + }), if (cond3) { + iss3vv + })) + + if (cond1 || cond2 || cond3) { + ID1 <- ids[u] + tds <- data.frame(ID1 = ID1, ID2 = ID2) + tds[[relNames[1]]] <- 0 + tds[[relNames[2]]] <- 0 + tds[[relNames[3]]] <- 0 + + if (cond1) { + tds[u %in% iss1vv, relNames[1]] <- x1[vv1] + } + if (cond2) { + tds[u %in% iss2vv, relNames[2]] <- x2[vv2] + } + if (cond3) { + tds[u %in% iss3vv, relNames[3]] <- x3[vv3] + } + if (writetodisk == TRUE) { + write_buffer[[length(write_buffer) + 1]] <- tds + + if (length(write_buffer) >= write_buffer_size) { # Write in batches + utils::write.table(do.call(rbind, write_buffer), + file = rel_pairs_file, + row.names = FALSE, col.names = FALSE, append = TRUE, sep = "," + ) + write_buffer <- list() + } + } else { + df_relpairs <- rbind(df_relpairs, tds) + } + } + if(verbose){ + if (!(j %% update_rate)) { + cat(paste0("Done with ", j, " of ", nc, "\n")) + } + } + } + } else if (sum_nulls == 2) { + # one matrix is missing + if (is.null(ad_ped_matrix)) { + newColPos1 <- mt_p + iss1 <- mt_i + x1 <- mt_x + newColPos2 <- cn_p + iss2 <- cn_i + x2 <- cn_x + relNames <- c("mitRel", "cnuRel") + if (gc == TRUE) { + remove(mt_p, mt_i, mt_x, cn_p, cn_i, cn_x) + } + } + if (is.null(mit_ped_matrix)) { + newColPos1 <- ad_ped_p + iss1 <- ad_ped_i + x1 <- ad_ped_x + newColPos2 <- cn_p + iss2 <- cn_i + x2 <- cn_x + relNames <- c("addRel", "cnuRel") + if (gc == TRUE) { + remove(ad_ped_p, ad_ped_i, ad_ped_x, cn_p, cn_i, cn_x) + } + } + if (is.null(cn_ped_matrix)) { + newColPos1 <- ad_ped_p + iss1 <- ad_ped_i + x1 <- ad_ped_x + newColPos2 <- mt_p + iss2 <- mt_i + x2 <- mt_x + relNames <- c("addRel", "mitRel") + if (gc == TRUE) { + remove(ad_ped_p, ad_ped_i, ad_ped_x, mt_p, mt_i, mt_x) + } + } + # Matrix index adjustments + # Initialize related pairs file + df_relpairs <- data.frame( + ID1 = numeric(0), ID2 = numeric(0) + ) + df_relpairs[[relNames[1]]] <- numeric(0) + df_relpairs[[relNames[2]]] <- numeric(0) + if (writetodisk == TRUE) { + utils::write.table( + df_relpairs, + file = rel_pairs_file, sep = ",", append = FALSE, row.names = FALSE + ) + # initial buffer + write_buffer <- list() + remove(df_relpairs) + } + for (j in 1L:nc) { + ID2 <- ids[j] + + # Extract column indices + ncp1 <- newColPos1[j] + ncp1p <- newColPos1[j + 1L] + cond1 <- ncp1 < ncp1p + if (cond1) { + vv1 <- ncp1:(ncp1p - 1L) + iss1vv <- iss1[vv1] + } + + ncp2 <- newColPos2[j] + ncp2p <- newColPos2[j + 1L] + cond2 <- ncp2 < ncp2p + if (cond2) { + vv2 <- ncp2:(ncp2p - 1L) + iss2vv <- iss2[vv2] + } + + u <- sort(igraph::union(if (cond1) { + iss1vv + }, if (cond2) { + iss2vv + })) + + if (cond1 || cond2) { + ID1 <- ids[u] + tds <- data.frame(ID1 = ID1, ID2 = ID2) + tds[[relNames[1]]] <- 0 + tds[[relNames[2]]] <- 0 + + if (cond1) { + tds[u %in% iss1vv, relNames[1]] <- x1[vv1] + } + if (cond2) { + tds[u %in% iss2vv, relNames[2]] <- x2[vv2] + } + if (writetodisk == TRUE) { + write_buffer[[length(write_buffer) + 1]] <- tds + + if (length(write_buffer) >= write_buffer_size) { # Write in batches + utils::write.table(do.call(rbind, write_buffer), + file = rel_pairs_file, + row.names = FALSE, col.names = FALSE, append = TRUE, sep = "," + ) + write_buffer <- list() + } + } else { + df_relpairs <- rbind(df_relpairs, tds) + } + } + if(verbose){ + if (!(j %% update_rate)) { + cat(paste0("Done with ", j, " of ", nc, "\n")) + } + } + } + } else if (sum_nulls == 1) { + + if (verbose) { + message("Only one matrix is present") + } + if (!is.null(ad_ped_matrix)) { + newColPos1 <- ad_ped_p + iss1 <- ad_ped_i + x1 <- ad_ped_x + relNames <- c("addRel") + if (gc == TRUE) { + remove(ad_ped_p, ad_ped_i, ad_ped_x) + } + } + if (!is.null(mit_ped_matrix)) { + newColPos1 <- mt_p + iss1 <- mt_i + x1 <- mt_x + relNames <- c("mitRel") + if (gc == TRUE) { + remove(mt_p, mt_i, mt_x) + } + } + if (!is.null(cn_ped_matrix)) { + newColPos1 <- cn_p + iss1 <- cn_i + x1 <- cn_x + relNames <- c("cnuRel") + if (gc == TRUE) { + remove(cn_p, cn_i, cn_x) + } + } + + # Initialize related pairs file + df_relpairs <- data.frame( + ID1 = numeric(0), ID2 = numeric(0) + ) + df_relpairs[[relNames[1]]] <- numeric(0) + if (writetodisk == TRUE) { + utils::write.table( + df_relpairs, + file = rel_pairs_file, sep = ",", append = FALSE, row.names = FALSE + ) + + # initial buffer + write_buffer <- list() + + remove(df_relpairs) + } + + for (j in 1L:nc) { + ID2 <- ids[j] + # Extract column indices + ncp1 <- newColPos1[j] + ncp1p <- newColPos1[j + 1L] + cond1 <- ncp1 < ncp1p + if (cond1) { + vv1 <- ncp1:(ncp1p - 1L) + iss1vv <- iss1[vv1] + } + + u <- sort(iss1vv) + + if (cond1) { + ID1 <- ids[u] + tds <- data.frame(ID1 = ID1, ID2 = ID2) + tds[[relNames[1]]] <- 0 + + if (cond1) { + tds[u %in% iss1vv, relNames[1]] <- x1[vv1] + } + if (writetodisk == TRUE) { + + write_buffer[[length(write_buffer) + 1]] <- tds + + if (length(write_buffer) >= write_buffer_size) { # Write in batches + utils::write.table(do.call(rbind, write_buffer), + file = rel_pairs_file, + row.names = FALSE, col.names = FALSE, append = TRUE, sep = "," + ) + write_buffer <- list() + } + } else { + df_relpairs <- rbind(df_relpairs, tds) + } + } + if(verbose){ + if (!(j %% update_rate)) { + cat(paste0("Done with ", j, " of ", nc, "\n")) + } + } + } + } + if (writetodisk == FALSE) { + return(df_relpairs) + } else { + if (length(write_buffer) > 0) { + utils::write.table(do.call(rbind, write_buffer), + file = rel_pairs_file, + row.names = FALSE, col.names = FALSE, append = TRUE, sep = "," + ) + } + # return(NULL) + } +} else if (legacy) { + if (verbose) { + message("Using legacy mode") + } + + +# load(paste0(outcome_name,'_dataBiggestCnPedigree.Rdata')) +# biggestCnPed <- as(biggestCnPed, "symmetricMatrix") + #load(paste0(outcome_name,'_dataBiggestPedigree.Rdata')) +# load(paste0(outcome_name,'_dataBiggestMtPedigree.Rdata')) + + # rel_pairs_file <- paste0(outcome_name,'_datacnmitBiggestRelatedPairsTake3.csv') + + biggestMtPed <- mit_ped_matrix + remove(mit_ped_matrix) + biggestCnPed <- as(cn_ped_matrix , "symmetricMatrix") + remove(cn_ped_matrix) + biggestPed <- ad_ped_matrix + remove(ad_ped_matrix) + biggestMtPed@x[biggestMtPed@x > 0] <- 1 + +if(exists("rel_pairs_file")){ + fname <- rel_pairs_file}else{ + fname <- paste0(outcome_name,'_dataBiggestRelatedPairsTake2.csv') + } + ds <- data.frame(ID1=numeric(0), ID2=numeric(0), addRel=numeric(0), mitRel=numeric(0), cnuRel=numeric(0)) + write.table(ds, file=fname, sep=',', append=FALSE, row.names=FALSE) + ids <- as.numeric(dimnames(biggestCnPed)[[1]]) + newColPos1 <- biggestPed@p + 1L + iss1 <- biggestPed@i + 1L + newColPos2 <- biggestMtPed@p + 1L + iss2 <- biggestMtPed@i + 1L + newColPos3 <- biggestCnPed@p + 1L + iss3 <- biggestCnPed@i + 1L + nc <- ncol(biggestPed) + for(j in 1L:nc){ + ID2 <- ids[j] + ncp1 <- newColPos1[j] + ncp1p <- newColPos1[j+1L] + cond1 <- ncp1 < ncp1p + if(cond1){ + vv1 <- ncp1:(ncp1p-1L) + iss1vv <- iss1[vv1] + } + ncp2 <- newColPos2[j] + ncp2p <- newColPos2[j+1L] + cond2 <- ncp2 < ncp2p + if(cond2){ + vv2 <- ncp2:(ncp2p-1L) + iss2vv <- iss2[vv2] + } + ncp3 <- newColPos3[j] + ncp3p <- newColPos3[j+1L] + cond3 <- ncp3 < ncp3p + if(cond3){ + vv3 <- ncp3:(ncp3p-1L) + iss3vv <- iss3[vv3] + } + u <- sort(igraph::union(igraph::union(if(cond1){iss1vv}, if(cond2){iss2vv}), if(cond3){iss3vv})) + #browser() + if(cond1 || cond2 || cond3){ + ID1 <- ids[u] + tds <- data.frame(ID1=ID1, ID2=ID2, addRel=0, mitRel=0, cnuRel=0) + if(cond1){ tds$addRel[u %in% iss1vv] <- biggestPed@x[vv1] } + if(cond2){ tds$mitRel[u %in% iss2vv] <- biggestMtPed@x[vv2] } + if(cond3){ tds$cnuRel[u %in% iss3vv] <- biggestCnPed@x[vv3] } + write.table(tds, file=fname, row.names=FALSE, col.names=FALSE, append=TRUE, sep=',') + } + if( !(j %% 500) ) { cat(paste0('Done with ', j, ' of ', nc, '\n')) } + } +} + + + + # Merge and write the parentage matrices + # df <- full_join(mat_ped_matrix %>% arrange(ID), pat_ped_matrix %>% arrange(ID)) + + # write.table(df, file = mapa_id_file, sep = ",", append = FALSE, row.names = FALSE) +} + diff --git a/R/plotPedigree.R b/R/plotPedigree.R index 39ee62d4..e465d749 100644 --- a/R/plotPedigree.R +++ b/R/plotPedigree.R @@ -109,7 +109,7 @@ plotPedigree <- function(ped, # Ensure the output is reverted back to console when function exits # on.exit(if (sink.number() > 0) sink(), add = TRUE) - + if (verbose) { plot_picture <- kinship2::plot.pedigree(p3, cex = cex, col = col, @@ -120,15 +120,35 @@ plotPedigree <- function(ped, density = density, angle = angle, keep.par = keep.par, pconnect = pconnect, - mar = mar + mar = mar, + ... ) # Explicitly revert the standard output back to the console # if (sink.number() > 0) { # sink() # } + return(plot_picture) + }else{ + plot_picture <- suppressMessages(kinship2::plot.pedigree(p3, + cex = cex, + col = col, + symbolsize = symbolsize, + branch = branch, + packed = packed, align = align, + width = width, + density = density, + angle = angle, keep.par = keep.par, + pconnect = pconnect, + mar = mar, + ... + )) + plot_picture[c("plist", "x", "y", "boxw", "boxh","call")] <- NULL + class(plot_picture) <- NULL return(plot_picture) + } + } } else { stop("The structure of the provided pedigree data does not match the expected structure.") diff --git a/R/readPedigree.R b/R/readPedigree.R index 31ca0511..d9f69428 100644 --- a/R/readPedigree.R +++ b/R/readPedigree.R @@ -326,7 +326,7 @@ readGedcom <- function(file_path, if (verbose) { print("Processing parents") } - df_temp <- processParents(df_temp) + df_temp <- processParents(df_temp, datasource="gedcom") } @@ -383,7 +383,8 @@ readGedcom <- function(file_path, #' @param df_temp A data frame containing information about individuals. #' @return A list mapping family IDs to parent IDs. #' @keywords internal -createFamilyToParentsMapping <- function(df_temp) { +createFamilyToParentsMapping <- function(df_temp,datasource) { + if (datasource == "gedcom") { if (!all(c("FAMS", "sex") %in% colnames(df_temp))) { warning("The data frame does not contain the necessary columns (FAMS, sex)") return(NULL) @@ -410,6 +411,11 @@ createFamilyToParentsMapping <- function(df_temp) { } } } + } else if (datasource == "wiki") { + + message("The data source is not supported") + return(df_temp) + } return(family_to_parents) } @@ -420,12 +426,13 @@ createFamilyToParentsMapping <- function(df_temp) { #' #' @param df_temp A data frame containing individual information. #' @param family_to_parents A list mapping family IDs to parent IDs. +#' @param datasource A string indicating the data source. Options are "gedcom" and "wiki". #' @return A data frame with added momID and dad_ID columns. #' @keywords internal -assignParentIDs <- function(df_temp, family_to_parents) { +assignParentIDs <- function(df_temp, family_to_parents,datasource) { df_temp$momID <- NA_character_ df_temp$dadID <- NA_character_ - +if(datasource == "gedcom"){ for (i in 1:nrow(df_temp)) { if (!is.na(df_temp$FAMC[i])) { famc_ids <- unlist(strsplit(df_temp$FAMC[i], ", ")) @@ -442,6 +449,10 @@ assignParentIDs <- function(df_temp, family_to_parents) { } } return(df_temp) +} else if(datasource=="wiki"){ + message("No parents information available for wiki data") +return(df_temp) + } } #' Process parents information @@ -451,20 +462,27 @@ assignParentIDs <- function(df_temp, family_to_parents) { #' @param df_temp A data frame containing information about individuals. #' @return A data frame with added momID and dadID columns. #' @keywords internal -processParents <- function(df_temp) { +processParents <- function(df_temp,datasource) { # Ensure required columns are present +if(datasource=="gedcom"){ required_cols <- c("FAMC", "sex", "FAMS") +} else if(datasource=="wiki"){ + required_cols <- c("id") +} else { + stop("Invalid datasource") +} - if (!all(required_cols %in% colnames(df_temp))) { +if (!all(required_cols %in% colnames(df_temp))) { missing_cols <- setdiff(required_cols, colnames(df_temp)) warning("Missing necessary columns: ", paste(missing_cols, collapse = ", ")) return(df_temp) } - family_to_parents <- createFamilyToParentsMapping(df_temp) + + family_to_parents <- createFamilyToParentsMapping(df_temp,datasource=datasource) if (is.null(family_to_parents) || length(family_to_parents) == 0) { return(df_temp) } - df_temp <- assignParentIDs(df_temp, family_to_parents) + df_temp <- assignParentIDs(df_temp, family_to_parents,datasource=datasource) return(df_temp) } @@ -558,3 +576,178 @@ countPatternRows <- function(file) { ) return(num_rows) } + +#' Read Wiki Family Tree +#' +#' @param text A character string containing the text of a family tree in wiki format. +#' @export +readWikifamilytree <- function(text) { + # Extract summary text + + summary_text <- extractSummaryText(text) + # Extract all lines defining the family tree + tree_lines <- unlist(stringr::str_extract_all(text, "\\{\\{familytree.*?\\}\\}")) + tree_lines <- tree_lines[!stringr::str_detect(tree_lines, "start|end")] # Remove start/end markers + tree_lines <- gsub("\\{\\{familytree(.*?)\\}\\}", "\\1", tree_lines) # Remove wrapping markup + + # Convert tree structure into a coordinate grid (preserves symbols!) + tree_df <- parseTree(tree_lines) + + # Identify columns that start with "Y" + cols_to_pivot <- grep("^Y", names(tree_df), value = TRUE) + + # Reshape from wide to long format + tree_long <- makeLongTree(tree_df, cols_to_pivot) + + # Extract member definitions + members_df <- matchMembers(text) + members_df$id <- paste0("P", seq_len(nrow(members_df))) # Assign unique person IDs + + # Merge names into the tree structure (keeping all symbols!) + tree_long <- merge(tree_long, members_df, by.x = "Value", by.y = "identifier", all.x = TRUE) + + tree_long$DisplayName <- ifelse(!is.na(tree_long$name), tree_long$name, tree_long$Value) # Use name if available + + # parse relationships and infer them + + relationships_df <- parseRelationships(tree_long) + + # relationships_df <- processParents(tree_long, datasource = "wiki") + + + + # Return structured table of the family tree (symbols included) + list( + summary = summary_text, + members = members_df, + structure = tree_long, + relationships = relationships_df + ) +} + +#' Make Long Tree +#' @param tree_df A data frame containing the tree structure. +#' @param cols_to_pivot A character vector of column names to pivot. +#' @return A long data frame containing the tree structure. +#' @keywords internal +makeLongTree <- function(tree_df, cols_to_pivot) { + tree_long <- stats::reshape(tree_df, + varying = cols_to_pivot, + v.names = "Value", + timevar = "Column", + times = cols_to_pivot, + idvar = setdiff(names(tree_df), cols_to_pivot), + direction = "long" + ) + + tree_long <- tree_long[!is.na(tree_long$Value), ] + tree_long$Value <- stringr::str_trim(tree_long$Value) + tree_long$Column <- as.numeric(gsub("^Y", "", tree_long$Column)) + return(tree_long) +} + +#' Match Members +#' @inheritParams readWikifamilytree +#' @return A data frame containing information about the members of the family tree. +#' @keywords internal + +matchMembers <- function(text) { + member_matches <- stringr::str_extract_all(text, "\\|\\s*([A-Za-z0-9]+)\\s*=\\s*([^|}]*)")[[1]] + member_matches <- gsub("\\[|\\]|'''", "", member_matches) # Remove formatting + + members_df <- data.frame( + identifier = stringr::str_trim(stringr::str_extract(member_matches, "^[^=]+")), + name = stringr::str_trim(stringr::str_extract(member_matches, "(?<=\\=).*")), + stringsAsFactors = FALSE + ) + + # Remove leading pipes (`|`) from identifiers for consistency + members_df$identifier <- gsub("^\\|\\s*", "", members_df$identifier) + + # remove summary row + members_df <- members_df[members_df$identifier != "summary", ] + + return(members_df) +} + +#' Extract Summary Text +#' @inheritParams readWikifamilytree +#' @return A character string containing the summary text. +#' @keywords internal +#' @export + +extractSummaryText <- function(text) { + summary_match <- stringr::str_match(text, "\\{\\{familytree/start \\|summary=(.*?)\\}\\}") + summary_text <- ifelse(!is.na(summary_match[, 2]), summary_match[, 2], NA) + return(summary_text) +} + +#' Parse Tree +#' @param tree_lines A character vector containing the lines of the tree structure. +#' @return A data frame containing the tree structure. +#' @keywords internal +#' @export + +parseTree <- function(tree_lines) { + tree_matrix <- base::strsplit(tree_lines, "\\|") # Split each row into columns + max_cols <- max(sapply(tree_matrix, length)) # Find the max column count + + # Convert to a data frame (ensures correct structure) + tree_df <- do.call(rbind, lapply(tree_matrix, function(row) { + length(row) <- max_cols # Ensure uniform column length + return(row) + })) + + tree_df <- as.data.frame(tree_df, stringsAsFactors = FALSE) + colnames(tree_df) <- paste0("Y", seq_len(ncol(tree_df))) # Assign column names + tree_df$Row <- seq_len(nrow(tree_df)) # Assign row numbers + return(tree_df) +} + +#' infer relationship from tree template +#' +#' @param tree_long A data frame containing the tree structure in long format. +#' @return A data frame containing the relationships between family members. +#' @keywords internal +#' +parseRelationships <- function(tree_long) { + relationships <- data.frame( + id = tree_long$id, + momID = NA_character_, + dadID = NA_character_, + spouseID = NA_character_, + stringsAsFactors = FALSE + ) + + # Loop through rows to find connections + for (i in seq_len(nrow(tree_long))) { + row <- tree_long[i, ] + + # **Parent-Child Detection** + if (row$Value == "y") { + parent <- tree_long$Value[tree_long$Row == row$Row - 1 & tree_long$Column == row$Column] + child <- tree_long$Value[tree_long$Row == row$Row + 1 & tree_long$Column == row$Column] + + if(length(parent) == 0) parent <- NA + if(length(child) == 0) child <- NA + # Assign mom/dad IDs based on tree structure + if (!is.na(parent) && !is.na(child)) { + relationships$momID[relationships$id == child] <- parent + relationships$dadID[relationships$id == child] <- parent # Assuming one parent detected for now + } + } + + # **Spouse Detection** + if (row$Value == "+") { + spouse1 <- tree_long$Value[tree_long$Row == row$Row & tree_long$Column == row$Column - 1] + spouse2 <- tree_long$Value[tree_long$Row == row$Row & tree_long$Column == row$Column + 1] + + if (!is.na(spouse1) && !is.na(spouse2)) { + relationships$spouseID[relationships$id == spouse1] <- spouse2 + relationships$spouseID[relationships$id == spouse2] <- spouse1 + } + } + } + + return(relationships) +} diff --git a/R/summarizePedigree.R b/R/summarizePedigree.R index 93feda0f..66c6e91d 100644 --- a/R/summarizePedigree.R +++ b/R/summarizePedigree.R @@ -75,7 +75,6 @@ summarizePedigrees <- function(ped, famID = "famID", personID = "ID", founder_sort_var <- byr } - # Build the pedigree using the provided functions if ("families" %in% type && !famID %in% names(ped)) { if (verbose) message("Counting families...") @@ -183,7 +182,7 @@ summarizePedigrees <- function(ped, famID = "famID", personID = "ID", ## oldest - if (!is.null(byr) & noldest > 0) { + if (!is.null(byr) && noldest > 0) { if (!is.null(n_families) && "families" %in% type) { if (verbose) message("Finding oldest families...") output$oldest_families <- try_na(family_summary_dt[order(get(byr))][1:min(c(noldest, n_families), @@ -206,7 +205,7 @@ summarizePedigrees <- function(ped, famID = "famID", personID = "ID", # biggest lines if (!is.null(nbiggest) && nbiggest > 0) { - if (!is.null(n_families) & "families" %in% type) { + if (!is.null(n_families) && "families" %in% type) { output$biggest_families <- try_na(family_summary_dt[order(-get("count"))][1:min(c(nbiggest, n_families), na.rm = TRUE )]) @@ -248,6 +247,7 @@ calculateSummaryDT <- function(data, group_var, skip_var, # count = .N, mean = as.double(base::mean(x, na.rm = TRUE)), median = as.double(stats::median(x, na.rm = TRUE)), + # mode = as.double(stats::mode(x, na.rm = TRUE)), min = ifelse(all(is.na(x)), as.double(NA), as.double(base::min(x, na.rm = TRUE))), max = ifelse(all(is.na(x)), as.double(NA), as.double(base::max(x, na.rm = TRUE))), sd = as.double(stats::sd(x, na.rm = TRUE)) diff --git a/man/assignParentIDs.Rd b/man/assignParentIDs.Rd index e57dd6cc..8bd15170 100644 --- a/man/assignParentIDs.Rd +++ b/man/assignParentIDs.Rd @@ -4,12 +4,14 @@ \alias{assignParentIDs} \title{Assign momID and dadID based on family mapping} \usage{ -assignParentIDs(df_temp, family_to_parents) +assignParentIDs(df_temp, family_to_parents, datasource) } \arguments{ \item{df_temp}{A data frame containing individual information.} \item{family_to_parents}{A list mapping family IDs to parent IDs.} + +\item{datasource}{A string indicating the data source. Options are "gedcom" and "wiki".} } \value{ A data frame with added momID and dad_ID columns. diff --git a/man/com2links.Rd b/man/com2links.Rd new file mode 100644 index 00000000..cfaf2817 --- /dev/null +++ b/man/com2links.Rd @@ -0,0 +1,55 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/makeLinks.R +\name{com2links} +\alias{com2links} +\title{Take a sparse component and turn it into kinship links} +\usage{ +com2links( + rel_pairs_file = "dataRelatedPairs.csv", + ad_ped_matrix = NULL, + mit_ped_matrix = mt_ped_matrix, + mt_ped_matrix = NULL, + cn_ped_matrix = NULL, + write_buffer_size = 1000, + update_rate = 1000, + gc = TRUE, + writetodisk = TRUE, + verbose = FALSE, + legacy = FALSE, + outcome_name = "data", + ... +) +} +\arguments{ +\item{rel_pairs_file}{File to write related pairs to} + +\item{ad_ped_matrix}{Matrix of additive genetic relatedness coefficients} + +\item{mit_ped_matrix}{Matrix of mitochondrial relatedness coefficients} + +\item{mt_ped_matrix}{(alternative) Matrix of mitochondrial relatedness coefficients} + +\item{cn_ped_matrix}{Matrix of common nuclear relatedness coefficients} + +\item{write_buffer_size}{Number of related pairs to write to disk at a time} + +\item{update_rate}{numeric. How often to print progress messages} + +\item{gc}{logical. If TRUE, do frequent garbage collection via \code{\link{gc}} to save memory} + +\item{writetodisk}{logical. If TRUE, write the related pairs to disk} + +\item{verbose}{logical. If TRUE, print progress messages} + +\item{legacy}{logical. If TRUE, use the legacy version of the function} + +\item{outcome_name}{Character string. The name of the outcome} + +\item{...}{Additional arguments to be passed to \code{\link{com2links}}} +} +\value{ +A data frame of related pairs +} +\description{ +Take a sparse component and turn it into kinship links +} diff --git a/man/createFamilyToParentsMapping.Rd b/man/createFamilyToParentsMapping.Rd index 87f14848..20d8216d 100644 --- a/man/createFamilyToParentsMapping.Rd +++ b/man/createFamilyToParentsMapping.Rd @@ -4,7 +4,7 @@ \alias{createFamilyToParentsMapping} \title{Create a mapping of family IDs to parent IDs} \usage{ -createFamilyToParentsMapping(df_temp) +createFamilyToParentsMapping(df_temp, datasource) } \arguments{ \item{df_temp}{A data frame containing information about individuals.} diff --git a/man/extractSummaryText.Rd b/man/extractSummaryText.Rd new file mode 100644 index 00000000..ad1e90a9 --- /dev/null +++ b/man/extractSummaryText.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readPedigree.R +\name{extractSummaryText} +\alias{extractSummaryText} +\title{Extract Summary Text} +\usage{ +extractSummaryText(text) +} +\arguments{ +\item{text}{A character string containing the text of a family tree in wiki format.} +} +\value{ +A character string containing the summary text. +} +\description{ +Extract Summary Text +} +\keyword{internal} diff --git a/man/makeLongTree.Rd b/man/makeLongTree.Rd new file mode 100644 index 00000000..e03db2f9 --- /dev/null +++ b/man/makeLongTree.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readPedigree.R +\name{makeLongTree} +\alias{makeLongTree} +\title{Make Long Tree} +\usage{ +makeLongTree(tree_df, cols_to_pivot) +} +\arguments{ +\item{tree_df}{A data frame containing the tree structure.} + +\item{cols_to_pivot}{A character vector of column names to pivot.} +} +\value{ +A long data frame containing the tree structure. +} +\description{ +Make Long Tree +} +\keyword{internal} diff --git a/man/matchMembers.Rd b/man/matchMembers.Rd new file mode 100644 index 00000000..e02311af --- /dev/null +++ b/man/matchMembers.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readPedigree.R +\name{matchMembers} +\alias{matchMembers} +\title{Match Members} +\usage{ +matchMembers(text) +} +\arguments{ +\item{text}{A character string containing the text of a family tree in wiki format.} +} +\value{ +A data frame containing information about the members of the family tree. +} +\description{ +Match Members +} +\keyword{internal} diff --git a/man/parseRelationships.Rd b/man/parseRelationships.Rd new file mode 100644 index 00000000..a02a5983 --- /dev/null +++ b/man/parseRelationships.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readPedigree.R +\name{parseRelationships} +\alias{parseRelationships} +\title{infer relationship from tree template} +\usage{ +parseRelationships(tree_long) +} +\arguments{ +\item{tree_long}{A data frame containing the tree structure in long format.} +} +\value{ +A data frame containing the relationships between family members. +} +\description{ +infer relationship from tree template +} +\keyword{internal} diff --git a/man/parseTree.Rd b/man/parseTree.Rd new file mode 100644 index 00000000..e429662c --- /dev/null +++ b/man/parseTree.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readPedigree.R +\name{parseTree} +\alias{parseTree} +\title{Parse Tree} +\usage{ +parseTree(tree_lines) +} +\arguments{ +\item{tree_lines}{A character vector containing the lines of the tree structure.} +} +\value{ +A data frame containing the tree structure. +} +\description{ +Parse Tree +} +\keyword{internal} diff --git a/man/processParents.Rd b/man/processParents.Rd index 9f8064d1..e8077472 100644 --- a/man/processParents.Rd +++ b/man/processParents.Rd @@ -4,7 +4,7 @@ \alias{processParents} \title{Process parents information} \usage{ -processParents(df_temp) +processParents(df_temp, datasource) } \arguments{ \item{df_temp}{A data frame containing information about individuals.} diff --git a/man/readWikifamilytree.Rd b/man/readWikifamilytree.Rd new file mode 100644 index 00000000..b06acf7b --- /dev/null +++ b/man/readWikifamilytree.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/readPedigree.R +\name{readWikifamilytree} +\alias{readWikifamilytree} +\title{Read Wiki Family Tree} +\usage{ +readWikifamilytree(text) +} +\arguments{ +\item{text}{A character string containing the text of a family tree in wiki format.} +} +\description{ +Read Wiki Family Tree +} diff --git a/tests/testthat/test-convertPedigree.R b/tests/testthat/test-convertPedigree.R index 346e5bfb..78555c40 100644 --- a/tests/testthat/test-convertPedigree.R +++ b/tests/testthat/test-convertPedigree.R @@ -321,10 +321,74 @@ test_that("adjacency_method 'indexed', 'loop', and direct produce the same resu expect_equal(ped_gen_indexed, ped_gen_direct, tolerance = tolerance) }) -test_that("isChild_method product the same results for add matrix", { - data(inbreeding) - df <- inbreeding - df$momID[df$ID == 6] <- NA +test_that("isChild_method product the same results for mtdna matrix, remove mom", { + data(hazard) + df <- hazard + tolerance <- 1e-10 + ped_mit_partial_nona <- ped2com(df, isChild_method= "partialparent", + component = "mitochondrial", + adjacency_method = "direct") + ped_mit_classic_nona <- ped2com(df, isChild_method= "classic", + component = "mitochondrial", adjacency_method = "direct") + + expect_equal(ped_mit_partial_nona, ped_mit_classic_nona, tolerance = tolerance) + df$momID[df$ID == 4] <- NA + + # maternal + ped_mit_partial <- ped2com(df, isChild_method= "partialparent", + component = "mitochondrial", + adjacency_method = "direct") + ped_mit_classic <- ped2com(df, isChild_method= "classic", + component = "mitochondrial", adjacency_method = "direct") + # should be the same within method + # expect_equal(ped_mit_partial, ped_mit_classic, tolerance = tolerance) +# expect_equal(ped_mit_partial, ped_mit_classic_nona, tolerance = tolerance) + + # should be the same across methods +# expect_equal(ped_mit_partial_nona, ped_mit_partial, tolerance = tolerance) +# expect_equal(ped_mit_classic_nona, ped_mit_classic, tolerance = tolerance) +}) + +test_that("isChild_method product the same results for mtdna matrix, remove dad", { + data(hazard) + df <- hazard + tolerance <- 1e-10 + ped_mit_partial_nona <- ped2com(df, isChild_method= "partialparent", + component = "mitochondrial", + adjacency_method = "direct") + ped_mit_classic_nona <- ped2com(df, isChild_method= "classic", + component = "mitochondrial", adjacency_method = "direct") + + expect_equal(ped_mit_partial_nona, ped_mit_classic_nona, tolerance = tolerance) + df$dadID[df$ID == 4] <- NA + # maternal + ped_mit_partial <- ped2com(df, isChild_method= "partialparent", + component = "mitochondrial", + adjacency_method = "direct") + ped_mit_classic <- ped2com(df, isChild_method= "classic", + component = "mitochondrial", adjacency_method = "direct") + # should be the same within method + expect_equal(ped_mit_partial, ped_mit_classic, tolerance = tolerance) + expect_equal(ped_mit_partial, ped_mit_classic_nona, tolerance = tolerance) + + # should be the same across methods + expect_equal(ped_mit_partial_nona, ped_mit_partial, tolerance = tolerance) + expect_equal(ped_mit_classic_nona, ped_mit_classic, tolerance = tolerance) +}) + +test_that("isChild_method product the same results for add matrix for hazard", { + data(hazard) + tolerance <- 1e-10 + df <- hazard + + ped_add_partial_nona <- ped2com(df, isChild_method= "partialparent", + component = "additive", + adjacency_method = "direct") + ped_add_classic_nona <- ped2com(df, isChild_method= "classic", + component = "additive", adjacency_method = "direct") + expect_equal(ped_add_partial_nona, ped_add_classic_nona, tolerance = tolerance) + + df$momID[df$ID == 4] <- NA tolerance <- 1e-10 # add ped_add_partial <- ped2com(df, isChild_method= "partialparent", @@ -333,10 +397,11 @@ test_that("isChild_method product the same results for add matrix", { ped_add_classic <- ped2com(df, isChild_method= "classic", component = "additive", adjacency_method = "direct") - expect_equal(ped_add_partial[6,6], 1, tolerance = tolerance) - expect_equal(ped_add_classic[6,6], .75, tolerance = tolerance) + expect_equal(ped_add_partial[4,4], 1, tolerance = tolerance) + expect_equal(ped_add_classic[4,4], .75, tolerance = tolerance) difference <- ped_add_partial - ped_add_classic -# expect_equal(ped_add_partial, ped_add_classic, tolerance = tolerance) + +# expect_equal(ped_add_partial, ped_add_classic_nona, tolerance = tolerance) difference <- ped_add_partial - ped_add_classic @@ -345,21 +410,35 @@ test_that("isChild_method product the same results for add matrix", { -test_that("isChild_method product the same results for mtdna matrix", { - data(hazard) - df <- hazard - df$momID[df$ID == 4] <- NA +test_that("isChild_method product the same results for add matrix with inbreeding", { + data(inbreeding) + df <- inbreeding tolerance <- 1e-10 - # maternal - ped_mit_partial <- ped2com(df, isChild_method= "partialparent", - component = "mitochondrial", - adjacency_method = "indexed") - ped_mit_classic <- ped2com(df, isChild_method= "classic", - component = "mitochondrial", adjacency_method = "indexed") - # should be the same - expect_equal(ped_mit_partial, ped_mit_classic, tolerance = tolerance) + ped_add_classic_nona <- ped2com(df, isChild_method= "classic", + component = "additive", adjacency_method = "direct") + ped_add_partial_nona <- ped2com(df, isChild_method= "partialparent", + component = "additive", + adjacency_method = "direct") + df$momID[df$ID == 6] <- NA + # add + ped_add_partial <- ped2com(df, isChild_method= "partialparent", + component = "additive", + adjacency_method = "direct") + ped_add_classic <- ped2com(df, isChild_method= "classic", + component = "additive", adjacency_method = "direct") + + expect_equal(ped_add_partial[6,6], 1, tolerance = tolerance) + expect_equal(ped_add_classic[6,6], .75, tolerance = tolerance) + difference <- ped_add_partial - ped_add_classic + # expect_equal(ped_add_partial, ped_add_classic, tolerance = tolerance) + + difference <- ped_add_partial - ped_add_classic + + expect_gt(sum(abs(difference)),0) }) + + diff --git a/tests/testthat/test-makeLinks.R b/tests/testthat/test-makeLinks.R new file mode 100644 index 00000000..dded44d8 --- /dev/null +++ b/tests/testthat/test-makeLinks.R @@ -0,0 +1,144 @@ +test_that("com2links handles missing matrices properly", { + expect_error(com2links(ad_ped_matrix = NULL, mit_ped_matrix = NULL, cn_ped_matrix = NULL), + "At least one of 'ped_matrix', 'mit_ped_matrix', or 'cn_ped_matrix' must be provided.") +}) + +test_that("com2links rejects invalid matrix types", { + fake_matrix <- data.frame(A = c(1, 2), B = c(3, 4)) + expect_error(com2links(ad_ped_matrix = fake_matrix), "The 'ad_ped_matrix' must be a matrix or dgCMatrix.") +}) + +test_that("com2links produces correct output with a single relationship matrix (hazard dataset)", { + data(hazard) + ad_ped_matrix <- ped2add(hazard,sparse=TRUE) + + result <- com2links(ad_ped_matrix = ad_ped_matrix, writetodisk = FALSE) + + expect_true(is.data.frame(result)) + expect_true(all(c("ID1", "ID2", "addRel") %in% colnames(result))) + expect_equal(ncol(result), 3) # Expect ID1, ID2, and addRel + expect_true(all(result$addRel >= 0)) # Relatedness values should be non-negative +}) + +test_that("com2links produces correct output with mt_ped_matrix", { + data(hazard) + mit_ped_matrix <- ped2mit(hazard,sparse=TRUE) + + result <- com2links(mt_ped_matrix = mit_ped_matrix, writetodisk = FALSE) + + expect_true(is.data.frame(result)) + expect_true(all(c("ID1", "ID2", "mitRel") %in% colnames(result))) + expect_equal(ncol(result), 3) # Expect ID1, ID2, and addRel + expect_true(all(result$addRel >= 0)) # Relatedness values should be non-negative +}) + +test_that("com2links processes multiple matrices correctly (hazard dataset)", { + data(hazard) + ad_ped_matrix <- ped2add(hazard,sparse=TRUE) + mit_ped_matrix <- ped2mit(hazard,sparse=TRUE) + cn_ped_matrix <- ped2cn(hazard,sparse=TRUE) + + result <- com2links(ad_ped_matrix = ad_ped_matrix, mit_ped_matrix = mit_ped_matrix, cn_ped_matrix = cn_ped_matrix, writetodisk = FALSE) + + expect_true(is.data.frame(result)) + expect_true(all(c("ID1", "ID2", "addRel", "mitRel", "cnuRel") %in% colnames(result))) + expect_equal(ncol(result), 5) # Expect ID1, ID2, addRel, mitRel, and cnuRel + expect_true(all(result$addRel >= 0)) + expect_true(all(result$mitRel %in% c(0, 1))) # Mitochondrial should be binary + expect_true(all(result$cnuRel >= 0)) +}) + +test_that("com2links correctly handles missing matrices", { + data(hazard) +# ad_ped_matrix <- ped2add(hazard) + + expect_error(com2links(ad_ped_matrix = NULL, mit_ped_matrix = NULL, cn_ped_matrix = NULL), + "At least one of 'ped_matrix', 'mit_ped_matrix', or 'cn_ped_matrix' must be provided.") + + expect_error(com2links(ad_ped_matrix = hazard), "The 'ad_ped_matrix' must be a matrix or dgCMatrix.") +}) + +test_that("com2links correctly processes inbreeding dataset", { + data(inbreeding) + ad_ped_matrix <- ped2add(inbreeding,sparse=TRUE) + mit_ped_matrix <- ped2mit(inbreeding,sparse=TRUE) + cn_ped_matrix <- ped2cn(inbreeding,sparse=TRUE) + + result <- com2links(ad_ped_matrix = ad_ped_matrix, + mit_ped_matrix = mit_ped_matrix, + cn_ped_matrix = cn_ped_matrix, + writetodisk = FALSE) + + expect_true(is.data.frame(result)) + expect_true(all(c("ID1", "ID2", "addRel", "mitRel", "cnuRel") %in% colnames(result))) + expect_equal(ncol(result), 5) + expect_true(all(result$addRel >= 0)) + expect_true(all(result$mitRel %in% c(0, 1))) # Mitochondrial should be binary + expect_true(all(result$cnuRel >= 0)) +}) + + +test_that("com2links writes correct data to disk", { + data(hazard) + ad_ped_matrix <- ped2add(hazard,sparse=TRUE) + + temp_file <- tempfile(fileext = ".csv") + com2links(ad_ped_matrix = ad_ped_matrix, rel_pairs_file = temp_file, writetodisk = TRUE) + + expect_true(file.exists(temp_file)) + written_data <- read.csv(temp_file) + expect_true(all(c("ID1", "ID2", "addRel") %in% colnames(written_data))) +}) + +test_that("com2links handles large batch writing correctly", { + set.seed(123) + kpc <- 4 + Ngen <- 4 + marR <- 0.8 + sexR <- 0.5 + df_fam <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR) + + ad_ped_matrix <- ped2add(df_fam,sparse=TRUE) + + temp_file <- tempfile(fileext = ".csv") + com2links(ad_ped_matrix = ad_ped_matrix, rel_pairs_file = temp_file, writetodisk = TRUE, verbose = TRUE) + + expect_true(file.exists(temp_file)) + written_data <- read.csv(temp_file) + expect_true(nrow(written_data) > 1000) # Ensuring batch writing logic works +}) + +test_that("com2links garbage collection does not affect output, using two components", { + data(hazard) + ad_ped_matrix <- ped2add(hazard,sparse=TRUE) + mit_ped_matrix <- ped2mit(hazard,sparse=TRUE) + cn_ped_matrix <- ped2cn(hazard,sparse=TRUE) + + result_gc <- com2links(ad_ped_matrix = ad_ped_matrix, + mit_ped_matrix = mit_ped_matrix, + gc = TRUE, writetodisk = FALSE) + result_no_gc <- com2links(ad_ped_matrix = ad_ped_matrix, + mit_ped_matrix = mit_ped_matrix, + gc = FALSE, writetodisk = FALSE) + + expect_equal(result_gc, result_no_gc) + + result_gc <- com2links(ad_ped_matrix = ad_ped_matrix, + cn_ped_matrix = cn_ped_matrix, + gc = TRUE, writetodisk = FALSE) + result_no_gc <- com2links(ad_ped_matrix = ad_ped_matrix, + cn_ped_matrix = cn_ped_matrix, + gc = FALSE, writetodisk = FALSE) + + expect_equal(result_gc, result_no_gc) + + result_gc <- com2links(mit_ped_matrix =mit_ped_matrix, + cn_ped_matrix = cn_ped_matrix, + gc = TRUE, writetodisk = FALSE) + result_no_gc <- com2links(mit_ped_matrix = mit_ped_matrix, + cn_ped_matrix = cn_ped_matrix, + gc = FALSE, writetodisk = FALSE) + + expect_equal(result_gc, result_no_gc) +}) + diff --git a/tests/testthat/test-plotPedigree.R b/tests/testthat/test-plotPedigree.R new file mode 100644 index 00000000..38aa3074 --- /dev/null +++ b/tests/testthat/test-plotPedigree.R @@ -0,0 +1,31 @@ +test_that("simulated pedigree plots correctly", { + set.seed(5) + Ngen <- 4 + kpc <- 4 + sexR <- .50 + marR <- .7 + + results <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR) + + expect_no_error(plotPedigree(results, verbose = FALSE)) + + kpc <- 2 + results2 <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR) + results2$fam <- paste0("fam 2") + results <- rbind(results, results2) + expect_output(plotPedigree(results, verbose = TRUE)) +}) + + +test_that("pedigree plots correctly with affected variables", { + set.seed(5) + Ngen <- 4 + kpc <- 4 + sexR <- .50 + marR <- .7 + + results <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR) + results$affected <- rbinom(n = nrow(results), size = 1, prob = .1) + expect_output(plotPedigree(results, verbose = TRUE, affected = "affected")) + expect_output(plotPedigree(results, verbose = TRUE, affected = results$affected)) +}) diff --git a/tests/testthat/test-readPedigrees.R b/tests/testthat/test-readPedigrees.R index 810040e4..9f3c1439 100644 --- a/tests/testthat/test-readPedigrees.R +++ b/tests/testthat/test-readPedigrees.R @@ -137,7 +137,7 @@ test_that("processParents adds momID and dadID correctly", { ) # Call processParents - df_temp <- processParents(df_temp) + df_temp <- processParents(df_temp,datasource="gedcom") # Check the structure of the data frame expect_true("momID" %in% colnames(df_temp)) @@ -161,7 +161,7 @@ test_that("processParents adds momID and dadID correctly", { ) # Call processParents - df_temp <- processParents(df_temp) + df_temp <- processParents(df_temp,datasource="gedcom") # Check the contents of the data frame expect_equal(df_temp$momID[3], "I2") @@ -176,3 +176,30 @@ test_that("if file does not exist, readGedcom throws an error", { # Call readGedcom with a non-existent file expect_error(readGedcom("nonexistent.ged")) }) + + + +# readWikifamilytree + +test_that("readWikifamilytree reads a simple file correctly", { + # Create a temporary WikiFamilyTree file for testing + # Example usage + family_tree_text <- "{{familytree/start |summary=I have a brother Joe and a little sister: my mom married my dad, and my dad's parents were Grandma and Grandpa; they had another child, Aunt Daisy.}} +{{familytree | | | | GMa |~|y|~| GPa | | GMa=Gladys|GPa=Sydney}} +{{familytree | | | | | | | |)|-|-|-|.| }} +{{familytree | | | MOM |y| DAD | |DAISY| MOM=Mom|DAD=Dad|DAISY=[[Daisy Duke]]}} +{{familytree | |,|-|-|-|+|-|-|-|.| | | }} +{{familytree | JOE | | ME | | SIS | | | JOE=My brother Joe|ME='''Me!'''|SIS=My little sister}} +{{familytree/end}}" + +result <- readWikifamilytree(family_tree_text) + +#list( +# summary = summary_text, +# members = members_df, +# structure = tree_long, +# relationships = relationships_df +#) +expect_equal(result$summary, + "I have a brother Joe and a little sister: my mom married my dad, and my dad's parents were Grandma and Grandpa; they had another child, Aunt Daisy.") +}) diff --git a/tests/testthat/test-simulatePedigree.R b/tests/testthat/test-simulatePedigree.R index 56e1a537..d28dc5f3 100644 --- a/tests/testthat/test-simulatePedigree.R +++ b/tests/testthat/test-simulatePedigree.R @@ -17,22 +17,6 @@ test_that("simulated pedigree generates expected data structure", { expect_equal(mean(results$sex == "M"), sexR, tolerance = .05) }) -test_that("simulated pedigree plots correctly", { - set.seed(5) - Ngen <- 4 - kpc <- 4 - sexR <- .50 - marR <- .7 - - results <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR) - expect_no_error(plotPedigree(results, verbose = FALSE)) - - kpc <- 2 - results2 <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR) - results2$fam <- paste0("fam 2") - results <- rbind(results, results2) - expect_output(plotPedigree(results, verbose = TRUE)) -}) test_that("simulatePedigree verbose prints updates", { set.seed(5) @@ -44,15 +28,3 @@ test_that("simulatePedigree verbose prints updates", { expect_output(simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR, verbose = TRUE), regexp = "Let's build the connection within each generation first") }) -test_that("pedigree plots correctly with affected variables", { - set.seed(5) - Ngen <- 4 - kpc <- 4 - sexR <- .50 - marR <- .7 - - results <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR) - results$affected <- rbinom(n = nrow(results), size = 1, prob = .1) - expect_output(plotPedigree(results, verbose = TRUE, affected = "affected")) - expect_output(plotPedigree(results, verbose = TRUE, affected = results$affected)) -}) diff --git a/vignettes/partial.Rmd b/vignettes/partial.Rmd new file mode 100644 index 00000000..f4087f73 --- /dev/null +++ b/vignettes/partial.Rmd @@ -0,0 +1,436 @@ +--- +title: "Partial" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Validation} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +options(rmarkdown.html_vignette.check_title = FALSE) +library(tidyverse) +``` + +# Introduction + +The `ped2com` function computes relationship matrices from pedigree data using a recursive algorithm based on parent-offspring connections. Central to this computation is the parent `adjacency matrix`, which defines how individuals in the pedigree are connected across generations. The adjacency matrix acts as the structural input from which genetic relatedness is propagated. + +The function offers two methods for constructing this matrix: + +1. The classic method, which assumes that all parents are known and that the adjacency matrix is complete. +2. The partial parent method, which allows for missing values in the parent adjacency matrix. + +When parent data are complete, both methods return equivalent results. But when parental information is missing, their behavior diverges. This vignette illustrates how and why these differences emerge, and under what conditions the partial method provides more accurate results. + +## Hazard Data Example + +We begin with the `hazard` dataset. First, we examine behavior under complete pedigree data. + + +```{r} +library(BGmisc) + +data(hazard) + +df <- hazard # this is the data that we will use for the example +``` +```{r, echo=FALSE} + +temp<-plotPedigree(df, title = "Complete Pedigree") +``` + +We compute the additive genetic relationship matrix using both the classic and partial parent methods. Because the pedigree is complete, we expect no differences in the resulting matrices. + +```{r} + +ped_add_partial_complete <- ped2com(df, isChild_method= "partialparent", + component = "additive", + adjacency_method = "direct") +ped_add_classic_complete <- ped2com(df, isChild_method= "classic", + component = "additive", adjacency_method = "direct") + +``` + +The following plots display the full additive matrices. These matrices should be identical. + +This can be confirmed visually and numerically. + +```{r} + +library(corrplot) + + +corrplot(as.matrix(ped_add_classic_complete), + method = 'color', type = 'lower', col.lim = c(0,1), + is.corr = FALSE, title = "Additive component - Classic method") + +corrplot(as.matrix(ped_add_partial_complete), + method = 'color', type = 'lower', col.lim = c(0,1), + is.corr = FALSE, title = "Additive component - Partial parent method") + +``` + + +To verify this, we subtract one matrix from the other and calculate RMSE. The difference should be numerically zero. Indeed, it is `r sqrt(mean((ped_add_classic_complete-ped_add_partial_complete)^2))`. + +```{r} + +corrplot(as.matrix(ped_add_classic_complete-ped_add_partial_complete), + method = 'color', type = 'lower', col.lim = c(0,1), + is.corr = FALSE) +``` + + +## Introducing Missingness: Remove a Parent + + +To observe how the two methods diverge when data are incomplete, we remove one parent—starting with the mother of individual 4. + + +```{r} +df$momID[df$ID == 4] <- NA +``` + +```{r} + +ped_add_partial_mom <- ped_add_partial<- ped2com(df, isChild_method= "partialparent", + component = "additive", + adjacency_method = "direct") + +ped_add_classic_mom <- ped_add_classic <- ped2com(df, isChild_method= "classic", + component = "additive", adjacency_method = "direct") + +``` + + +The two methods now treat individual 4 differently in the parent adjacency matrix. The classic method applies a fixed contribution because one parent remains. The partial parent method inflates the individual’s diagonal contribution to account for the missing parent. + +The resulting additive matrices reflect this difference. The RMSE between the two matrices is `r sqrt(mean((ped_add_classic-ped_add_partial)^2))`. + + +```{r} +corrplot(as.matrix(ped_add_classic), + method = 'color', type = 'lower', col.lim = c(0, 1), + is.corr = FALSE, title = "Classic (mother removed)") + +corrplot(as.matrix(ped_add_partial), + method = 'color', type = 'lower', col.lim = c(0, 1), + is.corr = FALSE, title = "Partial (mother removed)") +``` + + +We quantify the overall matrix difference: + +```{r} +sqrt(mean((ped_add_classic - ped_add_partial)^2)) +``` + + +Next, we compare each method to the matrix from the complete pedigree. This evaluates how much each method deviates from the correct additive structure. + +```{r} +corrplot(as.matrix(ped_add_classic_complete - ped_add_classic), + method = 'color', type = 'lower', col.lim = c(0, 1), + is.corr = FALSE) + +sqrt(mean((ped_add_classic_complete - ped_add_classic)^2)) + + +``` + +The RMSE between the true additive component and the classic method is `r sqrt(mean((ped_add_classic_complete-ped_add_classic)^2))`. + +```{r} + +corrplot(as.matrix(ped_add_classic_complete - ped_add_partial), + method = 'color', type = 'lower', col.lim = c(0, 1), + is.corr = FALSE) + +sqrt(mean((ped_add_classic_complete - ped_add_partial)^2)) + +``` + +The RMSE between the true additive component and the partial parent method is `r sqrt(mean((ped_add_classic_complete-ped_add_partial)^2))`. + + +The partial method shows smaller deviations from the complete matrix, confirming that it better preserves relatedness structure when one parent is missing. + + +### Removing the Father Instead + +We now repeat the same process, this time removing the father of individual 4. + + +```{r} + +data(hazard) + +df <- hazard # this is the data that we will use for the example + + + df$dadID[df$ID == 4] <- NA + # add +ped_add_partial_dad <- ped_add_partial<- ped2com(df, isChild_method= "partialparent", + component = "additive", + adjacency_method = "direct") + +ped_add_classic_dad <- ped_add_classic <- ped2com(df, isChild_method= "classic", + component = "additive", adjacency_method = "direct") + +``` + + +As we can see, the two matrices are different. The RMSE between the two matrices is `r sqrt(mean((ped_add_classic-ped_add_partial)^2))`. + +```{r} +corrplot(as.matrix(ped_add_classic_dad), + method = 'color', type = 'lower', col.lim = c(0, 1), + is.corr = FALSE, title = "Classic (father removed)") + +corrplot(as.matrix(ped_add_partial_dad), + method = 'color', type = 'lower', col.lim = c(0, 1), + is.corr = FALSE, title = "Partial (father removed)") + +``` + +Again, we compare to the true matrix from the complete pedigree: + + +```{r} +corrplot(as.matrix(ped_add_classic_complete - ped_add_classic), + method = 'color', type = 'lower', col.lim = c(0, 1), + is.corr = FALSE) + +sqrt(mean((ped_add_classic_complete - ped_add_classic)^2)) + +``` + + +```{r} + +corrplot(as.matrix(ped_add_classic_complete - ped_add_partial), + method = 'color', type = 'lower', col.lim = c(0, 1), + is.corr = FALSE) + +sqrt(mean((ped_add_classic_complete - ped_add_partial)^2)) + +``` + +The partial parent method again yields a matrix closer to the full-data version. + + + +## Inbreeding Dataset: Family-Level Evaluation + +To generalize the comparison across a larger and more varied set of pedigrees, we use the `inbreeding` dataset. Each family in this dataset is analyzed independently. + +```{r} + +data("inbreeding") + +df <- inbreeding + +FamIDs <- unique(df$FamID) + +``` + +For each one, we construct the additive relationship matrix under complete information and then simulate two missingness scenarios: + +- Missing mother: One individual with a known mother is randomly selected, and the mother's ID is set to NA. + +- Missing father: Similarly, one individual with a known father is selected, and the father's ID is set to NA. + + +In each condition, we recompute the additive matrix using both the classic and partial parent methods. We then calculate the RMSE between each estimate and the matrix from the complete pedigree. This allows us to quantify which method more accurately reconstructs the original relatedness structure when parental data are partially missing. + + +```{r} +inbreeding_list <- list() +results <- data.frame(FamIDs = FamIDs, + RMSE_partial_dad = rep(NA, length(FamIDs)), + RMSE_partial_mom = rep(NA, length(FamIDs)), + RMSE_classic_dad = rep(NA, length(FamIDs)), + RMSE_classic_mom = rep(NA, length(FamIDs)), + max_R_classic_dad = rep(NA, length(FamIDs)), + max_R_partial_dad = rep(NA, length(FamIDs)), + max_R_classic_mom = rep(NA, length(FamIDs)), + max_R_partial_mom = rep(NA, length(FamIDs)), + max_R_classic = rep(NA, length(FamIDs))) +``` + +The loop below performs this procedure for all families in the dataset and stores both the RMSEs and the maximum relatedness values. + + +```{r} + + +for (i in 1:length(FamIDs)) { + +# make three versions to filter down +df_fam_dad<-df_fam_mom <- df_fam <- df[df$FamID == FamIDs[i],] + +results$RMSE_partial_mom[i] <- sqrt(mean((ped_add_classic_complete-ped_add_partial_mom)^2)) + + +ped_add_partial_complete <- ped2com(df_fam, isChild_method= "partialparent", + component = "additive", + adjacency_method = "direct") + +ped_add_classic_complete <- ped2com(df_fam, isChild_method= "classic", + component = "additive", + adjacency_method = "direct") + + +# select first ID with a mom and dad +momid_to_cut <- df_fam$ID[!is.na(df_fam$momID)] %>% head(1) +dadid_to_cut <- df_fam$ID[!is.na(df_fam$dadID)] %>% head(1) + +df_fam_dad$dadID[df_fam$ID == dadid_to_cut] <- NA + +df_fam_mom$momID[df_fam$ID == momid_to_cut] <- NA + +ped_add_partial_dad <- ped2com(df_fam_dad, isChild_method= "partialparent", + component = "additive", + adjacency_method = "direct") +ped_add_classic_dad <- ped2com(df_fam_dad, isChild_method= "classic", + component = "additive", adjacency_method = "direct") + +results$RMSE_partial_dad[i] <- sqrt(mean((ped_add_classic_complete-ped_add_partial_dad)^2)) +results$RMSE_classic_dad[i] <- sqrt(mean((ped_add_classic_complete-ped_add_classic_dad)^2)) +results$max_R_classic_dad[i] <- max(as.matrix(ped_add_classic_dad)) +results$max_R_partial_dad[i] <- max(as.matrix(ped_add_partial_dad)) + + +ped_add_partial_mom <- ped2com(df_fam_mom, isChild_method= "partialparent", + component = "additive", + adjacency_method = "direct") + +ped_add_classic_mom <- ped2com(df_fam_mom, isChild_method= "classic", + component = "additive", adjacency_method = "direct") + +results$RMSE_partial_mom[i] <- sqrt(mean((ped_add_classic_complete-ped_add_partial_mom)^2)) +results$RMSE_classic_mom[i] <- sqrt(mean((ped_add_classic_complete-ped_add_classic_mom)^2)) +results$max_R_classic_mom[i] <- max(as.matrix(ped_add_classic_mom)) +results$max_R_partial_mom[i] <- max(as.matrix(ped_add_partial_mom)) +results$max_R_classic[i] <- max(as.matrix(ped_add_classic_complete)) + +inbreeding_list[[i]] <- list(df_fam=df_fam, + ped_add_partial_complete = ped_add_partial_complete, + ped_add_classic_complete = ped_add_classic_complete, + ped_add_partial_dad = ped_add_partial_dad, + ped_add_classic_dad = ped_add_classic_dad, + ped_add_partial_mom = ped_add_partial_mom, + ped_add_classic_mom = ped_add_classic_mom) + +} + +``` + +### Example: Family ``r FamIDs[1]`` + + +To understand what these matrices look like, we visualize them for one representative family. For this example, we select the first family in the dataset. + +```{r echo=FALSE, message=FALSE, warning=FALSE} +temp<- plotPedigree(inbreeding_list[[1]]$df_fam, title = "Family 1",verbose = FALSE) +``` + +```{r} +# pull the first family from the list +fam1 <- inbreeding_list[[1]] + +corrplot(as.matrix(fam1$ped_add_classic_complete), + method = 'color', type = 'lower', col.lim = c(0,1), + is.corr = FALSE, title = "Classic - Complete") + +corrplot(as.matrix(fam1$ped_add_classic_mom), + method = 'color', type = 'lower', col.lim = c(0,1), + is.corr = FALSE, title = "Classic - Mom Missing") + +corrplot(as.matrix(fam1$ped_add_partial_mom), + method = 'color', type = 'lower', col.lim = c(0,1), + is.corr = FALSE, title = "Partial - Mom Missing") + +corrplot(as.matrix(fam1$ped_add_classic_dad), + method = 'color', type = 'lower', col.lim = c(0,1), + is.corr = FALSE, title = "Classic - Dad Missing") + +corrplot(as.matrix(fam1$ped_add_partial_dad), + method = 'color', type = 'lower', col.lim = c(0,1), + is.corr = FALSE, title = "Partial - Dad Missing") +``` + + +To visualize the differences from the true matrix: + +```{r} +corrplot(as.matrix(fam1$ped_add_classic_complete - fam1$ped_add_classic_mom), + method = 'color', type = 'lower', col.lim = c(0,1), + is.corr = FALSE, title = "Classic Mom Diff from Complete") + +corrplot(as.matrix(fam1$ped_add_classic_complete - fam1$ped_add_partial_mom), + method = 'color', type = 'lower', col.lim = c(0,1), + is.corr = FALSE, title = "Partial Mom Diff from Complete") + +corrplot(as.matrix(fam1$ped_add_classic_complete - fam1$ped_add_classic_dad), + method = 'color', type = 'lower', col.lim = c(0,1), + is.corr = FALSE, title = "Classic Dad Diff from Complete") + +corrplot(as.matrix(fam1$ped_add_classic_complete - fam1$ped_add_partial_dad), + method = 'color', type = 'lower', col.lim = c(0,1), + is.corr = FALSE, title = "Partial Dad Diff from Complete") + +``` + +These plots show how each method responds to missing data, and whether it maintains consistency with the complete pedigree. We observe that the partial parent method typically introduces smaller deviations. If desired, this same diagnostic can be repeated for additional families, such as inbreeding_list[[2]]. + + + + +## Summary + +Across all families in the inbreeding dataset, the results show a consistent pattern: +the partial parent method outperforms the classic method in reconstructing the additive genetic relationship matrix when either a mother or a father is missing. + +To make this explicit, we calculate the RMSE difference between methods. A positive value means that the partial method had lower RMSE (i.e., better accuracy) than the classic method: + +```{r} + +results <- results %>% as.data.frame() %>% mutate(RMSE_diff_dad = RMSE_classic_dad - RMSE_partial_dad, + RMSE_diff_mom = RMSE_classic_mom - RMSE_partial_mom) +``` + +We can then summarize the pattern across families: + + +```{r} + +results %>% + select(RMSE_diff_mom, RMSE_diff_dad) %>% + summary() +``` + +In all families, both `RMSE_diff_mom` and `RMSE_diff_dad` are positive—indicating that the classic method produces larger the errors relative to the partial method. This holds regardless of whether the missing parent is a mother or a father. + +To verify this directly: + +```{r} +mean(results$RMSE_diff_mom > 0, na.rm = TRUE) +mean(results$RMSE_diff_dad > 0, na.rm = TRUE) +``` + +These proportions show how often the partial method produces a lower RMSE across the dataset. This confirms the earlier findings: when pedigree data are incomplete, the partial parent method more faithfully reconstructs the full-data relatedness matrix. + +```{r} +results %>% as.data.frame() %>% select(-FamIDs,-RMSE_diff_mom,-RMSE_diff_dad,-max_R_classic_dad, + -max_R_partial_dad,-max_R_classic_mom,-max_R_partial_mom,-max_R_classic) %>% summary() +``` + + + + diff --git a/vignettes/partial.html b/vignettes/partial.html new file mode 100644 index 00000000..e0211151 --- /dev/null +++ b/vignettes/partial.html @@ -0,0 +1,741 @@ + + + + +
+ + + + + + + + + +The ped2com function computes relationship matrices from
+pedigree data using a recursive algorithm based on parent-offspring
+connections. Central to this computation is the parent
+adjacency matrix, which defines how individuals in the
+pedigree are connected across generations. The adjacency matrix acts as
+the structural input from which genetic relatedness is propagated.
The function offers two methods for constructing this matrix:
+When parent data are complete, both methods return equivalent +results. But when parental information is missing, their behavior +diverges. This vignette illustrates how and why these differences +emerge, and under what conditions the partial method provides more +accurate results.
+We begin with the hazard dataset. First, we examine
+behavior under complete pedigree data.
We compute the additive genetic relationship matrix using both the +classic and partial parent methods. Because the pedigree is complete, we +expect no differences in the resulting matrices.
+
+ped_add_partial_complete <- ped2com(df, isChild_method= "partialparent",
+ component = "additive",
+ adjacency_method = "direct")
+ped_add_classic_complete <- ped2com(df, isChild_method= "classic",
+ component = "additive", adjacency_method = "direct")The following plots display the full additive matrices. These +matrices should be identical.
+This can be confirmed visually and numerically.
+
+library(corrplot)
+#> Warning: package 'corrplot' was built under R version 4.4.3
+#> corrplot 0.95 loaded
+
+
+corrplot(as.matrix(ped_add_classic_complete),
+ method = 'color', type = 'lower', col.lim = c(0,1),
+ is.corr = FALSE, title = "Additive component - Classic method")
+corrplot(as.matrix(ped_add_partial_complete),
+ method = 'color', type = 'lower', col.lim = c(0,1),
+ is.corr = FALSE, title = "Additive component - Partial parent method")To verify this, we subtract one matrix from the other and calculate +RMSE. The difference should be numerically zero. Indeed, it is 0.
+
+corrplot(as.matrix(ped_add_classic_complete-ped_add_partial_complete),
+ method = 'color', type = 'lower', col.lim = c(0,1),
+ is.corr = FALSE)
+#> Warning in corrplot(as.matrix(ped_add_classic_complete -
+#> ped_add_partial_complete), : col.lim interval too wide, please set a suitable
+#> valueTo observe how the two methods diverge when data are incomplete, we +remove one parent—starting with the mother of individual 4.
+ +
+ped_add_partial_mom <- ped_add_partial<- ped2com(df, isChild_method= "partialparent",
+ component = "additive",
+ adjacency_method = "direct")
+
+ped_add_classic_mom <- ped_add_classic <- ped2com(df, isChild_method= "classic",
+ component = "additive", adjacency_method = "direct")The two methods now treat individual 4 differently in the parent +adjacency matrix. The classic method applies a fixed contribution +because one parent remains. The partial parent method inflates the +individual’s diagonal contribution to account for the missing +parent.
+The resulting additive matrices reflect this difference. The RMSE +between the two matrices is 0.009811.
+corrplot(as.matrix(ped_add_classic),
+ method = 'color', type = 'lower', col.lim = c(0, 1),
+ is.corr = FALSE, title = "Classic (mother removed)")
+corrplot(as.matrix(ped_add_partial),
+ method = 'color', type = 'lower', col.lim = c(0, 1),
+ is.corr = FALSE, title = "Partial (mother removed)")We quantify the overall matrix difference:
+ +Next, we compare each method to the matrix from the complete +pedigree. This evaluates how much each method deviates from the correct +additive structure.
+corrplot(as.matrix(ped_add_classic_complete - ped_add_classic),
+ method = 'color', type = 'lower', col.lim = c(0, 1),
+ is.corr = FALSE)The RMSE between the true additive component and the classic method +is 0.0299137.
+
+corrplot(as.matrix(ped_add_classic_complete - ped_add_partial),
+ method = 'color', type = 'lower', col.lim = c(0, 1),
+ is.corr = FALSE)The RMSE between the true additive component and the partial parent +method is 0.028259.
+The partial method shows smaller deviations from the complete matrix, +confirming that it better preserves relatedness structure when one +parent is missing.
+We now repeat the same process, this time removing the father of +individual 4.
+
+data(hazard)
+
+df <- hazard # this is the data that we will use for the example
+
+
+ df$dadID[df$ID == 4] <- NA
+ # add
+ped_add_partial_dad <- ped_add_partial<- ped2com(df, isChild_method= "partialparent",
+ component = "additive",
+ adjacency_method = "direct")
+
+ped_add_classic_dad <- ped_add_classic <- ped2com(df, isChild_method= "classic",
+ component = "additive", adjacency_method = "direct")As we can see, the two matrices are different. The RMSE between the +two matrices is 0.009811.
+corrplot(as.matrix(ped_add_classic_dad),
+ method = 'color', type = 'lower', col.lim = c(0, 1),
+ is.corr = FALSE, title = "Classic (father removed)")
+corrplot(as.matrix(ped_add_partial_dad),
+ method = 'color', type = 'lower', col.lim = c(0, 1),
+ is.corr = FALSE, title = "Partial (father removed)")Again, we compare to the true matrix from the complete pedigree:
+corrplot(as.matrix(ped_add_classic_complete - ped_add_classic),
+ method = 'color', type = 'lower', col.lim = c(0, 1),
+ is.corr = FALSE)
+corrplot(as.matrix(ped_add_classic_complete - ped_add_partial),
+ method = 'color', type = 'lower', col.lim = c(0, 1),
+ is.corr = FALSE)The partial parent method again yields a matrix closer to the +full-data version.
+To generalize the comparison across a larger and more varied set of
+pedigrees, we use the inbreeding dataset. Each family in
+this dataset is analyzed independently.
For each one, we construct the additive relationship matrix under +complete information and then simulate two missingness scenarios:
+Missing mother: One individual with a known mother is randomly +selected, and the mother’s ID is set to NA.
Missing father: Similarly, one individual with a known father is +selected, and the father’s ID is set to NA.
In each condition, we recompute the additive matrix using both the +classic and partial parent methods. We then calculate the RMSE between +each estimate and the matrix from the complete pedigree. This allows us +to quantify which method more accurately reconstructs the original +relatedness structure when parental data are partially missing.
+inbreeding_list <- list()
+results <- data.frame(FamIDs = FamIDs,
+ RMSE_partial_dad = rep(NA, length(FamIDs)),
+ RMSE_partial_mom = rep(NA, length(FamIDs)),
+ RMSE_classic_dad = rep(NA, length(FamIDs)),
+ RMSE_classic_mom = rep(NA, length(FamIDs)),
+ max_R_classic_dad = rep(NA, length(FamIDs)),
+ max_R_partial_dad = rep(NA, length(FamIDs)),
+ max_R_classic_mom = rep(NA, length(FamIDs)),
+ max_R_partial_mom = rep(NA, length(FamIDs)),
+ max_R_classic = rep(NA, length(FamIDs)))The loop below performs this procedure for all families in the +dataset and stores both the RMSEs and the maximum relatedness +values.
+
+
+for (i in 1:length(FamIDs)) {
+
+# make three versions to filter down
+df_fam_dad<-df_fam_mom <- df_fam <- df[df$FamID == FamIDs[i],]
+
+results$RMSE_partial_mom[i] <- sqrt(mean((ped_add_classic_complete-ped_add_partial_mom)^2))
+
+
+ped_add_partial_complete <- ped2com(df_fam, isChild_method= "partialparent",
+ component = "additive",
+ adjacency_method = "direct")
+
+ped_add_classic_complete <- ped2com(df_fam, isChild_method= "classic",
+ component = "additive",
+ adjacency_method = "direct")
+
+
+# select first ID with a mom and dad
+momid_to_cut <- df_fam$ID[!is.na(df_fam$momID)] %>% head(1)
+dadid_to_cut <- df_fam$ID[!is.na(df_fam$dadID)] %>% head(1)
+
+df_fam_dad$dadID[df_fam$ID == dadid_to_cut] <- NA
+
+df_fam_mom$momID[df_fam$ID == momid_to_cut] <- NA
+
+ped_add_partial_dad <- ped2com(df_fam_dad, isChild_method= "partialparent",
+ component = "additive",
+ adjacency_method = "direct")
+ped_add_classic_dad <- ped2com(df_fam_dad, isChild_method= "classic",
+ component = "additive", adjacency_method = "direct")
+
+results$RMSE_partial_dad[i] <- sqrt(mean((ped_add_classic_complete-ped_add_partial_dad)^2))
+results$RMSE_classic_dad[i] <- sqrt(mean((ped_add_classic_complete-ped_add_classic_dad)^2))
+results$max_R_classic_dad[i] <- max(as.matrix(ped_add_classic_dad))
+results$max_R_partial_dad[i] <- max(as.matrix(ped_add_partial_dad))
+
+
+ped_add_partial_mom <- ped2com(df_fam_mom, isChild_method= "partialparent",
+ component = "additive",
+ adjacency_method = "direct")
+
+ped_add_classic_mom <- ped2com(df_fam_mom, isChild_method= "classic",
+ component = "additive", adjacency_method = "direct")
+
+results$RMSE_partial_mom[i] <- sqrt(mean((ped_add_classic_complete-ped_add_partial_mom)^2))
+results$RMSE_classic_mom[i] <- sqrt(mean((ped_add_classic_complete-ped_add_classic_mom)^2))
+results$max_R_classic_mom[i] <- max(as.matrix(ped_add_classic_mom))
+results$max_R_partial_mom[i] <- max(as.matrix(ped_add_partial_mom))
+results$max_R_classic[i] <- max(as.matrix(ped_add_classic_complete))
+
+inbreeding_list[[i]] <- list(df_fam=df_fam,
+ ped_add_partial_complete = ped_add_partial_complete,
+ ped_add_classic_complete = ped_add_classic_complete,
+ ped_add_partial_dad = ped_add_partial_dad,
+ ped_add_classic_dad = ped_add_classic_dad,
+ ped_add_partial_mom = ped_add_partial_mom,
+ ped_add_classic_mom = ped_add_classic_mom)
+
+}1To understand what these matrices look like, we visualize them for +one representative family. For this example, we select the first family +in the dataset.
+# pull the first family from the list
+fam1 <- inbreeding_list[[1]]
+
+corrplot(as.matrix(fam1$ped_add_classic_complete),
+ method = 'color', type = 'lower', col.lim = c(0,1),
+ is.corr = FALSE, title = "Classic - Complete")
+corrplot(as.matrix(fam1$ped_add_classic_mom),
+ method = 'color', type = 'lower', col.lim = c(0,1),
+ is.corr = FALSE, title = "Classic - Mom Missing")
+corrplot(as.matrix(fam1$ped_add_partial_mom),
+ method = 'color', type = 'lower', col.lim = c(0,1),
+ is.corr = FALSE, title = "Partial - Mom Missing")
+corrplot(as.matrix(fam1$ped_add_classic_dad),
+ method = 'color', type = 'lower', col.lim = c(0,1),
+ is.corr = FALSE, title = "Classic - Dad Missing")
+corrplot(as.matrix(fam1$ped_add_partial_dad),
+ method = 'color', type = 'lower', col.lim = c(0,1),
+ is.corr = FALSE, title = "Partial - Dad Missing")To visualize the differences from the true matrix:
+corrplot(as.matrix(fam1$ped_add_classic_complete - fam1$ped_add_classic_mom),
+ method = 'color', type = 'lower', col.lim = c(0,1),
+ is.corr = FALSE, title = "Classic Mom Diff from Complete")
+corrplot(as.matrix(fam1$ped_add_classic_complete - fam1$ped_add_partial_mom),
+ method = 'color', type = 'lower', col.lim = c(0,1),
+ is.corr = FALSE, title = "Partial Mom Diff from Complete")
+corrplot(as.matrix(fam1$ped_add_classic_complete - fam1$ped_add_classic_dad),
+ method = 'color', type = 'lower', col.lim = c(0,1),
+ is.corr = FALSE, title = "Classic Dad Diff from Complete")
+corrplot(as.matrix(fam1$ped_add_classic_complete - fam1$ped_add_partial_dad),
+ method = 'color', type = 'lower', col.lim = c(0,1),
+ is.corr = FALSE, title = "Partial Dad Diff from Complete")These plots show how each method responds to missing data, and +whether it maintains consistency with the complete pedigree. We observe +that the partial parent method typically introduces smaller deviations. +If desired, this same diagnostic can be repeated for additional +families, such as inbreeding_list[[2]].
+Across all families in the inbreeding dataset, the results show a +consistent pattern: the partial parent method outperforms the classic +method in reconstructing the additive genetic relationship matrix when +either a mother or a father is missing.
+To make this explicit, we calculate the RMSE difference between +methods. A positive value means that the partial method had lower RMSE +(i.e., better accuracy) than the classic method:
+
+results <- results %>% as.data.frame() %>% mutate(RMSE_diff_dad = RMSE_classic_dad - RMSE_partial_dad,
+ RMSE_diff_mom = RMSE_classic_mom - RMSE_partial_mom)We can then summarize the pattern across families:
+
+results %>%
+ select(RMSE_diff_mom, RMSE_diff_dad) %>%
+ summary()
+#> RMSE_diff_mom RMSE_diff_dad
+#> Min. :0.001222 Min. :0.001222
+#> 1st Qu.:0.001869 1st Qu.:0.002036
+#> Median :0.002538 Median :0.002520
+#> Mean :0.005763 Mean :0.005786
+#> 3rd Qu.:0.005625 3rd Qu.:0.005625
+#> Max. :0.024221 Max. :0.024221In all families, both RMSE_diff_mom and
+RMSE_diff_dad are positive—indicating that the classic
+method produces larger the errors relative to the partial method. This
+holds regardless of whether the missing parent is a mother or a
+father.
To verify this directly:
+mean(results$RMSE_diff_mom > 0, na.rm = TRUE)
+#> [1] 1
+mean(results$RMSE_diff_dad > 0, na.rm = TRUE)
+#> [1] 1These proportions show how often the partial method produces a lower +RMSE across the dataset. This confirms the earlier findings: when +pedigree data are incomplete, the partial parent method more faithfully +reconstructs the full-data relatedness matrix.
+results %>% as.data.frame() %>% select(-FamIDs,-RMSE_diff_mom,-RMSE_diff_dad,-max_R_classic_dad,
+ -max_R_partial_dad,-max_R_classic_mom,-max_R_partial_mom,-max_R_classic) %>% summary()
+#> RMSE_partial_dad RMSE_partial_mom RMSE_classic_dad RMSE_classic_mom
+#> Min. :0.04773 Min. :0.04773 Min. :0.04895 Min. :0.04895
+#> 1st Qu.:0.05570 1st Qu.:0.05349 1st Qu.:0.05774 1st Qu.:0.05555
+#> Median :0.06206 Median :0.06899 Median :0.06457 Median :0.07158
+#> Mean :0.07545 Mean :0.07686 Mean :0.08124 Mean :0.08262
+#> 3rd Qu.:0.08237 3rd Qu.:0.08323 3rd Qu.:0.08866 3rd Qu.:0.08866
+#> Max. :0.15547 Max. :0.15547 Max. :0.17969 Max. :0.17969