From 883ffac05e5dc45df6795a2e4844bd94ab92c8c8 Mon Sep 17 00:00:00 2001 From: Mason Garrison Date: Fri, 9 May 2025 22:11:48 -0400 Subject: [PATCH 01/18] Update calcCoordinates.R --- R/calcCoordinates.R | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/R/calcCoordinates.R b/R/calcCoordinates.R index ff349bd4..47ea883c 100644 --- a/R/calcCoordinates.R +++ b/R/calcCoordinates.R @@ -83,7 +83,9 @@ calculateCoordinates <- function(ped, personID = "ID", momID = "momID", # Flatten coordinate matrix - pos_vector <- as.vector(pos$pos) + # pos_vector <- as.vector(pos$pos) +# spouse_vector <- as.vector(pos$spouse) + # Initialize coordinate columns in the data frame ped$nid <- NA @@ -100,11 +102,22 @@ calculateCoordinates <- function(ped, personID = "ID", momID = "momID", y_coords <- rep(NA, length(nid_vector)) x_pos <- rep(NA, length(nid_vector)) + # Initialize spouse vector + spouse_vector <- rep(NA, length(nid_vector)) + + #' A matrix with values + #' 1 = subject plotted to the immediate right is a spouse + #' 2 = subject plotted to the immediate right is an inbred spouse + #' 0 = not a spouse + + # Populate coordinates from nid positions for (i in seq_along(nid_vector)) { y_coords[i] <- nid_pos[i, "row"] x_coords[i] <- nid_pos[i, "col"] x_pos[i] <- pos$pos[nid_pos[i, "row"], nid_pos[i, "col"]] + spouse_vector[i] <- pos$spouse[nid_pos[i, "row"], nid_pos[i, "col"]] + } # ----- @@ -121,6 +134,8 @@ calculateCoordinates <- function(ped, personID = "ID", momID = "momID", ped$y_order <- y_coords[tmp] ped$x_pos <- x_pos[tmp] ped$y_pos <- y_coords[tmp] + ped$spousehint <- spouse_vector[tmp] + # Detect multiple layout positions for the same individual # This can happen if the same individual appears multiple times in the pedigree From f8de18f66ac209fe1e39df36d6165fc328898f82 Mon Sep 17 00:00:00 2001 From: Mason Garrison Date: Sat, 10 May 2025 14:13:23 -0400 Subject: [PATCH 02/18] smarter Update plots_morecomplexity.xmd lets you chose different labels --- .gitignore | 1 + R/calcCoordinates.R | 128 ++++++++++++++++++----------- R/ggpedigree.R | 48 ++++++++--- vignettes/plots.Rmd | 28 +++++++ vignettes/plots_morecomplexity.xmd | 1 + 5 files changed, 143 insertions(+), 63 deletions(-) diff --git a/.gitignore b/.gitignore index 1138d94f..7ec8657e 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ *.lnk docs *.code-workspace +R/plot.ped.R diff --git a/R/calcCoordinates.R b/R/calcCoordinates.R index 47ea883c..8c9c1da1 100644 --- a/R/calcCoordinates.R +++ b/R/calcCoordinates.R @@ -251,17 +251,44 @@ calculateConnections <- function(ped, # Construct base connection frame # This will be used for all joins + if ("x_otherself" %in% names(ped)) { - connections <- dplyr::select( - .data = ped, - "personID", - "x_pos", "y_pos", - "dadID", "momID", - "spouseID", - "famID", - "x_otherself", "y_otherself" - ) - } else { + connections <- dplyr::select( + .data = ped, + "personID", + "x_pos", "y_pos", + "dadID", "momID", + "spouseID", + "famID", + "x_otherself", "y_otherself", + "extra","link_as_mom", "link_as_dad", "link_as_spouse" + ) + + + connections_moms <- dplyr::filter(connections, .data$extra==FALSE | .data$link_as_mom == TRUE) %>% + dplyr::select( + -"extra", + -"link_as_mom", + -"link_as_dad", + -"link_as_spouse" + ) + + connections_dads <- dplyr::filter(connections, .data$extra==FALSE | .data$link_as_dad == TRUE) %>% + dplyr::select( + -"extra", + -"link_as_mom", + -"link_as_dad", + -"link_as_spouse" + ) + connections_spouses <- dplyr::filter(connections, .data$extra==FALSE | .data$link_as_spouse == TRUE) %>% + dplyr::select( + -"extra", + -"link_as_mom", + -"link_as_dad", + -"link_as_spouse" + ) + + } else { connections <- dplyr::select( .data = ped, "personID", @@ -269,13 +296,21 @@ calculateConnections <- function(ped, "dadID", "momID", "spouseID", "famID" + ) +# no duplications, so just use the same connections + connections_spouses <- connections_dads <- connections_moms <- connections } + + + + + # Get mom's coordinates mom_connections <- getRelativeCoordinates( ped = ped, - connections = connections, + connections = connections_moms, relativeIDvar = "momID", x_name = "x_mom", y_name = "y_mom" @@ -284,7 +319,7 @@ calculateConnections <- function(ped, # Get dad's coordinates dad_connections <- getRelativeCoordinates( ped = ped, - connections = connections, + connections = connections_dads, relativeIDvar = "dadID", x_name = "x_dad", y_name = "y_dad" @@ -296,7 +331,7 @@ calculateConnections <- function(ped, "personID", "x_pos", "y_pos", "spouseID" ) |> - dplyr::left_join(ped, + dplyr::left_join(connections_spouses, by = c("spouseID" = "personID"), suffix = c("", "_spouse"), multiple = "any" @@ -756,17 +791,19 @@ processExtras <- function(ped, config = list()) { mom_closer = dplyr::case_when( .data$dist_mom < .data$dist_mom_other ~ TRUE, .data$dist_mom_other < .data$dist_mom ~ FALSE, - TRUE ~ NA + TRUE ~ TRUE ), dad_closer = dplyr::case_when( .data$dist_dad < .data$dist_dad_other ~ TRUE, .data$dist_dad_other < .data$dist_dad ~ FALSE, - TRUE ~ NA + TRUE ~ TRUE ), spouse_closer = dplyr::case_when( .data$dist_spouse < .data$dist_spouse_other ~ TRUE, .data$dist_spouse_other < .data$dist_spouse ~ FALSE, - TRUE ~ NA + # is.na(.data$dist_spouse) ~ FALSE, + # !is.na(.data$dist_spouse) & is.na(.data$dist_spouse_other) ~ TRUE, + TRUE ~ TRUE ) ) @@ -790,18 +827,11 @@ processExtras <- function(ped, config = list()) { extras <- extras |> dplyr::mutate( - keep_parents = dplyr::case_when( - c("mom", "dad") %in% .data$closest_relative & .data$mom_closer == TRUE & .data$dad_closer == TRUE ~ TRUE, - c("mom", "dad") %in% .data$closest_relative & .data$mom_closer == TRUE & .data$dad_closer == FALSE ~ TRUE, - c("mom", "dad") %in% .data$closest_relative & .data$mom_closer == FALSE & .data$dad_closer == TRUE ~ TRUE, - TRUE ~ FALSE - ), - keep_spouse = dplyr::case_when( - c("spouse") %in% .data$closest_relative & .data$spouse_closer == TRUE ~ TRUE, - c("spouse") %in% .data$closest_relative & .data$spouse_closer == FALSE ~ FALSE, - TRUE ~ FALSE + link_as_mom = .data$closest_relative %in% c("mom", "dad") & .data$mom_closer, + link_as_dad = .data$closest_relative %in% c("mom", "dad") & .data$dad_closer, + link_as_spouse = .data$closest_relative == "spouse" & .data$spouse_closer ) - ) + # ----- # Final subset of relevant decision columns @@ -810,9 +840,9 @@ processExtras <- function(ped, config = list()) { skinnyextras <- extras |> dplyr::select( .data$newID, - .data$closest_relative, - .data$keep_parents, - .data$keep_spouse, + .data$link_as_dad, + .data$link_as_mom, + .data$link_as_spouse, .data$x_otherself, .data$y_otherself ) @@ -825,31 +855,29 @@ processExtras <- function(ped, config = list()) { ped <- ped |> dplyr::left_join(skinnyextras, - by = c("newID"), suffix = c("", "_"), - relationship = "one-to-one" + by = c("newID"), suffix = c("", "_")#, + # relationship = "one-to-one" + ) |> + dplyr::select( + -"newID" ) |> + # set the connection columns to TRUE if not kept dplyr::mutate( - spouseID = dplyr::case_when( - .data$keep_spouse == TRUE ~ .data$spouseID, - is.na(.data$closest_relative) ~ .data$spouseID, - TRUE ~ NA_real_ + link_as_mom = case_when( + is.na(.data$link_as_mom) ~ TRUE, + .data$link_as_mom == TRUE ~ TRUE, + .data$link_as_mom == FALSE ~ FALSE ), - momID = dplyr::case_when( - .data$keep_parents == TRUE ~ .data$momID, - is.na(.data$closest_relative) ~ .data$momID, - TRUE ~ NA_real_ + link_as_dad = case_when( + is.na(.data$link_as_dad) ~ TRUE, + .data$link_as_dad == TRUE ~ TRUE, + .data$link_as_dad == FALSE ~ FALSE ), - dadID = dplyr::case_when( - .data$keep_parents == TRUE ~ .data$dadID, - is.na(.data$closest_relative) ~ .data$dadID, - TRUE ~ NA_real_ + link_as_spouse = case_when( + is.na(.data$link_as_spouse) ~ TRUE, + .data$link_as_spouse == TRUE ~ TRUE, + .data$link_as_spouse == FALSE ~ FALSE ) - ) |> - dplyr::select( - -"newID", - -"extra", - -"closest_relative" ) - return(ped) } diff --git a/R/ggpedigree.R b/R/ggpedigree.R index 977975a8..0123c693 100644 --- a/R/ggpedigree.R +++ b/R/ggpedigree.R @@ -15,7 +15,7 @@ #' \describe{ #' \item{code_male}{Integer or string. Value identifying males in the sex column. (typically 0 or 1) Default: 1.} #' \item{spouse_segment_color, self_segment_color, sibling_segment_color, parent_segment_color, offspring_segment_color}{Character. Line colors for respective connection types.} -#' \item{text_size, point_size, line_width}{Numeric. Controls text size, point size, and line thickness.} +#' \item{label_text_size, point_size, line_width}{Numeric. Controls text size, point size, and line thickness.} #' \item{generation_gap}{Numeric. Vertical spacing multiplier between generations. Default: 1.} #' \item{unknown_shape, female_shape, male_shape, affected_shape}{Integers. Shape codes for plotting each group.} #' \item{sex_shape_labs}{Character vector of labels for the sex variable. (default: c("Female", "Male", "Unknown")} @@ -53,8 +53,10 @@ ggPedigree <- function(ped, famID = "famID", sibling_segment_color = "black", parent_segment_color = "black", offspring_segment_color = "black", + include_labels = TRUE, + label_method="ggrepel", code_male = 1, - text_size = 3, + label_text_size = 2, point_size = 4, line_width = 0.5, generation_gap = 1, @@ -67,7 +69,7 @@ ggPedigree <- function(ped, famID = "famID", affected = "affected", sex_color = TRUE, status_vals = c(1, 0), - max_overlaps = 100, + max_overlaps = 15, id_segment_color = NA ) @@ -76,7 +78,7 @@ ggPedigree <- function(ped, famID = "famID", config <- utils::modifyList(default_config, config) # Set additional internal config values based on other entries - config$status_labs <- c(paste0(config$affected), paste0(config$unaffected)) + config$status_labs <- c(config$affected, config$unaffected) config$shape_vals <- c(config$female_shape, config$male_shape, config$unknown_shape) # ----- @@ -111,6 +113,7 @@ ggPedigree <- function(ped, famID = "famID", ) } + # ----- # STEP 3: Sex Recode # ----- @@ -293,15 +296,34 @@ ggPedigree <- function(ped, famID = "famID", # STEP 9: Add Labels # ----- # Add labels to the points using ggrepel for better visibility - p <- p + - ggrepel::geom_text_repel(ggplot2::aes(label = .data$personID), - nudge_y = -.15 * config$generation_gap, - size = config$text_size, - na.rm = TRUE, - max.overlaps = config$max_overlaps, - segment.size = config$line_width * .5, - segment.color = config$id_segment_color, - ) + if(config$include_labels == TRUE && config$label_method=="ggrepel"){ + p <- p + + ggrepel::geom_text_repel(ggplot2::aes(label = .data$personID), + nudge_y = -.10 * config$generation_gap, + size = config$label_text_size, + na.rm = TRUE, + max.overlaps = config$max_overlaps, + segment.size = config$line_width *.5, + segment.color = config$id_segment_color + ) + } else if(config$include_labels == TRUE && config$label_method=="geom_label"){ + p <- p + + ggplot2::geom_label(ggplot2::aes(label = .data$personID), + nudge_y = -.25 * config$generation_gap, + size = config$label_text_size, + na.rm = TRUE + ) + + } else if(config$include_labels == TRUE || config$label_method=="geom_text"){ + p <- p + + ggplot2::geom_text(ggplot2::aes(label = .data$personID), + nudge_y = -.25 * config$generation_gap, + size = config$label_text_size, + na.rm = TRUE + ) + } + + # ----- # STEP 10: Scales, Theme diff --git a/vignettes/plots.Rmd b/vignettes/plots.Rmd index 9b5c74b7..e5dd4509 100644 --- a/vignettes/plots.Rmd +++ b/vignettes/plots.Rmd @@ -170,3 +170,31 @@ p + labels = c("Female", "Male", "Unknown") ) ``` + +# More complex examples + +```{r, message=FALSE, warning=FALSE} +data("inbreeding") + +df <- inbreeding + + +p <- ggPedigree( + df, + famID = "famID", + personID = "ID", + status_col = "proband", + config = list( + code_male = 0, + sex_color = F, + label_method="geom_text", + affected = TRUE, + unaffected = FALSE, + generation_gap = 2, + affected_shape = 4 + + ) +) + facet_wrap(famID, scales= "free") + +p +``` diff --git a/vignettes/plots_morecomplexity.xmd b/vignettes/plots_morecomplexity.xmd index 870caf42..85216c53 100644 --- a/vignettes/plots_morecomplexity.xmd +++ b/vignettes/plots_morecomplexity.xmd @@ -128,6 +128,7 @@ ggPedigree(df_repaired, affected = 1, affected_shape = 4, point_size = 3, + label_text_size =1, spouse_segment_color = "pink", sibling_segment_color = "blue", parent_segment_color = "green", From b60206fc134cbf631cfb2c6c0451f2f9e923555a Mon Sep 17 00:00:00 2001 From: Mason Garrison Date: Sat, 10 May 2025 14:30:02 -0400 Subject: [PATCH 03/18] more complex family --- .gitignore | 1 + R/calcCoordinates.R | 26 +++++++++++++---------- man/ggPedigree.Rd | 4 ++-- vignettes/plots.R | 28 +++++++++++++++++++++++-- vignettes/plots.Rmd | 7 +++---- vignettes/plots.html | 50 ++++++++++++++++++++++++++++++++------------ 6 files changed, 84 insertions(+), 32 deletions(-) diff --git a/.gitignore b/.gitignore index 7ec8657e..acb3bdec 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ docs *.code-workspace R/plot.ped.R +R/plot.ped.X diff --git a/R/calcCoordinates.R b/R/calcCoordinates.R index 8c9c1da1..7b825769 100644 --- a/R/calcCoordinates.R +++ b/R/calcCoordinates.R @@ -25,12 +25,16 @@ utils::globalVariables(c(":=")) #' \item `nid`: Internal numeric identifier for layout mapping. #' \item `extra`: Logical flag indicating whether this row is a secondary appearance. #' } +#' #' @export + calculateCoordinates <- function(ped, personID = "ID", momID = "momID", dadID = "dadID", spouseID = "spouseID", sexVar = "sex", code_male = NULL, - config = list()) { + config = list()) + { + if (!inherits(ped, "data.frame")) { stop("ped should be a data.frame or inherit to a data.frame") } @@ -105,10 +109,10 @@ calculateCoordinates <- function(ped, personID = "ID", momID = "momID", # Initialize spouse vector spouse_vector <- rep(NA, length(nid_vector)) - #' A matrix with values - #' 1 = subject plotted to the immediate right is a spouse - #' 2 = subject plotted to the immediate right is an inbred spouse - #' 0 = not a spouse + # A matrix with values + # 1 = subject plotted to the immediate right is a spouse + # 2 = subject plotted to the immediate right is an inbred spouse + # 0 = not a spouse # Populate coordinates from nid positions @@ -265,7 +269,7 @@ calculateConnections <- function(ped, ) - connections_moms <- dplyr::filter(connections, .data$extra==FALSE | .data$link_as_mom == TRUE) %>% + connections_moms <- dplyr::filter(connections, .data$extra==FALSE | .data$link_as_mom == TRUE) |> dplyr::select( -"extra", -"link_as_mom", @@ -273,14 +277,14 @@ calculateConnections <- function(ped, -"link_as_spouse" ) - connections_dads <- dplyr::filter(connections, .data$extra==FALSE | .data$link_as_dad == TRUE) %>% + connections_dads <- dplyr::filter(connections, .data$extra==FALSE | .data$link_as_dad == TRUE)|> dplyr::select( -"extra", -"link_as_mom", -"link_as_dad", -"link_as_spouse" ) - connections_spouses <- dplyr::filter(connections, .data$extra==FALSE | .data$link_as_spouse == TRUE) %>% + connections_spouses <- dplyr::filter(connections, .data$extra==FALSE | .data$link_as_spouse == TRUE) |> dplyr::select( -"extra", -"link_as_mom", @@ -863,17 +867,17 @@ processExtras <- function(ped, config = list()) { ) |> # set the connection columns to TRUE if not kept dplyr::mutate( - link_as_mom = case_when( + link_as_mom = dplyr::case_when( is.na(.data$link_as_mom) ~ TRUE, .data$link_as_mom == TRUE ~ TRUE, .data$link_as_mom == FALSE ~ FALSE ), - link_as_dad = case_when( + link_as_dad = dplyr::case_when( is.na(.data$link_as_dad) ~ TRUE, .data$link_as_dad == TRUE ~ TRUE, .data$link_as_dad == FALSE ~ FALSE ), - link_as_spouse = case_when( + link_as_spouse = dplyr::case_when( is.na(.data$link_as_spouse) ~ TRUE, .data$link_as_spouse == TRUE ~ TRUE, .data$link_as_spouse == FALSE ~ FALSE diff --git a/man/ggPedigree.Rd b/man/ggPedigree.Rd index 0d6664d9..a83109f4 100644 --- a/man/ggPedigree.Rd +++ b/man/ggPedigree.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ggPedigree.R +% Please edit documentation in R/ggpedigree.R \name{ggPedigree} \alias{ggPedigree} \alias{ggpedigree} @@ -53,7 +53,7 @@ ggped( \describe{ \item{code_male}{Integer or string. Value identifying males in the sex column. (typically 0 or 1) Default: 1.} \item{spouse_segment_color, self_segment_color, sibling_segment_color, parent_segment_color, offspring_segment_color}{Character. Line colors for respective connection types.} - \item{text_size, point_size, line_width}{Numeric. Controls text size, point size, and line thickness.} + \item{label_text_size, point_size, line_width}{Numeric. Controls text size, point size, and line thickness.} \item{generation_gap}{Numeric. Vertical spacing multiplier between generations. Default: 1.} \item{unknown_shape, female_shape, male_shape, affected_shape}{Integers. Shape codes for plotting each group.} \item{sex_shape_labs}{Character vector of labels for the sex variable. (default: c("Female", "Male", "Unknown")} diff --git a/vignettes/plots.R b/vignettes/plots.R index 37029abb..1c47ea7c 100644 --- a/vignettes/plots.R +++ b/vignettes/plots.R @@ -62,7 +62,7 @@ p <- ggPedigree( p ## ----------------------------------------------------------------------------- -p <- ggPedigree( +ggPedigree( hazard, famID = "famID", personID = "ID", @@ -75,7 +75,7 @@ p <- ggPedigree( ) ) -p + ## ----------------------------------------------------------------------------- p + @@ -100,3 +100,27 @@ p + labels = c("Female", "Male", "Unknown") ) +## ----message=FALSE, warning=FALSE--------------------------------------------- +library(BGmisc) # helper utilities & example data +data("inbreeding") + +df <- inbreeding + + +p <- ggPedigree( + df, + famID = "famID", + personID = "ID", + status_col = "proband", + config = list( + code_male = 0, + sex_color = F, + label_method="geom_text", + affected = TRUE, + unaffected = FALSE, + generation_gap = 2, + affected_shape = 4) +) + facet_wrap(~famID, scales= "free") + +p + diff --git a/vignettes/plots.Rmd b/vignettes/plots.Rmd index e5dd4509..d94c1552 100644 --- a/vignettes/plots.Rmd +++ b/vignettes/plots.Rmd @@ -174,6 +174,7 @@ p + # More complex examples ```{r, message=FALSE, warning=FALSE} +library(BGmisc) # helper utilities & example data data("inbreeding") df <- inbreeding @@ -191,10 +192,8 @@ p <- ggPedigree( affected = TRUE, unaffected = FALSE, generation_gap = 2, - affected_shape = 4 - - ) -) + facet_wrap(famID, scales= "free") + affected_shape = 4) +) + facet_wrap(~famID, scales= "free") p ``` diff --git a/vignettes/plots.html b/vignettes/plots.html index 8a990059..36f798d3 100644 --- a/vignettes/plots.html +++ b/vignettes/plots.html @@ -369,7 +369,7 @@

Basic usage

famID = "famID", personID = "personID" ) -

+

ggPedigree() automatically:

  1. reshapes the data by family (ped2fam()),

  2. @@ -399,7 +399,7 @@

    Customizing the plot

    offspring_segment_color = "black" ) ) -

    +

    Because the result is just a ggplot object, regular layering applies:

    ggPedigree(potter,
    @@ -407,7 +407,7 @@ 

    Customizing the plot

    personID = "personID" ) + theme_bw(base_size = 12)
    -

    +

    Additional customization

    @@ -442,10 +442,10 @@

    Additional customization

    ) p
    -

    +

    If you set sex_color to FALSE, the affected values will be filled with the default color palette.

    -
    p <- ggPedigree(
    +
    ggPedigree(
       hazard,
       famID = "famID",
       personID = "ID",
    @@ -456,10 +456,8 @@ 

    Additional customization

    affected = TRUE, unaffected = FALSE ) -) - -p
    -

    +)
    +

    Multiple families in one graphic

    @@ -472,7 +470,7 @@

    Multiple families in one graphic

    x-axis scale.

    p +
       facet_wrap(~famID, scales = "free_x")
    -

    +

    Changing the layout

    @@ -494,9 +492,35 @@

    Changing the layout

    labels = c("Female", "Male", "Unknown") ) #> Scale for colour is already present. -#> Adding another scale for colour, which will replace the existing -#> scale.
    -

    +#> Adding another scale for colour, which will replace the +#> existing scale. +

    + +
    +

    More complex examples

    +
    library(BGmisc) # helper utilities & example data
    +data("inbreeding")
    +
    +df <- inbreeding 
    +
    +
    +p <- ggPedigree(
    +  df,
    +  famID = "famID",
    +  personID = "ID",
    +  status_col = "proband",
    +  config = list(
    +    code_male = 0,
    +    sex_color = F,
    +    label_method="geom_text",
    +    affected = TRUE,
    +    unaffected = FALSE,
    +    generation_gap = 2,
    +    affected_shape = 4)
    +) + facet_wrap(~famID, scales= "free")
    +
    +p
    +

    From 6c520e6ea94590cf4daee56579866419e172338f Mon Sep 17 00:00:00 2001 From: Mason Garrison Date: Mon, 12 May 2025 20:05:53 -0400 Subject: [PATCH 04/18] smarter with duplicates --- R/calcCoordinates.R | 108 ++++++++++++++++++++++++++++++++++++-------- R/ggpedigree.R | 66 +++++++++++++++------------ vignettes/plots.Rmd | 6 ++- 3 files changed, 130 insertions(+), 50 deletions(-) diff --git a/R/calcCoordinates.R b/R/calcCoordinates.R index 7b825769..7e7024f4 100644 --- a/R/calcCoordinates.R +++ b/R/calcCoordinates.R @@ -188,6 +188,7 @@ calculateCoordinates <- function(ped, personID = "ID", momID = "momID", ped <- rbind(ped, ped_extra) } else { ped_extra <- NULL + ped$extra <- FALSE } return(ped) @@ -248,7 +249,7 @@ calculateConnections <- function(ped, } # If duplicated appearances exist, resolve which connections to keep - if ("extra" %in% names(ped)) { + if (sum(ped$extra) > 0) { ped <- processExtras(ped, config = config) } @@ -299,18 +300,14 @@ calculateConnections <- function(ped, "x_pos", "y_pos", "dadID", "momID", "spouseID", - "famID" + "famID", + "extra" ) # no duplications, so just use the same connections connections_spouses <- connections_dads <- connections_moms <- connections } - - - - - # Get mom's coordinates mom_connections <- getRelativeCoordinates( ped = ped, @@ -395,10 +392,15 @@ calculateConnections <- function(ped, # Calculate sibling group midpoints sibling_midpoints <- connections |> - dplyr::filter(!is.na(.data$dadID) & !is.na(.data$momID)) |> + dplyr::filter( + !is.na(.data$momID) & !is.na(.data$dadID) & # biological parents defined + !is.na(.data$x_mom) & !is.na(.data$y_mom) & # mom’s coordinates linked + !is.na(.data$x_dad) & !is.na(.data$y_dad) # dad’s coordinates linked + ) |> dplyr::group_by( - .data$dadID, - .data$momID + .data$momID, .data$dadID, + .data$x_mom, .data$y_mom, + .data$x_dad, .data$y_dad ) |> dplyr::summarize( x_mid_sib = mean(.data$x_pos), @@ -416,17 +418,18 @@ calculateConnections <- function(ped, by = c("spouseID") ) |> dplyr::left_join(sibling_midpoints, - by = c("dadID", "momID") + by = c("dadID", "momID","x_mom", "y_mom", + "x_dad", "y_dad") ) |> dplyr::mutate( x_mid_sib = dplyr::case_when( - is.na(.data$x_mid_sib) & !is.na(.data$dadID) & !is.na(.data$momID) ~ .data$x_pos, !is.na(.data$x_mid_sib) ~ .data$x_mid_sib, + (!is.na(.data$momID) & !is.na(.data$x_mom)) | (!is.na(.data$dadID) & !is.na(.data$x_dad)) ~ .data$x_pos, TRUE ~ NA_real_ ), y_mid_sib = dplyr::case_when( - is.na(.data$y_mid_sib) & !is.na(.data$dadID) & !is.na(.data$momID) ~ .data$y_pos, !is.na(.data$y_mid_sib) ~ .data$y_mid_sib, + (!is.na(.data$momID) & !is.na(.data$y_mom)) | (!is.na(.data$dadID) & !is.na(.data$y_dad)) ~ .data$y_pos, TRUE ~ NA_real_ ) ) @@ -748,16 +751,47 @@ processExtras <- function(ped, config = list()) { # - mom, dad, spouse # - same individual in other location (otherself) # These will be used to choose the "closest" relationship. + # minkowski distance could be used here as well aka "city block" distance # ----- extras <- extras |> dplyr::mutate( - dist_mom = sqrt((.data$x_pos - .data$x_mom)^2 + (.data$y_pos - .data$y_mom)^2), - dist_mom_other = sqrt((.data$x_otherself - .data$x_mom)^2 + (.data$y_otherself - .data$y_mom)^2), - dist_dad = sqrt((.data$x_pos - .data$x_dad)^2 + (.data$y_pos - .data$y_dad)^2), - dist_dad_other = sqrt((.data$x_otherself - .data$x_dad)^2 + (.data$y_otherself - .data$y_dad)^2), - dist_spouse = sqrt((.data$x_pos - .data$x_spouse)^2 + (.data$y_pos - .data$y_spouse)^2), - dist_spouse_other = sqrt((.data$x_otherself - .data$x_spouse)^2 + (.data$y_otherself - .data$y_spouse)^2), - dist_otherself = sqrt((.data$x_pos - .data$x_otherself)^2 + (.data$y_pos - .data$y_otherself)^2) + dist_mom = computeDistance(method = "cityblock", + x1 = .data$x_pos, + y1 = .data$y_pos, + x2 = .data$x_mom, + y2 = .data$y_mom), + + dist_mom_other = computeDistance(method = "cityblock", + x1 = .data$x_otherself, + y1 = .data$y_otherself, + x2 = .data$x_mom, + y2 = .data$y_mom), + dist_dad = computeDistance(method = "cityblock", + x1 = .data$x_pos, + y1 = .data$y_pos, + x2 = .data$x_dad, + y2 = .data$y_dad), + dist_dad_other = computeDistance(method = "cityblock", + x1 = .data$x_otherself, + y1 = .data$y_otherself, + x2 = .data$x_dad, + y2 = .data$y_dad), + # spouse distance + dist_spouse = computeDistance(method = "cityblock", + x1 = .data$x_pos, + y1 = .data$y_pos, + x2 = .data$x_spouse, + y2 = .data$y_spouse), + dist_spouse_other = computeDistance(method = "cityblock", + x1 = .data$x_otherself, + y1 = .data$y_otherself, + x2 = .data$x_spouse, + y2 = .data$y_spouse), + dist_otherself = computeDistance(method = "cityblock", + x1 = .data$x_pos, + y1 = .data$y_pos, + x2 = .data$x_otherself, + y2 = .data$y_otherself) ) # ----- @@ -885,3 +919,37 @@ processExtras <- function(ped, config = list()) { ) return(ped) } +#' Compute distance between two points +#' +#' This function calculates the distance between two points in a 2D space using +#' Minkowski distance. It can be used to compute Euclidean or Manhattan distance. +#' It is a utility function for calculating distances in pedigree layouts. +#' Defaults to Euclidean distance if no method is specified. +#' +#' +#' @param x1 Numeric. X-coordinate of the first point. +#' @param y1 Numeric. Y-coordinate of the first point. +#' @param x2 Numeric. X-coordinate of the second point. +#' @param y2 Numeric. Y-coordinate of the second point. +#' @param method Character. Method of distance calculation. Options are "euclidean", "cityblock", and "Minkowski". +#' @param p Numeric. The order of the Minkowski distance. If NULL, defaults to 2 for Euclidean and 1 for Manhattan. If +#' Minkowski method is used, p should be specified. + +computeDistance <- function(x1, y1, x2, y2, + method = "euclidean", p = NULL) { + + method <- tolower(method) + + if(is.null(p)) { + p <- switch(method, + euclidean = 2, + cityblock = 1, + stop("Invalid distance method. Choose from 'euclidean', 'cityblock', or specify p.") + ) + } + # Calculate Minkowski distance + + ((abs(x1 - x2))^p + (abs(y1 - y2))^p)^(1 / p) + +} + diff --git a/R/ggpedigree.R b/R/ggpedigree.R index 0123c693..b90cdc1e 100644 --- a/R/ggpedigree.R +++ b/R/ggpedigree.R @@ -161,21 +161,6 @@ ggPedigree <- function(ped, famID = "famID", # STEP 7: Add Segments # ----- - # Self-segment (for duplicate layout appearances of same person) - if ("x_otherself" %in% names(connections)) { - p <- p + ggplot2::geom_segment( - data = connections, - ggplot2::aes( - x = .data$x_otherself, - xend = .data$x_pos, - y = .data$y_otherself, - yend = .data$y_pos - ), - linewidth = config$line_width, - color = config$self_segment_color, - na.rm = TRUE - ) - } # Spouse link between two parents p <- p + ggplot2::geom_segment( @@ -189,20 +174,22 @@ ggPedigree <- function(ped, famID = "famID", linewidth = config$line_width, color = config$spouse_segment_color, na.rm = TRUE - ) + - # Parent-child stub (child to mid-sibling point) - ggplot2::geom_segment( - data = connections, - ggplot2::aes( - x = .data$x_mid_sib, - xend = .data$x_midparent, - y = .data$y_mid_sib - gap_off, - yend = .data$y_midparent - ), - linewidth = config$line_width, - color = config$parent_segment_color, - na.rm = TRUE - ) + + ) + + # Parent-child stub (child to mid-sibling point) + + p <- p + ggplot2::geom_segment( + data = connections, + ggplot2::aes( + x = .data$x_mid_sib, + xend = .data$x_midparent, + y = .data$y_mid_sib - gap_off, + yend = .data$y_midparent + ), + linewidth = config$line_width, + color = config$parent_segment_color, + na.rm = TRUE + ) + # Mid-sibling to parents midpoint ggplot2::geom_segment( data = connections, @@ -232,6 +219,7 @@ ggPedigree <- function(ped, famID = "famID", + # ----- # STEP 8: Add Points (nodes) # ----- @@ -324,6 +312,26 @@ ggPedigree <- function(ped, famID = "famID", } + # Self-segment (for duplicate layout appearances of same person) + if ("x_otherself" %in% names(connections)) { + p <- p + ggplot2::geom_curve( + data = dplyr::filter(connections,extra==TRUE), + ggplot2::aes( + x = .data$x_otherself, + xend = .data$x_pos, + y = .data$y_otherself, + yend = .data$y_pos + ), + linewidth = config$line_width, + color = config$self_segment_color, + angle = 90, + curvature = -0.2, + na.rm = TRUE + ) + + } + + # ----- # STEP 10: Scales, Theme diff --git a/vignettes/plots.Rmd b/vignettes/plots.Rmd index d94c1552..1700f359 100644 --- a/vignettes/plots.Rmd +++ b/vignettes/plots.Rmd @@ -192,7 +192,11 @@ p <- ggPedigree( affected = TRUE, unaffected = FALSE, generation_gap = 2, - affected_shape = 4) + affected_shape = 4, + spouse_segment_color = "pink", + sibling_segment_color = "blue", + parent_segment_color = "green", + offspring_segment_color = "black") ) + facet_wrap(~famID, scales= "free") p From c229d087324fe65f6a1800024df06fe9597fe5c7 Mon Sep 17 00:00:00 2001 From: Mason Garrison Date: Tue, 13 May 2025 21:46:39 -0400 Subject: [PATCH 05/18] more features --- R/calcCoordinates.R | 384 +++++++++++++------------- R/calcCoordinatesHelpers.R | 150 ++++++++++ R/ggpedigree.R | 38 ++- tests/testthat/test-calcCoordinates.R | 20 ++ vignettes/plots.Rmd | 17 +- 5 files changed, 398 insertions(+), 211 deletions(-) create mode 100644 R/calcCoordinatesHelpers.R diff --git a/R/calcCoordinates.R b/R/calcCoordinates.R index 7e7024f4..24bd32a4 100644 --- a/R/calcCoordinates.R +++ b/R/calcCoordinates.R @@ -69,6 +69,25 @@ calculateCoordinates <- function(ped, personID = "ID", momID = "momID", sex = ped_recode[[sexVar]], ) + if(!is.null(config$hints)) { + + + autohint <- tryCatch(kinship2::autohint(ped_ped,config$hints, + align = config$ped_align, + packed = config$ped_packed), + error = function(e) kinship2::autohint(ped_ped, + align = config$ped_align, + packed = config$ped_packed) + , + finally = warning("Your hints caused an error and were not used, using default hints instead")) + } else { + autohint <- kinship2::autohint(ped_ped, + align = config$ped_align, + packed = config$ped_packed) + } + + + # ----- # Extract layout information # ----- @@ -78,7 +97,8 @@ calculateCoordinates <- function(ped, personID = "ID", momID = "momID", pos <- kinship2::align.pedigree(ped_ped, packed = config$ped_packed, align = config$ped_align, - width = config$ped_width + width = config$ped_width, + hints = autohint ) # Extract layout information @@ -248,6 +268,18 @@ calculateConnections <- function(ped, ped$famID <- 1 } + # create a unique parenthash for each individual + # this will be used to identify siblings + if (!all("parenthash" %in% names(ped))) { + ped <- ped |> + dplyr::mutate( + parenthash = paste0(.data$momID, ".", .data$dadID) + ) |> + dplyr::mutate( + parenthash = gsub("NA.NA", NA, .data$parenthash) + ) + } + # If duplicated appearances exist, resolve which connections to keep if (sum(ped$extra) > 0) { ped <- processExtras(ped, config = config) @@ -262,50 +294,69 @@ calculateConnections <- function(ped, .data = ped, "personID", "x_pos", "y_pos", - "dadID", "momID", + "dadID", "momID", "parenthash", "spouseID", "famID", "x_otherself", "y_otherself", - "extra","link_as_mom", "link_as_dad", "link_as_spouse" - ) + "extra","link_as_mom", "link_as_dad", "link_as_spouse", + "link_as_sibling" + ) |> unique() - connections_moms <- dplyr::filter(connections, .data$extra==FALSE | .data$link_as_mom == TRUE) |> + connections_moms <- dplyr::filter(connections, .data$link_as_mom == TRUE) |> dplyr::select( -"extra", -"link_as_mom", -"link_as_dad", - -"link_as_spouse" + -"link_as_spouse", + -"link_as_sibling" ) - connections_dads <- dplyr::filter(connections, .data$extra==FALSE | .data$link_as_dad == TRUE)|> + connections_dads <- dplyr::filter(connections, .data$link_as_dad == TRUE)|> dplyr::select( -"extra", -"link_as_mom", -"link_as_dad", - -"link_as_spouse" + -"link_as_spouse", + -"link_as_sibling" ) - connections_spouses <- dplyr::filter(connections, .data$extra==FALSE | .data$link_as_spouse == TRUE) |> + connections_spouses <- dplyr::filter(connections, .data$link_as_spouse == TRUE) |> dplyr::select( -"extra", -"link_as_mom", -"link_as_dad", - -"link_as_spouse" + -"link_as_spouse", + -"link_as_sibling" + ) + connections_sibs <- dplyr::filter(connections, .data$link_as_sibling == TRUE) |> + dplyr::select( + -"extra", + -"link_as_mom", + -"link_as_dad", + -"link_as_spouse", + -"link_as_sibling" ) - } else { connections <- dplyr::select( .data = ped, "personID", "x_pos", "y_pos", - "dadID", "momID", + "dadID", "momID", "parenthash", "spouseID", "famID", "extra" + ) |> unique() |> + dplyr::mutate( + link_as_mom = TRUE, + link_as_dad = TRUE, + link_as_spouse = TRUE, + link_as_sibling = TRUE + ) + + - ) # no duplications, so just use the same connections - connections_spouses <- connections_dads <- connections_moms <- connections + connections_sibs <- connections_spouses <- connections_dads <- connections_moms <- connections } # Get mom's coordinates @@ -344,7 +395,7 @@ calculateConnections <- function(ped, dplyr::select( "personID", "spouseID", "x_spouse", "y_spouse" - ) + ) |> unique() # Combine mom, dad, and spouse coordinates connections <- connections |> @@ -356,12 +407,13 @@ calculateConnections <- function(ped, ) |> dplyr::left_join(spouse_connections, by = c("personID", "spouseID") - ) + ) |> unique() + + # Calculate midpoints between mom and dad in child row - # Calculate midpoints between mom and dad parent_midpoints <- connections |> dplyr::filter(!is.na(.data$dadID) & !is.na(.data$momID)) |> - dplyr::group_by(.data$dadID, .data$momID) |> + dplyr::group_by(.data$parenthash) |> dplyr::summarize( x_midparent = mean(c( dplyr::first(.data$x_dad), @@ -372,7 +424,7 @@ calculateConnections <- function(ped, dplyr::first(.data$y_mom) )), .groups = "drop" - ) + ) |> unique() # Calculate midpoints between spouses spouse_midpoints <- connections |> @@ -388,17 +440,17 @@ calculateConnections <- function(ped, dplyr::first(.data$y_spouse) )), .groups = "drop" - ) + ) |> unique() # Calculate sibling group midpoints - sibling_midpoints <- connections |> + sibling_midpoints <- connections|> dplyr::filter( !is.na(.data$momID) & !is.na(.data$dadID) & # biological parents defined !is.na(.data$x_mom) & !is.na(.data$y_mom) & # mom’s coordinates linked !is.na(.data$x_dad) & !is.na(.data$y_dad) # dad’s coordinates linked ) |> dplyr::group_by( - .data$momID, .data$dadID, + .data$parenthash, .data$x_mom, .data$y_mom, .data$x_dad, .data$y_dad ) |> @@ -406,33 +458,36 @@ calculateConnections <- function(ped, x_mid_sib = mean(.data$x_pos), y_mid_sib = dplyr::first(.data$y_pos), .groups = "drop" - ) + ) |> unique() # Merge midpoints into connections connections <- connections |> dplyr::left_join(parent_midpoints, - by = c("dadID", "momID") + by = c("parenthash") ) |> dplyr::left_join(spouse_midpoints, by = c("spouseID") ) |> dplyr::left_join(sibling_midpoints, - by = c("dadID", "momID","x_mom", "y_mom", + by = c("parenthash","x_mom", "y_mom", "x_dad", "y_dad") ) |> - dplyr::mutate( - x_mid_sib = dplyr::case_when( - !is.na(.data$x_mid_sib) ~ .data$x_mid_sib, - (!is.na(.data$momID) & !is.na(.data$x_mom)) | (!is.na(.data$dadID) & !is.na(.data$x_dad)) ~ .data$x_pos, + dplyr::mutate( + x_mid_sib = dplyr::case_when( + is.na(.data$x_dad) & is.na(.data$x_mom) ~ NA_real_, + !is.na(.data$x_mid_sib) ~ .data$x_mid_sib, + (!is.na(.data$momID) & !is.na(.data$x_mom)) | (!is.na(.data$dadID) & !is.na(.data$x_dad)) ~ .data$x_pos, TRUE ~ NA_real_ - ), + ), y_mid_sib = dplyr::case_when( + is.na(.data$y_dad) & is.na(.data$y_mom) ~ NA_real_, + !is.na(.data$y_mid_sib) ~ .data$y_mid_sib, - (!is.na(.data$momID) & !is.na(.data$y_mom)) | (!is.na(.data$dadID) & !is.na(.data$y_dad)) ~ .data$y_pos, - TRUE ~ NA_real_ - ) - ) + (!is.na(.data$momID) & !is.na(.data$y_mom)) | (!is.na(.data$dadID) & !is.na(.data$y_dad)) ~ .data$y_pos, + TRUE ~ NA_real_ + ) + ) |> unique() return(connections) } @@ -448,6 +503,7 @@ calculateConnections <- function(ped, #' @param x_name Character. Name of the new column to store the x-coordinate of the relative. #' @param y_name Character. Name of the new column to store the y-coordinate of the relative. #' @param multiple Character. Specifies how to handle multiple matches. Options are "all" or "any". +#' @param only_unique Logical. If TRUE, return only unique rows. Defaults to TRUE. #' #' @return A `data.frame` with columns: #' \itemize{ @@ -461,7 +517,8 @@ calculateConnections <- function(ped, getRelativeCoordinates <- function(ped, connections, relativeIDvar, x_name, y_name, # relationship = "one-to-one", personID = "personID", - multiple = "all") { + multiple = "all", + only_unique = TRUE) { # Filter only rows where the relative ID is not missing # and join with the main pedigree data frame rel_connections <- connections |> @@ -498,126 +555,14 @@ getRelativeCoordinates <- function(ped, connections, relativeIDvar, x_name, y_na !!y_name ) } + if(only_unique == TRUE){ + rel_connections <- unique(rel_connections) + } return(rel_connections) } -#' Compute midpoints across grouped coordinates -#' -#' A flexible utility function to compute x and y midpoints for groups of individuals -#' using a specified method. Used to support positioning logic for sibling groups, -#' parental dyads, or spousal pairs in pedigree layouts. -#' @param data A `data.frame` containing the coordinate and grouping variables. -#' @param group_vars Character vector. Names of the grouping variables. -#' @param x_vars Character vector. Names of the x-coordinate variables to be averaged. -#' @param y_vars Character vector. Names of the y-coordinate variables to be averaged. -#' @param x_out Character. Name of the output column for the x-coordinate midpoint. -#' @param y_out Character. Name of the output column for the y-coordinate midpoint. -#' @param method Character. Method for calculating midpoints. Options include: -#' \itemize{ -#' \item `"mean"`: Arithmetic mean of the coordinates. -#' \item `"median"`: Median of the coordinates. -#' \item `"weighted_mean"`: Weighted mean of the coordinates. -#' \item `"first_pair"`: Mean of the first pair of coordinates. -#' \item `"meanxfirst"`: Mean of the x-coordinates and first y-coordinate. -#' \item `"meanyfirst"`: Mean of the y-coordinates and first x-coordinate. -#' } -#' @param require_non_missing Character vector. Names of variables that must not be missing for the row to be included. - -#' @return A `data.frame` grouped by `group_vars` with new columns `x_out` and `y_out` containing midpoint coordinates. -#' @keywords internal - -getMidpoints <- function(data, group_vars, - x_vars, y_vars, - x_out, y_out, method = "mean", - require_non_missing = group_vars) { - # ----- - # Filter for complete data if requested - if (!is.null(require_non_missing)) { - data <- data |> - dplyr::filter( - dplyr::if_all(!!!rlang::syms(require_non_missing), ~ !is.na(.)) - ) - } - - # ----- - # Apply selected midpoint method - # ----- - if (method == "mean") { - # Average all xs and Average of all y values - - data |> - dplyr::group_by(!!!rlang::syms(group_vars)) |> - dplyr::summarize( - !!x_out := mean(c(!!!rlang::syms(x_vars)), na.rm = TRUE), - !!y_out := mean(c(!!!rlang::syms(y_vars)), na.rm = TRUE), - .groups = "drop" - ) - } else if (method == "median") { - # Median of all xs and Median of all y values - data |> - dplyr::group_by(!!!rlang::syms(group_vars)) |> - dplyr::summarize( - !!x_out := stats::median(c(!!!rlang::syms(x_vars)), na.rm = TRUE), - !!y_out := stats::median(c(!!!rlang::syms(y_vars)), na.rm = TRUE), - .groups = "drop" - ) - } else if (method == "weighted_mean") { - # Weighted average (same weight for all unless specified externally) - - data |> - dplyr::group_by(!!!rlang::syms(group_vars)) |> - dplyr::summarize( - !!x_out := stats::weighted.mean(c(!!!rlang::syms(x_vars)), na.rm = TRUE), - !!y_out := stats::weighted.mean(c(!!!rlang::syms(y_vars)), na.rm = TRUE), - .groups = "drop" - ) - } else if (method == "first_pair") { - # Use only the first value in each pair of x/y coordinates - # This is useful for spousal pairs or sibling groups - data |> - dplyr::group_by(!!!rlang::syms(group_vars)) |> - dplyr::summarize( - !!x_out := mean(c( - dplyr::first(.data[[x_vars[1]]]), - dplyr::first(.data[[x_vars[2]]]) - ), na.rm = TRUE), - !!y_out := mean(c( - dplyr::first(.data[[y_vars[1]]]), - dplyr::first(.data[[y_vars[2]]]) - ), na.rm = TRUE), - .groups = "drop" - ) - } else if (method == "meanxfirst") { - # Use the mean of all x coordinates and the first y coordinate - data |> - dplyr::group_by(!!!rlang::syms(group_vars)) |> - dplyr::summarize( - !!x_out := mean(c(!!!rlang::syms(x_vars)), na.rm = TRUE), - !!y_out := mean(c( - dplyr::first(.data[[y_vars[1]]]), - dplyr::first(.data[[y_vars[2]]]) - ), na.rm = TRUE), - .groups = "drop" - ) - } else if (method == "meanyfirst") { - # First x, mean of all y - data |> - dplyr::group_by(!!!rlang::syms(group_vars)) |> - dplyr::summarize( - !!x_out := mean(c( - dplyr::first(.data[[x_vars[1]]]), - dplyr::first(.data[[x_vars[2]]]) - ), na.rm = TRUE), - !!y_out := mean(c(!!!rlang::syms(y_vars)), na.rm = TRUE), - .groups = "drop" - ) - } else { - # Handle unsupported method argument - stop("Unsupported method.") - } -} #' Process duplicate appearances of individuals in a pedigree layout #' @@ -671,7 +616,7 @@ processExtras <- function(ped, config = list()) { "newID", "personID", "x_pos", "y_pos", - "dadID", "momID", + "dadID", "momID","parenthash", "spouseID" ) @@ -710,6 +655,36 @@ processExtras <- function(ped, config = list()) { multiple = "all" ) + # Parenthash coordinates + parenthash_coords <- extras |> # need to get mom and dad coordinates + dplyr::left_join(mom_coords, by = c("newID", "personID", "momID")) |> + dplyr::left_join(dad_coords, by = c("newID", "personID", "dadID")) |> + dplyr::left_join( + ped, + by = c("parenthash"), + suffix = c("", "_sib"), + multiple = "all" + ) |> + dplyr::filter(!is.na(.data$parenthash)) |> + dplyr::mutate( + x_parenthash = mean(c( + .data$x_dad, + .data$x_mom + )), + y_parenthash = mean(c( + .data$y_dad, + .data$y_mom + )) + ) |> + dplyr::select( + .data$newID, + .data$personID, + .data$parenthash, + .data$x_parenthash, + .data$y_parenthash + ) + + # Coordinates of the individual's other appearance ("self") self_coords <- extras |> dplyr::left_join( @@ -743,9 +718,13 @@ processExtras <- function(ped, config = list()) { dplyr::left_join(spouse_coords, by = c("newID", "personID", "spouseID"), multiple = "all" + ) |> + dplyr::left_join(parenthash_coords, + by = c("newID", "personID", "parenthash"), + multiple = "all" ) - +#print(extras) # ----- # Compute Euclidean distances between this appearance and: # - mom, dad, spouse @@ -791,9 +770,22 @@ processExtras <- function(ped, config = list()) { x1 = .data$x_pos, y1 = .data$y_pos, x2 = .data$x_otherself, - y2 = .data$y_otherself) + y2 = .data$y_otherself), + dist_parenthash = computeDistance(method = "cityblock", + x1 = .data$x_pos, + y1 = .data$y_pos, + x2 = .data$x_parenthash, + y2 = .data$y_parenthash), + dist_parenthash_other = computeDistance(method = "cityblock", + x1 = .data$x_otherself, + y1 = .data$y_otherself, + x2 = .data$x_parenthash, + y2 = .data$y_parenthash) + + ) + # ----- # When there are multiple spouses, keep only the appearance # where the individual is closest to one of their spouses. @@ -842,6 +834,11 @@ processExtras <- function(ped, config = list()) { # is.na(.data$dist_spouse) ~ FALSE, # !is.na(.data$dist_spouse) & is.na(.data$dist_spouse_other) ~ TRUE, TRUE ~ TRUE + ), + parenthash_closer = dplyr::case_when( + .data$dist_parenthash < .data$dist_parenthash_other ~ TRUE, + .data$dist_parenthash_other < .data$dist_parenthash ~ FALSE, + TRUE ~ TRUE ) ) @@ -850,14 +847,14 @@ processExtras <- function(ped, config = list()) { # - Determine which of mom, dad, or spouse is closest in absolute terms # - Use that to decide whether to retain connections to that relative - extras <- extras |> - dplyr::mutate( - closest_relative = dplyr::case_when( - .data$dist_mom <= .data$dist_dad & .data$dist_mom <= .data$dist_spouse ~ "mom", - .data$dist_dad < .data$dist_mom & .data$dist_dad <= .data$dist_spouse ~ "dad", - TRUE ~ "spouse" - ) - ) +# extras <- extras |> +# dplyr::mutate( +# closest_relative = dplyr::case_when( +# .data$dist_mom <= .data$dist_dad & .data$dist_mom <= .data$dist_spouse ~ "mom", +# .data$dist_dad < .data$dist_mom & .data$dist_dad <= .data$dist_spouse ~ "dad", +# TRUE ~ "spouse" +# ) +# ) # ----- # Based on which relative is closest, determine which links to keep @@ -865,9 +862,16 @@ processExtras <- function(ped, config = list()) { extras <- extras |> dplyr::mutate( - link_as_mom = .data$closest_relative %in% c("mom", "dad") & .data$mom_closer, - link_as_dad = .data$closest_relative %in% c("mom", "dad") & .data$dad_closer, - link_as_spouse = .data$closest_relative == "spouse" & .data$spouse_closer + link_as_mom = .data$mom_closer, + link_as_dad = .data$dad_closer, + link_as_spouse = .data$spouse_closer, + link_as_sibling = .data$parenthash_closer + ) %>% dplyr::mutate( + link_any = dplyr::case_when( + .data$link_as_mom == TRUE | .data$link_as_dad == TRUE | + .data$link_as_sibling == TRUE| .data$link_as_spouse == TRUE ~ TRUE, + TRUE ~ FALSE + ) ) @@ -881,6 +885,8 @@ processExtras <- function(ped, config = list()) { .data$link_as_dad, .data$link_as_mom, .data$link_as_spouse, + .data$link_as_sibling, + .data$link_any, .data$x_otherself, .data$y_otherself ) @@ -912,44 +918,24 @@ processExtras <- function(ped, config = list()) { .data$link_as_dad == FALSE ~ FALSE ), link_as_spouse = dplyr::case_when( + .data$link_as_spouse == FALSE ~ FALSE, is.na(.data$link_as_spouse) ~ TRUE, - .data$link_as_spouse == TRUE ~ TRUE, - .data$link_as_spouse == FALSE ~ FALSE + .data$link_as_spouse == TRUE ~ TRUE), + link_as_sibling = dplyr::case_when( + is.na(.data$link_as_sibling) ~ TRUE, + .data$link_as_sibling == TRUE ~ TRUE, + .data$link_as_sibling == FALSE ~ FALSE), + link_any = dplyr::case_when( + is.na(.data$link_any) ~ TRUE, + .data$link_any == TRUE ~ TRUE, + .data$link_any == FALSE ~ FALSE) + ) |> filter( + .data$link_any == TRUE ) - ) - return(ped) -} -#' Compute distance between two points -#' -#' This function calculates the distance between two points in a 2D space using -#' Minkowski distance. It can be used to compute Euclidean or Manhattan distance. -#' It is a utility function for calculating distances in pedigree layouts. -#' Defaults to Euclidean distance if no method is specified. -#' -#' -#' @param x1 Numeric. X-coordinate of the first point. -#' @param y1 Numeric. Y-coordinate of the first point. -#' @param x2 Numeric. X-coordinate of the second point. -#' @param y2 Numeric. Y-coordinate of the second point. -#' @param method Character. Method of distance calculation. Options are "euclidean", "cityblock", and "Minkowski". -#' @param p Numeric. The order of the Minkowski distance. If NULL, defaults to 2 for Euclidean and 1 for Manhattan. If -#' Minkowski method is used, p should be specified. - -computeDistance <- function(x1, y1, x2, y2, - method = "euclidean", p = NULL) { - - method <- tolower(method) - - if(is.null(p)) { - p <- switch(method, - euclidean = 2, - cityblock = 1, - stop("Invalid distance method. Choose from 'euclidean', 'cityblock', or specify p.") - ) - } - # Calculate Minkowski distance - ((abs(x1 - x2))^p + (abs(y1 - y2))^p)^(1 / p) + + return(ped) } + diff --git a/R/calcCoordinatesHelpers.R b/R/calcCoordinatesHelpers.R new file mode 100644 index 00000000..0ddd90f2 --- /dev/null +++ b/R/calcCoordinatesHelpers.R @@ -0,0 +1,150 @@ +#' Compute distance between two points +#' +#' This function calculates the distance between two points in a 2D space using +#' Minkowski distance. It can be used to compute Euclidean or Manhattan distance. +#' It is a utility function for calculating distances in pedigree layouts. +#' Defaults to Euclidean distance if no method is specified. +#' +#' +#' @param x1 Numeric. X-coordinate of the first point. +#' @param y1 Numeric. Y-coordinate of the first point. +#' @param x2 Numeric. X-coordinate of the second point. +#' @param y2 Numeric. Y-coordinate of the second point. +#' @param method Character. Method of distance calculation. Options are "euclidean", "cityblock", and "Minkowski". +#' @param p Numeric. The order of the Minkowski distance. If NULL, defaults to 2 for Euclidean and 1 for Manhattan. If +#' Minkowski method is used, p should be specified. + +computeDistance <- function(x1, y1, x2, y2, + method = "euclidean", p = NULL) { + + method <- tolower(method) + + if(is.null(p)) { + p <- switch(method, + euclidean = 2, + cityblock = 1, + stop("Invalid distance method. Choose from 'euclidean', 'cityblock', or specify p.") + ) + } + # Calculate Minkowski distance + + ((abs(x1 - x2))^p + (abs(y1 - y2))^p)^(1 / p) + +} + +#' Compute midpoints across grouped coordinates +#' +#' A flexible utility function to compute x and y midpoints for groups of individuals +#' using a specified method. Used to support positioning logic for sibling groups, +#' parental dyads, or spousal pairs in pedigree layouts. +#' @param data A `data.frame` containing the coordinate and grouping variables. +#' @param group_vars Character vector. Names of the grouping variables. +#' @param x_vars Character vector. Names of the x-coordinate variables to be averaged. +#' @param y_vars Character vector. Names of the y-coordinate variables to be averaged. +#' @param x_out Character. Name of the output column for the x-coordinate midpoint. +#' @param y_out Character. Name of the output column for the y-coordinate midpoint. +#' @param method Character. Method for calculating midpoints. Options include: +#' \itemize{ +#' \item `"mean"`: Arithmetic mean of the coordinates. +#' \item `"median"`: Median of the coordinates. +#' \item `"weighted_mean"`: Weighted mean of the coordinates. +#' \item `"first_pair"`: Mean of the first pair of coordinates. +#' \item `"meanxfirst"`: Mean of the x-coordinates and first y-coordinate. +#' \item `"meanyfirst"`: Mean of the y-coordinates and first x-coordinate. +#' } +#' @param require_non_missing Character vector. Names of variables that must not be missing for the row to be included. + +#' @return A `data.frame` grouped by `group_vars` with new columns `x_out` and `y_out` containing midpoint coordinates. +#' @keywords internal + +getMidpoints <- function(data, group_vars, + x_vars, y_vars, + x_out, y_out, method = "mean", + require_non_missing = group_vars) { + # ----- + # Filter for complete data if requested + if (!is.null(require_non_missing)) { + data <- data |> + dplyr::filter( + dplyr::if_all(!!!rlang::syms(require_non_missing), ~ !is.na(.)) + ) + } + + # ----- + # Apply selected midpoint method + # ----- + + if (method == "mean") { + # Average all xs and Average of all y values + + data |> + dplyr::group_by(!!!rlang::syms(group_vars)) |> + dplyr::summarize( + !!x_out := mean(c(!!!rlang::syms(x_vars)), na.rm = TRUE), + !!y_out := mean(c(!!!rlang::syms(y_vars)), na.rm = TRUE), + .groups = "drop" + ) + } else if (method == "median") { + # Median of all xs and Median of all y values + data |> + dplyr::group_by(!!!rlang::syms(group_vars)) |> + dplyr::summarize( + !!x_out := stats::median(c(!!!rlang::syms(x_vars)), na.rm = TRUE), + !!y_out := stats::median(c(!!!rlang::syms(y_vars)), na.rm = TRUE), + .groups = "drop" + ) + } else if (method == "weighted_mean") { + # Weighted average (same weight for all unless specified externally) + + data |> + dplyr::group_by(!!!rlang::syms(group_vars)) |> + dplyr::summarize( + !!x_out := stats::weighted.mean(c(!!!rlang::syms(x_vars)), na.rm = TRUE), + !!y_out := stats::weighted.mean(c(!!!rlang::syms(y_vars)), na.rm = TRUE), + .groups = "drop" + ) + } else if (method == "first_pair") { + # Use only the first value in each pair of x/y coordinates + # This is useful for spousal pairs or sibling groups + data |> + dplyr::group_by(!!!rlang::syms(group_vars)) |> + dplyr::summarize( + !!x_out := mean(c( + dplyr::first(.data[[x_vars[1]]]), + dplyr::first(.data[[x_vars[2]]]) + ), na.rm = TRUE), + !!y_out := mean(c( + dplyr::first(.data[[y_vars[1]]]), + dplyr::first(.data[[y_vars[2]]]) + ), na.rm = TRUE), + .groups = "drop" + ) + } else if (method == "meanxfirst") { + # Use the mean of all x coordinates and the first y coordinate + data |> + dplyr::group_by(!!!rlang::syms(group_vars)) |> + dplyr::summarize( + !!x_out := mean(c(!!!rlang::syms(x_vars)), na.rm = TRUE), + !!y_out := mean(c( + dplyr::first(.data[[y_vars[1]]]), + dplyr::first(.data[[y_vars[2]]]) + ), na.rm = TRUE), + .groups = "drop" + ) + } else if (method == "meanyfirst") { + # First x, mean of all y + data |> + dplyr::group_by(!!!rlang::syms(group_vars)) |> + dplyr::summarize( + !!x_out := mean(c( + dplyr::first(.data[[x_vars[1]]]), + dplyr::first(.data[[x_vars[2]]]) + ), na.rm = TRUE), + !!y_out := mean(c(!!!rlang::syms(y_vars)), na.rm = TRUE), + .groups = "drop" + ) + } else { + # Handle unsupported method argument + stop("Unsupported method.") + } +} diff --git a/R/ggpedigree.R b/R/ggpedigree.R index b90cdc1e..98fec2af 100644 --- a/R/ggpedigree.R +++ b/R/ggpedigree.R @@ -11,6 +11,7 @@ #' @param momID Character string specifying the column name for mother IDs. Defaults to "momID". #' @param dadID Character string specifying the column name for father IDs. Defaults to "dadID". #' @param status_col Character string specifying the column name for affected status. Defaults to NULL. +#' @param debug Logical. If TRUE, prints debugging information. Default: FALSE. #' @param config A list of configuration options for customizing the plot. The list can include: #' \describe{ #' \item{code_male}{Integer or string. Value identifying males in the sex column. (typically 0 or 1) Default: 1.} @@ -41,7 +42,8 @@ ggPedigree <- function(ped, famID = "famID", momID = "momID", dadID = "dadID", status_col = NULL, - config = list()) { + config = list(), + debug = FALSE) { # ----- # STEP 1: Configuration and Preparation # ----- @@ -70,9 +72,13 @@ ggPedigree <- function(ped, famID = "famID", sex_color = TRUE, status_vals = c(1, 0), max_overlaps = 15, - id_segment_color = NA + id_segment_color = NA, + hints = NULL ) + + + # Merge with user-specified overrides # This allows the user to override any of the default values config <- utils::modifyList(default_config, config) @@ -144,7 +150,8 @@ ggPedigree <- function(ped, famID = "famID", # ----- # Generate a connection table for plotting lines (parents, spouses, etc.) - connections <- calculateConnections(ds, config = config) + connections <- calculateConnections(ds, config = config) |> + unique() # remove duplicates # ----- # STEP 6: Initialize Plot @@ -192,7 +199,11 @@ ggPedigree <- function(ped, famID = "famID", ) + # Mid-sibling to parents midpoint ggplot2::geom_segment( - data = connections, + data = dplyr::filter( + connections, + !is.na(.data$x_mom) & !is.na(.data$y_mom) & + !is.na(.data$x_dad) & !is.na(.data$y_dad) + ), ggplot2::aes( x = .data$x_pos, xend = .data$x_mid_sib, @@ -205,7 +216,11 @@ ggPedigree <- function(ped, famID = "famID", ) + # Sibling vertical drop line ggplot2::geom_segment( - data = connections, + data = dplyr::filter( + connections, + !is.na(.data$x_mom) & !is.na(.data$y_mom) & + !is.na(.data$x_dad) & !is.na(.data$y_dad) + ), ggplot2::aes( x = .data$x_pos, xend = .data$x_pos, @@ -384,8 +399,17 @@ ggPedigree <- function(ped, famID = "famID", } else { p <- p + ggplot2::labs(shape = "Sex") } - - return(p) +if (debug==TRUE) { + return(list( + plot = p, + data = ds, + connections = connections, + config = config + )) + } else { + # If debug is FALSE, return only the plot + return(p) + } } diff --git a/tests/testthat/test-calcCoordinates.R b/tests/testthat/test-calcCoordinates.R index 495f004e..9eb79ec9 100644 --- a/tests/testthat/test-calcCoordinates.R +++ b/tests/testthat/test-calcCoordinates.R @@ -120,3 +120,23 @@ test_that("getRelativeCoordinates returns expected coordinates for mother", { expect_false("A" %in% mom_coords$personID) expect_false("B" %in% mom_coords$personID) }) + + +test_that("broken hints doesn't cause a fatal error", { + library(BGmisc) + data("potter") + potter$momID[1] <- NA + potter$dadID[1] <- NA + + # Test with hints + expect_warning( + ggPedigree(potter, + famID = "famID", + personID = "personID", + config= list(hints = TRUE) + ) + ) +} + ) + + diff --git a/vignettes/plots.Rmd b/vignettes/plots.Rmd index 1700f359..1dcfef69 100644 --- a/vignettes/plots.Rmd +++ b/vignettes/plots.Rmd @@ -44,7 +44,8 @@ library(tidyverse) # for data wrangling data("potter") ggPedigree(potter, famID = "famID", - personID = "personID" + personID = "personID", + config= list(hints = TRUE) ) ``` @@ -173,7 +174,7 @@ p + # More complex examples -```{r, message=FALSE, warning=FALSE} +```{r message=FALSE, warning=FALSE} library(BGmisc) # helper utilities & example data data("inbreeding") @@ -181,10 +182,13 @@ df <- inbreeding p <- ggPedigree( - df, + df %>% filter( + famID %in% c(5,7), + ), famID = "famID", personID = "ID", status_col = "proband", + debug = TRUE, config = list( code_male = 0, sex_color = F, @@ -197,7 +201,10 @@ p <- ggPedigree( sibling_segment_color = "blue", parent_segment_color = "green", offspring_segment_color = "black") -) + facet_wrap(~famID, scales= "free") +)# + facet_wrap(~famID, scales= "free") -p +p$connections%>%filter(personID ==61) %>% nrow() +p$connections%>%filter(personID ==61) %>% unique() + +p$plot ``` From 43c95425184961ee3d0190edd0cb3fcf4e39b9fb Mon Sep 17 00:00:00 2001 From: Mason Garrison Date: Wed, 14 May 2025 10:11:26 -0400 Subject: [PATCH 06/18] Process duplicates moved to own file --- R/calcCoordinates.R | 376 +------------------------------------------- R/processExtras.R | 375 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 376 insertions(+), 375 deletions(-) create mode 100644 R/processExtras.R diff --git a/R/calcCoordinates.R b/R/calcCoordinates.R index 24bd32a4..eec457e4 100644 --- a/R/calcCoordinates.R +++ b/R/calcCoordinates.R @@ -71,7 +71,7 @@ calculateCoordinates <- function(ped, personID = "ID", momID = "momID", if(!is.null(config$hints)) { - +#' Check if hints are provided autohint <- tryCatch(kinship2::autohint(ped_ped,config$hints, align = config$ped_align, packed = config$ped_packed), @@ -564,378 +564,4 @@ getRelativeCoordinates <- function(ped, connections, relativeIDvar, x_name, y_na -#' Process duplicate appearances of individuals in a pedigree layout -#' -#' Resolves layout conflicts when the same individual appears in multiple places -#' (e.g., due to inbreeding loops). Keeps the layout point that is closest to a relevant -#' relative (mom, dad, or spouse) and removes links to others to avoid confusion in visualization. -#' -#' @param ped A data.frame containing pedigree layout info with columns including: -#' `personID`, `x_pos`, `y_pos`, `dadID`, `momID`, and a logical column `extra`. -#' @param config A list of configuration options. Currently unused but passed through to internal helpers. -#' -#' @return A modified `ped` data.frame with updated coordinates and removed duplicates. -#' -#' @keywords internal - -processExtras <- function(ped, config = list()) { - # ----- - # Check inputs - # ----- - if (!inherits(ped, "data.frame")) { - stop("ped should be a data.frame or inherit to a data.frame") - } - if (!all(c("personID", "x_pos", "y_pos", "dadID", "momID") %in% names(ped))) { - stop("ped must contain personID, x_pos, y_pos, dadID, and momID columns") - } - - # default config - default_config <- list() - config <- utils::modifyList(default_config, config) - - # ----- - # Identify duplicated individuals - # ----- - - # Find all individuals with extra appearances - idsextras <- dplyr::filter(ped, .data$extra == TRUE) |> - dplyr::select("personID") |> - dplyr::pull() |> - unique() - - - # Assign unique ID per row for later use - ped$newID <- 1:nrow(ped) - - - # ----- - # Subset to duplicated entries only # note that tidyselect hates .data pronouns - # ----- - extras <- dplyr::filter(ped, .data$personID %in% idsextras) |> - dplyr::select( - "newID", - "personID", - "x_pos", "y_pos", - "dadID", "momID","parenthash", - "spouseID" - ) - - # ----- - # Get coordinate positions of relatives and other-self - # ----- - - # Mother's coordinates - mom_coords <- getRelativeCoordinates( - ped = ped, - connections = extras, - relativeIDvar = "momID", - x_name = "x_mom", - y_name = "y_mom", - multiple = "any" - ) - - # Father's coordinates - - dad_coords <- getRelativeCoordinates( - ped = ped, - connections = extras, - relativeIDvar = "dadID", - x_name = "x_dad", - y_name = "y_dad", - multiple = "any" - ) - - # Spouse's coordinates - spouse_coords <- getRelativeCoordinates( - ped = ped, - connections = extras, - relativeIDvar = "spouseID", - x_name = "x_spouse", - y_name = "y_spouse", - multiple = "all" - ) - - # Parenthash coordinates - parenthash_coords <- extras |> # need to get mom and dad coordinates - dplyr::left_join(mom_coords, by = c("newID", "personID", "momID")) |> - dplyr::left_join(dad_coords, by = c("newID", "personID", "dadID")) |> - dplyr::left_join( - ped, - by = c("parenthash"), - suffix = c("", "_sib"), - multiple = "all" - ) |> - dplyr::filter(!is.na(.data$parenthash)) |> - dplyr::mutate( - x_parenthash = mean(c( - .data$x_dad, - .data$x_mom - )), - y_parenthash = mean(c( - .data$y_dad, - .data$y_mom - )) - ) |> - dplyr::select( - .data$newID, - .data$personID, - .data$parenthash, - .data$x_parenthash, - .data$y_parenthash - ) - - - # Coordinates of the individual's other appearance ("self") - self_coords <- extras |> - dplyr::left_join( - ped, - by = c("personID"), - suffix = c("", "_other"), - # relationship = relationship, - multiple = "all" - ) |> - dplyr::filter(.data$newID != .data$newID_other) |> - dplyr::mutate( - x_otherself = .data$x_pos_other, - y_otherself = .data$y_pos_other - ) |> - dplyr::select( - .data$newID, - .data$personID, - .data$newID_other, - .data$x_otherself, - .data$y_otherself - ) - - # ----- - # Merge coordinates into the extra rows - # ----- - - extras <- extras |> - dplyr::left_join(mom_coords, by = c("newID", "personID", "momID")) |> - dplyr::left_join(dad_coords, by = c("newID", "personID", "dadID")) |> - dplyr::left_join(self_coords, by = c("newID", "personID")) |> - dplyr::left_join(spouse_coords, - by = c("newID", "personID", "spouseID"), - multiple = "all" - ) |> - dplyr::left_join(parenthash_coords, - by = c("newID", "personID", "parenthash"), - multiple = "all" - ) - -#print(extras) - # ----- - # Compute Euclidean distances between this appearance and: - # - mom, dad, spouse - # - same individual in other location (otherself) - # These will be used to choose the "closest" relationship. - # minkowski distance could be used here as well aka "city block" distance - # ----- - extras <- extras |> - dplyr::mutate( - dist_mom = computeDistance(method = "cityblock", - x1 = .data$x_pos, - y1 = .data$y_pos, - x2 = .data$x_mom, - y2 = .data$y_mom), - - dist_mom_other = computeDistance(method = "cityblock", - x1 = .data$x_otherself, - y1 = .data$y_otherself, - x2 = .data$x_mom, - y2 = .data$y_mom), - dist_dad = computeDistance(method = "cityblock", - x1 = .data$x_pos, - y1 = .data$y_pos, - x2 = .data$x_dad, - y2 = .data$y_dad), - dist_dad_other = computeDistance(method = "cityblock", - x1 = .data$x_otherself, - y1 = .data$y_otherself, - x2 = .data$x_dad, - y2 = .data$y_dad), - # spouse distance - dist_spouse = computeDistance(method = "cityblock", - x1 = .data$x_pos, - y1 = .data$y_pos, - x2 = .data$x_spouse, - y2 = .data$y_spouse), - dist_spouse_other = computeDistance(method = "cityblock", - x1 = .data$x_otherself, - y1 = .data$y_otherself, - x2 = .data$x_spouse, - y2 = .data$y_spouse), - dist_otherself = computeDistance(method = "cityblock", - x1 = .data$x_pos, - y1 = .data$y_pos, - x2 = .data$x_otherself, - y2 = .data$y_otherself), - dist_parenthash = computeDistance(method = "cityblock", - x1 = .data$x_pos, - y1 = .data$y_pos, - x2 = .data$x_parenthash, - y2 = .data$y_parenthash), - dist_parenthash_other = computeDistance(method = "cityblock", - x1 = .data$x_otherself, - y1 = .data$y_otherself, - x2 = .data$x_parenthash, - y2 = .data$y_parenthash) - - - ) - - - # ----- - # When there are multiple spouses, keep only the appearance - # where the individual is closest to one of their spouses. - # ----- - - extras <- extras |> - dplyr::group_by(.data$newID, .data$personID) |> - dplyr::mutate( - min_spouse = min(.data$dist_spouse, na.rm = TRUE), - num_spouse = dplyr::n() - ) |> - dplyr::ungroup() - extras <- extras |> - dplyr::filter(.data$num_spouse == 1 | .data$dist_spouse == .data$min_spouse) |> - dplyr::select( - -.data$min_spouse, - -.data$num_spouse - ) - - - # ----- - # Determine the "closest relative" to this duplicated row - # ----- - - - - # For each duplicated appearance, we now ask: - # - Is this appearance closer to mom than the otherself copy is? - # - Same for dad? For spouse? - - extras <- extras |> - dplyr::mutate( - mom_closer = dplyr::case_when( - .data$dist_mom < .data$dist_mom_other ~ TRUE, - .data$dist_mom_other < .data$dist_mom ~ FALSE, - TRUE ~ TRUE - ), - dad_closer = dplyr::case_when( - .data$dist_dad < .data$dist_dad_other ~ TRUE, - .data$dist_dad_other < .data$dist_dad ~ FALSE, - TRUE ~ TRUE - ), - spouse_closer = dplyr::case_when( - .data$dist_spouse < .data$dist_spouse_other ~ TRUE, - .data$dist_spouse_other < .data$dist_spouse ~ FALSE, - # is.na(.data$dist_spouse) ~ FALSE, - # !is.na(.data$dist_spouse) & is.na(.data$dist_spouse_other) ~ TRUE, - TRUE ~ TRUE - ), - parenthash_closer = dplyr::case_when( - .data$dist_parenthash < .data$dist_parenthash_other ~ TRUE, - .data$dist_parenthash_other < .data$dist_parenthash ~ FALSE, - TRUE ~ TRUE - ) - ) - - - # Then: - # - Determine which of mom, dad, or spouse is closest in absolute terms - # - Use that to decide whether to retain connections to that relative - -# extras <- extras |> -# dplyr::mutate( -# closest_relative = dplyr::case_when( -# .data$dist_mom <= .data$dist_dad & .data$dist_mom <= .data$dist_spouse ~ "mom", -# .data$dist_dad < .data$dist_mom & .data$dist_dad <= .data$dist_spouse ~ "dad", -# TRUE ~ "spouse" -# ) -# ) - - # ----- - # Based on which relative is closest, determine which links to keep - # ----- - - extras <- extras |> - dplyr::mutate( - link_as_mom = .data$mom_closer, - link_as_dad = .data$dad_closer, - link_as_spouse = .data$spouse_closer, - link_as_sibling = .data$parenthash_closer - ) %>% dplyr::mutate( - link_any = dplyr::case_when( - .data$link_as_mom == TRUE | .data$link_as_dad == TRUE | - .data$link_as_sibling == TRUE| .data$link_as_spouse == TRUE ~ TRUE, - TRUE ~ FALSE - ) - ) - - - # ----- - # Final subset of relevant decision columns - # ----- - - skinnyextras <- extras |> - dplyr::select( - .data$newID, - .data$link_as_dad, - .data$link_as_mom, - .data$link_as_spouse, - .data$link_as_sibling, - .data$link_any, - .data$x_otherself, - .data$y_otherself - ) - - - # ----- - # Apply decisions to main pedigree - # Removes connection references for non-kept parents/spouses - # ----- - - ped <- ped |> - dplyr::left_join(skinnyextras, - by = c("newID"), suffix = c("", "_")#, - # relationship = "one-to-one" - ) |> - dplyr::select( - -"newID" - ) |> - # set the connection columns to TRUE if not kept - dplyr::mutate( - link_as_mom = dplyr::case_when( - is.na(.data$link_as_mom) ~ TRUE, - .data$link_as_mom == TRUE ~ TRUE, - .data$link_as_mom == FALSE ~ FALSE - ), - link_as_dad = dplyr::case_when( - is.na(.data$link_as_dad) ~ TRUE, - .data$link_as_dad == TRUE ~ TRUE, - .data$link_as_dad == FALSE ~ FALSE - ), - link_as_spouse = dplyr::case_when( - .data$link_as_spouse == FALSE ~ FALSE, - is.na(.data$link_as_spouse) ~ TRUE, - .data$link_as_spouse == TRUE ~ TRUE), - link_as_sibling = dplyr::case_when( - is.na(.data$link_as_sibling) ~ TRUE, - .data$link_as_sibling == TRUE ~ TRUE, - .data$link_as_sibling == FALSE ~ FALSE), - link_any = dplyr::case_when( - is.na(.data$link_any) ~ TRUE, - .data$link_any == TRUE ~ TRUE, - .data$link_any == FALSE ~ FALSE) - ) |> filter( - .data$link_any == TRUE - ) - - - - return(ped) -} - diff --git a/R/processExtras.R b/R/processExtras.R new file mode 100644 index 00000000..10f3f9b9 --- /dev/null +++ b/R/processExtras.R @@ -0,0 +1,375 @@ + + +#' Process duplicate appearances of individuals in a pedigree layout +#' +#' Resolves layout conflicts when the same individual appears in multiple places +#' (e.g., due to inbreeding loops). Keeps the layout point that is closest to a relevant +#' relative (mom, dad, or spouse) and removes links to others to avoid confusion in visualization. +#' +#' @param ped A data.frame containing pedigree layout info with columns including: +#' `personID`, `x_pos`, `y_pos`, `dadID`, `momID`, and a logical column `extra`. +#' @param config A list of configuration options. Currently unused but passed through to internal helpers. +#' +#' @return A modified `ped` data.frame with updated coordinates and removed duplicates. +#' +#' @keywords internal + +processExtras <- function(ped, config = list()) { + # ----- + # Check inputs + # ----- + if (!inherits(ped, "data.frame")) { + stop("ped should be a data.frame or inherit to a data.frame") + } + if (!all(c("personID", "x_pos", "y_pos", "dadID", "momID") %in% names(ped))) { + stop("ped must contain personID, x_pos, y_pos, dadID, and momID columns") + } + + # default config + default_config <- list() + config <- utils::modifyList(default_config, config) + + # ----- + # Identify duplicated individuals + # ----- + + # Find all individuals with extra appearances + idsextras <- dplyr::filter(ped, .data$extra == TRUE) |> + dplyr::select("personID") |> + dplyr::pull() |> + unique() + + + # Assign unique ID per row for later use + ped$newID <- 1:nrow(ped) + + + # ----- + # Subset to duplicated entries only # note that tidyselect hates .data pronouns + # ----- + extras <- dplyr::filter(ped, .data$personID %in% idsextras) |> + dplyr::select( + "newID", + "personID", + "x_pos", "y_pos", + "dadID", "momID","parenthash", + "spouseID" + ) + + # ----- + # Get coordinate positions of relatives and other-self + # ----- + + # Mother's coordinates + mom_coords <- getRelativeCoordinates( + ped = ped, + connections = extras, + relativeIDvar = "momID", + x_name = "x_mom", + y_name = "y_mom", + multiple = "any" + ) + + # Father's coordinates + + dad_coords <- getRelativeCoordinates( + ped = ped, + connections = extras, + relativeIDvar = "dadID", + x_name = "x_dad", + y_name = "y_dad", + multiple = "any" + ) + + # Spouse's coordinates + spouse_coords <- getRelativeCoordinates( + ped = ped, + connections = extras, + relativeIDvar = "spouseID", + x_name = "x_spouse", + y_name = "y_spouse", + multiple = "all" + ) + + # Parenthash coordinates + parenthash_coords <- extras |> # need to get mom and dad coordinates + dplyr::left_join(mom_coords, by = c("newID", "personID", "momID")) |> + dplyr::left_join(dad_coords, by = c("newID", "personID", "dadID")) |> + dplyr::left_join( + ped, + by = c("parenthash"), + suffix = c("", "_sib"), + multiple = "all" + ) |> + dplyr::filter(!is.na(.data$parenthash)) |> + dplyr::mutate( + x_parenthash = mean(c( + .data$x_dad, + .data$x_mom + )), + y_parenthash = mean(c( + .data$y_dad, + .data$y_mom + )) + ) |> + dplyr::select( + .data$newID, + .data$personID, + .data$parenthash, + .data$x_parenthash, + .data$y_parenthash + ) + + + # Coordinates of the individual's other appearance ("self") + self_coords <- extras |> + dplyr::left_join( + ped, + by = c("personID"), + suffix = c("", "_other"), + # relationship = relationship, + multiple = "all" + ) |> + dplyr::filter(.data$newID != .data$newID_other) |> + dplyr::mutate( + x_otherself = .data$x_pos_other, + y_otherself = .data$y_pos_other + ) |> + dplyr::select( + .data$newID, + .data$personID, + .data$newID_other, + .data$x_otherself, + .data$y_otherself + ) + + # ----- + # Merge coordinates into the extra rows + # ----- + + extras <- extras |> + dplyr::left_join(mom_coords, by = c("newID", "personID", "momID")) |> + dplyr::left_join(dad_coords, by = c("newID", "personID", "dadID")) |> + dplyr::left_join(self_coords, by = c("newID", "personID")) |> + dplyr::left_join(spouse_coords, + by = c("newID", "personID", "spouseID"), + multiple = "all" + ) |> + dplyr::left_join(parenthash_coords, + by = c("newID", "personID", "parenthash"), + multiple = "all" + ) + +#print(extras) + # ----- + # Compute Euclidean distances between this appearance and: + # - mom, dad, spouse + # - same individual in other location (otherself) + # These will be used to choose the "closest" relationship. + # minkowski distance could be used here as well aka "city block" distance + # ----- + extras <- extras |> + dplyr::mutate( + dist_mom = computeDistance(method = "cityblock", + x1 = .data$x_pos, + y1 = .data$y_pos, + x2 = .data$x_mom, + y2 = .data$y_mom), + + dist_mom_other = computeDistance(method = "cityblock", + x1 = .data$x_otherself, + y1 = .data$y_otherself, + x2 = .data$x_mom, + y2 = .data$y_mom), + dist_dad = computeDistance(method = "cityblock", + x1 = .data$x_pos, + y1 = .data$y_pos, + x2 = .data$x_dad, + y2 = .data$y_dad), + dist_dad_other = computeDistance(method = "cityblock", + x1 = .data$x_otherself, + y1 = .data$y_otherself, + x2 = .data$x_dad, + y2 = .data$y_dad), + # spouse distance + dist_spouse = computeDistance(method = "cityblock", + x1 = .data$x_pos, + y1 = .data$y_pos, + x2 = .data$x_spouse, + y2 = .data$y_spouse), + dist_spouse_other = computeDistance(method = "cityblock", + x1 = .data$x_otherself, + y1 = .data$y_otherself, + x2 = .data$x_spouse, + y2 = .data$y_spouse), + dist_otherself = computeDistance(method = "cityblock", + x1 = .data$x_pos, + y1 = .data$y_pos, + x2 = .data$x_otherself, + y2 = .data$y_otherself), + dist_parenthash = computeDistance(method = "cityblock", + x1 = .data$x_pos, + y1 = .data$y_pos, + x2 = .data$x_parenthash, + y2 = .data$y_parenthash), + dist_parenthash_other = computeDistance(method = "cityblock", + x1 = .data$x_otherself, + y1 = .data$y_otherself, + x2 = .data$x_parenthash, + y2 = .data$y_parenthash) + + + ) + + + # ----- + # When there are multiple spouses, keep only the appearance + # where the individual is closest to one of their spouses. + # ----- + + extras <- extras |> + dplyr::group_by(.data$newID, .data$personID) |> + dplyr::mutate( + min_spouse = min(.data$dist_spouse, na.rm = TRUE), + num_spouse = dplyr::n() + ) |> + dplyr::ungroup() + extras <- extras |> + dplyr::filter(.data$num_spouse == 1 | .data$dist_spouse == .data$min_spouse) |> + dplyr::select( + -.data$min_spouse, + -.data$num_spouse + ) + + + # ----- + # Determine the "closest relative" to this duplicated row + # ----- + + + + # For each duplicated appearance, we now ask: + # - Is this appearance closer to mom than the otherself copy is? + # - Same for dad? For spouse? + + extras <- extras |> + dplyr::mutate( + mom_closer = dplyr::case_when( + .data$dist_mom < .data$dist_mom_other ~ TRUE, + .data$dist_mom_other < .data$dist_mom ~ FALSE, + TRUE ~ TRUE + ), + dad_closer = dplyr::case_when( + .data$dist_dad < .data$dist_dad_other ~ TRUE, + .data$dist_dad_other < .data$dist_dad ~ FALSE, + TRUE ~ TRUE + ), + spouse_closer = dplyr::case_when( + .data$dist_spouse < .data$dist_spouse_other ~ TRUE, + .data$dist_spouse_other < .data$dist_spouse ~ FALSE, + # is.na(.data$dist_spouse) ~ FALSE, + # !is.na(.data$dist_spouse) & is.na(.data$dist_spouse_other) ~ TRUE, + TRUE ~ TRUE + ), + parenthash_closer = dplyr::case_when( + .data$dist_parenthash < .data$dist_parenthash_other ~ TRUE, + .data$dist_parenthash_other < .data$dist_parenthash ~ FALSE, + TRUE ~ TRUE + ) + ) + + + # Then: + # - Determine which of mom, dad, or spouse is closest in absolute terms + # - Use that to decide whether to retain connections to that relative + +# extras <- extras |> +# dplyr::mutate( +# closest_relative = dplyr::case_when( +# .data$dist_mom <= .data$dist_dad & .data$dist_mom <= .data$dist_spouse ~ "mom", +# .data$dist_dad < .data$dist_mom & .data$dist_dad <= .data$dist_spouse ~ "dad", +# TRUE ~ "spouse" +# ) +# ) + + # ----- + # Based on which relative is closest, determine which links to keep + # ----- + + extras <- extras |> + dplyr::mutate( + link_as_mom = .data$mom_closer, + link_as_dad = .data$dad_closer, + link_as_spouse = .data$spouse_closer, + link_as_sibling = .data$parenthash_closer + ) %>% dplyr::mutate( + link_any = dplyr::case_when( + .data$link_as_mom == TRUE | .data$link_as_dad == TRUE | + .data$link_as_sibling == TRUE| .data$link_as_spouse == TRUE ~ TRUE, + TRUE ~ FALSE + ) + ) + + + # ----- + # Final subset of relevant decision columns + # ----- + + skinnyextras <- extras |> + dplyr::select( + .data$newID, + .data$link_as_dad, + .data$link_as_mom, + .data$link_as_spouse, + .data$link_as_sibling, + .data$link_any, + .data$x_otherself, + .data$y_otherself + ) + + + # ----- + # Apply decisions to main pedigree + # Removes connection references for non-kept parents/spouses + # ----- + + ped <- ped |> + dplyr::left_join(skinnyextras, + by = c("newID"), suffix = c("", "_")#, + # relationship = "one-to-one" + ) |> + dplyr::select( + -"newID" + ) |> + # set the connection columns to TRUE if not kept + dplyr::mutate( + link_as_mom = dplyr::case_when( + is.na(.data$link_as_mom) ~ TRUE, + .data$link_as_mom == TRUE ~ TRUE, + .data$link_as_mom == FALSE ~ FALSE + ), + link_as_dad = dplyr::case_when( + is.na(.data$link_as_dad) ~ TRUE, + .data$link_as_dad == TRUE ~ TRUE, + .data$link_as_dad == FALSE ~ FALSE + ), + link_as_spouse = dplyr::case_when( + .data$link_as_spouse == FALSE ~ FALSE, + is.na(.data$link_as_spouse) ~ TRUE, + .data$link_as_spouse == TRUE ~ TRUE), + link_as_sibling = dplyr::case_when( + is.na(.data$link_as_sibling) ~ TRUE, + .data$link_as_sibling == TRUE ~ TRUE, + .data$link_as_sibling == FALSE ~ FALSE), + link_any = dplyr::case_when( + is.na(.data$link_any) ~ TRUE, + .data$link_any == TRUE ~ TRUE, + .data$link_any == FALSE ~ FALSE) + ) |> filter( + .data$link_any == TRUE + ) + + + + return(ped) +} \ No newline at end of file From 4270f15aed33be65ab29057d00477ddedf2d8eb9 Mon Sep 17 00:00:00 2001 From: Mason Garrison Date: Wed, 14 May 2025 10:12:37 -0400 Subject: [PATCH 07/18] moving getRelativeCoordinates --- R/calcCoordinates.R | 69 ------------------------------------ R/calcCoordinatesHelpers.R | 72 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 69 deletions(-) diff --git a/R/calcCoordinates.R b/R/calcCoordinates.R index eec457e4..810c9e52 100644 --- a/R/calcCoordinates.R +++ b/R/calcCoordinates.R @@ -491,76 +491,7 @@ calculateConnections <- function(ped, return(connections) } -#' Get coordinate positions of relatives for each individual -#' -#' Helper function used to retrieve the x and y coordinates of a specified relative -#' (e.g., mom, dad, spouse) and join them into the main connection table. This supports -#' relative-specific positioning in downstream layout functions like `calculateConnections()`. -#' -#' @inheritParams ggpedigree -#' @param connections A `data.frame` containing the individuals and their associated relative IDs. -#' @param relativeIDvar Character. Name of the column in `connections` for the relative ID variable. -#' @param x_name Character. Name of the new column to store the x-coordinate of the relative. -#' @param y_name Character. Name of the new column to store the y-coordinate of the relative. -#' @param multiple Character. Specifies how to handle multiple matches. Options are "all" or "any". -#' @param only_unique Logical. If TRUE, return only unique rows. Defaults to TRUE. -#' -#' @return A `data.frame` with columns: -#' \itemize{ -#' \item `personID`, `relativeIDvar` -#' \item `x_name`, `y_name`: Coordinates of the specified relative -#' \item Optionally, `newID` if present in `ped` -#' } -#' @keywords internal - - -getRelativeCoordinates <- function(ped, connections, relativeIDvar, x_name, y_name, - # relationship = "one-to-one", - personID = "personID", - multiple = "all", - only_unique = TRUE) { - # Filter only rows where the relative ID is not missing - # and join with the main pedigree data frame - rel_connections <- connections |> - dplyr::filter(!is.na(.data[[relativeIDvar]])) |> - # Join in the relative's coordinates from `ped`, based on relative ID - dplyr::left_join( - ped, - by = stats::setNames(personID, relativeIDvar), - suffix = c("", "_rel"), - # relationship = relationship, - multiple = multiple - ) |> - # Rename the joined coordinate columns to the specified x/y output names - dplyr::rename( - !!x_name := "x_pos_rel", - !!y_name := "y_pos_rel" - ) - # If the ped includes a 'newID' column (used to track duplicates), retain it in the result - if ("newID" %in% names(ped)) { - rel_connections <- rel_connections |> - dplyr::select( - !!personID, - "newID", - !!relativeIDvar, - !!x_name, - !!y_name - ) - } else { - rel_connections <- rel_connections |> - dplyr::select( - !!personID, - !!relativeIDvar, - !!x_name, - !!y_name - ) - } - if(only_unique == TRUE){ - rel_connections <- unique(rel_connections) - } - return(rel_connections) -} diff --git a/R/calcCoordinatesHelpers.R b/R/calcCoordinatesHelpers.R index 0ddd90f2..6b6e3f98 100644 --- a/R/calcCoordinatesHelpers.R +++ b/R/calcCoordinatesHelpers.R @@ -148,3 +148,75 @@ getMidpoints <- function(data, group_vars, stop("Unsupported method.") } } + + +#' Get coordinate positions of relatives for each individual +#' +#' Helper function used to retrieve the x and y coordinates of a specified relative +#' (e.g., mom, dad, spouse) and join them into the main connection table. This supports +#' relative-specific positioning in downstream layout functions like `calculateConnections()`. +#' +#' @inheritParams ggpedigree +#' @param connections A `data.frame` containing the individuals and their associated relative IDs. +#' @param relativeIDvar Character. Name of the column in `connections` for the relative ID variable. +#' @param x_name Character. Name of the new column to store the x-coordinate of the relative. +#' @param y_name Character. Name of the new column to store the y-coordinate of the relative. +#' @param multiple Character. Specifies how to handle multiple matches. Options are "all" or "any". +#' @param only_unique Logical. If TRUE, return only unique rows. Defaults to TRUE. +#' +#' @return A `data.frame` with columns: +#' \itemize{ +#' \item `personID`, `relativeIDvar` +#' \item `x_name`, `y_name`: Coordinates of the specified relative +#' \item Optionally, `newID` if present in `ped` +#' } +#' @keywords internal + + +getRelativeCoordinates <- function(ped, connections, relativeIDvar, x_name, y_name, + # relationship = "one-to-one", + personID = "personID", + multiple = "all", + only_unique = TRUE) { + # Filter only rows where the relative ID is not missing + # and join with the main pedigree data frame + rel_connections <- connections |> + dplyr::filter(!is.na(.data[[relativeIDvar]])) |> + # Join in the relative's coordinates from `ped`, based on relative ID + dplyr::left_join( + ped, + by = stats::setNames(personID, relativeIDvar), + suffix = c("", "_rel"), + # relationship = relationship, + multiple = multiple + ) |> + # Rename the joined coordinate columns to the specified x/y output names + dplyr::rename( + !!x_name := "x_pos_rel", + !!y_name := "y_pos_rel" + ) + # If the ped includes a 'newID' column (used to track duplicates), retain it in the result + if ("newID" %in% names(ped)) { + rel_connections <- rel_connections |> + dplyr::select( + !!personID, + "newID", + !!relativeIDvar, + !!x_name, + !!y_name + ) + } else { + rel_connections <- rel_connections |> + dplyr::select( + !!personID, + !!relativeIDvar, + !!x_name, + !!y_name + ) + } + if(only_unique == TRUE){ + rel_connections <- unique(rel_connections) + } + + return(rel_connections) +} \ No newline at end of file From 1f16adef13c27eceac23e709401e483f37c48a55 Mon Sep 17 00:00:00 2001 From: Mason Garrison Date: Wed, 14 May 2025 10:26:09 -0400 Subject: [PATCH 08/18] rename to reflect where it's used --- R/{calcCoordinatesHelpers.R => calcConnectionsHelpers.R} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename R/{calcCoordinatesHelpers.R => calcConnectionsHelpers.R} (100%) diff --git a/R/calcCoordinatesHelpers.R b/R/calcConnectionsHelpers.R similarity index 100% rename from R/calcCoordinatesHelpers.R rename to R/calcConnectionsHelpers.R From c8ca44deb8f0dc8d974ba72f0de6db502e802ca6 Mon Sep 17 00:00:00 2001 From: Mason Garrison Date: Wed, 14 May 2025 10:28:55 -0400 Subject: [PATCH 09/18] splitcalcs --- R/calcConnections.R | 282 ++++++++++++++++++++++++++++++++++++++++++++ R/calcCoordinates.R | 281 ------------------------------------------- 2 files changed, 282 insertions(+), 281 deletions(-) create mode 100644 R/calcConnections.R diff --git a/R/calcConnections.R b/R/calcConnections.R new file mode 100644 index 00000000..f6844289 --- /dev/null +++ b/R/calcConnections.R @@ -0,0 +1,282 @@ +#' Calculate connections for a pedigree dataset +#' +#' Computes graphical connection paths for a pedigree layout, including parent-child, +#' sibling, and spousal connections. Optionally processes duplicate appearances +#' of individuals (marked as `extra`) to ensure relational accuracy. +#' +#' @inheritParams ggpedigree +#' @param config List of configuration parameters. Currently unused but passed through to internal helpers. +#' @return A `data.frame` containing connection points and midpoints for graphical rendering. Includes: +#' \itemize{ +#' \item `x_pos`, `y_pos`: positions of focal individual +#' \item `x_dad`, `y_dad`, `x_mom`, `y_mom`: parental positions (if available) +#' \item `x_spouse`, `y_spouse`: spousal positions (if available) +#' \item `x_midparent`, `y_midparent`: midpoint between parents +#' \item `x_mid_sib`, `y_mid_sib`: sibling group midpoint +#' \item `x_mid_spouse`, `y_mid_spouse`: midpoint between spouses +#' } +#' +#' @export + +calculateConnections <- function(ped, + config = list()) { + # Check inputs ----------------------------------------------------------- + if (!inherits(ped, "data.frame")) { + stop("ped should be a data.frame or inherit to a data.frame") + } + if (!all(c("personID", "x_pos", "y_pos", "dadID", "momID") %in% names(ped))) { + stop("ped must contain personID, x_pos, y_pos, dadID, and momID columns") + } + + # Default configuration placeholder + default_config <- list() + config <- utils::modifyList(default_config, config) + + + # Add spouseID if missing + if (!all("spouseID" %in% names(ped))) { + ped$spouseID <- NA + # Attempt to infer spouse based on parenthood (not always reliable) + # this will give you the mom that is the spouse of the dad + # ped$spouseID <- ped$momID[match(ped$personID, ped$dadID)] + # this will give you the dad that is the spouse of the mom + # ped$spouseID <- ped$dadID[match(ped$personID, ped$momID)] + + ped$spouseID <- ifelse(!is.na(ped$momID[match(ped$personID, ped$dadID)]), + ped$momID[match(ped$personID, ped$dadID)], + ped$dadID[match(ped$personID, ped$momID)] + ) + } + # Add famID if missing (used for grouping) + if (!all("famID" %in% names(ped))) { + ped$famID <- 1 + } + + # create a unique parenthash for each individual + # this will be used to identify siblings + if (!all("parenthash" %in% names(ped))) { + ped <- ped |> + dplyr::mutate( + parenthash = paste0(.data$momID, ".", .data$dadID) + ) |> + dplyr::mutate( + parenthash = gsub("NA.NA", NA, .data$parenthash) + ) + } + + # If duplicated appearances exist, resolve which connections to keep + if (sum(ped$extra) > 0) { + ped <- processExtras(ped, config = config) + } + + # Construct base connection frame + # This will be used for all joins + + + if ("x_otherself" %in% names(ped)) { + connections <- dplyr::select( + .data = ped, + "personID", + "x_pos", "y_pos", + "dadID", "momID", "parenthash", + "spouseID", + "famID", + "x_otherself", "y_otherself", + "extra","link_as_mom", "link_as_dad", "link_as_spouse", + "link_as_sibling" + ) |> unique() + + + connections_moms <- dplyr::filter(connections, .data$link_as_mom == TRUE) |> + dplyr::select( + -"extra", + -"link_as_mom", + -"link_as_dad", + -"link_as_spouse", + -"link_as_sibling" + ) + + connections_dads <- dplyr::filter(connections, .data$link_as_dad == TRUE)|> + dplyr::select( + -"extra", + -"link_as_mom", + -"link_as_dad", + -"link_as_spouse", + -"link_as_sibling" + ) + connections_spouses <- dplyr::filter(connections, .data$link_as_spouse == TRUE) |> + dplyr::select( + -"extra", + -"link_as_mom", + -"link_as_dad", + -"link_as_spouse", + -"link_as_sibling" + ) + connections_sibs <- dplyr::filter(connections, .data$link_as_sibling == TRUE) |> + dplyr::select( + -"extra", + -"link_as_mom", + -"link_as_dad", + -"link_as_spouse", + -"link_as_sibling" + ) + } else { + connections <- dplyr::select( + .data = ped, + "personID", + "x_pos", "y_pos", + "dadID", "momID", "parenthash", + "spouseID", + "famID", + "extra" + ) |> unique() |> + dplyr::mutate( + link_as_mom = TRUE, + link_as_dad = TRUE, + link_as_spouse = TRUE, + link_as_sibling = TRUE + ) + + + +# no duplications, so just use the same connections + connections_sibs <- connections_spouses <- connections_dads <- connections_moms <- connections + } + + # Get mom's coordinates + mom_connections <- getRelativeCoordinates( + ped = ped, + connections = connections_moms, + relativeIDvar = "momID", + x_name = "x_mom", + y_name = "y_mom" + ) + + # Get dad's coordinates + dad_connections <- getRelativeCoordinates( + ped = ped, + connections = connections_dads, + relativeIDvar = "dadID", + x_name = "x_dad", + y_name = "y_dad" + ) + + # Get spouse coordinates + spouse_connections <- ped |> + dplyr::select( + "personID", "x_pos", + "y_pos", "spouseID" + ) |> + dplyr::left_join(connections_spouses, + by = c("spouseID" = "personID"), + suffix = c("", "_spouse"), + multiple = "any" + ) |> + dplyr::rename( + x_spouse = "x_pos_spouse", + y_spouse = "y_pos_spouse" + ) |> + dplyr::select( + "personID", "spouseID", + "x_spouse", "y_spouse" + ) |> unique() + + # Combine mom, dad, and spouse coordinates + connections <- connections |> + dplyr::left_join(mom_connections, + by = c("personID", "momID") + ) |> + dplyr::left_join(dad_connections, + by = c("personID", "dadID") + ) |> + dplyr::left_join(spouse_connections, + by = c("personID", "spouseID") + ) |> unique() + + # Calculate midpoints between mom and dad in child row + + parent_midpoints <- connections |> + dplyr::filter(!is.na(.data$dadID) & !is.na(.data$momID)) |> + dplyr::group_by(.data$parenthash) |> + dplyr::summarize( + x_midparent = mean(c( + dplyr::first(.data$x_dad), + dplyr::first(.data$x_mom) + )), + y_midparent = mean(c( + dplyr::first(.data$y_dad), + dplyr::first(.data$y_mom) + )), + .groups = "drop" + ) |> unique() + + # Calculate midpoints between spouses + spouse_midpoints <- connections |> + dplyr::filter(!is.na(.data$spouseID)) |> + dplyr::group_by(.data$spouseID) |> + dplyr::summarize( + x_mid_spouse = mean(c( + dplyr::first(.data$x_pos), + dplyr::first(.data$x_spouse) + )), + y_mid_spouse = mean(c( + dplyr::first(.data$y_pos), + dplyr::first(.data$y_spouse) + )), + .groups = "drop" + ) |> unique() + + # Calculate sibling group midpoints + sibling_midpoints <- connections|> + dplyr::filter( + !is.na(.data$momID) & !is.na(.data$dadID) & # biological parents defined + !is.na(.data$x_mom) & !is.na(.data$y_mom) & # mom’s coordinates linked + !is.na(.data$x_dad) & !is.na(.data$y_dad) # dad’s coordinates linked + ) |> + dplyr::group_by( + .data$parenthash, + .data$x_mom, .data$y_mom, + .data$x_dad, .data$y_dad + ) |> + dplyr::summarize( + x_mid_sib = mean(.data$x_pos), + y_mid_sib = dplyr::first(.data$y_pos), + .groups = "drop" + ) |> unique() + + + # Merge midpoints into connections + connections <- connections |> + dplyr::left_join(parent_midpoints, + by = c("parenthash") + ) |> + dplyr::left_join(spouse_midpoints, + by = c("spouseID") + ) |> + dplyr::left_join(sibling_midpoints, + by = c("parenthash","x_mom", "y_mom", + "x_dad", "y_dad") + ) |> + dplyr::mutate( + x_mid_sib = dplyr::case_when( + is.na(.data$x_dad) & is.na(.data$x_mom) ~ NA_real_, + !is.na(.data$x_mid_sib) ~ .data$x_mid_sib, + (!is.na(.data$momID) & !is.na(.data$x_mom)) | (!is.na(.data$dadID) & !is.na(.data$x_dad)) ~ .data$x_pos, + TRUE ~ NA_real_ + ), + y_mid_sib = dplyr::case_when( + is.na(.data$y_dad) & is.na(.data$y_mom) ~ NA_real_, + + !is.na(.data$y_mid_sib) ~ .data$y_mid_sib, + (!is.na(.data$momID) & !is.na(.data$y_mom)) | (!is.na(.data$dadID) & !is.na(.data$y_dad)) ~ .data$y_pos, + TRUE ~ NA_real_ + ) + ) |> unique() + + return(connections) +} + + + + + diff --git a/R/calcCoordinates.R b/R/calcCoordinates.R index 810c9e52..f7048372 100644 --- a/R/calcCoordinates.R +++ b/R/calcCoordinates.R @@ -214,285 +214,4 @@ calculateCoordinates <- function(ped, personID = "ID", momID = "momID", return(ped) } -#' Calculate connections for a pedigree dataset -#' -#' Computes graphical connection paths for a pedigree layout, including parent-child, -#' sibling, and spousal connections. Optionally processes duplicate appearances -#' of individuals (marked as `extra`) to ensure relational accuracy. -#' -#' @inheritParams ggpedigree -#' @param config List of configuration parameters. Currently unused but passed through to internal helpers. -#' @return A `data.frame` containing connection points and midpoints for graphical rendering. Includes: -#' \itemize{ -#' \item `x_pos`, `y_pos`: positions of focal individual -#' \item `x_dad`, `y_dad`, `x_mom`, `y_mom`: parental positions (if available) -#' \item `x_spouse`, `y_spouse`: spousal positions (if available) -#' \item `x_midparent`, `y_midparent`: midpoint between parents -#' \item `x_mid_sib`, `y_mid_sib`: sibling group midpoint -#' \item `x_mid_spouse`, `y_mid_spouse`: midpoint between spouses -#' } -#' -#' @export - -calculateConnections <- function(ped, - config = list()) { - # Check inputs ----------------------------------------------------------- - if (!inherits(ped, "data.frame")) { - stop("ped should be a data.frame or inherit to a data.frame") - } - if (!all(c("personID", "x_pos", "y_pos", "dadID", "momID") %in% names(ped))) { - stop("ped must contain personID, x_pos, y_pos, dadID, and momID columns") - } - - # Default configuration placeholder - default_config <- list() - config <- utils::modifyList(default_config, config) - - - # Add spouseID if missing - if (!all("spouseID" %in% names(ped))) { - ped$spouseID <- NA - # Attempt to infer spouse based on parenthood (not always reliable) - # this will give you the mom that is the spouse of the dad - # ped$spouseID <- ped$momID[match(ped$personID, ped$dadID)] - # this will give you the dad that is the spouse of the mom - # ped$spouseID <- ped$dadID[match(ped$personID, ped$momID)] - - ped$spouseID <- ifelse(!is.na(ped$momID[match(ped$personID, ped$dadID)]), - ped$momID[match(ped$personID, ped$dadID)], - ped$dadID[match(ped$personID, ped$momID)] - ) - } - # Add famID if missing (used for grouping) - if (!all("famID" %in% names(ped))) { - ped$famID <- 1 - } - - # create a unique parenthash for each individual - # this will be used to identify siblings - if (!all("parenthash" %in% names(ped))) { - ped <- ped |> - dplyr::mutate( - parenthash = paste0(.data$momID, ".", .data$dadID) - ) |> - dplyr::mutate( - parenthash = gsub("NA.NA", NA, .data$parenthash) - ) - } - - # If duplicated appearances exist, resolve which connections to keep - if (sum(ped$extra) > 0) { - ped <- processExtras(ped, config = config) - } - - # Construct base connection frame - # This will be used for all joins - - - if ("x_otherself" %in% names(ped)) { - connections <- dplyr::select( - .data = ped, - "personID", - "x_pos", "y_pos", - "dadID", "momID", "parenthash", - "spouseID", - "famID", - "x_otherself", "y_otherself", - "extra","link_as_mom", "link_as_dad", "link_as_spouse", - "link_as_sibling" - ) |> unique() - - - connections_moms <- dplyr::filter(connections, .data$link_as_mom == TRUE) |> - dplyr::select( - -"extra", - -"link_as_mom", - -"link_as_dad", - -"link_as_spouse", - -"link_as_sibling" - ) - - connections_dads <- dplyr::filter(connections, .data$link_as_dad == TRUE)|> - dplyr::select( - -"extra", - -"link_as_mom", - -"link_as_dad", - -"link_as_spouse", - -"link_as_sibling" - ) - connections_spouses <- dplyr::filter(connections, .data$link_as_spouse == TRUE) |> - dplyr::select( - -"extra", - -"link_as_mom", - -"link_as_dad", - -"link_as_spouse", - -"link_as_sibling" - ) - connections_sibs <- dplyr::filter(connections, .data$link_as_sibling == TRUE) |> - dplyr::select( - -"extra", - -"link_as_mom", - -"link_as_dad", - -"link_as_spouse", - -"link_as_sibling" - ) - } else { - connections <- dplyr::select( - .data = ped, - "personID", - "x_pos", "y_pos", - "dadID", "momID", "parenthash", - "spouseID", - "famID", - "extra" - ) |> unique() |> - dplyr::mutate( - link_as_mom = TRUE, - link_as_dad = TRUE, - link_as_spouse = TRUE, - link_as_sibling = TRUE - ) - - - -# no duplications, so just use the same connections - connections_sibs <- connections_spouses <- connections_dads <- connections_moms <- connections - } - - # Get mom's coordinates - mom_connections <- getRelativeCoordinates( - ped = ped, - connections = connections_moms, - relativeIDvar = "momID", - x_name = "x_mom", - y_name = "y_mom" - ) - - # Get dad's coordinates - dad_connections <- getRelativeCoordinates( - ped = ped, - connections = connections_dads, - relativeIDvar = "dadID", - x_name = "x_dad", - y_name = "y_dad" - ) - - # Get spouse coordinates - spouse_connections <- ped |> - dplyr::select( - "personID", "x_pos", - "y_pos", "spouseID" - ) |> - dplyr::left_join(connections_spouses, - by = c("spouseID" = "personID"), - suffix = c("", "_spouse"), - multiple = "any" - ) |> - dplyr::rename( - x_spouse = "x_pos_spouse", - y_spouse = "y_pos_spouse" - ) |> - dplyr::select( - "personID", "spouseID", - "x_spouse", "y_spouse" - ) |> unique() - - # Combine mom, dad, and spouse coordinates - connections <- connections |> - dplyr::left_join(mom_connections, - by = c("personID", "momID") - ) |> - dplyr::left_join(dad_connections, - by = c("personID", "dadID") - ) |> - dplyr::left_join(spouse_connections, - by = c("personID", "spouseID") - ) |> unique() - - # Calculate midpoints between mom and dad in child row - - parent_midpoints <- connections |> - dplyr::filter(!is.na(.data$dadID) & !is.na(.data$momID)) |> - dplyr::group_by(.data$parenthash) |> - dplyr::summarize( - x_midparent = mean(c( - dplyr::first(.data$x_dad), - dplyr::first(.data$x_mom) - )), - y_midparent = mean(c( - dplyr::first(.data$y_dad), - dplyr::first(.data$y_mom) - )), - .groups = "drop" - ) |> unique() - - # Calculate midpoints between spouses - spouse_midpoints <- connections |> - dplyr::filter(!is.na(.data$spouseID)) |> - dplyr::group_by(.data$spouseID) |> - dplyr::summarize( - x_mid_spouse = mean(c( - dplyr::first(.data$x_pos), - dplyr::first(.data$x_spouse) - )), - y_mid_spouse = mean(c( - dplyr::first(.data$y_pos), - dplyr::first(.data$y_spouse) - )), - .groups = "drop" - ) |> unique() - - # Calculate sibling group midpoints - sibling_midpoints <- connections|> - dplyr::filter( - !is.na(.data$momID) & !is.na(.data$dadID) & # biological parents defined - !is.na(.data$x_mom) & !is.na(.data$y_mom) & # mom’s coordinates linked - !is.na(.data$x_dad) & !is.na(.data$y_dad) # dad’s coordinates linked - ) |> - dplyr::group_by( - .data$parenthash, - .data$x_mom, .data$y_mom, - .data$x_dad, .data$y_dad - ) |> - dplyr::summarize( - x_mid_sib = mean(.data$x_pos), - y_mid_sib = dplyr::first(.data$y_pos), - .groups = "drop" - ) |> unique() - - - # Merge midpoints into connections - connections <- connections |> - dplyr::left_join(parent_midpoints, - by = c("parenthash") - ) |> - dplyr::left_join(spouse_midpoints, - by = c("spouseID") - ) |> - dplyr::left_join(sibling_midpoints, - by = c("parenthash","x_mom", "y_mom", - "x_dad", "y_dad") - ) |> - dplyr::mutate( - x_mid_sib = dplyr::case_when( - is.na(.data$x_dad) & is.na(.data$x_mom) ~ NA_real_, - !is.na(.data$x_mid_sib) ~ .data$x_mid_sib, - (!is.na(.data$momID) & !is.na(.data$x_mom)) | (!is.na(.data$dadID) & !is.na(.data$x_dad)) ~ .data$x_pos, - TRUE ~ NA_real_ - ), - y_mid_sib = dplyr::case_when( - is.na(.data$y_dad) & is.na(.data$y_mom) ~ NA_real_, - - !is.na(.data$y_mid_sib) ~ .data$y_mid_sib, - (!is.na(.data$momID) & !is.na(.data$y_mom)) | (!is.na(.data$dadID) & !is.na(.data$y_dad)) ~ .data$y_pos, - TRUE ~ NA_real_ - ) - ) |> unique() - - return(connections) -} - - - - From 48aab55a9365d1c3556c1338b77bdedd8ef76319 Mon Sep 17 00:00:00 2001 From: Mason Garrison Date: Wed, 14 May 2025 12:29:46 -0400 Subject: [PATCH 10/18] slowly building each segment as its own function --- R/calcConnections.R | 104 +++++++++++--- R/calcConnections.X | 329 ++++++++++++++++++++++++++++++++++++++++++++ R/ggpedigree.R | 26 ++-- 3 files changed, 426 insertions(+), 33 deletions(-) create mode 100644 R/calcConnections.X diff --git a/R/calcConnections.R b/R/calcConnections.R index f6844289..9940747a 100644 --- a/R/calcConnections.R +++ b/R/calcConnections.R @@ -67,13 +67,6 @@ calculateConnections <- function(ped, # If duplicated appearances exist, resolve which connections to keep if (sum(ped$extra) > 0) { ped <- processExtras(ped, config = config) - } - - # Construct base connection frame - # This will be used for all joins - - - if ("x_otherself" %in% names(ped)) { connections <- dplyr::select( .data = ped, "personID", @@ -87,7 +80,7 @@ calculateConnections <- function(ped, ) |> unique() - connections_moms <- dplyr::filter(connections, .data$link_as_mom == TRUE) |> + connections_for_moms <- dplyr::filter(connections, .data$link_as_mom == TRUE) |> dplyr::select( -"extra", -"link_as_mom", @@ -96,7 +89,7 @@ calculateConnections <- function(ped, -"link_as_sibling" ) - connections_dads <- dplyr::filter(connections, .data$link_as_dad == TRUE)|> + connections_for_dads <- dplyr::filter(connections, .data$link_as_dad == TRUE)|> dplyr::select( -"extra", -"link_as_mom", @@ -104,7 +97,7 @@ calculateConnections <- function(ped, -"link_as_spouse", -"link_as_sibling" ) - connections_spouses <- dplyr::filter(connections, .data$link_as_spouse == TRUE) |> + connections_for_spouses <- dplyr::filter(connections, .data$link_as_spouse == TRUE) |> dplyr::select( -"extra", -"link_as_mom", @@ -112,7 +105,7 @@ calculateConnections <- function(ped, -"link_as_spouse", -"link_as_sibling" ) - connections_sibs <- dplyr::filter(connections, .data$link_as_sibling == TRUE) |> + connections_for_sibs <- dplyr::filter(connections, .data$link_as_sibling == TRUE) |> dplyr::select( -"extra", -"link_as_mom", @@ -140,13 +133,13 @@ calculateConnections <- function(ped, # no duplications, so just use the same connections - connections_sibs <- connections_spouses <- connections_dads <- connections_moms <- connections + connections_for_sibs <- connections_for_spouses <- connections_for_dads <- connections_for_moms <- connections } # Get mom's coordinates mom_connections <- getRelativeCoordinates( ped = ped, - connections = connections_moms, + connections = connections_for_moms, relativeIDvar = "momID", x_name = "x_mom", y_name = "y_mom" @@ -155,7 +148,7 @@ calculateConnections <- function(ped, # Get dad's coordinates dad_connections <- getRelativeCoordinates( ped = ped, - connections = connections_dads, + connections = connections_for_dads, relativeIDvar = "dadID", x_name = "x_dad", y_name = "y_dad" @@ -167,7 +160,7 @@ calculateConnections <- function(ped, "personID", "x_pos", "y_pos", "spouseID" ) |> - dplyr::left_join(connections_spouses, + dplyr::left_join(connections_for_spouses, by = c("spouseID" = "personID"), suffix = c("", "_spouse"), multiple = "any" @@ -273,10 +266,89 @@ calculateConnections <- function(ped, ) ) |> unique() - return(connections) + plot_connections <- list( + connections = connections, + connections_spouse_segment = build_connections_spouse_segment( + ped = ped, + connections_for_spouses = connections_for_spouses + ) + ) + + return(plot_connections) } +build_connections_spouse_segment <- function(ped,connections_for_FOO) { + + # spouses + # Get spouse coordinates + spouse_connections <- ped |> + dplyr::select( + "personID", "x_pos", + "y_pos", "spouseID" + ) |> dplyr::filter(!is.na(.data$spouseID)) |> + dplyr::left_join(connections_for_FOO, + by = c("spouseID" = "personID"), + suffix = c("", "_spouse"), + multiple = "any" + ) |> + dplyr::rename( + x_spouse = "x_pos_spouse", + y_spouse = "y_pos_spouse" + ) |> unique() |> + dplyr::mutate( + x_start = .data$x_spouse, + x_end = .data$x_pos, + y_start = .data$y_spouse, + y_end = .data$y_pos + ) |> select( + -"spouseID_spouse" + ) + return(spouse_connections) +} +build_connections_parent_segment <- function(ped,connections_for_FOO) { + # ggplot2::aes( + # x = .data$x_mid_sib, +# xend = .data$x_midparent, + #y = .data$y_mid_sib - gap_off, + # yend = .data$y_midparent +# ) + parent_midpoints <- connections |> + dplyr::filter(!is.na(.data$dadID) & !is.na(.data$momID)) |> + dplyr::group_by(.data$parenthash) |> + dplyr::summarize( + x_midparent = mean(c( + dplyr::first(.data$x_dad), + dplyr::first(.data$x_mom) + )), + y_midparent = mean(c( + dplyr::first(.data$y_dad), + dplyr::first(.data$y_mom) + )), + .groups = "drop" + ) |> unique() + + # Calculate sibling group midpoints + sibling_midpoints <- connections|> + dplyr::filter( + !is.na(.data$momID) & !is.na(.data$dadID) & # biological parents defined + !is.na(.data$x_mom) & !is.na(.data$y_mom) & # mom’s coordinates linked + !is.na(.data$x_dad) & !is.na(.data$y_dad) # dad’s coordinates linked + ) |> + dplyr::group_by( + .data$parenthash, + .data$x_mom, .data$y_mom, + .data$x_dad, .data$y_dad + ) |> + dplyr::summarize( + x_mid_sib = mean(.data$x_pos), + y_mid_sib = dplyr::first(.data$y_pos), + .groups = "drop" + ) |> unique() + + + return(spouse_connections) +} diff --git a/R/calcConnections.X b/R/calcConnections.X new file mode 100644 index 00000000..a1e8eb48 --- /dev/null +++ b/R/calcConnections.X @@ -0,0 +1,329 @@ +#' Calculate connections for a pedigree dataset +#' +#' Computes graphical connection paths for a pedigree layout, including parent-child, +#' sibling, and spousal connections. Optionally processes duplicate appearances +#' of individuals (marked as `extra`) to ensure relational accuracy. +#' +#' @inheritParams ggpedigree +#' @param config List of configuration parameters. Currently unused but passed through to internal helpers. +#' @return A `data.frame` containing connection points and midpoints for graphical rendering. Includes: +#' \itemize{ +#' \item `x_pos`, `y_pos`: positions of focal individual +#' \item `x_dad`, `y_dad`, `x_mom`, `y_mom`: parental positions (if available) +#' \item `x_spouse`, `y_spouse`: spousal positions (if available) +#' \item `x_midparent`, `y_midparent`: midpoint between parents +#' \item `x_mid_sib`, `y_mid_sib`: sibling group midpoint +#' \item `x_mid_spouse`, `y_mid_spouse`: midpoint between spouses +#' } +#' +#' @export + +calculateConnections <- function(ped, + config = list()) { + # Check inputs ----------------------------------------------------------- + if (!inherits(ped, "data.frame")) { + stop("ped should be a data.frame or inherit to a data.frame") + } + if (!all(c("personID", "x_pos", "y_pos", "dadID", "momID") %in% names(ped))) { + stop("ped must contain personID, x_pos, y_pos, dadID, and momID columns") + } + + # Default configuration placeholder + default_config <- list() + config <- utils::modifyList(default_config, config) + + + # Add spouseID if missing + if (!all("spouseID" %in% names(ped))) { + ped$spouseID <- NA + # Attempt to infer spouse based on parenthood (not always reliable) + # this will give you the mom that is the spouse of the dad + # ped$spouseID <- ped$momID[match(ped$personID, ped$dadID)] + # this will give you the dad that is the spouse of the mom + # ped$spouseID <- ped$dadID[match(ped$personID, ped$momID)] + + ped$spouseID <- ifelse(!is.na(ped$momID[match(ped$personID, ped$dadID)]), + ped$momID[match(ped$personID, ped$dadID)], + ped$dadID[match(ped$personID, ped$momID)] + ) + } + # Add famID if missing (used for grouping) + if (!all("famID" %in% names(ped))) { + ped$famID <- 1 + } + + # create a unique parenthash for each individual + # this will be used to identify siblings + if (!all("parenthash" %in% names(ped))) { + ped <- ped |> + dplyr::mutate( + parenthash = paste0(.data$momID, ".", .data$dadID) + ) |> + dplyr::mutate( + parenthash = gsub("NA.NA", NA, .data$parenthash) + ) + } + + # If duplicated appearances exist, resolve which connections to keep + if (sum(ped$extra) > 0) { + ped <- processExtras(ped, config = config) + + # Construct base connection frame + # This will be used for all joins + connections_core <- dplyr::select( + .data = ped, + "personID", + "x_pos", "y_pos", + "dadID", "momID", "parenthash", + "spouseID", + "famID", + "x_otherself", "y_otherself", + "extra","link_as_mom", "link_as_dad", "link_as_spouse", + "link_as_sibling" + ) |> unique() + + + connections_for_moms <- dplyr::filter(connections_core, .data$link_as_mom == TRUE) |> + dplyr::select( + -"extra", + -"link_as_mom", + -"link_as_dad", + -"link_as_spouse", + -"link_as_sibling" + ) + + connections_for_dads <- dplyr::filter(connections_core, .data$link_as_dad == TRUE)|> + dplyr::select( + -"extra", + -"link_as_mom", + -"link_as_dad", + -"link_as_spouse", + -"link_as_sibling" + ) + connections_for_spouses <- dplyr::filter(connections_core, .data$link_as_spouse == TRUE) |> + dplyr::select( + -"extra", + -"link_as_mom", + -"link_as_dad", + -"link_as_spouse", + -"link_as_sibling" + ) + connections_for_sibs <- dplyr::filter(connections_core, .data$link_as_sibling == TRUE) |> + dplyr::select( + -"extra", + -"link_as_mom", + -"link_as_dad", + -"link_as_spouse", + -"link_as_sibling" + ) + } else { # no duplicates + connections_core <- dplyr::select( + .data = ped, + "personID", + "x_pos", "y_pos", + "dadID", "momID", "parenthash", + "spouseID", + "famID", + "extra" + ) |> unique() + + # no duplications, so just use the same connections + connections_for_sibs <- connections_for_spouses <- connections_for_dads <- connections_for_moms <- connections_core + +connections_core <- connections_core |> + dplyr::mutate( + link_as_mom = TRUE, + link_as_dad = TRUE, + link_as_spouse = TRUE, + link_as_sibling = TRUE + ) + + + + } + + # Get mom's coordinates + mom_connections <- getRelativeCoordinates( + ped = ped, + connections = connections_for_moms, + relativeIDvar = "momID", + x_name = "x_mom", + y_name = "y_mom" + ) + + # Get dad's coordinates + dad_connections <- getRelativeCoordinates( + ped = ped, + connections = connections_for_dads, + relativeIDvar = "dadID", + x_name = "x_dad", + y_name = "y_dad" + ) + +# combine for parents + parent_connections <- connections_core |> + dplyr::left_join(mom_connections, + by = c("personID", "momID") + ) |> + dplyr::left_join(dad_connections, + by = c("personID", "dadID") + ) + + # Calculate midpoints between mom and dad in child row + + parent_midpoints <- parent_connections |> + dplyr::filter(!is.na(.data$dadID) & !is.na(.data$momID)) |> + dplyr::group_by(.data$parenthash) |> + dplyr::summarize( + x_midparent = mean(c( + dplyr::first(.data$x_dad), # faster + dplyr::first(.data$x_mom) + )), + y_midparent = mean(c( + dplyr::first(.data$y_dad), + dplyr::first(.data$y_mom) + )), + .groups = "drop" + ) |> unique() + + +# spouses + # Get spouse coordinates + spouse_connections <- build_connections_spouse_segment( + ped, + connections_for_spouses + ) +print(spouse_connections) + # Calculate midpoints between spouses + + +spouse_midpoints <- connections_core |> + dplyr::left_join(spouse_connections, + by = c("spouseID") + ) |> + dplyr::filter(!is.na(.data$spouseID)) |> + dplyr::group_by(.data$spouseID) |> + dplyr::summarize( + x_mid_spouse = mean(c( + dplyr::first(.data$x_pos), + dplyr::first(.data$x_spouse) + )), + y_mid_spouse = mean(c( + dplyr::first(.data$y_pos), + dplyr::first(.data$y_spouse) + )), + .groups = "drop" + ) |> unique() + + + # Combine mom, dad, and spouse coordinates + connections <- connections_core |> + dplyr::left_join(mom_connections, + by = c("personID", "momID") + ) |> + dplyr::left_join(dad_connections, + by = c("personID", "dadID") + ) |> + dplyr::left_join(spouse_connections, + by = c("personID", "spouseID") + ) |> unique() + + + + # Calculate sibling group midpoints + sibling_midpoints <- connections |> + dplyr::filter( + !is.na(.data$momID) & !is.na(.data$dadID) & # biological parents defined + !is.na(.data$x_mom) & !is.na(.data$y_mom) & # mom’s coordinates linked + !is.na(.data$x_dad) & !is.na(.data$y_dad) # dad’s coordinates linked + ) |> + dplyr::group_by( + .data$parenthash, + .data$x_mom, .data$y_mom, + .data$x_dad, .data$y_dad + ) |> + dplyr::summarize( + x_mid_sib = mean(.data$x_pos), + y_mid_sib = dplyr::first(.data$y_pos), + .groups = "drop" + ) |> unique() + + + # Merge midpoints into connections + connections <- connections |> + dplyr::left_join(parent_midpoints, + by = c("parenthash") + ) |> + dplyr::left_join(spouse_midpoints, + by = c("spouseID") + ) |> + dplyr::left_join(sibling_midpoints, + by = c("parenthash","x_mom", "y_mom", + "x_dad", "y_dad") + ) |> + dplyr::mutate( + x_mid_sib = dplyr::case_when( + is.na(.data$x_dad) & is.na(.data$x_mom) ~ NA_real_, + !is.na(.data$x_mid_sib) ~ .data$x_mid_sib, + (!is.na(.data$momID) & !is.na(.data$x_mom)) | (!is.na(.data$dadID) & !is.na(.data$x_dad)) ~ .data$x_pos, + TRUE ~ NA_real_ + ), + y_mid_sib = dplyr::case_when( + is.na(.data$y_dad) & is.na(.data$y_mom) ~ NA_real_, + + !is.na(.data$y_mid_sib) ~ .data$y_mid_sib, + (!is.na(.data$momID) & !is.na(.data$y_mom)) | (!is.na(.data$dadID) & !is.na(.data$y_dad)) ~ .data$y_pos, + TRUE ~ NA_real_ + ) + ) |> unique() + + plotting_connections <- list( + connections = connections, + connections_core = connections_core, + connections_spouse_segment = connections_spouse_segment, # Spouse link between two parents, needs + # x = .data$x_spouse, + # xend = .data$x_pos, + # y = .data$y_spouse, + # yend = .data$y_pos + connections_parent_segment = NULL, # Parent-child stub (child to mid-sibling point) + connections_offspring_segment = NULL, # Mid-sibling to parents midpoint + connections_sibling_segment = NULL, # Sibling vertical drop line + ped = ped + ) + + + return(connections) +} + + +build_connections_spouse_segment <- function(ped,connections_for_spouses) { + + # spouses + # Get spouse coordinates + spouse_connections <- ped |> + dplyr::select( + "personID", "x_pos", + "y_pos", "spouseID" + ) |> dplyr::filter(!is.na(.data$spouseID)) |> + dplyr::left_join(connections_for_spouses, + by = c("spouseID" = "personID"), + suffix = c("", "_spouse"), + multiple = "any" + ) |> + dplyr::rename( + x_spouse = "x_pos_spouse", + y_spouse = "y_pos_spouse" + ) |> unique() |> + dplyr::mutate( + x_start = .data$x_spouse, + x_end = .data$x_pos, + y_start = .data$y_spouse, + y_end = .data$y_pos + ) |> select( + -"spouseID_spouse" + ) + + return(spouse_connections) +} + + diff --git a/R/ggpedigree.R b/R/ggpedigree.R index 98fec2af..728c6b07 100644 --- a/R/ggpedigree.R +++ b/R/ggpedigree.R @@ -150,9 +150,9 @@ ggPedigree <- function(ped, famID = "famID", # ----- # Generate a connection table for plotting lines (parents, spouses, etc.) - connections <- calculateConnections(ds, config = config) |> - unique() # remove duplicates + plot_connections <- calculateConnections(ds, config = config) + connections <- plot_connections$connections # ----- # STEP 6: Initialize Plot # ----- @@ -171,12 +171,12 @@ ggPedigree <- function(ped, famID = "famID", # Spouse link between two parents p <- p + ggplot2::geom_segment( - data = connections, + data = plot_connections$connections_spouse_segment, ggplot2::aes( - x = .data$x_spouse, - xend = .data$x_pos, - y = .data$y_spouse, - yend = .data$y_pos + x = .data$x_start, + xend = .data$x_end, + y = .data$y_start, + yend = .data$y_end ), linewidth = config$line_width, color = config$spouse_segment_color, @@ -199,11 +199,7 @@ ggPedigree <- function(ped, famID = "famID", ) + # Mid-sibling to parents midpoint ggplot2::geom_segment( - data = dplyr::filter( - connections, - !is.na(.data$x_mom) & !is.na(.data$y_mom) & - !is.na(.data$x_dad) & !is.na(.data$y_dad) - ), + data = connections, ggplot2::aes( x = .data$x_pos, xend = .data$x_mid_sib, @@ -216,11 +212,7 @@ ggPedigree <- function(ped, famID = "famID", ) + # Sibling vertical drop line ggplot2::geom_segment( - data = dplyr::filter( - connections, - !is.na(.data$x_mom) & !is.na(.data$y_mom) & - !is.na(.data$x_dad) & !is.na(.data$y_dad) - ), + data = connections, ggplot2::aes( x = .data$x_pos, xend = .data$x_pos, From 9e4f255372b841fa23c0c0b981d7b417ca243a80 Mon Sep 17 00:00:00 2001 From: Mason Garrison Date: Wed, 14 May 2025 18:36:28 -0400 Subject: [PATCH 11/18] trying alternative spec --- R/calcConnections.R | 155 ++++++++++++++++++---------------- R/calcConnectionsHelpers.R | 9 +- R/ggpedigree.R | 15 ++++ R/processExtras.R | 167 +++++++++++++++++++++++++------------ vignettes/plots.Rmd | 5 +- 5 files changed, 223 insertions(+), 128 deletions(-) diff --git a/R/calcConnections.R b/R/calcConnections.R index 9940747a..8c4be745 100644 --- a/R/calcConnections.R +++ b/R/calcConnections.R @@ -52,26 +52,30 @@ calculateConnections <- function(ped, ped$famID <- 1 } - # create a unique parenthash for each individual + # create a unique parent_hash for each individual # this will be used to identify siblings - if (!all("parenthash" %in% names(ped))) { + if (!all("parent_hash" %in% names(ped))) { ped <- ped |> dplyr::mutate( - parenthash = paste0(.data$momID, ".", .data$dadID) - ) |> + parent_hash = symKey(.data$momID, .data$dadID), + couple_hash = symKey(.data$personID, .data$spouseID) + ) |> dplyr::mutate( - parenthash = gsub("NA.NA", NA, .data$parenthash) + parent_hash = gsub("NA.NA", NA, .data$parent_hash), + couple_hash = gsub("NA.NA", NA, .data$couple_hash) ) } # If duplicated appearances exist, resolve which connections to keep if (sum(ped$extra) > 0) { - ped <- processExtras(ped, config = config) + ped <- processExtras(ped, config = config) |> unique() + + # Create a unique parent_hash for each individual connections <- dplyr::select( .data = ped, "personID", "x_pos", "y_pos", - "dadID", "momID", "parenthash", + "dadID", "momID", "parent_hash", "couple_hash", "spouseID", "famID", "x_otherself", "y_otherself", @@ -87,7 +91,7 @@ calculateConnections <- function(ped, -"link_as_dad", -"link_as_spouse", -"link_as_sibling" - ) + ) |> unique() connections_for_dads <- dplyr::filter(connections, .data$link_as_dad == TRUE)|> dplyr::select( @@ -96,7 +100,7 @@ calculateConnections <- function(ped, -"link_as_dad", -"link_as_spouse", -"link_as_sibling" - ) + ) |> unique() connections_for_spouses <- dplyr::filter(connections, .data$link_as_spouse == TRUE) |> dplyr::select( -"extra", @@ -112,13 +116,13 @@ calculateConnections <- function(ped, -"link_as_dad", -"link_as_spouse", -"link_as_sibling" - ) + ) |> unique() } else { connections <- dplyr::select( .data = ped, "personID", "x_pos", "y_pos", - "dadID", "momID", "parenthash", + "dadID", "momID", "parent_hash", "couple_hash", "spouseID", "famID", "extra" @@ -158,12 +162,12 @@ calculateConnections <- function(ped, spouse_connections <- ped |> dplyr::select( "personID", "x_pos", - "y_pos", "spouseID" + "y_pos", "spouseID", "couple_hash" ) |> dplyr::left_join(connections_for_spouses, by = c("spouseID" = "personID"), suffix = c("", "_spouse"), - multiple = "any" + multiple = "all" ) |> dplyr::rename( x_spouse = "x_pos_spouse", @@ -171,7 +175,7 @@ calculateConnections <- function(ped, ) |> dplyr::select( "personID", "spouseID", - "x_spouse", "y_spouse" + "x_spouse", "y_spouse","couple_hash" ) |> unique() # Combine mom, dad, and spouse coordinates @@ -183,14 +187,15 @@ calculateConnections <- function(ped, by = c("personID", "dadID") ) |> dplyr::left_join(spouse_connections, - by = c("personID", "spouseID") + by = c("personID", "spouseID","couple_hash") ) |> unique() # Calculate midpoints between mom and dad in child row parent_midpoints <- connections |> - dplyr::filter(!is.na(.data$dadID) & !is.na(.data$momID)) |> - dplyr::group_by(.data$parenthash) |> + dplyr::filter(.data$link_as_sibling & + !is.na(.data$dadID) & !is.na(.data$momID)) |> + dplyr::group_by(.data$parent_hash) |> dplyr::summarize( x_midparent = mean(c( dplyr::first(.data$x_dad), @@ -205,8 +210,9 @@ calculateConnections <- function(ped, # Calculate midpoints between spouses spouse_midpoints <- connections |> - dplyr::filter(!is.na(.data$spouseID)) |> - dplyr::group_by(.data$spouseID) |> + dplyr::filter(.data$link_as_spouse, + !is.na(.data$spouseID)) |> + dplyr::group_by(.data$spouseID,.data$couple_hash) |> dplyr::summarize( x_mid_spouse = mean(c( dplyr::first(.data$x_pos), @@ -222,12 +228,13 @@ calculateConnections <- function(ped, # Calculate sibling group midpoints sibling_midpoints <- connections|> dplyr::filter( + .data$link_as_sibling, !is.na(.data$momID) & !is.na(.data$dadID) & # biological parents defined !is.na(.data$x_mom) & !is.na(.data$y_mom) & # mom’s coordinates linked !is.na(.data$x_dad) & !is.na(.data$y_dad) # dad’s coordinates linked ) |> dplyr::group_by( - .data$parenthash, + .data$parent_hash, .data$x_mom, .data$y_mom, .data$x_dad, .data$y_dad ) |> @@ -241,13 +248,15 @@ calculateConnections <- function(ped, # Merge midpoints into connections connections <- connections |> dplyr::left_join(parent_midpoints, - by = c("parenthash") + by = c("parent_hash", + "x_mom", "y_mom", + "x_dad", "y_dad") ) |> dplyr::left_join(spouse_midpoints, - by = c("spouseID") + by = c("spouseID", "couple_hash") ) |> dplyr::left_join(sibling_midpoints, - by = c("parenthash","x_mom", "y_mom", + by = c("parent_hash","x_mom", "y_mom", "x_dad", "y_dad") ) |> dplyr::mutate( @@ -259,18 +268,25 @@ calculateConnections <- function(ped, ), y_mid_sib = dplyr::case_when( is.na(.data$y_dad) & is.na(.data$y_mom) ~ NA_real_, - !is.na(.data$y_mid_sib) ~ .data$y_mid_sib, (!is.na(.data$momID) & !is.na(.data$y_mom)) | (!is.na(.data$dadID) & !is.na(.data$y_dad)) ~ .data$y_pos, TRUE ~ NA_real_ ) - ) |> unique() + ) |> unique() |> + dplyr::mutate( + x_mid_sib = dplyr::if_else(link_as_sibling, x_mid_sib, NA_real_), + y_mid_sib = dplyr::if_else(link_as_sibling, y_mid_sib, NA_real_) + ) + + + + plot_connections <- list( connections = connections, connections_spouse_segment = build_connections_spouse_segment( ped = ped, - connections_for_spouses = connections_for_spouses + connections_for_FOO = connections_for_spouses ) ) @@ -278,7 +294,45 @@ calculateConnections <- function(ped, } -build_connections_spouse_segment <- function(ped,connections_for_FOO) { +build_connections_spouse_segment <- function(ped,connections_for_FOO, use_hash = TRUE){ +if(use_hash==TRUE){ + # I want to make segments for each hash, because some people have multiple spouses + # this is to add those missing segments + parent_hash_connections <- ped |> dplyr::select("parent_hash") |> + mutate(parent1 = # needs to be the first part of the string + stringr::str_extract(.data$parent_hash, "^[^.]+"), + parent2 = # needs to be the second part of the string\ + stringr::str_extract(.data$parent_hash, "(?<=\\.)[^.]+") + ) |> + dplyr::left_join(connections_for_FOO |> mutate(personID = paste0(.data$personID)), + by = c("parent1" = "personID"), + suffix = c("", "_parent1"), + multiple = "any" + ) |> dplyr::left_join(connections_for_FOO |> mutate(personID = paste0(.data$personID)), + by = c("parent2" = "personID"), + suffix = c("", "_parent2"), + multiple = "any" + ) |> + dplyr::mutate( + x_start = .data$x_pos, + x_end = .data$x_pos_parent2, + y_start = .data$y_pos, + y_end = .data$y_pos_parent2 + ) |> select( + -"parent_hash", + -"parent1", + -"parent2", + -"x_pos", + -"y_pos", + -"x_pos_parent2", + -"y_pos_parent2" + ) + + + + # Get spouse coordinates + +}else{ # spouses # Get spouse coordinates @@ -304,51 +358,8 @@ build_connections_spouse_segment <- function(ped,connections_for_FOO) { ) |> select( -"spouseID_spouse" ) - - return(spouse_connections) } - -build_connections_parent_segment <- function(ped,connections_for_FOO) { - # ggplot2::aes( - # x = .data$x_mid_sib, -# xend = .data$x_midparent, - #y = .data$y_mid_sib - gap_off, - # yend = .data$y_midparent -# ) - - parent_midpoints <- connections |> - dplyr::filter(!is.na(.data$dadID) & !is.na(.data$momID)) |> - dplyr::group_by(.data$parenthash) |> - dplyr::summarize( - x_midparent = mean(c( - dplyr::first(.data$x_dad), - dplyr::first(.data$x_mom) - )), - y_midparent = mean(c( - dplyr::first(.data$y_dad), - dplyr::first(.data$y_mom) - )), - .groups = "drop" - ) |> unique() - - # Calculate sibling group midpoints - sibling_midpoints <- connections|> - dplyr::filter( - !is.na(.data$momID) & !is.na(.data$dadID) & # biological parents defined - !is.na(.data$x_mom) & !is.na(.data$y_mom) & # mom’s coordinates linked - !is.na(.data$x_dad) & !is.na(.data$y_dad) # dad’s coordinates linked - ) |> - dplyr::group_by( - .data$parenthash, - .data$x_mom, .data$y_mom, - .data$x_dad, .data$y_dad - ) |> - dplyr::summarize( - x_mid_sib = mean(.data$x_pos), - y_mid_sib = dplyr::first(.data$y_pos), - .groups = "drop" - ) |> unique() + return(parent_hash_connections) +} - return(spouse_connections) -} diff --git a/R/calcConnectionsHelpers.R b/R/calcConnectionsHelpers.R index 6b6e3f98..bd12eb1c 100644 --- a/R/calcConnectionsHelpers.R +++ b/R/calcConnectionsHelpers.R @@ -219,4 +219,11 @@ getRelativeCoordinates <- function(ped, connections, relativeIDvar, x_name, y_na } return(rel_connections) -} \ No newline at end of file +} + + +symKey <- function(id1, id2, sep = ".") { + dplyr::if_else(id1 < id2, + paste0(id1, sep, id2), + paste0(id2, sep, id1)) +} diff --git a/R/ggpedigree.R b/R/ggpedigree.R index 728c6b07..256ffca0 100644 --- a/R/ggpedigree.R +++ b/R/ggpedigree.R @@ -169,6 +169,7 @@ ggPedigree <- function(ped, famID = "famID", # ----- # Spouse link between two parents + if(FALSE){ p <- p + ggplot2::geom_segment( data = plot_connections$connections_spouse_segment, @@ -182,6 +183,20 @@ ggPedigree <- function(ped, famID = "famID", color = config$spouse_segment_color, na.rm = TRUE ) + } + p <- p + + ggplot2::geom_segment( + data = connections, + ggplot2::aes( + x = .data$x_spouse, + xend = .data$x_pos, + y = .data$y_spouse, + yend = .data$y_pos + ), + linewidth = config$line_width, + color = config$spouse_segment_color, + na.rm = TRUE + ) # Parent-child stub (child to mid-sibling point) diff --git a/R/processExtras.R b/R/processExtras.R index 10f3f9b9..484d944f 100644 --- a/R/processExtras.R +++ b/R/processExtras.R @@ -21,14 +21,18 @@ processExtras <- function(ped, config = list()) { if (!inherits(ped, "data.frame")) { stop("ped should be a data.frame or inherit to a data.frame") } - if (!all(c("personID", "x_pos", "y_pos", "dadID", "momID") %in% names(ped))) { + + req_cols <- c("personID", "x_pos", "y_pos", "dadID", "momID") + if (!all(req_cols %in% names(ped))) { stop("ped must contain personID, x_pos, y_pos, dadID, and momID columns") } - # default config default_config <- list() config <- utils::modifyList(default_config, config) + # Assign unique ID per row for later use + ped$newID <- seq_len(nrow(ped)) + # ----- # Identify duplicated individuals # ----- @@ -39,20 +43,24 @@ processExtras <- function(ped, config = list()) { dplyr::pull() |> unique() - - # Assign unique ID per row for later use - ped$newID <- 1:nrow(ped) + ped <- ped |> # flag anyone with extra appearances + dplyr::mutate(extra = dplyr::case_when(.data$personID %in% idsextras ~ TRUE, + .data$momID %in% idsextras ~ TRUE, + .data$dadID %in% idsextras ~ TRUE, + .data$spouseID %in% idsextras ~ TRUE, + TRUE ~ .data$extra)) # ----- # Subset to duplicated entries only # note that tidyselect hates .data pronouns # ----- + extras <- dplyr::filter(ped, .data$personID %in% idsextras) |> dplyr::select( "newID", "personID", "x_pos", "y_pos", - "dadID", "momID","parenthash", + "dadID", "momID","parent_hash", "couple_hash", "spouseID" ) @@ -71,7 +79,6 @@ processExtras <- function(ped, config = list()) { ) # Father's coordinates - dad_coords <- getRelativeCoordinates( ped = ped, connections = extras, @@ -91,23 +98,23 @@ processExtras <- function(ped, config = list()) { multiple = "all" ) - # Parenthash coordinates - parenthash_coords <- extras |> # need to get mom and dad coordinates + # parent_hash coordinates + parent_hash_coords <- extras |> # need to get mom and dad coordinates dplyr::left_join(mom_coords, by = c("newID", "personID", "momID")) |> dplyr::left_join(dad_coords, by = c("newID", "personID", "dadID")) |> dplyr::left_join( ped, - by = c("parenthash"), + by = c("parent_hash"), suffix = c("", "_sib"), multiple = "all" ) |> - dplyr::filter(!is.na(.data$parenthash)) |> + dplyr::filter(!is.na(.data$parent_hash)) |> dplyr::mutate( - x_parenthash = mean(c( + x_parent_hash = mean(c( .data$x_dad, .data$x_mom )), - y_parenthash = mean(c( + y_parent_hash = mean(c( .data$y_dad, .data$y_mom )) @@ -115,9 +122,10 @@ processExtras <- function(ped, config = list()) { dplyr::select( .data$newID, .data$personID, - .data$parenthash, - .data$x_parenthash, - .data$y_parenthash + .data$parent_hash, + .data$couple_hash, + .data$x_parent_hash, + .data$y_parent_hash ) @@ -155,8 +163,8 @@ processExtras <- function(ped, config = list()) { by = c("newID", "personID", "spouseID"), multiple = "all" ) |> - dplyr::left_join(parenthash_coords, - by = c("newID", "personID", "parenthash"), + dplyr::left_join(parent_hash_coords, + by = c("newID", "personID", "parent_hash"), multiple = "all" ) @@ -207,16 +215,16 @@ processExtras <- function(ped, config = list()) { y1 = .data$y_pos, x2 = .data$x_otherself, y2 = .data$y_otherself), - dist_parenthash = computeDistance(method = "cityblock", + dist_parent_hash = computeDistance(method = "cityblock", x1 = .data$x_pos, y1 = .data$y_pos, - x2 = .data$x_parenthash, - y2 = .data$y_parenthash), - dist_parenthash_other = computeDistance(method = "cityblock", + x2 = .data$x_parent_hash, + y2 = .data$y_parent_hash), + dist_parent_hash_other = computeDistance(method = "cityblock", x1 = .data$x_otherself, y1 = .data$y_otherself, - x2 = .data$x_parenthash, - y2 = .data$y_parenthash) + x2 = .data$x_parent_hash, + y2 = .data$y_parent_hash) ) @@ -227,19 +235,19 @@ processExtras <- function(ped, config = list()) { # where the individual is closest to one of their spouses. # ----- - extras <- extras |> - dplyr::group_by(.data$newID, .data$personID) |> - dplyr::mutate( - min_spouse = min(.data$dist_spouse, na.rm = TRUE), - num_spouse = dplyr::n() - ) |> - dplyr::ungroup() - extras <- extras |> - dplyr::filter(.data$num_spouse == 1 | .data$dist_spouse == .data$min_spouse) |> - dplyr::select( - -.data$min_spouse, - -.data$num_spouse - ) +# extras <- extras |> +# dplyr::group_by(.data$newID, .data$personID) |> +# dplyr::mutate( +# min_spouse = min(.data$dist_spouse, na.rm = TRUE), +# num_spouse = dplyr::n() +# ) |> +# dplyr::ungroup() +# extras <- extras |> +# dplyr::filter(.data$num_spouse == 1 | .data$dist_spouse == .data$min_spouse) |> +# dplyr::select( +# -.data$min_spouse, +# -.data$num_spouse +# ) # ----- @@ -257,24 +265,29 @@ processExtras <- function(ped, config = list()) { mom_closer = dplyr::case_when( .data$dist_mom < .data$dist_mom_other ~ TRUE, .data$dist_mom_other < .data$dist_mom ~ FALSE, - TRUE ~ TRUE + .data$dist_mom == .data$dist_mom_other & + .data$newID < .data$newID_other ~ TRUE, + TRUE ~ FALSE ), dad_closer = dplyr::case_when( .data$dist_dad < .data$dist_dad_other ~ TRUE, .data$dist_dad_other < .data$dist_dad ~ FALSE, + .data$dist_dad == .data$dist_dad_other & + .data$newID < .data$newID_other ~ TRUE, TRUE ~ TRUE ), spouse_closer = dplyr::case_when( .data$dist_spouse < .data$dist_spouse_other ~ TRUE, .data$dist_spouse_other < .data$dist_spouse ~ FALSE, - # is.na(.data$dist_spouse) ~ FALSE, - # !is.na(.data$dist_spouse) & is.na(.data$dist_spouse_other) ~ TRUE, - TRUE ~ TRUE + .data$dist_spouse == .data$dist_spouse_other & + .data$newID < .data$newID_other ~ TRUE, + + TRUE ~ FALSE ), - parenthash_closer = dplyr::case_when( - .data$dist_parenthash < .data$dist_parenthash_other ~ TRUE, - .data$dist_parenthash_other < .data$dist_parenthash ~ FALSE, - TRUE ~ TRUE + parent_hash_closer = dplyr::case_when( + .data$dist_parent_hash < .data$dist_parent_hash_other ~ TRUE, + .data$dist_parent_hash_other < .data$dist_parent_hash ~ FALSE, + TRUE ~ FALSE ) ) @@ -301,15 +314,63 @@ processExtras <- function(ped, config = list()) { link_as_mom = .data$mom_closer, link_as_dad = .data$dad_closer, link_as_spouse = .data$spouse_closer, - link_as_sibling = .data$parenthash_closer - ) %>% dplyr::mutate( - link_any = dplyr::case_when( - .data$link_as_mom == TRUE | .data$link_as_dad == TRUE | - .data$link_as_sibling == TRUE| .data$link_as_spouse == TRUE ~ TRUE, - TRUE ~ FALSE - ) + link_as_sibling = .data$link_as_mom | .data$link_as_dad +#.data$parent_hash_closer ) + ### per‑spouse keeper ---------------------------------------- + extras <- extras |> + dplyr::mutate( + dist_spouse_fix = dplyr::if_else(is.na(.data$dist_spouse), + Inf, .data$dist_spouse) + ) |> + dplyr::group_by(.data$personID) |> + dplyr::mutate( + keep_spouse = (.data$dist_spouse_fix == + min(.data$dist_spouse_fix, na.rm = TRUE)) #& + # (dplyr::row_number() == 1) + ) |> + dplyr::ungroup() |> + dplyr::mutate( + link_as_spouse = .data$link_as_spouse & .data$keep_spouse + ) |> + dplyr::select(-"dist_spouse_fix", -"keep_spouse") |> unique() + + ### END INSERT -------------------------------------------------------------- + + # --- Keep ONE appearance per person for parent / sibling links ---- + extras <- extras |> + dplyr::mutate( + total_parent_dist = dplyr::if_else( + is.na(.data$dist_mom + .data$dist_dad), + Inf, + .data$dist_mom + .data$dist_dad + ) + ) |> + dplyr::group_by(.data$personID) |> + dplyr::mutate( + min_total_parent_dist = min(.data$total_parent_dist, na.rm = TRUE), + keep_links = (.data$total_parent_dist == .data$min_total_parent_dist)# & + # (dplyr::row_number() == 1) + ) |> + dplyr::ungroup() |> + dplyr::mutate( + link_as_mom = .data$link_as_mom & .data$keep_links, + link_as_dad = .data$link_as_dad & .data$keep_links, + link_as_sibling = .data$link_as_sibling & .data$keep_links + ) |> + dplyr::select( + -"total_parent_dist", + -"min_total_parent_dist", + -"keep_links" + ) |> dplyr::mutate( + link_any = dplyr::case_when( + .data$link_as_mom == TRUE | .data$link_as_dad == TRUE | + .data$link_as_sibling == TRUE | .data$link_as_spouse == TRUE ~ TRUE, + TRUE ~ FALSE + ) + ) + # ----- # Final subset of relevant decision columns @@ -372,4 +433,4 @@ processExtras <- function(ped, config = list()) { return(ped) -} \ No newline at end of file +} diff --git a/vignettes/plots.Rmd b/vignettes/plots.Rmd index 1dcfef69..9f603c77 100644 --- a/vignettes/plots.Rmd +++ b/vignettes/plots.Rmd @@ -44,8 +44,7 @@ library(tidyverse) # for data wrangling data("potter") ggPedigree(potter, famID = "famID", - personID = "personID", - config= list(hints = TRUE) + personID = "personID" ) ``` @@ -206,5 +205,7 @@ p <- ggPedigree( p$connections%>%filter(personID ==61) %>% nrow() p$connections%>%filter(personID ==61) %>% unique() +p$connections%>%filter(personID ==65) %>% unique() + p$plot ``` From a0bed1ffbd0fc57dc41129b3fbaa3a56833b0d6a Mon Sep 17 00:00:00 2001 From: Mason Garrison Date: Wed, 14 May 2025 20:40:58 -0400 Subject: [PATCH 12/18] making progross by reconnecting making progross by reconnecting --- R/calcConnections.R | 73 +++--------- R/ggpedigree.R | 15 --- R/processExtras.R | 2 +- R/processExtras_replaceIDs.R | 220 +++++++++++++++++++++++++++++++++++ 4 files changed, 238 insertions(+), 72 deletions(-) create mode 100644 R/processExtras_replaceIDs.R diff --git a/R/calcConnections.R b/R/calcConnections.R index 8c4be745..8e65f308 100644 --- a/R/calcConnections.R +++ b/R/calcConnections.R @@ -69,55 +69,14 @@ calculateConnections <- function(ped, # If duplicated appearances exist, resolve which connections to keep if (sum(ped$extra) > 0) { ped <- processExtras(ped, config = config) |> unique() +} else { + + ped <- ped |> + dplyr::mutate( + coreID = .data$personID + ) +} - # Create a unique parent_hash for each individual - connections <- dplyr::select( - .data = ped, - "personID", - "x_pos", "y_pos", - "dadID", "momID", "parent_hash", "couple_hash", - "spouseID", - "famID", - "x_otherself", "y_otherself", - "extra","link_as_mom", "link_as_dad", "link_as_spouse", - "link_as_sibling" - ) |> unique() - - - connections_for_moms <- dplyr::filter(connections, .data$link_as_mom == TRUE) |> - dplyr::select( - -"extra", - -"link_as_mom", - -"link_as_dad", - -"link_as_spouse", - -"link_as_sibling" - ) |> unique() - - connections_for_dads <- dplyr::filter(connections, .data$link_as_dad == TRUE)|> - dplyr::select( - -"extra", - -"link_as_mom", - -"link_as_dad", - -"link_as_spouse", - -"link_as_sibling" - ) |> unique() - connections_for_spouses <- dplyr::filter(connections, .data$link_as_spouse == TRUE) |> - dplyr::select( - -"extra", - -"link_as_mom", - -"link_as_dad", - -"link_as_spouse", - -"link_as_sibling" - ) - connections_for_sibs <- dplyr::filter(connections, .data$link_as_sibling == TRUE) |> - dplyr::select( - -"extra", - -"link_as_mom", - -"link_as_dad", - -"link_as_spouse", - -"link_as_sibling" - ) |> unique() - } else { connections <- dplyr::select( .data = ped, "personID", @@ -126,7 +85,13 @@ calculateConnections <- function(ped, "spouseID", "famID", "extra" - ) |> unique() |> + ) |> unique() + + # no duplications, so just use the same connections + connections_for_sibs <- connections_for_spouses <- connections_for_dads <- connections_for_moms <- connections + + + connections <- connections |> dplyr::mutate( link_as_mom = TRUE, link_as_dad = TRUE, @@ -136,9 +101,7 @@ calculateConnections <- function(ped, -# no duplications, so just use the same connections - connections_for_sibs <- connections_for_spouses <- connections_for_dads <- connections_for_moms <- connections - } + # Get mom's coordinates mom_connections <- getRelativeCoordinates( @@ -244,13 +207,11 @@ calculateConnections <- function(ped, .groups = "drop" ) |> unique() - +#print(parent_midpoints) # Merge midpoints into connections connections <- connections |> dplyr::left_join(parent_midpoints, - by = c("parent_hash", - "x_mom", "y_mom", - "x_dad", "y_dad") + by = c("parent_hash") ) |> dplyr::left_join(spouse_midpoints, by = c("spouseID", "couple_hash") diff --git a/R/ggpedigree.R b/R/ggpedigree.R index 256ffca0..6ca094de 100644 --- a/R/ggpedigree.R +++ b/R/ggpedigree.R @@ -169,21 +169,6 @@ ggPedigree <- function(ped, famID = "famID", # ----- # Spouse link between two parents - if(FALSE){ - p <- p + - ggplot2::geom_segment( - data = plot_connections$connections_spouse_segment, - ggplot2::aes( - x = .data$x_start, - xend = .data$x_end, - y = .data$y_start, - yend = .data$y_end - ), - linewidth = config$line_width, - color = config$spouse_segment_color, - na.rm = TRUE - ) - } p <- p + ggplot2::geom_segment( data = connections, diff --git a/R/processExtras.R b/R/processExtras.R index 484d944f..3c486a36 100644 --- a/R/processExtras.R +++ b/R/processExtras.R @@ -14,7 +14,7 @@ #' #' @keywords internal -processExtras <- function(ped, config = list()) { +processExtrasx <- function(ped, config = list()) { # ----- # Check inputs # ----- diff --git a/R/processExtras_replaceIDs.R b/R/processExtras_replaceIDs.R new file mode 100644 index 00000000..c0298f38 --- /dev/null +++ b/R/processExtras_replaceIDs.R @@ -0,0 +1,220 @@ + + +#' Process duplicate appearances of individuals in a pedigree layout +#' +#' Resolves layout conflicts when the same individual appears in multiple places +#' (e.g., due to inbreeding loops). Keeps the layout point that is closest to a relevant +#' relative (mom, dad, or spouse) and removes links to others to avoid confusion in visualization. +#' +#' @param ped A data.frame containing pedigree layout info with columns including: +#' `personID`, `x_pos`, `y_pos`, `dadID`, `momID`, and a logical column `extra`. +#' @param config A list of configuration options. Currently unused but passed through to internal helpers. +#' +#' @return A modified `ped` data.frame with updated coordinates and removed duplicates. +#' +#' @keywords internal + +processExtras <- function(ped, config = list()) { + + + # ---- sanity checks ------------------------------------------------------- + if (!inherits(ped, "data.frame")) { + stop("ped must be a data.frame") + } + + req_cols <- c("personID", "x_pos", "y_pos", + "momID", "dadID", "spouseID", "extra") + miss <- setdiff(req_cols, names(ped)) + if (length(miss)) + stop("ped is missing columns: ", paste(miss, collapse = ", ")) + + # ---- 1. ensure a unique row key ---- + + ped$newID <- seq_len(nrow(ped)) + + idsextras <- dplyr::filter(ped, .data$extra == TRUE) |> + dplyr::select("personID") |> + dplyr::pull() |> + unique() + + + # ---- 2. give every extra appearance a unique numeric personID ----------- + ped <- ped |> + dplyr::arrange(.data$personID, .data$newID) |> + dplyr::mutate( + coreID = .data$personID, + personID = dplyr::if_else( + .data$extra, + .data$personID + .data$newID / 1000, # numeric, unique + .data$personID + ) + ) + + ped <- ped |> # flag anyone with extra appearances + dplyr::mutate(extra = dplyr::case_when(.data$coreID %in% idsextras ~ TRUE, + .data$momID %in% idsextras ~ TRUE, + .data$dadID %in% idsextras ~ TRUE, + .data$spouseID %in% idsextras ~ TRUE, + TRUE ~ .data$extra)) + + + # ---- 3. isolate duplicates for distance logic --------------------------- + extras <- dplyr::filter(ped, .data$extra) + + # ---- 3a. attach relative coordinates (same helpers you use) ------------- + # Mother's coordinates + mom_coords <- getRelativeCoordinates( + ped = ped, + connections = extras, + relativeIDvar = "momID", + x_name = "x_mom", + y_name = "y_mom", + multiple = "any" + ) + + # Father's coordinates + dad_coords <- getRelativeCoordinates( + ped = ped, + connections = extras, + relativeIDvar = "dadID", + x_name = "x_dad", + y_name = "y_dad", + multiple = "any" + ) + + # Spouse's coordinates + spouse_coords <- getRelativeCoordinates( + ped = ped, + connections = extras, + relativeIDvar = "spouseID", + x_name = "x_spouse", + y_name = "y_spouse", + multiple = "all" + ) + + parent_hash_coords <- extras |> + dplyr::left_join(mom_coords, by = c("newID", "personID", "momID")) |> + dplyr::left_join(dad_coords, by = c("newID", "personID", "dadID")) |> + dplyr::filter(!is.na(.data$parent_hash)) |> + dplyr::mutate( + x_parent_hash = mean(c(.data$x_dad, .data$x_mom), na.rm = TRUE), + y_parent_hash = mean(c(.data$y_dad, .data$y_mom), na.rm = TRUE) + ) |> + dplyr::select(.data$newID, .data$personID, + .data$x_parent_hash, .data$y_parent_hash) + + extras <- extras |> + dplyr::left_join(mom_coords, by = c("newID", "personID", "momID")) |> + dplyr::left_join(dad_coords, by = c("newID", "personID", "dadID")) |> + dplyr::left_join(spouse_coords, by = c("newID", "personID", "spouseID")) |> + dplyr::left_join(parent_hash_coords, by = c("newID", "personID")) + + # ---- 3b. compute distance metrics -------------- + extras <- extras |> + dplyr::mutate( + dist_mom = computeDistance(method = "cityblock", + x1 = .data$x_pos, y1 = .data$y_pos, + x2 = .data$x_mom, y2 = .data$y_mom), + dist_dad = computeDistance(method = "cityblock", + x1 = .data$x_pos, y1 = .data$y_pos, + x2 = .data$x_dad, y2 = .data$y_dad), + dist_spouse = computeDistance(method = "cityblock", + x1 = .data$x_pos, y1 = .data$y_pos, + x2 = .data$x_spouse, y2 = .data$y_spouse), + + total_parent_dist = computeDistance(method = "cityblock", + x1 = .data$x_pos, y1 = .data$y_pos, + x2 = .data$x_parent_hash, y2 = .data$y_parent_hash), + + total_parent_dist2 = .data$dist_mom + .data$dist_dad + ) + + # ---- 4. choose winning duplicate per relationship ----------------------- + spouse_winner <- extras |> + dplyr::group_by(.data$coreID) |> + dplyr::slice_min(.data$dist_spouse, n = 1, with_ties = FALSE) |> + dplyr::ungroup() |> + dplyr::select(coreID, spouse_choice = .data$personID) + + parent_winner <- extras |> + dplyr::group_by(.data$coreID) |> + dplyr::slice_min(.data$total_parent_dist, n = 1, with_ties = FALSE) |> + dplyr::ungroup() |> + dplyr::select(coreID, parent_choice = .data$personID) + + # ---- 5. row‑wise relink using nearest appearance ------------------------- + + # lookup table: every appearance of every coreID + dup_xy <- ped |> + dplyr::select(personID, coreID, x_pos, y_pos) + + closest_dup <- function(target_core, x0, y0) { + cand <- dup_xy[dup_xy$coreID == target_core, ] + if (nrow(cand) == 0L) return(NA_real_) + cand$personID[ + which.min( + computeDistance(method = "cityblock", + x1= x0, y1=y0, + x2=cand$x_pos, y2=cand$y_pos) + ) + ] + } + + relink <- function(df, col) { + df |> + dplyr::rowwise() |> + dplyr::mutate( + "{col}" := { + tgt <- .data[[col]] + if (is.na(tgt)) NA_real_ + else closest_dup(tgt, .data$x_pos, .data$y_pos) + } + ) |> + dplyr::ungroup() + } + + ped <- ped |> + relink("spouseID") |> + relink("momID") |> + relink("dadID") + + # remove parent ids from all but the closest coreID, + # if there's no choice to be made, then keep existing momID +if(T){ + ped <- ped |> + dplyr::left_join(spouse_winner, by = "coreID") |> + dplyr::left_join(parent_winner, by = "coreID") |> + dplyr::mutate( + momID = dplyr::case_when(.data$personID == .data$parent_choice ~ .data$momID, + !is.na(.data$parent_choice) ~ NA_real_, + TRUE ~ .data$momID ), + + dadID = dplyr::case_when(.data$personID == .data$parent_choice ~ .data$dadID, + !is.na(.data$parent_choice) ~ NA_real_, + TRUE ~ .data$dadID ), + spouseID = dplyr::case_when(.data$personID == .data$spouse_choice ~ .data$spouseID, + !is.na(.data$spouse_choice) ~ NA_real_, + TRUE ~ .data$spouseID ) + + ) |> dplyr::select(-.data$parent_choice, -.data$spouse_choice) +} + + # rehash + ped <- ped |> + dplyr::mutate( + parent_hash = symKey(.data$momID, .data$dadID), + couple_hash = symKey(.data$personID, .data$spouseID) + ) |> + dplyr::mutate( + parent_hash = gsub("NA.NA", NA_real_, .data$parent_hash), + couple_hash = gsub("NA.NA", NA_real_, .data$couple_hash) + ) + + +print(ped) + return(ped) + } + + + + From b7bbc189e4ba222ca648be331154e621d96ee1b3 Mon Sep 17 00:00:00 2001 From: Mason Garrison Date: Wed, 14 May 2025 23:38:43 -0400 Subject: [PATCH 13/18] works in all but the oddest scenarios --- R/calcConnections.R | 22 ++++-- R/ggpedigree.R | 38 +++++++--- R/processExtras_replaceIDs.R | 142 +++++++++++++++++++++++++++++------ vignettes/plots.Rmd | 11 ++- 4 files changed, 168 insertions(+), 45 deletions(-) diff --git a/R/calcConnections.R b/R/calcConnections.R index 8e65f308..00dd9f80 100644 --- a/R/calcConnections.R +++ b/R/calcConnections.R @@ -68,7 +68,10 @@ calculateConnections <- function(ped, # If duplicated appearances exist, resolve which connections to keep if (sum(ped$extra) > 0) { - ped <- processExtras(ped, config = config) |> unique() + full_extra <- processExtras(ped, config = config) + + ped <- full_extra$ped |> unique() + } else { ped <- ped |> @@ -76,7 +79,6 @@ calculateConnections <- function(ped, coreID = .data$personID ) } - connections <- dplyr::select( .data = ped, "personID", @@ -239,18 +241,26 @@ calculateConnections <- function(ped, y_mid_sib = dplyr::if_else(link_as_sibling, y_mid_sib, NA_real_) ) - - - - +if(exists("full_extra")){ plot_connections <- list( connections = connections, + self_coords = full_extra$self_coords, connections_spouse_segment = build_connections_spouse_segment( ped = ped, connections_for_FOO = connections_for_spouses ) ) +}else{ + plot_connections <- list( + connections = connections, + self_coords = FALSE, + connections_spouse_segment = build_connections_spouse_segment( + ped = ped, + connections_for_FOO = connections_for_spouses + ) + ) +} return(plot_connections) } diff --git a/R/ggpedigree.R b/R/ggpedigree.R index 6ca094de..180f5bff 100644 --- a/R/ggpedigree.R +++ b/R/ggpedigree.R @@ -17,7 +17,7 @@ #' \item{code_male}{Integer or string. Value identifying males in the sex column. (typically 0 or 1) Default: 1.} #' \item{spouse_segment_color, self_segment_color, sibling_segment_color, parent_segment_color, offspring_segment_color}{Character. Line colors for respective connection types.} #' \item{label_text_size, point_size, line_width}{Numeric. Controls text size, point size, and line thickness.} -#' \item{generation_gap}{Numeric. Vertical spacing multiplier between generations. Default: 1.} +#' \item{generation_height}{Numeric. Vertical spacing multiplier between generations. Default: 1.} #' \item{unknown_shape, female_shape, male_shape, affected_shape}{Integers. Shape codes for plotting each group.} #' \item{sex_shape_labs}{Character vector of labels for the sex variable. (default: c("Female", "Male", "Unknown")} #' \item{unaffected, affected}{Values indicating unaffected/affected status.} @@ -61,7 +61,8 @@ ggPedigree <- function(ped, famID = "famID", label_text_size = 2, point_size = 4, line_width = 0.5, - generation_gap = 1, + generation_height = 1, + generation_width = 1, unknown_shape = 18, female_shape = 16, male_shape = 15, @@ -140,11 +141,14 @@ ggPedigree <- function(ped, famID = "famID", config = config ) - # Apply vertical spacing factor if generation_gap ≠ 1 - if (!isTRUE(all.equal(config$generation_gap, 1))) { - ds$y_pos <- ds$y_pos * config$generation_gap # expand/contract generations + # Apply vertical spacing factor if generation_height ≠ 1 + if (!isTRUE(all.equal(config$generation_height, 1))) { + ds$y_pos <- ds$y_pos * config$generation_height # expand/contract generations + } + # Apply horizontal spacing factor if generation_width ≠ 1 + if (!isTRUE(all.equal(config$generation_width, 1))) { + ds$x_pos <- ds$x_pos * config$generation_width # expand/contract generations } - # ----- # STEP 5: Compute Relationship Connections # ----- @@ -156,7 +160,7 @@ ggPedigree <- function(ped, famID = "famID", # ----- # STEP 6: Initialize Plot # ----- - gap_off <- 0.5 * config$generation_gap # single constant for all “stub” offsets + gap_off <- 0.5 * config$generation_height # single constant for all “stub” offsets p <- ggplot2::ggplot(ds, ggplot2::aes( x = .data$x_pos, @@ -294,7 +298,7 @@ ggPedigree <- function(ped, famID = "famID", if(config$include_labels == TRUE && config$label_method=="ggrepel"){ p <- p + ggrepel::geom_text_repel(ggplot2::aes(label = .data$personID), - nudge_y = -.10 * config$generation_gap, + nudge_y = -.10 * config$generation_height, size = config$label_text_size, na.rm = TRUE, max.overlaps = config$max_overlaps, @@ -304,7 +308,7 @@ ggPedigree <- function(ped, famID = "famID", } else if(config$include_labels == TRUE && config$label_method=="geom_label"){ p <- p + ggplot2::geom_label(ggplot2::aes(label = .data$personID), - nudge_y = -.25 * config$generation_gap, + nudge_y = -.25 * config$generation_height, size = config$label_text_size, na.rm = TRUE ) @@ -312,7 +316,7 @@ ggPedigree <- function(ped, famID = "famID", } else if(config$include_labels == TRUE || config$label_method=="geom_text"){ p <- p + ggplot2::geom_text(ggplot2::aes(label = .data$personID), - nudge_y = -.25 * config$generation_gap, + nudge_y = -.25 * config$generation_height, size = config$label_text_size, na.rm = TRUE ) @@ -320,9 +324,19 @@ ggPedigree <- function(ped, famID = "famID", # Self-segment (for duplicate layout appearances of same person) - if ("x_otherself" %in% names(connections)) { + if (inherits(plot_connections$self_coords, "data.frame")) { + + otherself <- plot_connections$self_coords |> filter(!is.na(.data$x_otherself)) |> + mutate( + otherself_xkey = symKey(.data$x_otherself, .data$x_pos)#, + # otherself_ykey = symKey(.data$y_otherself, .data$y_pos) + ) |> + # unique combinations of x_otherself and x_pos and y_otherself and y_pos + dplyr::distinct(.data$otherself_xkey, .keep_all = TRUE) + + p <- p + ggplot2::geom_curve( - data = dplyr::filter(connections,extra==TRUE), + data = otherself, ggplot2::aes( x = .data$x_otherself, xend = .data$x_pos, diff --git a/R/processExtras_replaceIDs.R b/R/processExtras_replaceIDs.R index c0298f38..4aacc149 100644 --- a/R/processExtras_replaceIDs.R +++ b/R/processExtras_replaceIDs.R @@ -37,6 +37,36 @@ processExtras <- function(ped, config = list()) { dplyr::pull() |> unique() + # check if momID == spouseID + if (any(ped$momID == ped$spouseID, na.rm = TRUE)) { + + ped <- ped |> + dplyr::mutate( + momSpouse = dplyr::if_else(.data$spouseID == .data$momID, + TRUE, + FALSE) + ) + } else { + ped <- ped |> + dplyr::mutate( + momSpouse = FALSE + ) + } + if (any(ped$dadID == ped$spouseID, na.rm = TRUE)) { + + ped <- ped |> + dplyr::mutate( + dadSpouse = dplyr::if_else(.data$spouseID == .data$dadID, + TRUE, + FALSE) + ) + } else { + ped <- ped |> + dplyr::mutate( + dadSpouse = FALSE + ) + } + # ---- 2. give every extra appearance a unique numeric personID ----------- ped <- ped |> @@ -47,8 +77,10 @@ processExtras <- function(ped, config = list()) { .data$extra, .data$personID + .data$newID / 1000, # numeric, unique .data$personID - ) - ) + ), + total_blue = .data$dadSpouse | .data$momSpouse + ) |> + select(-.data$dadSpouse, -.data$momSpouse) ped <- ped |> # flag anyone with extra appearances dplyr::mutate(extra = dplyr::case_when(.data$coreID %in% idsextras ~ TRUE, @@ -122,7 +154,7 @@ processExtras <- function(ped, config = list()) { x1 = .data$x_pos, y1 = .data$y_pos, x2 = .data$x_spouse, y2 = .data$y_spouse), - total_parent_dist = computeDistance(method = "cityblock", + total_parent_dist_cityblock = computeDistance(method = "cityblock", x1 = .data$x_pos, y1 = .data$y_pos, x2 = .data$x_parent_hash, y2 = .data$y_parent_hash), @@ -130,34 +162,66 @@ processExtras <- function(ped, config = list()) { ) # ---- 4. choose winning duplicate per relationship ----------------------- + spouse_winner <- extras |> - dplyr::group_by(.data$coreID) |> + dplyr::group_by(.data$coreID,.data$spouseID) |> dplyr::slice_min(.data$dist_spouse, n = 1, with_ties = FALSE) |> dplyr::ungroup() |> dplyr::select(coreID, spouse_choice = .data$personID) + +if(sum(ped$total_blue,na.rm = TRUE) == 0){ parent_winner <- extras |> dplyr::group_by(.data$coreID) |> - dplyr::slice_min(.data$total_parent_dist, n = 1, with_ties = FALSE) |> + dplyr::slice_min(.data$total_parent_dist_cityblock, n = 1, with_ties = FALSE) |> dplyr::ungroup() |> dplyr::select(coreID, parent_choice = .data$personID) - +} else{ + + # if there are spouseID == momID or spouseID == dadID, then parent choice needs to be the 2nd closest + parent_winner <- extras |> + dplyr::group_by(coreID) |> + dplyr::arrange(total_parent_dist2, .by_group = TRUE) |> + dplyr::mutate( + rank = dplyr::row_number(), # 1 = closest, 2 = second‑closest … + pick_rank = ifelse(any(.data$total_blue), 2L, 1L) # group‑level choice + ) |> + dplyr::filter(.data$rank == .data$pick_rank) |> + dplyr::ungroup() |> + dplyr::select(coreID, parent_choice = .data$personID) +} # ---- 5. row‑wise relink using nearest appearance ------------------------- + + + # lookup table: every appearance of every coreID dup_xy <- ped |> - dplyr::select(personID, coreID, x_pos, y_pos) + dplyr::select(personID, coreID, x_pos, y_pos, total_blue) closest_dup <- function(target_core, x0, y0) { cand <- dup_xy[dup_xy$coreID == target_core, ] if (nrow(cand) == 0L) return(NA_real_) - cand$personID[ - which.min( - computeDistance(method = "cityblock", - x1= x0, y1=y0, - x2=cand$x_pos, y2=cand$y_pos) - ) - ] + # compute Manhattan (“city‑block”) distance for all candidates + d <- computeDistance( + method = "cityblock", + x1 = x0, y1 = y0, + x2 = cand$x_pos, y2 = cand$y_pos + ) + ord <- order(d) # ascending distance + pick <- if (any(cand$total_blue, na.rm = TRUE)) 2L else 1L # 2nd if blue present, else 1st + + if (length(ord) < pick) pick <- 1L + + cand$personID[ord[pick]] + + # cand$personID[ + # which.min( + # computeDistance(method = "cityblock", + # x1= x0, y1=y0, + # x2=cand$x_pos, y2=cand$y_pos) + # ) + # ] } relink <- function(df, col) { @@ -173,14 +237,11 @@ processExtras <- function(ped, config = list()) { dplyr::ungroup() } - ped <- ped |> - relink("spouseID") |> - relink("momID") |> - relink("dadID") + # remove parent ids from all but the closest coreID, # if there's no choice to be made, then keep existing momID -if(T){ + ped <- ped |> dplyr::left_join(spouse_winner, by = "coreID") |> dplyr::left_join(parent_winner, by = "coreID") |> @@ -196,8 +257,14 @@ if(T){ !is.na(.data$spouse_choice) ~ NA_real_, TRUE ~ .data$spouseID ) - ) |> dplyr::select(-.data$parent_choice, -.data$spouse_choice) -} + ) |> dplyr::select(-.data$parent_choice, -.data$spouse_choice, + -starts_with("newID")) + ped <- ped |> + relink("spouseID") |> + relink("momID") |> + relink("dadID") + + # # rehash ped <- ped |> @@ -209,10 +276,39 @@ if(T){ parent_hash = gsub("NA.NA", NA_real_, .data$parent_hash), couple_hash = gsub("NA.NA", NA_real_, .data$couple_hash) ) + # ---- 6. remove duplicates and return ------------------------------------ + + # Coordinates of the individual's other appearance ("self") + self_coords <- extras |> + dplyr::left_join( + ped, + by = c("coreID"), + suffix = c("", "_other"), + # relationship = relationship, + multiple = "all" + ) |> + dplyr::filter(.data$personID != .data$personID_other) |> + dplyr::mutate( + x_otherself = .data$x_pos_other, + y_otherself = .data$y_pos_other + ) |> + dplyr::select( + .data$personID, + # .data$coreID, + .data$x_pos, + .data$y_pos, + .data$x_otherself, + .data$y_otherself + ) |> unique() + + +full_extra <- list( + ped = ped, + self_coords = self_coords + ) -print(ped) - return(ped) + return(full_extra) } diff --git a/vignettes/plots.Rmd b/vignettes/plots.Rmd index 9f603c77..785c8c1d 100644 --- a/vignettes/plots.Rmd +++ b/vignettes/plots.Rmd @@ -194,7 +194,8 @@ p <- ggPedigree( label_method="geom_text", affected = TRUE, unaffected = FALSE, - generation_gap = 2, + generation_height = 2, + generation_width = 1, affected_shape = 4, spouse_segment_color = "pink", sibling_segment_color = "blue", @@ -202,10 +203,12 @@ p <- ggPedigree( offspring_segment_color = "black") )# + facet_wrap(~famID, scales= "free") -p$connections%>%filter(personID ==61) %>% nrow() -p$connections%>%filter(personID ==61) %>% unique() +#p$connections%>%filter(personID ==60) %>% nrow() +#p$connections%>%filter(personID ==66) %>% unique() +#p$connections%>%filter(personID ==65) %>% unique() -p$connections%>%filter(personID ==65) %>% unique() +#p$connections%>%filter(personID >=61 & + # personID <62 ) %>% unique() p$plot ``` From f981366bbb9be6c081312e0b9971fc554e8a0f4d Mon Sep 17 00:00:00 2001 From: Mason Garrison Date: Thu, 15 May 2025 10:07:05 -0400 Subject: [PATCH 14/18] move --- R/{processExtras.R => processExtras.X} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename R/{processExtras.R => processExtras.X} (100%) diff --git a/R/processExtras.R b/R/processExtras.X similarity index 100% rename from R/processExtras.R rename to R/processExtras.X From 83186bc26a3133aa91e707e1f2d227c02de03752 Mon Sep 17 00:00:00 2001 From: Mason Garrison Date: Thu, 15 May 2025 10:07:36 -0400 Subject: [PATCH 15/18] rename --- R/{processExtras_replaceIDs.R => processExtras.R} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename R/{processExtras_replaceIDs.R => processExtras.R} (100%) diff --git a/R/processExtras_replaceIDs.R b/R/processExtras.R similarity index 100% rename from R/processExtras_replaceIDs.R rename to R/processExtras.R From 698d3f404a82e7b273823d06e14eaa139bf82361 Mon Sep 17 00:00:00 2001 From: Mason Garrison Date: Thu, 15 May 2025 16:43:35 -0400 Subject: [PATCH 16/18] primetime? --- DESCRIPTION | 1 + R/calcConnections.R | 306 +++++++++-------- R/calcConnectionsHelpers.R | 19 +- R/calcCoordinates.R | 44 +-- R/ggpedigree.R | 98 +++--- R/processExtras.R | 463 +++++++++++++------------- man/calculateConnections.Rd | 2 +- man/computeDistance.Rd | 28 ++ man/getMidpoints.Rd | 2 +- man/getRelativeCoordinates.Rd | 7 +- man/ggPedigree.Rd | 21 +- man/processExtras.Rd | 2 +- tests/testthat/test-calcCoordinates.R | 15 +- vignettes/plots.R | 122 ++++++- vignettes/plots.Rmd | 125 ++++++- vignettes/plots.html | 151 +++++++-- vignettes/plots_morecomplexity.xmd | 4 +- 17 files changed, 900 insertions(+), 510 deletions(-) create mode 100644 man/computeDistance.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 9028ee0f..496c76b6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -30,6 +30,7 @@ Suggests: OpenMx, NlsyLinks, rmarkdown, + tibble, testthat (>= 3.0.0) VignetteBuilder: knitr diff --git a/R/calcConnections.R b/R/calcConnections.R index 00dd9f80..eaf01859 100644 --- a/R/calcConnections.R +++ b/R/calcConnections.R @@ -55,51 +55,49 @@ calculateConnections <- function(ped, # create a unique parent_hash for each individual # this will be used to identify siblings if (!all("parent_hash" %in% names(ped))) { - ped <- ped |> - dplyr::mutate( - parent_hash = symKey(.data$momID, .data$dadID), - couple_hash = symKey(.data$personID, .data$spouseID) - ) |> - dplyr::mutate( - parent_hash = gsub("NA.NA", NA, .data$parent_hash), - couple_hash = gsub("NA.NA", NA, .data$couple_hash) - ) + ped <- ped |> + dplyr::mutate( + parent_hash = symKey(.data$momID, .data$dadID), + couple_hash = symKey(.data$personID, .data$spouseID) + ) |> + dplyr::mutate( + parent_hash = gsub("NA.NA", NA, .data$parent_hash), + couple_hash = gsub("NA.NA", NA, .data$couple_hash) + ) } # If duplicated appearances exist, resolve which connections to keep if (sum(ped$extra) > 0) { full_extra <- processExtras(ped, config = config) - ped <- full_extra$ped |> unique() + ped <- full_extra$ped |> unique() + } else { + ped <- ped |> + dplyr::mutate( + coreID = .data$personID + ) + } + connections <- dplyr::select( + .data = ped, + "personID", + "x_pos", "y_pos", + "dadID", "momID", "parent_hash", "couple_hash", + "spouseID", + "famID", + "extra" + ) |> unique() + + # no duplications, so just use the same connections + connections_for_sibs <- connections_for_spouses <- connections_for_dads <- connections_for_moms <- connections -} else { - ped <- ped |> + connections <- connections |> dplyr::mutate( - coreID = .data$personID + link_as_mom = TRUE, + link_as_dad = TRUE, + link_as_spouse = TRUE, + link_as_sibling = TRUE ) -} - connections <- dplyr::select( - .data = ped, - "personID", - "x_pos", "y_pos", - "dadID", "momID", "parent_hash", "couple_hash", - "spouseID", - "famID", - "extra" - ) |> unique() - - # no duplications, so just use the same connections - connections_for_sibs <- connections_for_spouses <- connections_for_dads <- connections_for_moms <- connections - - - connections <- connections |> - dplyr::mutate( - link_as_mom = TRUE, - link_as_dad = TRUE, - link_as_spouse = TRUE, - link_as_sibling = TRUE - ) @@ -140,8 +138,9 @@ calculateConnections <- function(ped, ) |> dplyr::select( "personID", "spouseID", - "x_spouse", "y_spouse","couple_hash" - ) |> unique() + "x_spouse", "y_spouse", "couple_hash" + ) |> + unique() # Combine mom, dad, and spouse coordinates connections <- connections |> @@ -152,8 +151,9 @@ calculateConnections <- function(ped, by = c("personID", "dadID") ) |> dplyr::left_join(spouse_connections, - by = c("personID", "spouseID","couple_hash") - ) |> unique() + by = c("personID", "spouseID", "couple_hash") + ) |> + unique() # Calculate midpoints between mom and dad in child row @@ -171,13 +171,16 @@ calculateConnections <- function(ped, dplyr::first(.data$y_mom) )), .groups = "drop" - ) |> unique() + ) |> + unique() # Calculate midpoints between spouses spouse_midpoints <- connections |> - dplyr::filter(.data$link_as_spouse, - !is.na(.data$spouseID)) |> - dplyr::group_by(.data$spouseID,.data$couple_hash) |> + dplyr::filter( + .data$link_as_spouse, + !is.na(.data$spouseID) + ) |> + dplyr::group_by(.data$spouseID, .data$couple_hash) |> dplyr::summarize( x_mid_spouse = mean(c( dplyr::first(.data$x_pos), @@ -188,15 +191,16 @@ calculateConnections <- function(ped, dplyr::first(.data$y_spouse) )), .groups = "drop" - ) |> unique() + ) |> + unique() # Calculate sibling group midpoints - sibling_midpoints <- connections|> + sibling_midpoints <- connections |> dplyr::filter( .data$link_as_sibling, - !is.na(.data$momID) & !is.na(.data$dadID) & # biological parents defined - !is.na(.data$x_mom) & !is.na(.data$y_mom) & # mom’s coordinates linked - !is.na(.data$x_dad) & !is.na(.data$y_dad) # dad’s coordinates linked + !is.na(.data$momID) & !is.na(.data$dadID) & # biological parents defined + !is.na(.data$x_mom) & !is.na(.data$y_mom) & # mom’s coordinates linked + !is.na(.data$x_dad) & !is.na(.data$y_dad) # dad’s coordinates linked ) |> dplyr::group_by( .data$parent_hash, @@ -207,9 +211,10 @@ calculateConnections <- function(ped, x_mid_sib = mean(.data$x_pos), y_mid_sib = dplyr::first(.data$y_pos), .groups = "drop" - ) |> unique() + ) |> + unique() -#print(parent_midpoints) + # print(parent_midpoints) # Merge midpoints into connections connections <- connections |> dplyr::left_join(parent_midpoints, @@ -219,118 +224,125 @@ calculateConnections <- function(ped, by = c("spouseID", "couple_hash") ) |> dplyr::left_join(sibling_midpoints, - by = c("parent_hash","x_mom", "y_mom", - "x_dad", "y_dad") + by = c( + "parent_hash", "x_mom", "y_mom", + "x_dad", "y_dad" + ) ) |> - dplyr::mutate( - x_mid_sib = dplyr::case_when( - is.na(.data$x_dad) & is.na(.data$x_mom) ~ NA_real_, - !is.na(.data$x_mid_sib) ~ .data$x_mid_sib, - (!is.na(.data$momID) & !is.na(.data$x_mom)) | (!is.na(.data$dadID) & !is.na(.data$x_dad)) ~ .data$x_pos, + dplyr::mutate( + x_mid_sib = dplyr::case_when( + is.na(.data$x_dad) & is.na(.data$x_mom) ~ NA_real_, + !is.na(.data$x_mid_sib) ~ .data$x_mid_sib, + (!is.na(.data$momID) & !is.na(.data$x_mom)) | (!is.na(.data$dadID) & !is.na(.data$x_dad)) ~ .data$x_pos, TRUE ~ NA_real_ - ), + ), y_mid_sib = dplyr::case_when( - is.na(.data$y_dad) & is.na(.data$y_mom) ~ NA_real_, + is.na(.data$y_dad) & is.na(.data$y_mom) ~ NA_real_, !is.na(.data$y_mid_sib) ~ .data$y_mid_sib, - (!is.na(.data$momID) & !is.na(.data$y_mom)) | (!is.na(.data$dadID) & !is.na(.data$y_dad)) ~ .data$y_pos, - TRUE ~ NA_real_ - ) - ) |> unique() |> + (!is.na(.data$momID) & !is.na(.data$y_mom)) | (!is.na(.data$dadID) & !is.na(.data$y_dad)) ~ .data$y_pos, + TRUE ~ NA_real_ + ) + ) |> + unique() |> dplyr::mutate( - x_mid_sib = dplyr::if_else(link_as_sibling, x_mid_sib, NA_real_), - y_mid_sib = dplyr::if_else(link_as_sibling, y_mid_sib, NA_real_) + x_mid_sib = dplyr::if_else(.data$link_as_sibling, .data$x_mid_sib, NA_real_), + y_mid_sib = dplyr::if_else(.data$link_as_sibling, .data$y_mid_sib, NA_real_) ) -if(exists("full_extra")){ - plot_connections <- list( - connections = connections, - self_coords = full_extra$self_coords, - connections_spouse_segment = build_connections_spouse_segment( - ped = ped, - connections_for_FOO = connections_for_spouses - ) + if (exists("full_extra")) { + plot_connections <- list( + connections = connections, + self_coords = full_extra$self_coords, + connections_spouse_segment = build_connections_spouse_segment( + ped = ped, + connections_for_FOO = connections_for_spouses + ) ) -}else{ - plot_connections <- list( - connections = connections, - self_coords = FALSE, - connections_spouse_segment = build_connections_spouse_segment( - ped = ped, - connections_for_FOO = connections_for_spouses + } else { + plot_connections <- list( + connections = connections, + self_coords = FALSE, + connections_spouse_segment = build_connections_spouse_segment( + ped = ped, + connections_for_FOO = connections_for_spouses + ) ) - ) - -} + } return(plot_connections) } -build_connections_spouse_segment <- function(ped,connections_for_FOO, use_hash = TRUE){ -if(use_hash==TRUE){ - # I want to make segments for each hash, because some people have multiple spouses - # this is to add those missing segments - parent_hash_connections <- ped |> dplyr::select("parent_hash") |> - mutate(parent1 = # needs to be the first part of the string - stringr::str_extract(.data$parent_hash, "^[^.]+"), - parent2 = # needs to be the second part of the string\ - stringr::str_extract(.data$parent_hash, "(?<=\\.)[^.]+") - ) |> - dplyr::left_join(connections_for_FOO |> mutate(personID = paste0(.data$personID)), - by = c("parent1" = "personID"), - suffix = c("", "_parent1"), - multiple = "any" - ) |> dplyr::left_join(connections_for_FOO |> mutate(personID = paste0(.data$personID)), - by = c("parent2" = "personID"), - suffix = c("", "_parent2"), - multiple = "any" - ) |> - dplyr::mutate( - x_start = .data$x_pos, - x_end = .data$x_pos_parent2, - y_start = .data$y_pos, - y_end = .data$y_pos_parent2 - ) |> select( - -"parent_hash", - -"parent1", - -"parent2", - -"x_pos", - -"y_pos", - -"x_pos_parent2", - -"y_pos_parent2" - ) - - +build_connections_spouse_segment <- function(ped, connections_for_FOO, use_hash = TRUE) { + if (use_hash == TRUE) { + # I want to make segments for each hash, because some people have multiple spouses + # this is to add those missing segments + parent_hash_connections <- ped |> + dplyr::select("parent_hash") |> + dplyr::mutate( + parent1 = # needs to be the first part of the string + stringr::str_extract(.data$parent_hash, "^[^.]+"), + parent2 = # needs to be the second part of the string\ + stringr::str_extract(.data$parent_hash, "(?<=\\.)[^.]+") + ) |> + dplyr::left_join(connections_for_FOO |> + dplyr::mutate(personID = paste0(.data$personID)), + by = c("parent1" = "personID"), + suffix = c("", "_parent1"), + multiple = "any" + ) |> + dplyr::left_join(connections_for_FOO |> + dplyr::mutate(personID = paste0(.data$personID)), + by = c("parent2" = "personID"), + suffix = c("", "_parent2"), + multiple = "any" + ) |> + dplyr::mutate( + x_start = .data$x_pos, + x_end = .data$x_pos_parent2, + y_start = .data$y_pos, + y_end = .data$y_pos_parent2 + ) |> + dplyr::select( + -"parent_hash", + -"parent1", + -"parent2", + -"x_pos", + -"y_pos", + -"x_pos_parent2", + -"y_pos_parent2" + ) - # Get spouse coordinates -}else{ - # spouses - # Get spouse coordinates - spouse_connections <- ped |> - dplyr::select( - "personID", "x_pos", - "y_pos", "spouseID" - ) |> dplyr::filter(!is.na(.data$spouseID)) |> - dplyr::left_join(connections_for_FOO, - by = c("spouseID" = "personID"), - suffix = c("", "_spouse"), - multiple = "any" - ) |> - dplyr::rename( - x_spouse = "x_pos_spouse", - y_spouse = "y_pos_spouse" - ) |> unique() |> - dplyr::mutate( - x_start = .data$x_spouse, - x_end = .data$x_pos, - y_start = .data$y_spouse, - y_end = .data$y_pos - ) |> select( - -"spouseID_spouse" - ) -} + # Get spouse coordinates + } else { + # spouses + # Get spouse coordinates + spouse_connections <- ped |> + dplyr::select( + "personID", "x_pos", + "y_pos", "spouseID" + ) |> + dplyr::filter(!is.na(.data$spouseID)) |> + dplyr::left_join(connections_for_FOO, + by = c("spouseID" = "personID"), + suffix = c("", "_spouse"), + multiple = "any" + ) |> + dplyr::rename( + x_spouse = "x_pos_spouse", + y_spouse = "y_pos_spouse" + ) |> + unique() |> + dplyr::mutate( + x_start = .data$x_spouse, + x_end = .data$x_pos, + y_start = .data$y_spouse, + y_end = .data$y_pos + ) |> + dplyr::select( + -"spouseID_spouse" + ) + } return(parent_hash_connections) } - - diff --git a/R/calcConnectionsHelpers.R b/R/calcConnectionsHelpers.R index bd12eb1c..a651e7de 100644 --- a/R/calcConnectionsHelpers.R +++ b/R/calcConnectionsHelpers.R @@ -16,20 +16,18 @@ computeDistance <- function(x1, y1, x2, y2, method = "euclidean", p = NULL) { - method <- tolower(method) - if(is.null(p)) { + if (is.null(p)) { p <- switch(method, - euclidean = 2, - cityblock = 1, - stop("Invalid distance method. Choose from 'euclidean', 'cityblock', or specify p.") + euclidean = 2, + cityblock = 1, + stop("Invalid distance method. Choose from 'euclidean', 'cityblock', or specify p.") ) } # Calculate Minkowski distance ((abs(x1 - x2))^p + (abs(y1 - y2))^p)^(1 / p) - } #' Compute midpoints across grouped coordinates @@ -214,8 +212,8 @@ getRelativeCoordinates <- function(ped, connections, relativeIDvar, x_name, y_na !!y_name ) } - if(only_unique == TRUE){ - rel_connections <- unique(rel_connections) + if (only_unique == TRUE) { + rel_connections <- unique(rel_connections) } return(rel_connections) @@ -224,6 +222,7 @@ getRelativeCoordinates <- function(ped, connections, relativeIDvar, x_name, y_na symKey <- function(id1, id2, sep = ".") { dplyr::if_else(id1 < id2, - paste0(id1, sep, id2), - paste0(id2, sep, id1)) + paste0(id1, sep, id2), + paste0(id2, sep, id1) + ) } diff --git a/R/calcCoordinates.R b/R/calcCoordinates.R index f7048372..ad06104b 100644 --- a/R/calcCoordinates.R +++ b/R/calcCoordinates.R @@ -32,9 +32,7 @@ calculateCoordinates <- function(ped, personID = "ID", momID = "momID", dadID = "dadID", spouseID = "spouseID", sexVar = "sex", code_male = NULL, - config = list()) - { - + config = list()) { if (!inherits(ped, "data.frame")) { stop("ped should be a data.frame or inherit to a data.frame") } @@ -68,22 +66,27 @@ calculateCoordinates <- function(ped, personID = "ID", momID = "momID", momid = ped[[momID]], sex = ped_recode[[sexVar]], ) - - if(!is.null(config$hints)) { - -#' Check if hints are provided - autohint <- tryCatch(kinship2::autohint(ped_ped,config$hints, - align = config$ped_align, - packed = config$ped_packed), - error = function(e) kinship2::autohint(ped_ped, - align = config$ped_align, - packed = config$ped_packed) - , - finally = warning("Your hints caused an error and were not used, using default hints instead")) +# + if ("hints" %in% names(config)) { + # Check if hints are provided + autohint <- tryCatch( + kinship2::autohint(ped_ped, config$hints, + align = config$ped_align, + packed = config$ped_packed + ), + error = function(e) { + warning("Your hints caused an error and were not used. Using default hints instead.") + kinship2::autohint( + ped_ped, + align = config$ped_align, + packed = config$ped_packed + )} + ) } else { autohint <- kinship2::autohint(ped_ped, - align = config$ped_align, - packed = config$ped_packed) + align = config$ped_align, + packed = config$ped_packed + ) } @@ -107,8 +110,8 @@ calculateCoordinates <- function(ped, personID = "ID", momID = "momID", # Flatten coordinate matrix - # pos_vector <- as.vector(pos$pos) -# spouse_vector <- as.vector(pos$spouse) + # pos_vector <- as.vector(pos$pos) + # spouse_vector <- as.vector(pos$spouse) # Initialize coordinate columns in the data frame @@ -141,7 +144,6 @@ calculateCoordinates <- function(ped, personID = "ID", momID = "momID", x_coords[i] <- nid_pos[i, "col"] x_pos[i] <- pos$pos[nid_pos[i, "row"], nid_pos[i, "col"]] spouse_vector[i] <- pos$spouse[nid_pos[i, "row"], nid_pos[i, "col"]] - } # ----- @@ -213,5 +215,3 @@ calculateCoordinates <- function(ped, personID = "ID", momID = "momID", return(ped) } - - diff --git a/R/ggpedigree.R b/R/ggpedigree.R index 180f5bff..197a2daa 100644 --- a/R/ggpedigree.R +++ b/R/ggpedigree.R @@ -12,6 +12,7 @@ #' @param dadID Character string specifying the column name for father IDs. Defaults to "dadID". #' @param status_col Character string specifying the column name for affected status. Defaults to NULL. #' @param debug Logical. If TRUE, prints debugging information. Default: FALSE. +#' @param hints Data frame with hints for layout adjustments. Default: NULL. #' @param config A list of configuration options for customizing the plot. The list can include: #' \describe{ #' \item{code_male}{Integer or string. Value identifying males in the sex column. (typically 0 or 1) Default: 1.} @@ -43,20 +44,25 @@ ggPedigree <- function(ped, famID = "famID", dadID = "dadID", status_col = NULL, config = list(), - debug = FALSE) { + debug = FALSE, + hints = NULL, + ...) { # ----- # STEP 1: Configuration and Preparation # ----- # Set default styling and layout parameters default_config <- list( + apply_default_theme = TRUE, + apply_default_scales = TRUE, spouse_segment_color = "black", self_segment_color = "purple", sibling_segment_color = "black", parent_segment_color = "black", offspring_segment_color = "black", include_labels = TRUE, - label_method="ggrepel", + label_method = "ggrepel", + label_text_angle = 0, code_male = 1, label_text_size = 2, point_size = 4, @@ -73,8 +79,8 @@ ggPedigree <- function(ped, famID = "famID", sex_color = TRUE, status_vals = c(1, 0), max_overlaps = 15, - id_segment_color = NA, - hints = NULL + id_segment_color = NA#, + # hints = NULL ) @@ -102,10 +108,10 @@ ggPedigree <- function(ped, famID = "famID", # Clean duplicated famID columns if present if ("famID.y" %in% names(ds_ped)) { - ds_ped <- dplyr::select(ds_ped, -.data$famID.y) + ds_ped <- dplyr::select(.data=ds_ped, -"famID.y") } if ("famID.x" %in% names(ds_ped)) { - ds_ped <- dplyr::rename(ds_ped, famID = .data$famID.x) + ds_ped <- dplyr::rename(.data=ds_ped, famID = "famID.x") } # If personID is not "personID", rename to "personID" internally @@ -180,7 +186,7 @@ ggPedigree <- function(ped, famID = "famID", x = .data$x_spouse, xend = .data$x_pos, y = .data$y_spouse, - yend = .data$y_pos + yend = .data$y_pos ), linewidth = config$line_width, color = config$spouse_segment_color, @@ -200,7 +206,7 @@ ggPedigree <- function(ped, famID = "famID", linewidth = config$line_width, color = config$parent_segment_color, na.rm = TRUE - ) + + ) + # Mid-sibling to parents midpoint ggplot2::geom_segment( data = connections, @@ -295,29 +301,32 @@ ggPedigree <- function(ped, famID = "famID", # STEP 9: Add Labels # ----- # Add labels to the points using ggrepel for better visibility - if(config$include_labels == TRUE && config$label_method=="ggrepel"){ + if (config$include_labels == TRUE && config$label_method == "ggrepel") { p <- p + ggrepel::geom_text_repel(ggplot2::aes(label = .data$personID), - nudge_y = -.10 * config$generation_height, + nudge_y = -.10*config$generation_height, size = config$label_text_size, na.rm = TRUE, max.overlaps = config$max_overlaps, - segment.size = config$line_width *.5, + segment.size = config$line_width * .5, + angle = config$label_text_angle, segment.color = config$id_segment_color ) - } else if(config$include_labels == TRUE && config$label_method=="geom_label"){ + } else if (config$include_labels == TRUE && config$label_method == "geom_label") { p <- p + ggplot2::geom_label(ggplot2::aes(label = .data$personID), - nudge_y = -.25 * config$generation_height, + nudge_y = -.10 * config$generation_height, size = config$label_text_size, + angle = config$label_text_angle, na.rm = TRUE - ) - } else if(config$include_labels == TRUE || config$label_method=="geom_text"){ + ) + } else if (config$include_labels == TRUE || config$label_method == "geom_text") { p <- p + ggplot2::geom_text(ggplot2::aes(label = .data$personID), - nudge_y = -.25 * config$generation_height, + nudge_y = -.10*config$generation_height, size = config$label_text_size, + angle = config$label_text_angle, na.rm = TRUE ) } @@ -325,11 +334,11 @@ ggPedigree <- function(ped, famID = "famID", # Self-segment (for duplicate layout appearances of same person) if (inherits(plot_connections$self_coords, "data.frame")) { - - otherself <- plot_connections$self_coords |> filter(!is.na(.data$x_otherself)) |> - mutate( - otherself_xkey = symKey(.data$x_otherself, .data$x_pos)#, - # otherself_ykey = symKey(.data$y_otherself, .data$y_pos) + otherself <- plot_connections$self_coords |> + dplyr::filter(!is.na(.data$x_otherself)) |> + dplyr::mutate( + otherself_xkey = symKey(.data$x_otherself, .data$x_pos) # , + # otherself_ykey = symKey(.data$y_otherself, .data$y_pos) ) |> # unique combinations of x_otherself and x_pos and y_otherself and y_pos dplyr::distinct(.data$otherself_xkey, .keep_all = TRUE) @@ -349,7 +358,6 @@ ggPedigree <- function(ped, famID = "famID", curvature = -0.2, na.rm = TRUE ) - } @@ -359,29 +367,35 @@ ggPedigree <- function(ped, famID = "famID", # ----- p <- p + - ggplot2::scale_y_reverse() + - ggplot2::theme_minimal() + - ggplot2::theme( - axis.title.y = ggplot2::element_blank(), - axis.text.y = ggplot2::element_blank(), - axis.ticks.y = ggplot2::element_blank(), - panel.grid.major = ggplot2::element_blank(), - panel.grid.minor = ggplot2::element_blank(), - panel.background = ggplot2::element_blank(), - axis.title.x = ggplot2::element_blank(), - axis.text.x = ggplot2::element_blank(), - axis.ticks.x = ggplot2::element_blank() - ) + ggplot2::scale_y_reverse() + + if(config$apply_default_theme == TRUE) { + p <- p + + ggplot2::theme_minimal() + + ggplot2::theme( + axis.title.y = ggplot2::element_blank(), + axis.text.y = ggplot2::element_blank(), + axis.ticks.y = ggplot2::element_blank(), + panel.grid.major = ggplot2::element_blank(), + panel.grid.minor = ggplot2::element_blank(), + panel.background = ggplot2::element_blank(), + axis.title.x = ggplot2::element_blank(), + axis.text.x = ggplot2::element_blank(), + axis.ticks.x = ggplot2::element_blank() + ) +} + # ----- # STEP 11: Final Legend Adjustments # ----- # Adjust legend labels and colors based on the configuration - p <- p + ggplot2::scale_shape_manual( - values = config$shape_vals, - labels = config$shape_labs - ) + if(config$apply_default_scales == TRUE) { + p <- p + ggplot2::scale_shape_manual( + values = config$shape_vals, + labels = config$shape_labs + ) # Add alpha scale for affected status if applicable if (!is.null(status_col) && config$sex_color == TRUE) { @@ -405,7 +419,9 @@ ggPedigree <- function(ped, famID = "famID", } else { p <- p + ggplot2::labs(shape = "Sex") } -if (debug==TRUE) { + } + + if (debug == TRUE) { return(list( plot = p, data = ds, @@ -416,8 +432,8 @@ if (debug==TRUE) { # If debug is FALSE, return only the plot return(p) } -} +} #' @rdname ggPedigree #' @export diff --git a/R/processExtras.R b/R/processExtras.R index 4aacc149..abe6ff11 100644 --- a/R/processExtras.R +++ b/R/processExtras.R @@ -1,5 +1,3 @@ - - #' Process duplicate appearances of individuals in a pedigree layout #' #' Resolves layout conflicts when the same individual appears in multiple places @@ -15,86 +13,89 @@ #' @keywords internal processExtras <- function(ped, config = list()) { + # ---- sanity checks ------------------------------------------------------- + if (!inherits(ped, "data.frame")) { + stop("ped must be a data.frame") + } + req_cols <- c( + "personID", "x_pos", "y_pos", + "momID", "dadID", "spouseID", "extra" + ) + miss <- setdiff(req_cols, names(ped)) + if (length(miss)) { + stop("ped is missing columns: ", paste(miss, collapse = ", ")) + } - # ---- sanity checks ------------------------------------------------------- - if (!inherits(ped, "data.frame")) { - stop("ped must be a data.frame") - } - - req_cols <- c("personID", "x_pos", "y_pos", - "momID", "dadID", "spouseID", "extra") - miss <- setdiff(req_cols, names(ped)) - if (length(miss)) - stop("ped is missing columns: ", paste(miss, collapse = ", ")) - - # ---- 1. ensure a unique row key ---- - - ped$newID <- seq_len(nrow(ped)) + # ---- 1. ensure a unique row key ---- - idsextras <- dplyr::filter(ped, .data$extra == TRUE) |> - dplyr::select("personID") |> - dplyr::pull() |> - unique() + ped$newID <- seq_len(nrow(ped)) - # check if momID == spouseID - if (any(ped$momID == ped$spouseID, na.rm = TRUE)) { + idsextras <- dplyr::filter(ped, .data$extra == TRUE) |> + dplyr::select("personID") |> + dplyr::pull() |> + unique() + # check if momID == spouseID + if (any(ped$momID == ped$spouseID, na.rm = TRUE)) { ped <- ped |> dplyr::mutate( momSpouse = dplyr::if_else(.data$spouseID == .data$momID, - TRUE, - FALSE) - ) - } else { - ped <- ped |> - dplyr::mutate( - momSpouse = FALSE - ) - } - if (any(ped$dadID == ped$spouseID, na.rm = TRUE)) { - - ped <- ped |> - dplyr::mutate( - dadSpouse = dplyr::if_else(.data$spouseID == .data$dadID, - TRUE, - FALSE) + TRUE, + FALSE ) - } else { - ped <- ped |> - dplyr::mutate( - dadSpouse = FALSE + ) + } else { + ped <- ped |> + dplyr::mutate( + momSpouse = FALSE + ) + } + if (any(ped$dadID == ped$spouseID, na.rm = TRUE)) { + ped <- ped |> + dplyr::mutate( + dadSpouse = dplyr::if_else(.data$spouseID == .data$dadID, + TRUE, + FALSE ) - } - - - # ---- 2. give every extra appearance a unique numeric personID ----------- + ) + } else { ped <- ped |> - dplyr::arrange(.data$personID, .data$newID) |> dplyr::mutate( - coreID = .data$personID, - personID = dplyr::if_else( - .data$extra, - .data$personID + .data$newID / 1000, # numeric, unique - .data$personID - ), - total_blue = .data$dadSpouse | .data$momSpouse - ) |> - select(-.data$dadSpouse, -.data$momSpouse) + dadSpouse = FALSE + ) + } + + + # ---- 2. give every extra appearance a unique numeric personID ----------- + ped <- ped |> + dplyr::arrange(.data$personID, .data$newID) |> + dplyr::mutate( + coreID = .data$personID, + personID = dplyr::if_else( + .data$extra, + .data$personID + .data$newID / 1000, # numeric, unique + .data$personID + ), + total_blue = .data$dadSpouse | .data$momSpouse + ) |> + dplyr::select(-.data$dadSpouse, -.data$momSpouse) - ped <- ped |> # flag anyone with extra appearances - dplyr::mutate(extra = dplyr::case_when(.data$coreID %in% idsextras ~ TRUE, - .data$momID %in% idsextras ~ TRUE, - .data$dadID %in% idsextras ~ TRUE, - .data$spouseID %in% idsextras ~ TRUE, - TRUE ~ .data$extra)) + ped <- ped |> # flag anyone with extra appearances + dplyr::mutate(extra = dplyr::case_when( + .data$coreID %in% idsextras ~ TRUE, + .data$momID %in% idsextras ~ TRUE, + .data$dadID %in% idsextras ~ TRUE, + .data$spouseID %in% idsextras ~ TRUE, + TRUE ~ .data$extra + )) - # ---- 3. isolate duplicates for distance logic --------------------------- - extras <- dplyr::filter(ped, .data$extra) + # ---- 3. isolate duplicates for distance logic --------------------------- + extras <- dplyr::filter(ped, .data$extra) - # ---- 3a. attach relative coordinates (same helpers you use) ------------- - # Mother's coordinates + # ---- 3a. attach relative coordinates (same helpers you use) ------------- + # Mother's coordinates mom_coords <- getRelativeCoordinates( ped = ped, connections = extras, @@ -124,193 +125,209 @@ processExtras <- function(ped, config = list()) { multiple = "all" ) - parent_hash_coords <- extras |> - dplyr::left_join(mom_coords, by = c("newID", "personID", "momID")) |> - dplyr::left_join(dad_coords, by = c("newID", "personID", "dadID")) |> - dplyr::filter(!is.na(.data$parent_hash)) |> - dplyr::mutate( - x_parent_hash = mean(c(.data$x_dad, .data$x_mom), na.rm = TRUE), - y_parent_hash = mean(c(.data$y_dad, .data$y_mom), na.rm = TRUE) - ) |> - dplyr::select(.data$newID, .data$personID, - .data$x_parent_hash, .data$y_parent_hash) + parent_hash_coords <- extras |> + dplyr::left_join(mom_coords, by = c("newID", "personID", "momID")) |> + dplyr::left_join(dad_coords, by = c("newID", "personID", "dadID")) |> + dplyr::filter(!is.na(.data$parent_hash)) |> + dplyr::mutate( + x_parent_hash = mean(c(.data$x_dad, .data$x_mom), na.rm = TRUE), + y_parent_hash = mean(c(.data$y_dad, .data$y_mom), na.rm = TRUE) + ) |> + dplyr::select( + .data$newID, .data$personID, + .data$x_parent_hash, .data$y_parent_hash + ) - extras <- extras |> - dplyr::left_join(mom_coords, by = c("newID", "personID", "momID")) |> - dplyr::left_join(dad_coords, by = c("newID", "personID", "dadID")) |> - dplyr::left_join(spouse_coords, by = c("newID", "personID", "spouseID")) |> - dplyr::left_join(parent_hash_coords, by = c("newID", "personID")) + extras <- extras |> + dplyr::left_join(mom_coords, by = c("newID", "personID", "momID")) |> + dplyr::left_join(dad_coords, by = c("newID", "personID", "dadID")) |> + dplyr::left_join(spouse_coords, by = c("newID", "personID", "spouseID")) |> + dplyr::left_join(parent_hash_coords, by = c("newID", "personID")) - # ---- 3b. compute distance metrics -------------- - extras <- extras |> - dplyr::mutate( - dist_mom = computeDistance(method = "cityblock", - x1 = .data$x_pos, y1 = .data$y_pos, - x2 = .data$x_mom, y2 = .data$y_mom), - dist_dad = computeDistance(method = "cityblock", - x1 = .data$x_pos, y1 = .data$y_pos, - x2 = .data$x_dad, y2 = .data$y_dad), - dist_spouse = computeDistance(method = "cityblock", - x1 = .data$x_pos, y1 = .data$y_pos, - x2 = .data$x_spouse, y2 = .data$y_spouse), - - total_parent_dist_cityblock = computeDistance(method = "cityblock", - x1 = .data$x_pos, y1 = .data$y_pos, - x2 = .data$x_parent_hash, y2 = .data$y_parent_hash), - - total_parent_dist2 = .data$dist_mom + .data$dist_dad - ) + # ---- 3b. compute distance metrics -------------- + extras <- extras |> + dplyr::mutate( + dist_mom = computeDistance( + method = "cityblock", + x1 = .data$x_pos, y1 = .data$y_pos, + x2 = .data$x_mom, y2 = .data$y_mom + ), + dist_dad = computeDistance( + method = "cityblock", + x1 = .data$x_pos, y1 = .data$y_pos, + x2 = .data$x_dad, y2 = .data$y_dad + ), + dist_spouse = computeDistance( + method = "cityblock", + x1 = .data$x_pos, y1 = .data$y_pos, + x2 = .data$x_spouse, y2 = .data$y_spouse + ), + total_parent_dist_cityblock = computeDistance( + method = "cityblock", + x1 = .data$x_pos, y1 = .data$y_pos, + x2 = .data$x_parent_hash, y2 = .data$y_parent_hash + ), + total_parent_dist2 = .data$dist_mom + .data$dist_dad + ) - # ---- 4. choose winning duplicate per relationship ----------------------- + # ---- 4. choose winning duplicate per relationship ----------------------- - spouse_winner <- extras |> - dplyr::group_by(.data$coreID,.data$spouseID) |> - dplyr::slice_min(.data$dist_spouse, n = 1, with_ties = FALSE) |> - dplyr::ungroup() |> - dplyr::select(coreID, spouse_choice = .data$personID) + spouse_winner <- extras |> + dplyr::group_by(.data$coreID, .data$spouseID) |> + dplyr::slice_min(.data$dist_spouse, n = 1, with_ties = FALSE) |> + dplyr::ungroup() |> + dplyr::select("coreID", spouse_choice = .data$personID) -if(sum(ped$total_blue,na.rm = TRUE) == 0){ + if (sum(ped$total_blue, na.rm = TRUE) == 0) { parent_winner <- extras |> dplyr::group_by(.data$coreID) |> dplyr::slice_min(.data$total_parent_dist_cityblock, n = 1, with_ties = FALSE) |> dplyr::ungroup() |> - dplyr::select(coreID, parent_choice = .data$personID) -} else{ - - # if there are spouseID == momID or spouseID == dadID, then parent choice needs to be the 2nd closest - parent_winner <- extras |> - dplyr::group_by(coreID) |> - dplyr::arrange(total_parent_dist2, .by_group = TRUE) |> - dplyr::mutate( - rank = dplyr::row_number(), # 1 = closest, 2 = second‑closest … - pick_rank = ifelse(any(.data$total_blue), 2L, 1L) # group‑level choice - ) |> - dplyr::filter(.data$rank == .data$pick_rank) |> - dplyr::ungroup() |> - dplyr::select(coreID, parent_choice = .data$personID) -} - # ---- 5. row‑wise relink using nearest appearance ------------------------- + dplyr::select("coreID", parent_choice = .data$personID) + } else { + # if there are spouseID == momID or spouseID == dadID, then parent choice needs to be the 2nd closest + parent_winner <- extras |> + dplyr::group_by(coreID) |> + dplyr::arrange(.data$total_parent_dist2, .by_group = TRUE) |> + dplyr::mutate( + rank = dplyr::row_number(), # 1 = closest, 2 = second‑closest … + pick_rank = base::ifelse(any(.data$total_blue), 2L, 1L) # group‑level choice + ) |> + dplyr::filter(.data$rank == .data$pick_rank) |> + dplyr::ungroup() |> + dplyr::select("coreID", parent_choice = .data$personID) + } + # ---- 5. row‑wise relink using nearest appearance ------------------------- - # lookup table: every appearance of every coreID - dup_xy <- ped |> - dplyr::select(personID, coreID, x_pos, y_pos, total_blue) + # lookup table: every appearance of every coreID + dup_xy <- ped |> + dplyr::select("personID", "coreID", "x_pos", "y_pos", "total_blue") - closest_dup <- function(target_core, x0, y0) { - cand <- dup_xy[dup_xy$coreID == target_core, ] - if (nrow(cand) == 0L) return(NA_real_) - # compute Manhattan (“city‑block”) distance for all candidates - d <- computeDistance( - method = "cityblock", - x1 = x0, y1 = y0, - x2 = cand$x_pos, y2 = cand$y_pos - ) - ord <- order(d) # ascending distance - pick <- if (any(cand$total_blue, na.rm = TRUE)) 2L else 1L # 2nd if blue present, else 1st + closest_dup <- function(target_core, x0, y0) { + cand <- dup_xy[dup_xy$coreID == target_core, ] + if (nrow(cand) == 0L) { + return(NA_real_) + } + # compute Manhattan (“city‑block”) distance for all candidates + d <- computeDistance( + method = "cityblock", + x1 = x0, y1 = y0, + x2 = cand$x_pos, y2 = cand$y_pos + ) + ord <- order(d) # ascending distance + pick <- if(any(cand$total_blue, na.rm = TRUE)){ + 2L }else{ 1L} # 2nd if blue present, else 1st - if (length(ord) < pick) pick <- 1L + if(length(ord) < pick) pick <- 1L - cand$personID[ord[pick]] + cand$personID[ord[pick]] # cand$personID[ # which.min( # computeDistance(method = "cityblock", - # x1= x0, y1=y0, - # x2=cand$x_pos, y2=cand$y_pos) - # ) + # x1= x0, y1=y0, + # x2=cand$x_pos, y2=cand$y_pos) + # ) # ] - } + } - relink <- function(df, col) { - df |> - dplyr::rowwise() |> - dplyr::mutate( - "{col}" := { - tgt <- .data[[col]] - if (is.na(tgt)) NA_real_ - else closest_dup(tgt, .data$x_pos, .data$y_pos) + relink <- function(df, col) { + df |> + dplyr::rowwise() |> + dplyr::mutate( + "{col}" := { + tgt <- .data[[col]] + if (is.na(tgt)) { + NA_real_ + } else { + closest_dup(tgt, .data$x_pos, .data$y_pos) } - ) |> - dplyr::ungroup() - } - - + } + ) |> + dplyr::ungroup() + } - # remove parent ids from all but the closest coreID, - # if there's no choice to be made, then keep existing momID - ped <- ped |> - dplyr::left_join(spouse_winner, by = "coreID") |> - dplyr::left_join(parent_winner, by = "coreID") |> - dplyr::mutate( - momID = dplyr::case_when(.data$personID == .data$parent_choice ~ .data$momID, - !is.na(.data$parent_choice) ~ NA_real_, - TRUE ~ .data$momID ), - - dadID = dplyr::case_when(.data$personID == .data$parent_choice ~ .data$dadID, - !is.na(.data$parent_choice) ~ NA_real_, - TRUE ~ .data$dadID ), - spouseID = dplyr::case_when(.data$personID == .data$spouse_choice ~ .data$spouseID, - !is.na(.data$spouse_choice) ~ NA_real_, - TRUE ~ .data$spouseID ) - - ) |> dplyr::select(-.data$parent_choice, -.data$spouse_choice, - -starts_with("newID")) - ped <- ped |> - relink("spouseID") |> - relink("momID") |> - relink("dadID") - # + # remove parent ids from all but the closest coreID, + # if there's no choice to be made, then keep existing momID - # rehash - ped <- ped |> - dplyr::mutate( - parent_hash = symKey(.data$momID, .data$dadID), - couple_hash = symKey(.data$personID, .data$spouseID) - ) |> - dplyr::mutate( - parent_hash = gsub("NA.NA", NA_real_, .data$parent_hash), - couple_hash = gsub("NA.NA", NA_real_, .data$couple_hash) + ped <- ped |> + dplyr::left_join(spouse_winner, by = "coreID") |> + dplyr::left_join(parent_winner, by = "coreID") |> + dplyr::mutate( + momID = dplyr::case_when( + .data$personID == .data$parent_choice ~ .data$momID, + !is.na(.data$parent_choice) ~ NA_real_, + TRUE ~ .data$momID + ), + dadID = dplyr::case_when( + .data$personID == .data$parent_choice ~ .data$dadID, + !is.na(.data$parent_choice) ~ NA_real_, + TRUE ~ .data$dadID + ), + spouseID = dplyr::case_when( + .data$personID == .data$spouse_choice ~ .data$spouseID, + !is.na(.data$spouse_choice) ~ NA_real_, + TRUE ~ .data$spouseID ) - # ---- 6. remove duplicates and return ------------------------------------ - - # Coordinates of the individual's other appearance ("self") - self_coords <- extras |> - dplyr::left_join( - ped, - by = c("coreID"), - suffix = c("", "_other"), - # relationship = relationship, - multiple = "all" - ) |> - dplyr::filter(.data$personID != .data$personID_other) |> - dplyr::mutate( - x_otherself = .data$x_pos_other, - y_otherself = .data$y_pos_other - ) |> - dplyr::select( - .data$personID, - # .data$coreID, - .data$x_pos, - .data$y_pos, - .data$x_otherself, - .data$y_otherself - ) |> unique() - - - -full_extra <- list( - ped = ped, - self_coords = self_coords + ) |> + dplyr::select( + -.data$parent_choice, -.data$spouse_choice, + -dplyr::starts_with("newID") ) + ped <- ped |> + relink("spouseID") |> + relink("momID") |> + relink("dadID") - return(full_extra) - } - - + # + # rehash + ped <- ped |> + dplyr::mutate( + parent_hash = symKey(.data$momID, .data$dadID), + couple_hash = symKey(.data$personID, .data$spouseID) + ) |> + dplyr::mutate( + parent_hash = gsub("NA.NA", NA_real_, .data$parent_hash), + couple_hash = gsub("NA.NA", NA_real_, .data$couple_hash) + ) + # ---- 6. remove duplicates and return ------------------------------------ + + # Coordinates of the individual's other appearance ("self") + self_coords <- extras |> + dplyr::left_join( + ped, + by = c("coreID"), + suffix = c("", "_other"), + # relationship = relationship, + multiple = "all" + ) |> + dplyr::filter(.data$personID != .data$personID_other) |> + dplyr::mutate( + x_otherself = .data$x_pos_other, + y_otherself = .data$y_pos_other + ) |> + # dplyr::select( + # .data$personID, + # # .data$coreID, + # .data$x_pos, + # .data$y_pos, + # .data$x_otherself, + # .data$y_otherself, + # + # ) |> + unique() + + full_extra <- list( + ped = ped, + self_coords = self_coords + ) + return(full_extra) +} diff --git a/man/calculateConnections.Rd b/man/calculateConnections.Rd index 8f334678..6ec2293e 100644 --- a/man/calculateConnections.Rd +++ b/man/calculateConnections.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/calcCoordinates.R +% Please edit documentation in R/calcConnections.R \name{calculateConnections} \alias{calculateConnections} \title{Calculate connections for a pedigree dataset} diff --git a/man/computeDistance.Rd b/man/computeDistance.Rd new file mode 100644 index 00000000..e42f244a --- /dev/null +++ b/man/computeDistance.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/calcConnectionsHelpers.R +\name{computeDistance} +\alias{computeDistance} +\title{Compute distance between two points} +\usage{ +computeDistance(x1, y1, x2, y2, method = "euclidean", p = NULL) +} +\arguments{ +\item{x1}{Numeric. X-coordinate of the first point.} + +\item{y1}{Numeric. Y-coordinate of the first point.} + +\item{x2}{Numeric. X-coordinate of the second point.} + +\item{y2}{Numeric. Y-coordinate of the second point.} + +\item{method}{Character. Method of distance calculation. Options are "euclidean", "cityblock", and "Minkowski".} + +\item{p}{Numeric. The order of the Minkowski distance. If NULL, defaults to 2 for Euclidean and 1 for Manhattan. If +Minkowski method is used, p should be specified.} +} +\description{ +This function calculates the distance between two points in a 2D space using +Minkowski distance. It can be used to compute Euclidean or Manhattan distance. +It is a utility function for calculating distances in pedigree layouts. +Defaults to Euclidean distance if no method is specified. +} diff --git a/man/getMidpoints.Rd b/man/getMidpoints.Rd index 1e74d3f0..f1d7b2eb 100644 --- a/man/getMidpoints.Rd +++ b/man/getMidpoints.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/calcCoordinates.R +% Please edit documentation in R/calcConnectionsHelpers.R \name{getMidpoints} \alias{getMidpoints} \title{Compute midpoints across grouped coordinates} diff --git a/man/getRelativeCoordinates.Rd b/man/getRelativeCoordinates.Rd index 60a7da73..b280cc0c 100644 --- a/man/getRelativeCoordinates.Rd +++ b/man/getRelativeCoordinates.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/calcCoordinates.R +% Please edit documentation in R/calcConnectionsHelpers.R \name{getRelativeCoordinates} \alias{getRelativeCoordinates} \title{Get coordinate positions of relatives for each individual} @@ -11,7 +11,8 @@ getRelativeCoordinates( x_name, y_name, personID = "personID", - multiple = "all" + multiple = "all", + only_unique = TRUE ) } \arguments{ @@ -28,6 +29,8 @@ getRelativeCoordinates( \item{personID}{Character string specifying the column name for individual IDs.} \item{multiple}{Character. Specifies how to handle multiple matches. Options are "all" or "any".} + +\item{only_unique}{Logical. If TRUE, return only unique rows. Defaults to TRUE.} } \value{ A `data.frame` with columns: diff --git a/man/ggPedigree.Rd b/man/ggPedigree.Rd index a83109f4..ff5c4ec0 100644 --- a/man/ggPedigree.Rd +++ b/man/ggPedigree.Rd @@ -13,7 +13,10 @@ ggPedigree( momID = "momID", dadID = "dadID", status_col = NULL, - config = list() + config = list(), + debug = FALSE, + hints = NULL, + ... ) ggpedigree( @@ -23,7 +26,10 @@ ggpedigree( momID = "momID", dadID = "dadID", status_col = NULL, - config = list() + config = list(), + debug = FALSE, + hints = NULL, + ... ) ggped( @@ -33,7 +39,10 @@ ggped( momID = "momID", dadID = "dadID", status_col = NULL, - config = list() + config = list(), + debug = FALSE, + hints = NULL, + ... ) } \arguments{ @@ -54,7 +63,7 @@ ggped( \item{code_male}{Integer or string. Value identifying males in the sex column. (typically 0 or 1) Default: 1.} \item{spouse_segment_color, self_segment_color, sibling_segment_color, parent_segment_color, offspring_segment_color}{Character. Line colors for respective connection types.} \item{label_text_size, point_size, line_width}{Numeric. Controls text size, point size, and line thickness.} - \item{generation_gap}{Numeric. Vertical spacing multiplier between generations. Default: 1.} + \item{generation_height}{Numeric. Vertical spacing multiplier between generations. Default: 1.} \item{unknown_shape, female_shape, male_shape, affected_shape}{Integers. Shape codes for plotting each group.} \item{sex_shape_labs}{Character vector of labels for the sex variable. (default: c("Female", "Male", "Unknown")} \item{unaffected, affected}{Values indicating unaffected/affected status.} @@ -62,6 +71,10 @@ ggped( \item{max_overlaps}{Maximum number of overlaps allowed in repelled labels.} \item{id_segment_color}{Color used for label connector lines.} }} + +\item{debug}{Logical. If TRUE, prints debugging information. Default: FALSE.} + +\item{hints}{Data frame with hints for layout adjustments. Default: NULL.} } \value{ A `ggplot` object rendering the pedigree diagram. diff --git a/man/processExtras.Rd b/man/processExtras.Rd index 57183be4..ab0ec543 100644 --- a/man/processExtras.Rd +++ b/man/processExtras.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/calcCoordinates.R +% Please edit documentation in R/processExtras.R \name{processExtras} \alias{processExtras} \title{Process duplicate appearances of individuals in a pedigree layout} diff --git a/tests/testthat/test-calcCoordinates.R b/tests/testthat/test-calcCoordinates.R index 9eb79ec9..64306503 100644 --- a/tests/testthat/test-calcCoordinates.R +++ b/tests/testthat/test-calcCoordinates.R @@ -61,7 +61,7 @@ test_that("calculateConnections returns expected structure", { "x_mid_sib", "y_mid_sib" ) - expect_true(all(expected_cols %in% names(conns))) + expect_true(all(expected_cols %in% names(conns$connections))) }) @@ -125,18 +125,13 @@ test_that("getRelativeCoordinates returns expected coordinates for mother", { test_that("broken hints doesn't cause a fatal error", { library(BGmisc) data("potter") - potter$momID[1] <- NA - potter$dadID[1] <- NA # Test with hints expect_warning( ggPedigree(potter, - famID = "famID", - personID = "personID", - config= list(hints = TRUE) + famID = "famID", + personID = "personID", + config = list(hints = TRUE) ) ) -} - ) - - +}) diff --git a/vignettes/plots.R b/vignettes/plots.R index 1c47ea7c..345f50a1 100644 --- a/vignettes/plots.R +++ b/vignettes/plots.R @@ -75,8 +75,6 @@ ggPedigree( ) ) - - ## ----------------------------------------------------------------------------- p + facet_wrap(~famID, scales = "free_x") @@ -104,7 +102,9 @@ p + library(BGmisc) # helper utilities & example data data("inbreeding") -df <- inbreeding +df <- inbreeding %>% filter( + famID %in% c(5, 7), + ) p <- ggPedigree( @@ -112,15 +112,121 @@ p <- ggPedigree( famID = "famID", personID = "ID", status_col = "proband", +# debug = TRUE, config = list( code_male = 0, sex_color = F, - label_method="geom_text", + # label_method = "geom_text", affected = TRUE, unaffected = FALSE, - generation_gap = 2, - affected_shape = 4) -) + facet_wrap(~famID, scales= "free") + generation_height = 2, + generation_width = 1, + affected_shape = 4, + spouse_segment_color = "pink", + sibling_segment_color = "blue", + parent_segment_color = "green", + offspring_segment_color = "black" + ) +) + +# p$connections%>%filter(personID ==60) %>% nrow() +# p$connections%>%filter(personID ==66) %>% unique() +# p$connections%>%filter(personID ==65) %>% unique() + +# p$connections%>%filter(personID >=61 & +# personID <62 ) %>% unique() + +p + facet_wrap(~famID, scales= "free") #+ scale_color_viridis( + # discrete = TRUE, + # labels = c("TRUE", "FALSE") +# ) + theme_bw(base_size = 14) + guides(colour="none", shape="none") + +## ----------------------------------------------------------------------------- +library(tibble) + +pedigree_df <- tribble( + ~personID, ~momID, ~dadID, ~sex, ~famID, + 10011, NA, NA, 0, 1, + 10012, NA, NA, 1, 1, + 10021, NA, NA, 1, 1, + 10022, 10011, 10012, 1, 1, + 10023, 10011, 10012, 0, 1, + 10024, NA, NA, 0, 1, + 10025, NA, NA, 0, 1, + 10026, 10011, 10012, 0, 1, + 10027, 10011, 10012, 1, 1, + 10031, 10023, 10021, 0, 1, + 10032, 10023, 10021, 1, 1, + 10033, 10023, 10021, 1, 1, + 10034, 10023, 10021, 1, 1, + 10035, 10023, 10021, 0, 1, + 10036, 10024, 10022, 1, 1, + 10037, 10024, 10022, 0, 1, + 10038, 10025, 10027, 1, 1, + 10039, 10025, 10027, 0, 1, + 10310, 10025, 10027, 1, 1, + 10311, 10025, 10027, 1, 1, + 10312, 10025, 10027, 0, 1, + 10011, NA, NA, 0, 2, + 10012, NA, NA, 1, 2, + 10021, NA, NA, 0, 2, + 10022, 10011, 10012, 0, 2, + 10023, 10011, 10012, 1, 2, + 10024, 10011, 10012, 1, 2, + 10025, NA, NA, 1, 2, + 10026, 10011, 10012, 0, 2, + 10027, NA, NA, 1, 2, + 10031, 10021, 10023, 1, 2, + 10032, 10021, 10023, 0, 2, + 10033, 10021, 10023, 1, 2, + 10034, 10022, 10025, 0, 2, + 10035, 10022, 10025, 0, 2, + 10036, 10022, 10025, 1, 2, + 100310, 10022, 10025, 1, 2, + 10037, 10026, 10027, 0, 2, + 10038, 10026, 10027, 0, 2, + 10039, 10026, 10027, 0, 2, + 100311, 10026, 10027, 1, 2, + 100312, 10026, 10027, 1, 2 +) %>% mutate (proband = TRUE) + +#pedigree_df <- recodeSex(pedigree_df,code_male = 1, recode_male = "M") +pedigree_df$personID[pedigree_df$famID == 1] <- pedigree_df$personID[pedigree_df$famID == 1]-10000 +pedigree_df$momID[pedigree_df$famID == 1] <- pedigree_df$momID[pedigree_df$famID == 1]-10000 +pedigree_df$dadID[pedigree_df$famID == 1] <- pedigree_df$dadID[pedigree_df$famID == 1]-10000 + + + +p <- ggPedigree( + pedigree_df, + famID = "famID", + personID = "personID", + status_col = "proband", +# debug = TRUE, + config = list( + code_male = 1, + sex_color = F, + apply_default_scales = FALSE, + label_method = "geom_text", + # affected = TRUE, + # unaffected = FALSE, + generation_height = 1, + generation_width = 1, + affected_shape = 4, + spouse_segment_color = "black", + sibling_segment_color = "black", + parent_segment_color = "black", + offspring_segment_color = "black" + ) +) + +p + scale_shape_manual( + values = c(16, 15, 15), + labels = c("Female", "Male", "Unknown") + ) + + guides(colour="none", shape="none") + # guides(colour="none", shape="none") + +#facet_wrap(~famID, scales= "free") + -p diff --git a/vignettes/plots.Rmd b/vignettes/plots.Rmd index 785c8c1d..9d368b36 100644 --- a/vignettes/plots.Rmd +++ b/vignettes/plots.Rmd @@ -134,8 +134,6 @@ ggPedigree( unaffected = FALSE ) ) - - ``` # Multiple families in one graphic @@ -177,21 +175,21 @@ p + library(BGmisc) # helper utilities & example data data("inbreeding") -df <- inbreeding +df <- inbreeding %>% filter( + famID %in% c(5, 7), + ) p <- ggPedigree( - df %>% filter( - famID %in% c(5,7), - ), + df, famID = "famID", personID = "ID", status_col = "proband", - debug = TRUE, +# debug = TRUE, config = list( code_male = 0, sex_color = F, - label_method="geom_text", + # label_method = "geom_text", affected = TRUE, unaffected = FALSE, generation_height = 2, @@ -200,15 +198,110 @@ p <- ggPedigree( spouse_segment_color = "pink", sibling_segment_color = "blue", parent_segment_color = "green", - offspring_segment_color = "black") -)# + facet_wrap(~famID, scales= "free") + offspring_segment_color = "black" + ) +) + +# p$connections%>%filter(personID ==60) %>% nrow() +# p$connections%>%filter(personID ==66) %>% unique() +# p$connections%>%filter(personID ==65) %>% unique() + +# p$connections%>%filter(personID >=61 & +# personID <62 ) %>% unique() + +p + facet_wrap(~famID, scales= "free") #+ scale_color_viridis( + # discrete = TRUE, + # labels = c("TRUE", "FALSE") +# ) + theme_bw(base_size = 14) + guides(colour="none", shape="none") +``` + +# Power paper +```{r} +library(tibble) + +pedigree_df <- tribble( + ~personID, ~momID, ~dadID, ~sex, ~famID, + 10011, NA, NA, 0, 1, + 10012, NA, NA, 1, 1, + 10021, NA, NA, 1, 1, + 10022, 10011, 10012, 1, 1, + 10023, 10011, 10012, 0, 1, + 10024, NA, NA, 0, 1, + 10025, NA, NA, 0, 1, + 10026, 10011, 10012, 0, 1, + 10027, 10011, 10012, 1, 1, + 10031, 10023, 10021, 0, 1, + 10032, 10023, 10021, 1, 1, + 10033, 10023, 10021, 1, 1, + 10034, 10023, 10021, 1, 1, + 10035, 10023, 10021, 0, 1, + 10036, 10024, 10022, 1, 1, + 10037, 10024, 10022, 0, 1, + 10038, 10025, 10027, 1, 1, + 10039, 10025, 10027, 0, 1, + 10310, 10025, 10027, 1, 1, + 10311, 10025, 10027, 1, 1, + 10312, 10025, 10027, 0, 1, + 10011, NA, NA, 0, 2, + 10012, NA, NA, 1, 2, + 10021, NA, NA, 0, 2, + 10022, 10011, 10012, 0, 2, + 10023, 10011, 10012, 1, 2, + 10024, 10011, 10012, 1, 2, + 10025, NA, NA, 1, 2, + 10026, 10011, 10012, 0, 2, + 10027, NA, NA, 1, 2, + 10031, 10021, 10023, 1, 2, + 10032, 10021, 10023, 0, 2, + 10033, 10021, 10023, 1, 2, + 10034, 10022, 10025, 0, 2, + 10035, 10022, 10025, 0, 2, + 10036, 10022, 10025, 1, 2, + 100310, 10022, 10025, 1, 2, + 10037, 10026, 10027, 0, 2, + 10038, 10026, 10027, 0, 2, + 10039, 10026, 10027, 0, 2, + 100311, 10026, 10027, 1, 2, + 100312, 10026, 10027, 1, 2 +) %>% mutate (proband = TRUE) + +#pedigree_df <- recodeSex(pedigree_df,code_male = 1, recode_male = "M") +pedigree_df$personID[pedigree_df$famID == 1] <- pedigree_df$personID[pedigree_df$famID == 1]-10000 +pedigree_df$momID[pedigree_df$famID == 1] <- pedigree_df$momID[pedigree_df$famID == 1]-10000 +pedigree_df$dadID[pedigree_df$famID == 1] <- pedigree_df$dadID[pedigree_df$famID == 1]-10000 + + + +p <- ggPedigree( + pedigree_df, + famID = "famID", + personID = "personID", + status_col = "proband", +# debug = TRUE, + config = list( + code_male = 1, + sex_color = F, + apply_default_scales = FALSE, + label_method = "geom_text", + # affected = TRUE, + # unaffected = FALSE, + generation_height = 1, + generation_width = 1, + affected_shape = 4, + spouse_segment_color = "black", + sibling_segment_color = "black", + parent_segment_color = "black", + offspring_segment_color = "black" + ) +) -#p$connections%>%filter(personID ==60) %>% nrow() -#p$connections%>%filter(personID ==66) %>% unique() -#p$connections%>%filter(personID ==65) %>% unique() +p + scale_shape_manual( + values = c(16, 15, 15), + labels = c("Female", "Male", "Unknown") + ) + + guides(colour="none", shape="none") + # guides(colour="none", shape="none") + +#facet_wrap(~famID, scales= "free") -#p$connections%>%filter(personID >=61 & - # personID <62 ) %>% unique() -p$plot ``` diff --git a/vignettes/plots.html b/vignettes/plots.html index 36f798d3..17502698 100644 --- a/vignettes/plots.html +++ b/vignettes/plots.html @@ -492,8 +492,7 @@

    Changing the layout

    labels = c("Female", "Male", "Unknown") ) #> Scale for colour is already present. -#> Adding another scale for colour, which will replace the -#> existing scale. +#> Adding another scale for colour, which will replace the existing scale.

    @@ -501,26 +500,134 @@

    More complex examples

    library(BGmisc) # helper utilities & example data
     data("inbreeding")
     
    -df <- inbreeding 
    -
    -
    -p <- ggPedigree(
    -  df,
    -  famID = "famID",
    -  personID = "ID",
    -  status_col = "proband",
    -  config = list(
    -    code_male = 0,
    -    sex_color = F,
    -    label_method="geom_text",
    -    affected = TRUE,
    -    unaffected = FALSE,
    -    generation_gap = 2,
    -    affected_shape = 4)
    -) + facet_wrap(~famID, scales= "free")
    -
    -p
    -

    +df <- inbreeding %>% filter( + famID %in% c(5, 7), + ) + + +p <- ggPedigree( + df, + famID = "famID", + personID = "ID", + status_col = "proband", +# debug = TRUE, + config = list( + code_male = 0, + sex_color = F, + # label_method = "geom_text", + affected = TRUE, + unaffected = FALSE, + generation_height = 2, + generation_width = 1, + affected_shape = 4, + spouse_segment_color = "pink", + sibling_segment_color = "blue", + parent_segment_color = "green", + offspring_segment_color = "black" + ) +) + +# p$connections%>%filter(personID ==60) %>% nrow() +# p$connections%>%filter(personID ==66) %>% unique() +# p$connections%>%filter(personID ==65) %>% unique() + +# p$connections%>%filter(personID >=61 & +# personID <62 ) %>% unique() + +p + facet_wrap(~famID, scales= "free") #+ scale_color_viridis(
    +

    +
     #   discrete = TRUE,
    + #   labels = c("TRUE", "FALSE")
    +#  )  + theme_bw(base_size = 14)  +  guides(colour="none", shape="none")
    + +
    +

    Power paper

    +
    library(tibble)
    +
    +pedigree_df <- tribble(
    +  ~personID, ~momID, ~dadID, ~sex, ~famID,
    +  10011,     NA,     NA,     0,    1,
    +  10012,     NA,     NA,     1,    1,
    +  10021,     NA,     NA,     1,    1,
    +  10022,  10011,  10012,     1,    1,
    +  10023,  10011,  10012,     0,    1,
    +  10024,     NA,     NA,     0,    1,
    +  10025,     NA,     NA,     0,    1,
    +  10026,  10011,  10012,     0,    1,
    +  10027,  10011,  10012,     1,    1,
    +  10031,  10023,  10021,     0,    1,
    +  10032,  10023,  10021,     1,    1,
    +  10033,  10023,  10021,     1,    1,
    +  10034,  10023,  10021,     1,    1,
    +  10035,  10023,  10021,     0,    1,
    +  10036,  10024,  10022,     1,    1,
    +  10037,  10024,  10022,     0,    1,
    +  10038,  10025,  10027,     1,    1,
    +  10039,  10025,  10027,     0,    1,
    +  10310, 10025,  10027,     1,    1,
    +  10311, 10025,  10027,     1,    1,
    +  10312, 10025,  10027,     0,    1,
    +  10011,     NA,     NA,     0,    2,
    +  10012,     NA,     NA,     1,    2,
    +  10021,     NA,     NA,     0,    2,
    +  10022,  10011,  10012,     0,    2,
    +  10023,  10011,  10012,     1,    2,
    +  10024,  10011,  10012,     1,    2,
    +  10025,     NA,     NA,     1,    2,
    +  10026,  10011,  10012,     0,    2,
    +  10027,     NA,     NA,     1,    2,
    +  10031,  10021,  10023,     1,    2,
    +  10032,  10021,  10023,     0,    2,
    +  10033,  10021,  10023,     1,    2,
    +  10034,  10022,  10025,     0,    2,
    +  10035,  10022,  10025,     0,    2,
    +  10036,  10022,  10025,     1,    2,
    +  100310, 10022,  10025,     1,    2,
    +  10037,  10026,  10027,     0,    2,
    +  10038,  10026,  10027,     0,    2,
    +  10039,  10026,  10027,     0,    2,
    +  100311, 10026,  10027,     1,    2,
    +  100312, 10026,  10027,     1,    2
    +) %>% mutate (proband = TRUE)
    +
    +#pedigree_df <- recodeSex(pedigree_df,code_male = 1, recode_male = "M")
    +pedigree_df$personID[pedigree_df$famID == 1] <- pedigree_df$personID[pedigree_df$famID == 1]-10000
    +pedigree_df$momID[pedigree_df$famID == 1] <- pedigree_df$momID[pedigree_df$famID == 1]-10000
    +pedigree_df$dadID[pedigree_df$famID == 1] <- pedigree_df$dadID[pedigree_df$famID == 1]-10000
    +
    +
    +
    +p <- ggPedigree(
    +  pedigree_df,
    +  famID = "famID",
    +  personID = "personID",
    +  status_col = "proband",
    +#  debug = TRUE,
    +  config = list(
    +    code_male = 1,
    +    sex_color = F,
    +    apply_default_scales = FALSE,
    +   label_method = "geom_text",
    + #   affected = TRUE,
    + #   unaffected = FALSE,
    +    generation_height = 1,
    +    generation_width = 1,
    +    affected_shape = 4,
    +    spouse_segment_color = "black",
    +    sibling_segment_color = "black",
    +    parent_segment_color = "black",
    +    offspring_segment_color = "black"
    +  )
    +)
    +
    +p + scale_shape_manual(
    +      values = c(16, 15, 15),
    +      labels =  c("Female", "Male", "Unknown")
    +    ) +
    +  guides(colour="none", shape="none")
    +

    +
     # guides(colour="none", shape="none") + 
    +#facet_wrap(~famID, scales= "free") 
    diff --git a/vignettes/plots_morecomplexity.xmd b/vignettes/plots_morecomplexity.xmd index 85216c53..ef32aadc 100644 --- a/vignettes/plots_morecomplexity.xmd +++ b/vignettes/plots_morecomplexity.xmd @@ -127,8 +127,8 @@ ggPedigree(df_repaired, code_male = "M", affected = 1, affected_shape = 4, - point_size = 3, - label_text_size =1, + point_size = .5, + label_text_size =.5, spouse_segment_color = "pink", sibling_segment_color = "blue", parent_segment_color = "green", From 67b18896d2240388ab5965a9368f6f9666f21133 Mon Sep 17 00:00:00 2001 From: Mason Garrison Date: Thu, 15 May 2025 16:52:43 -0400 Subject: [PATCH 17/18] ready for update --- .Rbuildignore | 2 +- DESCRIPTION | 1 + R/ggpedigree.R | 1 + R/processExtras.R | 4 ++-- man/ggPedigree.Rd | 2 ++ 5 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.Rbuildignore b/.Rbuildignore index d8d42c09..19a02a58 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -20,4 +20,4 @@ CITATION.cff$ ^docs$ ^pkgdown$ ^kinship2 - Shortcut - +\.X$ diff --git a/DESCRIPTION b/DESCRIPTION index 496c76b6..e6a0024a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -20,6 +20,7 @@ Imports: ggrepel, rlang, dplyr, + stringr, utils Suggests: EasyMx, diff --git a/R/ggpedigree.R b/R/ggpedigree.R index 197a2daa..4ab13b99 100644 --- a/R/ggpedigree.R +++ b/R/ggpedigree.R @@ -13,6 +13,7 @@ #' @param status_col Character string specifying the column name for affected status. Defaults to NULL. #' @param debug Logical. If TRUE, prints debugging information. Default: FALSE. #' @param hints Data frame with hints for layout adjustments. Default: NULL. +#' @param ... Additional arguments passed to `ggplot2` functions. #' @param config A list of configuration options for customizing the plot. The list can include: #' \describe{ #' \item{code_male}{Integer or string. Value identifying males in the sex column. (typically 0 or 1) Default: 1.} diff --git a/R/processExtras.R b/R/processExtras.R index abe6ff11..3d485c26 100644 --- a/R/processExtras.R +++ b/R/processExtras.R @@ -181,14 +181,14 @@ processExtras <- function(ped, config = list()) { if (sum(ped$total_blue, na.rm = TRUE) == 0) { parent_winner <- extras |> - dplyr::group_by(.data$coreID) |> + dplyr::group_by("coreID") |> dplyr::slice_min(.data$total_parent_dist_cityblock, n = 1, with_ties = FALSE) |> dplyr::ungroup() |> dplyr::select("coreID", parent_choice = .data$personID) } else { # if there are spouseID == momID or spouseID == dadID, then parent choice needs to be the 2nd closest parent_winner <- extras |> - dplyr::group_by(coreID) |> + dplyr::group_by("coreID") |> dplyr::arrange(.data$total_parent_dist2, .by_group = TRUE) |> dplyr::mutate( rank = dplyr::row_number(), # 1 = closest, 2 = second‑closest … diff --git a/man/ggPedigree.Rd b/man/ggPedigree.Rd index ff5c4ec0..edc38456 100644 --- a/man/ggPedigree.Rd +++ b/man/ggPedigree.Rd @@ -75,6 +75,8 @@ ggped( \item{debug}{Logical. If TRUE, prints debugging information. Default: FALSE.} \item{hints}{Data frame with hints for layout adjustments. Default: NULL.} + +\item{...}{Additional arguments passed to `ggplot2` functions.} } \value{ A `ggplot` object rendering the pedigree diagram. From 0d0c7b581943c4a099f312f23a6fcea56d652777 Mon Sep 17 00:00:00 2001 From: Mason Garrison Date: Thu, 15 May 2025 16:59:42 -0400 Subject: [PATCH 18/18] push --- vignettes/plots.R | 7 +++-- vignettes/plots.Rmd | 7 +++-- vignettes/plots.html | 75 ++++++++++++++++++++++---------------------- 3 files changed, 46 insertions(+), 43 deletions(-) diff --git a/vignettes/plots.R b/vignettes/plots.R index 345f50a1..b633353e 100644 --- a/vignettes/plots.R +++ b/vignettes/plots.R @@ -100,11 +100,12 @@ p + ## ----message=FALSE, warning=FALSE--------------------------------------------- library(BGmisc) # helper utilities & example data + data("inbreeding") -df <- inbreeding %>% filter( - famID %in% c(5, 7), - ) +df <- inbreeding + +#df <- dplyr::filter(df, famID %in% c(5, 7)) p <- ggPedigree( diff --git a/vignettes/plots.Rmd b/vignettes/plots.Rmd index 9d368b36..7363aaa8 100644 --- a/vignettes/plots.Rmd +++ b/vignettes/plots.Rmd @@ -173,11 +173,12 @@ p + ```{r message=FALSE, warning=FALSE} library(BGmisc) # helper utilities & example data + data("inbreeding") -df <- inbreeding %>% filter( - famID %in% c(5, 7), - ) +df <- inbreeding + +#df <- dplyr::filter(df, famID %in% c(5, 7)) p <- ggPedigree( diff --git a/vignettes/plots.html b/vignettes/plots.html index 17502698..202fe542 100644 --- a/vignettes/plots.html +++ b/vignettes/plots.html @@ -498,44 +498,45 @@

    Changing the layout

    More complex examples

    library(BGmisc) # helper utilities & example data
    -data("inbreeding")
    -
    -df <- inbreeding %>% filter(
    -    famID %in% c(5, 7),
    -  )
    -
    +
    +data("inbreeding")
    +
    +df <- inbreeding 
    +
    +#df  <- dplyr::filter(df, famID %in% c(5, 7))
     
    -p <- ggPedigree(
    -  df,
    -  famID = "famID",
    -  personID = "ID",
    -  status_col = "proband",
    -#  debug = TRUE,
    -  config = list(
    -    code_male = 0,
    -    sex_color = F,
    -  #  label_method = "geom_text",
    -    affected = TRUE,
    -    unaffected = FALSE,
    -    generation_height = 2,
    -    generation_width = 1,
    -    affected_shape = 4,
    -    spouse_segment_color = "pink",
    -    sibling_segment_color = "blue",
    -    parent_segment_color = "green",
    -    offspring_segment_color = "black"
    -  )
    -) 
    -
    -# p$connections%>%filter(personID     ==60) %>% nrow()
    -# p$connections%>%filter(personID     ==66) %>% unique()
    -# p$connections%>%filter(personID     ==65) %>% unique()
    -
    -# p$connections%>%filter(personID  >=61 &
    -#       personID  <62 ) %>% unique()
    -
    -p  + facet_wrap(~famID, scales= "free") #+ scale_color_viridis(
    -

    + +p <- ggPedigree( + df, + famID = "famID", + personID = "ID", + status_col = "proband", +# debug = TRUE, + config = list( + code_male = 0, + sex_color = F, + # label_method = "geom_text", + affected = TRUE, + unaffected = FALSE, + generation_height = 2, + generation_width = 1, + affected_shape = 4, + spouse_segment_color = "pink", + sibling_segment_color = "blue", + parent_segment_color = "green", + offspring_segment_color = "black" + ) +) + +# p$connections%>%filter(personID ==60) %>% nrow() +# p$connections%>%filter(personID ==66) %>% unique() +# p$connections%>%filter(personID ==65) %>% unique() + +# p$connections%>%filter(personID >=61 & +# personID <62 ) %>% unique() + +p + facet_wrap(~famID, scales= "free") #+ scale_color_viridis(
    +

     #   discrete = TRUE,
      #   labels = c("TRUE", "FALSE")
     #  )  + theme_bw(base_size = 14)  +  guides(colour="none", shape="none")