diff --git a/.github/workflows/R-CMD-dev_check.yaml b/.github/workflows/R-CMD-dev_check.yaml new file mode 100644 index 0000000..b7d4ce9 --- /dev/null +++ b/.github/workflows/R-CMD-dev_check.yaml @@ -0,0 +1,54 @@ +on: + push: + branches: [dev] + paths: + - '**.Rmd' + - '**.Rd' + - '**.R' + pull_request: + branches: [dev] + +name: Dev Branch + +jobs: + R-CMD-check: + runs-on: ${{ matrix.config.os }} + + name: ${{ matrix.config.os }} (${{ matrix.config.r }}) + + strategy: + fail-fast: false + matrix: + config: + - {os: macOS-latest, r: 'release', rtools: ''} + - {os: windows-latest, r: 'release', rtools: '42'} + - {os: ubuntu-latest, r: 'devel', rtools: '', http-user-agent: 'release'} + - {os: ubuntu-latest, r: 'release', rtools: ''} + - {os: ubuntu-latest, r: 'oldrel-1', rtools: ''} + + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + R_KEEP_PKG_SOURCE: yes + + steps: + - uses: actions/checkout@v3 + + - uses: r-lib/actions/setup-pandoc@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + r-version: ${{ matrix.config.r }} + rtools-version: ${{ matrix.config.rtools }} + http-user-agent: ${{ matrix.config.http-user-agent }} + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + cache-version: 3 + extra-packages: | + any::rcmdcheck + upgrade: 'TRUE' + needs: check + - uses: r-lib/actions/check-r-package@v2 + with: + upload-snapshots: true diff --git a/.github/workflows/codecov.yaml b/.github/workflows/codecov.yaml new file mode 100644 index 0000000..0ebd87e --- /dev/null +++ b/.github/workflows/codecov.yaml @@ -0,0 +1,36 @@ +# For more configuration details: +# https://docs.codecov.com/docs/codecov-yaml + +# Check if this file is valid by running in bash: +# curl -X POST --data-binary @.codecov.yml https://codecov.io/validate + +# Settings related to codecov job +codecov: + require_ci_to_pass: yes # Only report if CI runs successfully + notify: + wait_for_ci: yes # Wait for other jobs to finish + manual_trigger: true # Notify when manually triggered + +# Settings related to code coverage analysis +coverage: + status: + # Checks how the PR changes overall coverage. + project: + default: + # For each PR, auto compare coverage to previous commit. + target: auto # Desired coverage + base: auto # Compares against the base branch + threshold: 2% # Allows slight drops before failing + # Checks the relative coverage of the new PR code only. + patch: + default: + target: auto + base: auto + threshold: 2% + +# Settings related to the comment that Codecov posts on the PR +comment: + layout: "diff, flags, files" + behavior: default # Uses default commenting behavior + require_changes: false # Always post the comment, even if coverage did not change + hide_project_coverage: false # Always show coverage for both project and patch diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml new file mode 100644 index 0000000..2883e2c --- /dev/null +++ b/.github/workflows/test-coverage.yaml @@ -0,0 +1,61 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +on: + push: + branches: [main, master,dev] + pull_request: + +name: test-coverage.yaml + +permissions: read-all + +jobs: + test-coverage: + runs-on: ubuntu-latest + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + + steps: + - uses: actions/checkout@v4 + + - uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::covr, any::xml2 + needs: coverage + + - name: Test coverage + run: | + cov <- covr::package_coverage( + quiet = FALSE, + clean = FALSE, + install_path = file.path(normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), "package") + ) + covr::to_cobertura(cov) + shell: Rscript {0} + + - uses: codecov/codecov-action@v4 + with: + # Fail if error if not on PR, or if on PR and token is given + fail_ci_if_error: ${{ github.event_name != 'pull_request' || secrets.CODECOV_TOKEN }} + file: ./cobertura.xml + plugin: noop + disable_search: true + token: ${{ secrets.CODECOV_TOKEN }} + + - name: Show testthat output + if: always() + run: | + ## -------------------------------------------------------------------- + find '${{ runner.temp }}/package' -name 'testthat.Rout*' -exec cat '{}' \; || true + shell: bash + + - name: Upload test results + if: failure() + uses: actions/upload-artifact@v4 + with: + name: coverage-test-failures + path: ${{ runner.temp }}/package diff --git a/DESCRIPTION b/DESCRIPTION index f348eff..0940ef8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,7 +2,6 @@ Package: discord Type: Package Title: Functions for Discordant Kinship Modeling Version: 1.2.4 -Date: 2025-04-29 Authors@R: c(person("S. Mason", "Garrison", email = "garrissm@wfu.edu", role = c("aut", "cre", "cph"), comment = c(ORCID = "0000-0002-4804-6003")), @@ -29,6 +28,7 @@ Suggests: dplyr, grid, gridExtra, + ggplot2, janitor, kableExtra, knitr, diff --git a/NAMESPACE b/NAMESPACE index c7b8024..08faad3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,7 +1,9 @@ # Generated by roxygen2: do not edit by hand +export(discord_between_model) export(discord_data) export(discord_regression) +export(discord_within_model) export(kinsim) importFrom(stats,formula) importFrom(stats,lm) diff --git a/NEWS.md b/NEWS.md index c332bf3..b20376e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,11 @@ + + # discord 1.2.4 * Added a new vignette demonstrating ways to visualize discordant kinship data using the `ggplot2` package. * Added a new vignette demonstrating how to conduct a power analysis. +* Vectorizing `discord_data()` to improve performance. +* Adding tests to ensure comparability between optimized and non-optimized versions of `discord_data()`. +* Adding `discord_between_model()` to get the between-family model # discord 1.2.3.1 * More mild improvements to documentation diff --git a/R/func_discord_data.R b/R/func_discord_data.R index bb08cfa..7cc6da0 100644 --- a/R/func_discord_data.R +++ b/R/func_discord_data.R @@ -16,6 +16,8 @@ #' be "both". Other options include "sex", "race", or "none". #' @param coding_method A character string that indicates what kind of #' additional coding schemes should be used. Default is none. Other options include "binary" and "multi". +#' @param fast Logical. If TRUE, uses a faster method for data processing. +#' @param ... Additional arguments to be passed to the function. #' @return A data frame that contains analyzable, paired data for performing #' kinship regressions. #' @@ -41,7 +43,54 @@ discord_data <- function(data, race = "race", pair_identifiers, demographics = "both", - coding_method = "none") { + coding_method = "none", + fast = TRUE, + ...) { + if (fast) { + discord_data_fast( + data = data, + outcome = outcome, + id = id, + sex = sex, + race = race, + pair_identifiers = pair_identifiers, + demographics = demographics, + predictors = predictors, + coding_method = coding_method, + ... + ) + } else { + discord_data_ram_optimized( + data = data, + outcome = outcome, + id = id, + sex = sex, + race = race, + pair_identifiers = pair_identifiers, + demographics = demographics, + predictors = predictors, + coding_method = coding_method, + ... + ) + } +} + +#' @title Discord Data RAM Optimized +#' +#' @description This function restructures data to determine kinship differences. +#' +#' @inheritParams discord_data +#' @keywords internal + +discord_data_ram_optimized <- function(data, + outcome, + predictors, + id = NULL, + sex = "sex", + race = "race", + pair_identifiers, + demographics = "both", + coding_method = "none") { # combine outcome and predictors for manipulating the data variables <- c(outcome, predictors) @@ -88,43 +137,44 @@ discord_data <- function(data, ) } - output <- Reduce(mrg, out) + output <- base::Reduce(mrg, out) } else if (demographics == "race") { mrg <- function(x, y) { merge( x = x, y = y, by = c( - "id", paste0(race, "_1"), - paste0(race, "_2") + "id", + base::paste0(race, "_1"), + base::paste0(race, "_2") ), all.x = TRUE ) } - output <- Reduce(mrg, out) + output <- base::Reduce(mrg, out) } else if (demographics == "sex") { mrg <- function(x, y) { merge( x = x, y = y, by = c( - "id", paste0(sex, "_1"), - paste0(sex, "_2") + "id", base::paste0(sex, "_1"), + base::paste0(sex, "_2") ), all.x = TRUE ) } - output <- Reduce(mrg, out) + output <- base::Reduce(mrg, out) } else if (demographics == "both") { mrg <- function(x, y) { merge( x = x, y = y, by = c( - "id", paste0(sex, "_1"), paste0(sex, "_2"), - paste0(race, "_1"), paste0(race, "_2") + "id", base::paste0(sex, "_1"), base::paste0(sex, "_2"), + base::paste0(race, "_1"), base::paste0(race, "_2") ), all.x = TRUE ) @@ -134,5 +184,125 @@ discord_data <- function(data, } + return(output) +} +.clean_names <- function(df) { + names(df) <- sub(".*\\.", "", names(df)) # If name has "prefix.name", keep only "name" + return(df) +} +#' @title Discord Data Fast +#' +#' @description This function restructures data to determine kinship differences. +#' +#' @inheritParams discord_data +#' @keywords internal + +discord_data_fast <- function(data, + outcome, + predictors, + id = NULL, + sex = "sex", + race = "race", + pair_identifiers, + demographics = "both", + coding_method = "none") { + # combine outcome and predictors for manipulating the data + variables <- c(outcome, predictors) + + #------------------------------------------- + # Step 1: order the data on outcome + #------------------------------------------- + orderedOnOutcome <- check_sibling_order( + data = data, + outcome = outcome, + pair_identifiers = pair_identifiers, + fast = TRUE + ) + + if (!valid_ids(orderedOnOutcome, + id = id + )) { + id <- "rowwise_id" + orderedOnOutcome <- cbind(orderedOnOutcome, rowwise_id = 1:nrow(data)) + } + + #------------------------------------------- + # Step 3: Differencing (using original helper, fast = TRUE) + #------------------------------------------- + + # out <- vector(mode = "list", length = length(variables)) + out <- make_mean_diffs( + data = orderedOnOutcome, + variables = variables, + id = id, + sex = sex, + race = race, + pair_identifiers = pair_identifiers, + demographics = demographics, + coding_method = coding_method, + fast = TRUE + ) + + + + if (demographics == "none") { + mrg <- function(x, y) { + merge( + x = .clean_names(x), + y = .clean_names(y), + by = c("id"), + all.x = TRUE + ) + } + + output <- base::Reduce(mrg, out) + } else if (demographics == "race") { + mrg <- function(x, y) { + merge( + x = .clean_names(x), + y = .clean_names(y), + by = c( + "id", base::paste0(race, "_1"), + base::paste0(race, "_2") + ), + all.x = TRUE + ) + } + + output <- base::Reduce(mrg, out) + } else if (demographics == "sex") { + mrg <- function(x, y) { + merge( + x = .clean_names(x), + y = .clean_names(y), + by = c( + "id", base::paste0(sex, "_1"), + base::paste0(sex, "_2") + ), + all.x = TRUE + ) + } + + output <- base::Reduce(mrg, out) + } else if (demographics == "both") { + mrg <- function(x, y) { + merge( + x = .clean_names(x), + y = .clean_names(y), + by = c( + "id", base::paste0(sex, "_1"), base::paste0(sex, "_2"), + base::paste0(race, "_1"), base::paste0(race, "_2") + ), + all.x = TRUE + ) + } + # remove names of lists that get concatenated by Reduce + # the variable name repeats to look like var.var_1, instead of var_1 + # how do we fix this? it breaks inside Reduce + + output <- base::Reduce(mrg, out) + } + + return(output) } diff --git a/R/func_discord_regression.R b/R/func_discord_regression.R index 4312d46..f4e94d6 100644 --- a/R/func_discord_regression.R +++ b/R/func_discord_regression.R @@ -26,7 +26,8 @@ discord_regression <- function(data, race = "race", pair_identifiers = c("_s1", "_s2"), data_processed = FALSE, - coding_method = "none") { + coding_method = "none", + fast = TRUE) { check_discord_errors(data = data, id = id, sex = sex, race = race, pair_identifiers = pair_identifiers) # if no demographics provided @@ -52,7 +53,8 @@ discord_regression <- function(data, race = race, pair_identifiers = pair_identifiers, demographics = demographics, - coding_method = coding_method + coding_method = coding_method, + fast = fast ) } else { preppedData <- data @@ -70,6 +72,8 @@ discord_regression <- function(data, pred_mean <- NULL } # coding method + + if (coding_method %in% c("binary", "binarymatch")) { race_match <- base::paste0(race, "_binarymatch") sex_match <- base::paste0(sex, "_binarymatch") @@ -101,7 +105,136 @@ discord_regression <- function(data, } - model <- stats::lm(stats::as.formula(paste(realOutcome, preds, sep = " ~ ")), data = preppedData) + formula_ <- stats::as.formula(base::paste(realOutcome, preds, sep = " ~ ")) + + model <- eval(substitute( + stats::lm(formula = F, data = preppedData), + list(F = formula_) + )) + + return(model) +} + +# alias +#' @rdname discord_regression +#' @export + +discord_within_model <- discord_regression + + + +#' Perform a Between-Family Linear Regression within the Discordant Kinship Framework +#' +#' @inheritParams discord_data +#' @inheritParams discord_regression +#' @return Resulting `lm` object from performing the between-family regression. +#' +#' @export +#' +#' @examples +#' +#' discord_between_model( +#' data = data_sample, +#' outcome = "height", +#' predictors = "weight", +#' pair_identifiers = c("_s1", "_s2"), +#' sex = NULL, +#' race = NULL +#' ) +#' +discord_between_model <- function(data, + outcome, + predictors, + demographics = NULL, + id = NULL, + sex = "sex", + race = "race", + pair_identifiers = c("_s1", "_s2"), + data_processed = FALSE, + coding_method = "none", + fast = TRUE) { + check_discord_errors(data = data, id = id, sex = sex, race = race, pair_identifiers = pair_identifiers) + + # if no demographics provided + if (is.null(demographics)) { + if (is.null(sex) & is.null(race)) { + demographics <- "none" + } else if (is.null(sex) & !is.null(race)) { + demographics <- "race" + } else if (!is.null(sex) & is.null(race)) { + demographics <- "sex" + } else if (!is.null(sex) & !is.null(race)) { + demographics <- "both" + } + } + + # If data not already processed, run through discord_data + if (!data_processed) { + preppedData <- discord_data( + data = data, + outcome = outcome, + predictors = predictors, + id = id, + sex = sex, + race = race, + pair_identifiers = pair_identifiers, + demographics = demographics, + coding_method = coding_method, + fast = fast + ) + } else { + preppedData <- data + } + + # Build formula + realOutcome <- base::paste0(outcome, "_mean") + + # predictors provided? + + if (!is.null(predictors)) { + pred_mean <- base::paste0(predictors, "_mean", collapse = " + ") + } else { + pred_mean <- NULL + } + + if (coding_method %in% c("binary", "binarymatch")) { + race_match <- base::paste0(race, "_binarymatch") + sex_match <- base::paste0(sex, "_binarymatch") + } else if (coding_method %in% c("multi", "multimatch")) { + race_match <- base::paste0(race, "_multimatch") + sex_match <- base::paste0(sex, "_multimatch") + } + + coding_method_list <- c("binary", "binarymatch", "multi", "multimatch") + + if (demographics == "none") { + preds <- pred_mean + } else if (demographics == "race") { + demographic_controls <- base::paste0(race, "_1") + if (coding_method %in% coding_method_list) { + demographic_controls <- race_match + } + preds <- paste(pred_mean, demographic_controls, sep = " + ") + } else if (demographics == "sex") { + demographic_controls <- base::paste0(sex, "_1 + ", sex, "_2") + if (coding_method %in% coding_method_list) { + demographic_controls <- sex_match + } + preds <- paste(pred_mean, demographic_controls, sep = " + ") + } else if (demographics == "both") { + demographic_controls <- base::paste0(sex, "_1 + ", race, "_1 + ", sex, "_2") + if (coding_method %in% coding_method_list) { + demographic_controls <- base::paste0(sex_match, " + ", race_match) + } + preds <- base::paste(pred_mean, demographic_controls, sep = " + ") + } + + formula_ <- stats::as.formula(base::paste(realOutcome, preds, sep = " ~ ")) + + model <- eval(substitute( + stats::lm(formula = F, data = preppedData), + list(F = formula_) + )) return(model) } diff --git a/R/helpers_regression.R b/R/helpers_regression.R index e6a193f..b92746e 100644 --- a/R/helpers_regression.R +++ b/R/helpers_regression.R @@ -6,12 +6,37 @@ #' If the two siblings have the same amount of the outcome, it randomly assigns one as having more. # #' @inheritParams discord_data -#' @param row The row number of the data frame +#' @param ... Additional arguments to be passed to the function. #' #' @return A one-row data frame with a new column order indicating which familial member (1, 2, or #' neither) has more of the outcome. #' -check_sibling_order <- function(data, outcome, pair_identifiers, row) { + +check_sibling_order <- function(..., fast = FALSE) { + if (fast == TRUE) { + check_sibling_order_fast(...) + } else { + check_sibling_order_ram_optimized(...) + } +} + +#' @title Check Sibling Order RAM Optimized +#' +#' @description This function determines the order of sibling pairs based on an outcome variable. +#' The function checks which of the two kinship pairs has more of a specified outcome variable. +#' It adds a new column named `order` to the dataset, indicating which sibling (identified as "s1" or "s2") has more of the outcome. +#' If the two siblings have the same amount of the outcome, it randomly assigns one as having more. +#' +#' @inheritParams discord_data +#' @inheritParams check_sibling_order +#' @param row The row number of the data frame +#' +#' +#' @return A one-row data frame with a new column order indicating which familial member (1, 2, or +#' neither) has more of the outcome. +#' @keywords internal + +check_sibling_order_ram_optimized <- function(data, outcome, pair_identifiers, row) { # Select the row of interest from the data frame data <- data[row, ] @@ -40,6 +65,36 @@ check_sibling_order <- function(data, outcome, pair_identifiers, row) { return(data) } + +check_sibling_order_fast <- function(data, outcome, pair_identifiers) { + #------------------------- + # 1. VECTORIZE ORDER ASSIGNMENT + #------------------------- + outcome1 <- data[[paste0(outcome, pair_identifiers[1])]] + outcome2 <- data[[paste0(outcome, pair_identifiers[2])]] + + # Check for missing outcome data + if (any(is.na(outcome1) | is.na(outcome2))) { + stop(paste0("There are missing data, encoded as `NA`, for at least one kinship pair in the '", outcome, "' variable and data cannot be prepped properly.\n Please remove or impute missing data.")) + } + + order <- ifelse(outcome1 > outcome2, "s1", + ifelse(outcome1 < outcome2, "s2", NA) + ) + + # Random tie breaking + ties <- which(is.na(order)) + if (length(ties) > 0) { + tie_assignment <- ifelse(stats::rbinom(length(ties), 1, 0.5) == 1, "s1", "s2") + order[ties] <- tie_assignment + } + + data$order <- order + return(data) +} + + + #' @title Make Mean Differences #' #' @description This function calculates differences and means of a given variable for each kinship pair. The order of subtraction and the variables' names in the output dataframe depend on the order column set by check_sibling_order(). @@ -47,11 +102,19 @@ check_sibling_order <- function(data, outcome, pair_identifiers, row) { #' swapping the order of demographics as per the order column. #' @inheritParams discord_data #' @inheritParams check_sibling_order -#' @param variable outcomes and predictors for manipulating the data #' -make_mean_diffs <- function(data, id, sex, race, demographics, - variable, pair_identifiers, row, - coding_method = "none") { +make_mean_diffs <- function(..., fast = FALSE) { + if (fast) { + make_mean_diffs_fast(...) + } else { + make_mean_diffs_ram_optimized(...) + } +} + + +make_mean_diffs_ram_optimized <- function(data, id, sex, race, demographics, + variable, pair_identifiers, row, + coding_method = "none") { S1 <- base::paste0(variable, pair_identifiers[1]) S2 <- base::paste0(variable, pair_identifiers[2]) sexS1 <- base::paste0(sex, pair_identifiers[1]) @@ -100,53 +163,109 @@ make_mean_diffs <- function(data, id, sex, race, demographics, # check for whether or not race and sex are defined - if (demographics == "race") { - if (data[, "order"] == "s1") { + output <- recode_demographics( + demographics = demographics, + data = data, + raceS1 = raceS1, + raceS2 = raceS2, + race = race, + sexS1 = sexS1, + sexS2 = sexS2, + sex = sex, + coding_method = coding_method, + output = output, + fast = FALSE + ) + + + return(output) +} + + +recode_demographics <- function(demographics, data, raceS1, raceS2, + race, sexS1, sexS2, sex, coding_method, output, fast = FALSE) { + # check for whether or not race and sex are defined + if (fast) { + if (demographics == "race") { output_demographics <- data.frame( race_1 = data[[raceS1]], race_2 = data[[raceS2]] ) - } else if (data[, "order"] == "s2") { - output_demographics <- data.frame( - race_1 = data[[raceS2]], - race_2 = data[[raceS1]] - ) - } - - names(output_demographics) <- paste0(race, c("_1", "_2")) - } else if (demographics == "sex") { - if (data[, "order"] == "s1") { + output_demographics$race_1[data$order == "s2"] <- data[[raceS2]][data$order == "s2"] + output_demographics$race_2[data$order == "s2"] <- data[[raceS1]][data$order == "s2"] + names(output_demographics) <- paste0(race, c("_1", "_2")) + } else if (demographics == "sex") { output_demographics <- data.frame( sex_1 = data[[sexS1]], sex_2 = data[[sexS2]] ) - } else if (data[, "order"] == "s2") { - output_demographics <- data.frame( - sex_1 = data[[sexS2]], - sex_2 = data[[sexS1]] - ) - } + output_demographics$sex_1[data$order == "s2"] <- data[[sexS2]][data$order == "s2"] + output_demographics$sex_2[data$order == "s2"] <- data[[sexS1]][data$order == "s2"] - names(output_demographics) <- paste0(sex, c("_1", "_2")) - } else if (demographics == "both") { - if (data[, "order"] == "s1") { + names(output_demographics) <- paste0(sex, c("_1", "_2")) + } else if (demographics == "both") { output_demographics <- data.frame( sex_1 = data[[sexS1]], sex_2 = data[[sexS2]], race_1 = data[[raceS1]], race_2 = data[[raceS2]] ) - } else if (data[, "order"] == "s2") { - output_demographics <- data.frame( - sex_1 = data[[sexS2]], - sex_2 = data[[sexS1]], - race_1 = data[[raceS2]], - race_2 = data[[raceS1]] - ) + output_demographics$race_1[data$order == "s2"] <- data[[raceS2]][data$order == "s2"] + output_demographics$race_2[data$order == "s2"] <- data[[raceS1]][data$order == "s2"] + output_demographics$sex_1[data$order == "s2"] <- data[[sexS2]][data$order == "s2"] + output_demographics$sex_2[data$order == "s2"] <- data[[sexS1]][data$order == "s2"] + names(output_demographics) <- c(paste0(sex, c("_1", "_2")), paste0(race, c("_1", "_2"))) } + } else { + if (demographics == "race") { + if (data[, "order"] == "s1") { + output_demographics <- data.frame( + race_1 = data[[raceS1]], + race_2 = data[[raceS2]] + ) + } else if (data[, "order"] == "s2") { + output_demographics <- data.frame( + race_1 = data[[raceS2]], + race_2 = data[[raceS1]] + ) + } + + names(output_demographics) <- paste0(race, c("_1", "_2")) + } else if (demographics == "sex") { + if (data[, "order"] == "s1") { + output_demographics <- data.frame( + sex_1 = data[[sexS1]], + sex_2 = data[[sexS2]] + ) + } else if (data[, "order"] == "s2") { + output_demographics <- data.frame( + sex_1 = data[[sexS2]], + sex_2 = data[[sexS1]] + ) + } - names(output_demographics) <- c(paste0(sex, c("_1", "_2")), paste0(race, c("_1", "_2"))) + names(output_demographics) <- paste0(sex, c("_1", "_2")) + } else if (demographics == "both") { + if (data[, "order"] == "s1") { + output_demographics <- data.frame( + sex_1 = data[[sexS1]], + sex_2 = data[[sexS2]], + race_1 = data[[raceS1]], + race_2 = data[[raceS2]] + ) + } else if (data[, "order"] == "s2") { + output_demographics <- data.frame( + sex_1 = data[[sexS2]], + sex_2 = data[[sexS1]], + race_1 = data[[raceS2]], + race_2 = data[[raceS1]] + ) + } + + names(output_demographics) <- c(paste0(sex, c("_1", "_2")), paste0(race, c("_1", "_2"))) + } } + # both methods if (coding_method != "none") { # New logic to handle race and sex as categorical variables if (demographics == "both" || demographics == "race") { @@ -168,14 +287,70 @@ make_mean_diffs <- function(data, id, sex, race, demographics, output_demographics[[paste0(sex, "_multimatch")]] <- ifelse(output_demographics[[sex_1_name]] == output_demographics[[sex_2_name]], as.character(output_demographics[[sex_2_name]]), "mixed") } } + if (exists("output_demographics")) { output <- base::cbind(output, output_demographics) } - return(output) } + + +make_mean_diffs_fast <- function(data, id, sex, race, demographics, + variables = variable, + variable = NULL, + pair_identifiers, + coding_method = "none") { + # S1 <- base::paste0(variable, pair_identifiers[1]) + # S2 <- base::paste0(variable, pair_identifiers[2]) + sexS1 <- base::paste0(sex, pair_identifiers[1]) + sexS2 <- base::paste0(sex, pair_identifiers[2]) + raceS1 <- base::paste0(race, pair_identifiers[1]) + raceS2 <- base::paste0(race, pair_identifiers[2]) + + + diff_list <- list() + for (var in variables) { + var1 <- ifelse(data$order == "s1", + data[[paste0(var, pair_identifiers[1])]], + data[[paste0(var, pair_identifiers[2])]] + ) + var2 <- ifelse(data$order == "s1", + data[[paste0(var, pair_identifiers[2])]], + data[[paste0(var, pair_identifiers[1])]] + ) + + diff <- var1 - var2 + mean_ <- (var1 + var2) / 2 + + tmp <- data.frame( + id = data[[id]], + stats::setNames(list(var1), paste0(var, "_1")), + stats::setNames(list(var2), paste0(var, "_2")), + stats::setNames(list(diff), paste0(var, "_diff")), + stats::setNames(list(mean_), paste0(var, "_mean")) + ) + + # obvious inefficiency + tmp <- recode_demographics( + demographics = demographics, + data = data, + raceS1 = raceS1, + raceS2 = raceS2, + race = race, + sexS1 = sexS1, + sexS2 = sexS2, + sex = sex, + coding_method = coding_method, + output = tmp, + fast = TRUE + ) + diff_list[[var]] <- tmp + } + return(diff_list) +} + #' @title Check Discord Errors #' #' @description This function checks for common errors in the provided data, including the correct specification of identifiers (ID, sex, race) and their existence in the data. diff --git a/man/check_sibling_order.Rd b/man/check_sibling_order.Rd index 84743f0..fe1688a 100644 --- a/man/check_sibling_order.Rd +++ b/man/check_sibling_order.Rd @@ -4,26 +4,20 @@ \alias{check_sibling_order} \title{Check Sibling Order} \usage{ -check_sibling_order(data, outcome, pair_identifiers, row) +check_sibling_order(..., fast = FALSE) } \arguments{ -\item{data}{The data set with kinship pairs} +\item{...}{Additional arguments to be passed to the function.} -\item{outcome}{A character string containing the outcome variable of -interest.} - -\item{pair_identifiers}{A character vector of length two that contains the -variable identifier for each kinship pair} - -\item{row}{The row number of the data frame} +\item{fast}{Logical. If TRUE, uses a faster method for data processing.} } \value{ -A character string signifying which familial member (1, 2, or +A one-row data frame with a new column order indicating which familial member (1, 2, or neither) has more of the outcome. } \description{ This function determines the order of sibling pairs based on an outcome variable. -It function checks which of the two kinship pairs has more of a specified outcome variable. +The function checks which of the two kinship pairs has more of a specified outcome variable. It adds a new column named `order` to the dataset, indicating which sibling (identified as "s1" or "s2") has more of the outcome. If the two siblings have the same amount of the outcome, it randomly assigns one as having more. } diff --git a/man/check_sibling_order_ram_optimized.Rd b/man/check_sibling_order_ram_optimized.Rd new file mode 100644 index 0000000..518ec55 --- /dev/null +++ b/man/check_sibling_order_ram_optimized.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/helpers_regression.R +\name{check_sibling_order_ram_optimized} +\alias{check_sibling_order_ram_optimized} +\title{Check Sibling Order RAM Optimized} +\usage{ +check_sibling_order_ram_optimized(data, outcome, pair_identifiers, row) +} +\arguments{ +\item{data}{The data set with kinship pairs} + +\item{outcome}{A character string containing the outcome variable of +interest.} + +\item{pair_identifiers}{A character vector of length two that contains the +variable identifier for each kinship pair} + +\item{row}{The row number of the data frame} +} +\value{ +A one-row data frame with a new column order indicating which familial member (1, 2, or + neither) has more of the outcome. +} +\description{ +This function determines the order of sibling pairs based on an outcome variable. +The function checks which of the two kinship pairs has more of a specified outcome variable. +It adds a new column named `order` to the dataset, indicating which sibling (identified as "s1" or "s2") has more of the outcome. +If the two siblings have the same amount of the outcome, it randomly assigns one as having more. +} +\keyword{internal} diff --git a/man/discord_between_model.Rd b/man/discord_between_model.Rd new file mode 100644 index 0000000..2f39cd9 --- /dev/null +++ b/man/discord_between_model.Rd @@ -0,0 +1,68 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/func_discord_regression.R +\name{discord_between_model} +\alias{discord_between_model} +\title{Perform a Between-Family Linear Regression within the Discordant Kinship Framework} +\usage{ +discord_between_model( + data, + outcome, + predictors, + demographics = NULL, + id = NULL, + sex = "sex", + race = "race", + pair_identifiers = c("_s1", "_s2"), + data_processed = FALSE, + coding_method = "none", + fast = TRUE +) +} +\arguments{ +\item{data}{The data set with kinship pairs} + +\item{outcome}{A character string containing the outcome variable of +interest.} + +\item{predictors}{A character vector containing the column names for +predicting the outcome.} + +\item{demographics}{Indicator variable for if the data has the sex and race +demographics. If both are present (default, and recommended), value should +be "both". Other options include "sex", "race", or "none".} + +\item{id}{Default's to NULL. If supplied, must specify the column name +corresponding to unique kinship pair identifiers.} + +\item{sex}{A character string for the sex column name.} + +\item{race}{A character string for the race column name.} + +\item{pair_identifiers}{A character vector of length two that contains the +variable identifier for each kinship pair} + +\item{data_processed}{Logical operator if data are already preprocessed by discord_data , default is FALSE} + +\item{coding_method}{A character string that indicates what kind of +additional coding schemes should be used. Default is none. Other options include "binary" and "multi".} + +\item{fast}{Logical. If TRUE, uses a faster method for data processing.} +} +\value{ +Resulting `lm` object from performing the between-family regression. +} +\description{ +Perform a Between-Family Linear Regression within the Discordant Kinship Framework +} +\examples{ + +discord_between_model( + data = data_sample, + outcome = "height", + predictors = "weight", + pair_identifiers = c("_s1", "_s2"), + sex = NULL, + race = NULL +) + +} diff --git a/man/discord_data.Rd b/man/discord_data.Rd index d157f72..0bf0e67 100644 --- a/man/discord_data.Rd +++ b/man/discord_data.Rd @@ -13,7 +13,9 @@ discord_data( race = "race", pair_identifiers, demographics = "both", - coding_method = "none" + coding_method = "none", + fast = TRUE, + ... ) } \arguments{ @@ -41,6 +43,10 @@ be "both". Other options include "sex", "race", or "none".} \item{coding_method}{A character string that indicates what kind of additional coding schemes should be used. Default is none. Other options include "binary" and "multi".} + +\item{fast}{Logical. If TRUE, uses a faster method for data processing.} + +\item{...}{Additional arguments to be passed to the function.} } \value{ A data frame that contains analyzable, paired data for performing diff --git a/man/discord_data_fast.Rd b/man/discord_data_fast.Rd new file mode 100644 index 0000000..4bc1d86 --- /dev/null +++ b/man/discord_data_fast.Rd @@ -0,0 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/func_discord_data.R +\name{discord_data_fast} +\alias{discord_data_fast} +\title{Discord Data Fast} +\usage{ +discord_data_fast( + data, + outcome, + predictors, + id = NULL, + sex = "sex", + race = "race", + pair_identifiers, + demographics = "both", + coding_method = "none" +) +} +\arguments{ +\item{data}{The data set with kinship pairs} + +\item{outcome}{A character string containing the outcome variable of +interest.} + +\item{predictors}{A character vector containing the column names for +predicting the outcome.} + +\item{id}{Default's to NULL. If supplied, must specify the column name +corresponding to unique kinship pair identifiers.} + +\item{sex}{A character string for the sex column name.} + +\item{race}{A character string for the race column name.} + +\item{pair_identifiers}{A character vector of length two that contains the +variable identifier for each kinship pair} + +\item{demographics}{Indicator variable for if the data has the sex and race +demographics. If both are present (default, and recommended), value should +be "both". Other options include "sex", "race", or "none".} + +\item{coding_method}{A character string that indicates what kind of +additional coding schemes should be used. Default is none. Other options include "binary" and "multi".} +} +\description{ +This function restructures data to determine kinship differences. +} +\keyword{internal} diff --git a/man/discord_data_ram_optimized.Rd b/man/discord_data_ram_optimized.Rd new file mode 100644 index 0000000..278f5f3 --- /dev/null +++ b/man/discord_data_ram_optimized.Rd @@ -0,0 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/func_discord_data.R +\name{discord_data_ram_optimized} +\alias{discord_data_ram_optimized} +\title{Discord Data RAM Optimized} +\usage{ +discord_data_ram_optimized( + data, + outcome, + predictors, + id = NULL, + sex = "sex", + race = "race", + pair_identifiers, + demographics = "both", + coding_method = "none" +) +} +\arguments{ +\item{data}{The data set with kinship pairs} + +\item{outcome}{A character string containing the outcome variable of +interest.} + +\item{predictors}{A character vector containing the column names for +predicting the outcome.} + +\item{id}{Default's to NULL. If supplied, must specify the column name +corresponding to unique kinship pair identifiers.} + +\item{sex}{A character string for the sex column name.} + +\item{race}{A character string for the race column name.} + +\item{pair_identifiers}{A character vector of length two that contains the +variable identifier for each kinship pair} + +\item{demographics}{Indicator variable for if the data has the sex and race +demographics. If both are present (default, and recommended), value should +be "both". Other options include "sex", "race", or "none".} + +\item{coding_method}{A character string that indicates what kind of +additional coding schemes should be used. Default is none. Other options include "binary" and "multi".} +} +\description{ +This function restructures data to determine kinship differences. +} +\keyword{internal} diff --git a/man/discord_regression.Rd b/man/discord_regression.Rd index 8420f3e..08aea2f 100644 --- a/man/discord_regression.Rd +++ b/man/discord_regression.Rd @@ -2,6 +2,7 @@ % Please edit documentation in R/func_discord_regression.R \name{discord_regression} \alias{discord_regression} +\alias{discord_within_model} \title{Perform a Linear Regression within the Discordant Kinship Framework} \usage{ discord_regression( @@ -14,7 +15,22 @@ discord_regression( race = "race", pair_identifiers = c("_s1", "_s2"), data_processed = FALSE, - coding_method = "none" + coding_method = "none", + fast = TRUE +) + +discord_within_model( + data, + outcome, + predictors, + demographics = NULL, + id = NULL, + sex = "sex", + race = "race", + pair_identifiers = c("_s1", "_s2"), + data_processed = FALSE, + coding_method = "none", + fast = TRUE ) } \arguments{ @@ -44,6 +60,8 @@ variable identifier for each kinship pair} \item{coding_method}{A character string that indicates what kind of additional coding schemes should be used. Default is none. Other options include "binary" and "multi".} + +\item{fast}{Logical. If TRUE, uses a faster method for data processing.} } \value{ Resulting `lm` object from performing the discordant regression. diff --git a/man/discord_regression_legacy.Rd b/man/discord_regression_legacy.Rd index b15493c..3d909f3 100644 --- a/man/discord_regression_legacy.Rd +++ b/man/discord_regression_legacy.Rd @@ -23,6 +23,8 @@ predicting the outcome.} \item{more_args}{Optional string to add additional inputs to formula} \item{additional_formula}{Deprecated} + +\item{...}{Additional arguments to be passed to the function.} } \value{ Resulting `lm` object from performing the discordant regression. diff --git a/man/make_mean_diffs.Rd b/man/make_mean_diffs.Rd index 4b513f2..a6ac8cf 100644 --- a/man/make_mean_diffs.Rd +++ b/man/make_mean_diffs.Rd @@ -4,41 +4,12 @@ \alias{make_mean_diffs} \title{Make Mean Differences} \usage{ -make_mean_diffs( - data, - id, - sex, - race, - demographics, - variable, - pair_identifiers, - row, - coding_method = "none" -) +make_mean_diffs(..., fast = FALSE) } \arguments{ -\item{data}{The data set with kinship pairs} +\item{...}{Additional arguments to be passed to the function.} -\item{id}{Default's to NULL. If supplied, must specify the column name -corresponding to unique kinship pair identifiers.} - -\item{sex}{A character string for the sex column name.} - -\item{race}{A character string for the race column name.} - -\item{demographics}{Indicator variable for if the data has the sex and race -demographics. If both are present (default, and recommended), value should -be "both". Other options include "sex", "race", or "none".} - -\item{variable}{outcomes and predictors for manipulating the data} - -\item{pair_identifiers}{A character vector of length two that contains the -variable identifier for each kinship pair} - -\item{row}{The row number of the data frame} - -\item{coding_method}{A character string that indicates what kind of -additional coding schemes should be used. Default is none. Other options include "binary" and "multi".} +\item{fast}{Logical. If TRUE, uses a faster method for data processing.} } \description{ This function calculates differences and means of a given variable for each kinship pair. The order of subtraction and the variables' names in the output dataframe depend on the order column set by check_sibling_order(). diff --git a/tests/testthat/test-discord_between.R b/tests/testthat/test-discord_between.R new file mode 100644 index 0000000..f724ffd --- /dev/null +++ b/tests/testthat/test-discord_between.R @@ -0,0 +1,172 @@ +get_p_value_between <- function(.results) { + results_df <- summary(.results) + results_df <- as.data.frame(results_df$coefficients) + results_df <- cbind(names = rownames(results_df), results_df) + rownames(results_df) <- NULL + results_df[which(results_df$names == "y2_mean"), "Pr(>|t|)"] +} + +# note that all of these models should be significant because all have between +# variance > 0 + +signif_threshold <- 0.05 + +test_that("monozygotic significant between-model is as expected", { + set.seed(18) + results_fast <- discord_between_model(mz_signif, + outcome = "y1", + predictors = "y2", + id = "id", + sex = NULL, + race = NULL, + pair_identifiers = c("_1", "_2"), + fast = TRUE + ) + + set.seed(18) + results_ram <- discord_between_model(mz_signif, + outcome = "y1", + predictors = "y2", + id = "id", + sex = NULL, + race = NULL, + pair_identifiers = c("_1", "_2"), + fast = FALSE + ) + + expect_equal(results_fast, results_ram, tolerance = 0.005) + + expect_lt(object = get_p_value_between(results_fast), expected = signif_threshold) + expect_lt(object = get_p_value_between(results_ram), expected = signif_threshold) +}) + +test_that("monozygotic nonsignificant between-model is as expected", { + set.seed(18) + results_fast <- discord_between_model(mz_nonsignif, + outcome = "y1", + predictors = "y2", + id = "id", + sex = NULL, + race = NULL, + pair_identifiers = c("_1", "_2"), + fast = TRUE + ) + set.seed(18) + results_ram <- discord_between_model(mz_nonsignif, + outcome = "y1", + predictors = "y2", + id = "id", + sex = NULL, + race = NULL, + pair_identifiers = c("_1", "_2"), + fast = FALSE + ) + expect_equal(results_fast, results_ram, tolerance = 0.005) + expect_lt(object = get_p_value_between(results_fast), expected = signif_threshold) + expect_lt(object = get_p_value_between(results_ram), expected = signif_threshold) +}) + +test_that("dizygotic significant between-model is as expected", { + set.seed(18) + results_fast <- discord_between_model(dz_signif, + outcome = "y1", + predictors = "y2", + id = "id", + sex = NULL, + race = NULL, + pair_identifiers = c("_1", "_2"), + fast = TRUE + ) + set.seed(18) + results_ram <- discord_between_model(dz_signif, + outcome = "y1", + predictors = "y2", + id = "id", + sex = NULL, + race = NULL, + pair_identifiers = c("_1", "_2"), + fast = FALSE + ) + expect_lt(object = get_p_value_between(results_fast), expected = signif_threshold) + expect_lt(object = get_p_value_between(results_ram), expected = signif_threshold) + expect_equal(results_fast, results_ram, tolerance = 0.005) + + }) + +test_that("dizygotic nonsignificant between-model is as expected", { + set.seed(18) + results_fast <- discord_between_model(dz_nonsignif, + outcome = "y1", + predictors = "y2", + id = "id", + sex = NULL, + race = NULL, + pair_identifiers = c("_1", "_2"), + fast = TRUE + ) + set.seed(18) + results_ram <- discord_between_model(dz_nonsignif, + outcome = "y1", + predictors = "y2", + id = "id", + sex = NULL, + race = NULL, + pair_identifiers = c("_1", "_2"), + fast = FALSE + ) + expect_lt(object = get_p_value_between(results_fast), expected = signif_threshold) + expect_lt(object = get_p_value_between(results_ram), expected = signif_threshold) + expect_equal(results_fast, results_ram, tolerance = 0.005) + }) + +test_that("half siblings significant between-model is as expected", { + set.seed(18) + results_fast <- discord_between_model(half_sibs_signif, + outcome = "y1", + predictors = "y2", + id = "id", + sex = NULL, + race = NULL, + pair_identifiers = c("_1", "_2"), + fast = TRUE + ) + set.seed(18) + results_ram <- discord_between_model(half_sibs_signif, + outcome = "y1", + predictors = "y2", + id = "id", + sex = NULL, + race = NULL, + pair_identifiers = c("_1", "_2"), + fast = FALSE + ) + expect_lt(object = get_p_value_between(results_fast), expected = signif_threshold) + expect_lt(object = get_p_value_between(results_ram), expected = signif_threshold) + expect_equal(results_fast, results_ram, tolerance = 0.005) + }) + +test_that("half siblings nonsignificant between-model is as expected", { + set.seed(18) + results_fast <- discord_between_model(half_sibs_nonsignif, + outcome = "y1", + predictors = "y2", + id = "id", + sex = NULL, + race = NULL, + pair_identifiers = c("_1", "_2"), + fast = TRUE + ) + set.seed(18) + results_ram <- discord_between_model(half_sibs_nonsignif, + outcome = "y1", + predictors = "y2", + id = "id", + sex = NULL, + race = NULL, + pair_identifiers = c("_1", "_2"), + fast = FALSE + ) + expect_lt(object = get_p_value_between(results_fast), expected = signif_threshold) + expect_lt(object = get_p_value_between(results_ram), expected = signif_threshold) + expect_equal(results_fast, results_ram, tolerance = 0.005) + }) diff --git a/tests/testthat/test-discord_regression.R b/tests/testthat/test-discord_regression.R index 8dbe01f..e52e24e 100644 --- a/tests/testthat/test-discord_regression.R +++ b/tests/testthat/test-discord_regression.R @@ -10,85 +10,156 @@ signif_threshold <- 0.05 test_that("monozygotic significant is as expected", { set.seed(18) - results <- discord_regression(mz_signif, + results_fast <- discord_regression(mz_signif, outcome = "y1", predictors = "y2", id = "id", sex = NULL, race = NULL, - pair_identifiers = c("_1", "_2") + pair_identifiers = c("_1", "_2"), + fast = TRUE ) + results_ram <- discord_regression(mz_signif, + outcome = "y1", + predictors = "y2", + id = "id", + sex = NULL, + race = NULL, + pair_identifiers = c("_1", "_2"), + fast = FALSE + ) + expect_lt(object = get_p_value(results_fast), expected = signif_threshold) + expect_lt(object = get_p_value(results_ram), expected = signif_threshold) + expect_equal(get_p_value(results_fast), get_p_value(results_ram), tolerance = 0.005) - expect_lt(object = get_p_value(results), expected = signif_threshold) }) test_that("monozygotic nonsignificant is as expected", { set.seed(18) - results <- discord_regression(mz_nonsignif, + results_fast <- discord_regression(mz_nonsignif, outcome = "y1", predictors = "y2", id = "id", sex = NULL, race = NULL, - pair_identifiers = c("_1", "_2") + pair_identifiers = c("_1", "_2"), + fast = TRUE ) - - expect_gt(object = get_p_value(results), expected = signif_threshold) + results_ram <- discord_regression(mz_nonsignif, + outcome = "y1", + predictors = "y2", + id = "id", + sex = NULL, + race = NULL, + pair_identifiers = c("_1", "_2"), + fast = FALSE + ) + expect_gt(object = get_p_value(results_fast), expected = signif_threshold) + expect_gt(object = get_p_value(results_ram), expected = signif_threshold) + expect_equal(get_p_value(results_fast), get_p_value(results_ram), tolerance = 0.005) }) test_that("dizygotic significant is as expected", { set.seed(18) - results <- discord_regression(dz_signif, + results_fast <- discord_regression(dz_signif, outcome = "y1", predictors = "y2", id = "id", sex = NULL, race = NULL, - pair_identifiers = c("_1", "_2") + pair_identifiers = c("_1", "_2"), + fast = TRUE ) - - expect_lt(object = get_p_value(results), expected = signif_threshold) + results_ram <- discord_regression(dz_signif, + outcome = "y1", + predictors = "y2", + id = "id", + sex = NULL, + race = NULL, + pair_identifiers = c("_1", "_2"), + fast = FALSE + ) + expect_lt(object = get_p_value(results_fast), expected = signif_threshold) + expect_lt(object = get_p_value(results_ram), expected = signif_threshold) + expect_equal(get_p_value(results_fast), get_p_value(results_ram), tolerance = 0.005) }) test_that("dizygotic nonsignificant is as expected", { set.seed(18) - results <- discord_regression(dz_nonsignif, + results_fast <- discord_regression(dz_nonsignif, outcome = "y1", predictors = "y2", id = "id", sex = NULL, race = NULL, - pair_identifiers = c("_1", "_2") + pair_identifiers = c("_1", "_2"), + fast = TRUE ) - expect_gt(object = get_p_value(results), expected = signif_threshold) + results_ram <- discord_regression(dz_nonsignif, + outcome = "y1", + predictors = "y2", + id = "id", + sex = NULL, + race = NULL, + pair_identifiers = c("_1", "_2"), + fast = FALSE + ) + + expect_gt(object = get_p_value(results_fast), expected = signif_threshold) + expect_gt(object = get_p_value(results_ram), expected = signif_threshold) + expect_equal(get_p_value(results_fast), get_p_value(results_ram), tolerance = 0.005) }) test_that("half siblings significant is as expected", { set.seed(18) - results <- discord_regression(half_sibs_signif, + results_fast <- discord_regression(half_sibs_signif, outcome = "y1", predictors = "y2", id = "id", sex = NULL, race = NULL, - pair_identifiers = c("_1", "_2") + pair_identifiers = c("_1", "_2"), + fast = TRUE + ) + results_ram <- discord_regression(half_sibs_signif, + outcome = "y1", + predictors = "y2", + id = "id", + sex = NULL, + race = NULL, + pair_identifiers = c("_1", "_2"), + fast = FALSE ) - expect_lt(object = get_p_value(results), expected = signif_threshold) + expect_lt(object = get_p_value(results_fast), expected = signif_threshold) + expect_lt(object = get_p_value(results_ram), expected = signif_threshold) + expect_equal(get_p_value(results_fast), get_p_value(results_ram), tolerance = 0.005) }) test_that("half siblings nonsignificant is as expected", { set.seed(18) - results <- discord_regression(half_sibs_nonsignif, + results_fast <- discord_regression(half_sibs_nonsignif, outcome = "y1", predictors = "y2", id = "id", sex = NULL, race = NULL, - pair_identifiers = c("_1", "_2") + pair_identifiers = c("_1", "_2"), + fast= TRUE + ) + results_ram <- discord_regression(half_sibs_nonsignif, + outcome = "y1", + predictors = "y2", + id = "id", + sex = NULL, + race = NULL, + pair_identifiers = c("_1", "_2"), + fast= FALSE ) - expect_gt(object = get_p_value(results), expected = signif_threshold) + expect_gt(object = get_p_value(results_fast), expected = signif_threshold) + expect_gt(object = get_p_value(results_ram), expected = signif_threshold) + expect_equal(get_p_value(results_fast), get_p_value(results_ram), tolerance = 0.005) }) diff --git a/tests/testthat/test-discord_regression_arguments.R b/tests/testthat/test-discord_regression_arguments.R index a0b84bc..0b7d071 100644 --- a/tests/testthat/test-discord_regression_arguments.R +++ b/tests/testthat/test-discord_regression_arguments.R @@ -30,13 +30,27 @@ test_that("flu_2008 ~ edu_2008 + ses_2008 + race + sex", { )) set.seed(18) - new <- discord_regression(uniqueExtendedIDs, + new_fast <- discord_regression(uniqueExtendedIDs, outcome = "flu_2008", - predictors = c("edu_2008", "ses_2008") + predictors = c("edu_2008", "ses_2008"), + fast = TRUE + ) + new_fast <- summarize_results(new_fast) + + expect_equal(control, new_fast, tolerance = 0.005) + + set.seed(18) + new_ram <- discord_regression(uniqueExtendedIDs, + outcome = "flu_2008", + predictors = c("edu_2008", "ses_2008"), + fast = FALSE ) - new <- summarize_results(new) - expect_equal(control, new, tolerance = 0.005) + new_ram <- summarize_results(new_ram) + + expect_equal(control, new_ram, tolerance = 0.005) + + expect_equal(new_fast, new_ram, tolerance = 0.005) }) @@ -63,15 +77,29 @@ test_that("flu_2008 ~ edu_2008 + ses_2008 + race", { "ses_2008_mean", "race_1BLACK" )) set.seed(18) - new <- discord_regression(uniqueExtendedIDs, + new_fast <- discord_regression(uniqueExtendedIDs, outcome = "flu_2008", id = "extended_id", predictors = c("edu_2008", "ses_2008"), - sex = NULL + sex = NULL, + fast= TRUE + ) + + set.seed(18) + new_ram <- discord_regression(uniqueExtendedIDs, + outcome = "flu_2008", + id = "extended_id", + predictors = c("edu_2008", "ses_2008"), + sex = NULL, + fast= FALSE ) - new <- summarize_results(new) + new_fast <- summarize_results(new_fast) + new_ram <- summarize_results(new_ram) + + expect_equal(control, new_fast, tolerance = 0.005) + expect_equal(control, new_ram, tolerance = 0.005) - expect_equal(control, new, tolerance = 0.005) + expect_equal(new_fast, new_ram, tolerance = 0.005) }) test_that("flu_2008 ~ edu_2008 + ses_2008", { @@ -96,16 +124,30 @@ test_that("flu_2008 ~ edu_2008 + ses_2008", { )) set.seed(18) - new <- discord_regression(uniqueExtendedIDs, + new_fast <- discord_regression(uniqueExtendedIDs, outcome = "flu_2008", id = "extended_id", predictors = c("edu_2008", "ses_2008"), sex = NULL, - race = NULL + race = NULL, + fast = TRUE + ) + set.seed(18) + new_ram <- discord_regression(uniqueExtendedIDs, + outcome = "flu_2008", + id = "extended_id", + predictors = c("edu_2008", "ses_2008"), + sex = NULL, + race = NULL, + fast = FALSE ) - new <- summarize_results(new) - expect_equal(control, new, tolerance = 0.005) + new_fast <- summarize_results(new_fast) + new_ram <- summarize_results(new_ram) + + expect_equal(control, new_fast, tolerance = 0.005) + expect_equal(control, new_ram, tolerance = 0.005) + expect_equal(new_fast, new_ram, tolerance = 0.005) }) @@ -134,11 +176,22 @@ test_that("flu_2008 ~ edu_2008 + ses_2008 + race + sex", { "race_1BLACK", "sex_2FEMALE" )) set.seed(18) - new <- discord_regression(uniqueExtendedIDs, + new_fast <- discord_regression(uniqueExtendedIDs, outcome = "flu_2008", - predictors = "edu_2008" + predictors = "edu_2008", + fast = TRUE + ) + set.seed(18) + new_ram <- discord_regression(uniqueExtendedIDs, + outcome = "flu_2008", + predictors = "edu_2008", + fast = FALSE ) - new <- summarize_results(new) - expect_equal(control, new, tolerance = 0.005) +new_fast <- summarize_results(new_fast) +new_ram <- summarize_results(new_ram) + +expect_equal(control, new_fast, tolerance = 0.005) +expect_equal(control, new_ram, tolerance = 0.005) +expect_equal(new_fast, new_ram, tolerance = 0.005) }) diff --git a/tests/testthat/test-new-legacy.R b/tests/testthat/test-new-legacy.R index 5597ef7..9b4f367 100644 --- a/tests/testthat/test-new-legacy.R +++ b/tests/testthat/test-new-legacy.R @@ -372,6 +372,36 @@ test_that("half-siblings nonsignificant: new & legacy data prep code results are }) +test_that("half-siblings nonsignificant: new & legacy data prep code results are equal", { + set.seed(18) + new_data <- discord_data(half_sibs_nonsignif, + outcome = "y1", + predictors = "y2", + id = "id", + sex = NULL, + race = NULL, + pair_identifiers = c("_1", "_2"), + demographics = "none", + fast = FALSE + ) + rownames(new_data) <- NULL + + set.seed(18) + old_data <- discord_data_legacy( + df = make_double_entered(half_sibs_nonsignif), + outcome = "y1", + predictors = "y2", + id = "id", + sep = "_", + doubleentered = TRUE + ) + rownames(old_data) <- NULL + + expect_equal(new_data, old_data) +}) + + + test_that("discord_data_legacy returns scaled values when scale = TRUE", { set.seed(18) tolerance <- .1 diff --git a/vignettes/Power.Rmd b/vignettes/Power.Rmd index 7fc5910..befc235 100644 --- a/vignettes/Power.Rmd +++ b/vignettes/Power.Rmd @@ -18,7 +18,6 @@ knitr::opts_chunk$set( fig.width = 7, fig.height = 5 ) - ``` @@ -45,6 +44,7 @@ The `kinsim()` function is designed to simulate data for kinship studies. It gen | `r_vector` | Optional: pairwise relatedness at the observation level | ## Output + A data.frame with: - Latent variables: A, C, E for each phenotype and sibling @@ -56,57 +56,53 @@ A data.frame with: # Example Usage - - -# Step 1: Define Simulation Grid +## Step 1: Define Simulation Grid We define a grid of simulation conditions, varying genetic relatedness, ACE variance components, and genetic correlations. - ```{r load-packages, message=FALSE} # Libraries library(NlsyLinks) library(discord) library(utils) +library(tidyverse) library(ggplot2) # Set random seed for reproducibility set.seed(1492) # Disable scientific notation for clarity -options(scipen=999) +options(scipen = 999) conditions <- expand.grid( - total_pairs = c(100,250,500,750,1000), - relatedness = c(1,.5), + total_pairs = c(100, 250, 500, 750, 1000), + relatedness = c(1, .5), cov_a = c(0, 0.25), cov_c = c(0, 0.25), - cov_e = c(0,0.25), + cov_e = c(0, 0.25), ace_a = c(1), ace_c = c(1), ace_e = c(1) ) - - - ``` -# Step 2: Run Simulations +## Step 2: Run Simulations For each condition, we simulate 100 replications of fitting discordant sibling models. We have kept the number of trials low for demonstration purposes, but you can increase `n_trials` for more robust power estimates. ```{r} +set.seed(1492) # Set seed for reproducibility n_trials <- 100 -FAST <- TRUE # Set to FALSE for slower, more detailed analysis +FAST <- FALSE # Set to FALSE for slower, more detailed analysis results_list <- list() name.results <- c("coef_xdiff", "p_xdiff", "r.squared") -for (cond in 1:nrow(conditions)) { +for (cond in seq_along(conditions)) { current <- conditions[cond, ] temp_results <- matrix(NA, nrow = n_trials, ncol = length(name.results)) colnames(temp_results) <- name.results @@ -114,7 +110,7 @@ for (cond in 1:nrow(conditions)) { for (i in 1:n_trials) { trial <- kinsim( r_vector = rep(current$relatedness, each = current$total_pairs), - npg_all = current$total_pairs, + npg_all = current$total_pairs, ace_all = c(current$ace_a, current$ace_c, current$ace_e), cov_a = current$cov_a, cov_c = current$cov_c, @@ -128,42 +124,52 @@ for (cond in 1:nrow(conditions)) { x_s1 = trial$y2_1, x_s2 = trial$y2_2 ) - if(FAST==TRUE){ - # faster + if (FAST == TRUE) { + # faster # double enter the data - extract2 <- rbind( - transform(extract, y_s1 = y_s2, y_s2 = y_s1, - x_s1 = x_s2, x_s2 = x_s1), - extract - ) - extract2$y_diff <- extract2$y_s1 - extract2$y_s2 - extract2$x_diff <- extract2$x_s1 - extract2$x_s2 - extract2$x_bar <- (extract2$x_s1 + extract2$x_s2) / 2 - extract2$y_bar <- (extract2$y_s1 + extract2$y_s2) / 2 - # select pair with ydiff > 0 - extract3 <- extract2[extract2$y_diff>0,] - - - fit <- tryCatch( - lm(y_diff ~ x_bar + y_bar + x_diff, data = extract3), - error = function(e) return(NULL) - ) + extract2 <- rbind( + transform(extract, + y_s1 = y_s2, y_s2 = y_s1, + x_s1 = x_s2, x_s2 = x_s1 + ), + extract + ) + extract2$y_diff <- extract2$y_s1 - extract2$y_s2 + extract2$x_diff <- extract2$x_s1 - extract2$x_s2 + extract2$x_bar <- (extract2$x_s1 + extract2$x_s2) / 2 + extract2$y_bar <- (extract2$y_s1 + extract2$y_s2) / 2 + + # select pair with ydiff > 0 + extract3 <- extract2[extract2$y_diff > 0, ] + + + fit <- tryCatch( + lm(y_diff ~ x_bar + y_bar + x_diff, data = extract3), + error = function(e) { + return(NULL) + } + ) } # slower - if(FAST==FALSE){ - fit <- tryCatch( - discord_regression(data = extract, outcome = "y", predictors = "x", - id = "id", - sex = NULL, - race = NULL), - error = function(e) return(NULL) - ) -} + if (FAST == FALSE) { + fit <- tryCatch( + discord_regression( + data = extract, outcome = "y", predictors = "x", + id = "id", + sex = NULL, + race = NULL, + fast = TRUE + ), + error = function(e) { + return(NULL) + } + ) + } if (!is.null(fit)) { sm <- summary(fit) temp_results[i, "coef_xdiff"] <- coef(sm)["x_diff", "Estimate"] - temp_results[i, "p_xdiff"] <- coef(sm)["x_diff", "Pr(>|t|)"] - temp_results[i, "r.squared"] <- sm$r.squared + temp_results[i, "p_xdiff"] <- coef(sm)["x_diff", "Pr(>|t|)"] + temp_results[i, "r.squared"] <- sm$r.squared } } @@ -177,7 +183,6 @@ for (cond in 1:nrow(conditions)) { Our final step is to summarize the power across all conditions. We calculate the proportion of trials where the p-value for the difference in means (`p_xdiff`) is less than 0.05, and we also report the median R-squared value from the regression models. Note that we've limited the number of trials to 10 for demonstration purposes, but you can (and should) increase this for more robust estimates. ```{r summarize-power} - power_summary <- lapply(results_list, function(res) { data.frame( power_xdiff = mean(res$p_xdiff < 0.05, na.rm = TRUE), @@ -188,8 +193,9 @@ power_summary <- lapply(results_list, function(res) { final_results <- cbind(conditions, do.call(rbind, power_summary)) ``` +## Step 4: Visualize Power -As shown in the table (at the end of the vignette), we have the power estimates for each condition, including the total number of pairs, relatedness level, covariate settings, and the median R-squared value from the regression models. As expected, power increases with the number of pairs and is influenced by the relatedness level and covariate settings. In the plots below, we visualize the power estimates across different conditions, focusing on the impact of covariates and relatedness levels. +We can visualize the power estimates across different conditions. The plots below show the power estimates for each condition, including the total number of pairs, relatedness level, covariate settings, and the median R-squared value from the regression models. As expected, power increases with the number of pairs and is influenced by the relatedness level and covariate settings. In the plots below, we visualize the power estimates across different conditions, focusing on the impact of covariates and relatedness levels. ```{r, echo=FALSE, message=FALSE} @@ -200,20 +206,27 @@ facet_labels <- list( cov_e = c("0" = "No Cov_E", "1" = "With Cov_E") ) # Ensure factors with exact string levels -final_results$cov_a <- factor(final_results$cov_a, levels = c(0, 0.25), labels = c("No Covariate A", "Covariate A (0.25)")) -final_results$cov_c <- factor(final_results$cov_c, levels = c(0, 0.25), labels = c("No Covariate C", "Covariate C (0.25)")) -final_results$cov_e <- factor(final_results$cov_e, levels = c(0, 0.25), labels = c("No Covariate E", "Covariate E (0.25)")) +final_results$cov_a <- factor(final_results$cov_a, levels = c(0, 0.25), + labels = c("No Covariate A", + "Covariate A (0.25)")) +final_results$cov_c <- factor(final_results$cov_c, levels = c(0, 0.25), + labels = c("No Covariate C", + "Covariate C (0.25)")) +final_results$cov_e <- factor(final_results$cov_e, levels = c(0, 0.25), + labels = c("No Covariate E", + "Covariate E (0.25)")) ``` ```{r plot-power, echo=FALSE} - final_results_noc <- final_results[final_results$cov_c == "No Covariate C", ] -p_noc <- ggplot(final_results_noc, aes(x = total_pairs, y = power_xdiff, - fill= factor(relatedness), - color = factor(relatedness))) + +p_noc <- ggplot(final_results_noc, aes( + x = total_pairs, y = power_xdiff, + fill = factor(relatedness), + color = factor(relatedness) +)) + geom_bar(stat = "identity", position = "dodge") + labs( title = "Power Analysis for Discordant Sibling Designs", @@ -222,23 +235,29 @@ p_noc <- ggplot(final_results_noc, aes(x = total_pairs, y = power_xdiff, color = "Relatedness", fill = "Relatedness" ) + - theme_minimal() + # Manual, per-facet annotations with explicit facet-mapped data - geom_text( x = 600, y = 0.55, label = "False Positive Rate", - data = data.frame(cov_a = "No Covariate A", cov_e = "No Covariate E"), - fontface = "italic", size = 3.5, inherit.aes = FALSE) + - geom_text( x = 600, y = 0.85, label = "Genetic Confounding", - data = data.frame(cov_a ="Covariate A (0.25)", cov_e = "No Covariate E"), - fontface = "italic", size = 3.5, inherit.aes = FALSE) + - facet_grid(cov_e ~ cov_a, labeller = labeller(facet_labels)) + theme_minimal() + + geom_text( + x = 600, y = 0.55, label = "False Positive Rate", + data = data.frame(cov_a = "No Covariate A", cov_e = "No Covariate E"), + fontface = "italic", size = 3.5, inherit.aes = FALSE + ) + + geom_text( + x = 600, y = 0.85, label = "Genetic Confounding", + data = data.frame(cov_a = "Covariate A (0.25)", cov_e = "No Covariate E"), + fontface = "italic", size = 3.5, inherit.aes = FALSE + ) + + facet_grid(cov_e ~ cov_a, labeller = labeller(facet_labels)) p_noc final_results_noa <- final_results[final_results$cov_a == "No Covariate A", ] -p_noa <- ggplot(final_results_noa, aes(x = total_pairs, y = power_xdiff, - fill= factor(relatedness), - color = factor(relatedness))) + +p_noa <- ggplot(final_results_noa, aes( + x = total_pairs, y = power_xdiff, + fill = factor(relatedness), + color = factor(relatedness) +)) + geom_bar(stat = "identity", position = "dodge") + labs( title = "Power Analysis for Discordant Sibling Designs", @@ -247,21 +266,30 @@ p_noa <- ggplot(final_results_noa, aes(x = total_pairs, y = power_xdiff, color = "Relatedness", fill = "Relatedness" ) + - theme_minimal() + # Manual, per-facet annotations with explicit facet-mapped data - geom_text( x = 600, y = 0.55, label = "False Positive Rate", - data = data.frame(cov_c = "No Covariate C", cov_e = "No Covariate E"), - fontface = "italic", size = 3.5, inherit.aes = FALSE) + - geom_text( x = 600, y = 0.85, label = "Shared-Environmental Confounding", - data = data.frame(cov_c ="Covariate C (0.25)", cov_e = "No Covariate E"), - fontface = "italic", size = 3.5, inherit.aes = FALSE) + - facet_grid(cov_e ~ cov_c, labeller = labeller(facet_labels)) + theme_minimal() + + geom_text( + x = 600, y = 0.55, label = "False Positive Rate", + data = data.frame(cov_c = "No Covariate C", cov_e = "No Covariate E"), + fontface = "italic", size = 3.5, inherit.aes = FALSE + ) + + geom_text( + x = 600, y = 0.85, label = "Shared-Environmental Confounding", + data = data.frame(cov_c = "Covariate C (0.25)", cov_e = "No Covariate E"), + fontface = "italic", size = 3.5, inherit.aes = FALSE + ) + + facet_grid(cov_e ~ cov_c, labeller = labeller(facet_labels)) p_noa - - ``` +### Power Table + +The table below summarizes the power estimates across different conditions. The `power_xdiff` column indicates the proportion of trials where the p-value for the difference in means (`p_xdiff`) is less than 0.05, and the `median_r2` column reports the median R-squared value from the regression models. ```{r, echo=FALSE, message=FALSE} knitr::kable(final_results) ``` + +## Conclusion +This vignette demonstrates how to conduct a simulation-based power analysis for discordant sibling designs using the `discord` package. By varying genetic relatedness, ACE variance components, and covariate settings, we can assess the power of our designs to detect causal effects. The results highlight the importance of sample size and relatedness level in achieving sufficient power for these analyses. + diff --git a/vignettes/links.Rmd b/vignettes/links.Rmd index 4f306f4..62bbe63 100644 --- a/vignettes/links.Rmd +++ b/vignettes/links.Rmd @@ -43,7 +43,7 @@ We begin by loading the required packages and a built-in dataset from {BGmisc}. library(BGmisc) library(tidyverse) library(discord) -library(stargazer) + data(potter) ``` diff --git a/vignettes/plots.Rmd b/vignettes/plots.Rmd index b579780..b0eaa56 100644 --- a/vignettes/plots.Rmd +++ b/vignettes/plots.Rmd @@ -17,8 +17,6 @@ This vignette walks through the steps needed to recreate a figure from Garrison ## Data Cleaning -## Data Cleaning - This section reuses the data preparation pipeline developed in the regression vignette.\ That vignette demonstrated how to set up data for discordant regression analysis by using discord data processing tools. Those tools facilitate the construction of kinship links, including identifying sibling pairs, merging sibling characteristics, and calculating pair-level variables. @@ -89,7 +87,7 @@ flu_modeling_data <- semi_join(df_link, With the data prepared, we restructure it using `discord_data()`. -```{r} +```{r, warning=FALSE} discord_flu <- discord_data( data = flu_modeling_data, outcome = "flu_total", @@ -106,23 +104,27 @@ discord_flu <- discord_data( Because we are interested in differences between kin, we create a new variable, `ses_diff_group`, that classifies SES differences into three categories: "More Advantage", "Equally Advantage", and "Less Advantage". This variable is later used to group observations in the marginal density plots. ```{r} - discord_flu <- discord_flu %>% mutate( # # Classify Difference Grouping - ses_diff_group = factor(case_when(scale(s00_h40_diff) > 0.33 ~ "More Advantage", - scale(s00_h40_diff) < -0.33 ~ "Less Advantage", - abs(scale(s00_h40_diff)) <= 0.33 ~ "Equally Advantage"), - levels = c("Less Advantage", - "Equally Advantage", - "More Advantage")) + ses_diff_group = factor( + case_when( + scale(s00_h40_diff) > 0.33 ~ "More Advantage", + scale(s00_h40_diff) < -0.33 ~ "Less Advantage", + abs(scale(s00_h40_diff)) <= 0.33 ~ "Equally Advantage" + ), + levels = c( + "Less Advantage", + "Equally Advantage", + "More Advantage" + ) + ) ) # Create a color palette for the shading shading <- c("firebrick4", "firebrick1", "dodgerblue1", "dodgerblue4") max_val <- max(abs(discord_flu$s00_h40_diff), na.rm = TRUE) values <- seq(-max_val, max_val, length = length(shading)) - ``` # Plotting the Data @@ -135,36 +137,43 @@ values <- seq(-max_val, max_val, length = length(shading)) This scatter plot shows individual SES at age 40 against individual flu vaccinations. ```{r individual, echo=TRUE, message=FALSE} - # Individual level plot -individual_plot <- ggplot(flu_modeling_data, aes(x = s00_h40_s1, - y = flu_total_s1, - color = s00_h40_s1 - s00_h40_s2)) + - geom_point(size = 0.8, alpha = 0.8, na.rm = TRUE, - position = position_jitter(w = 0.2, h = 0.2)) + +individual_plot <- ggplot(flu_modeling_data, aes( + x = s00_h40_s1, + y = flu_total_s1, + color = s00_h40_s1 - s00_h40_s2 +)) + + geom_point( + size = 0.8, alpha = 0.8, na.rm = TRUE, + position = position_jitter(width = 0.2, height = 0.2) + ) + geom_smooth(method = "lm", se = FALSE, color = "black") + # added sibling 2 to the plot - geom_point(size = 0.8, alpha = 0.8, na.rm = TRUE, - position = position_jitter(w = 0.2, h = 0.2), - aes( - x = s00_h40_s2, - y = flu_total_s2, - color = s00_h40_s2 - s00_h40_s1)) + + geom_point( + size = 0.8, alpha = 0.8, na.rm = TRUE, + position = position_jitter(w = 0.2, h = 0.2), + aes( + x = s00_h40_s2, + y = flu_total_s2, + color = s00_h40_s2 - s00_h40_s1 + ) + ) + scale_colour_gradientn( name = "Sibling\nDifferences\nin SES", colours = shading, na.value = "#AD78B6", values = scales::rescale(c(-max_val, max_val)) ) + - labs(x = "SES at Age 40", - y = "Flu Vaccination Count") + + labs( + x = "SES at Age 40", + y = "Flu Vaccination Count" + ) + theme_minimal() #+ # theme(legend.position = "none") individual_plot + ggtitle("Individual Level Plot") + theme(plot.title = element_text(hjust = 0.5)) - ``` @@ -174,23 +183,28 @@ This scatter plot shows mean SES at age 40 against mean flu vaccinations, with p ```{r scatter, message=FALSE, include=FALSE, echo=TRUE} # Main scatter plot -main_plot <- ggplot(discord_flu, aes(x = s00_h40_mean, - y = flu_total_mean, colour = s00_h40_diff)) + - geom_point(size = 0.8, alpha = 0.8, na.rm = TRUE, - position = position_jitter(w = 0.2, h = 0.2)) + +main_plot <- ggplot(discord_flu, aes( + x = s00_h40_mean, + y = flu_total_mean, colour = s00_h40_diff +)) + + geom_point( + size = 0.8, alpha = 0.8, na.rm = TRUE, + position = position_jitter(width = 0.2, height = 0.2) + ) + geom_smooth(method = "lm", se = FALSE, color = "black") + scale_colour_gradientn( name = "Sibling\nDifferences\nin SES", colours = shading, na.value = "#AD78B6", - values = scales::rescale(c(min(discord_flu$flu_total_diff, na.rm = TRUE), - mean(discord_flu$flu_total_diff, na.rm = TRUE), - max(discord_flu$flu_total_diff, na.rm = TRUE))) + values = scales::rescale(c( + min(discord_flu$flu_total_diff, na.rm = TRUE), + mean(discord_flu$flu_total_diff, na.rm = TRUE), + max(discord_flu$flu_total_diff, na.rm = TRUE) + )) ) + theme_minimal() + - theme( legend.position = "left")+ + theme(legend.position = "left") + labs(x = "Mean SES at Age 40", y = "Mean Flu Vaccinations (2006–2016)") - ``` ```{r echo=FALSE, message=FALSE} @@ -202,7 +216,6 @@ main_plot We overlay marginal density plots for SES and flu vaccinations, grouped by SES difference category. ```{r plot-raw-data, message=FALSE} - # Marginal X density (SES mean) xdensity <- ggplot(discord_flu, aes(x = s00_h40_mean, group = ses_diff_group, color = ses_diff_group)) + geom_density(adjust = 2, linewidth = 1, fill = NA) + @@ -211,12 +224,13 @@ xdensity <- ggplot(discord_flu, aes(x = s00_h40_mean, group = ses_diff_group, co values = c("firebrick1", "#AD78B6", "dodgerblue1") ) + theme_minimal() + - theme(legend.position = "left", - axis.title.y = element_blank(), - axis.text.y = element_blank(), - legend.title = element_text(size = 8), - legend.text = element_text(size = 6) - ) + + theme( + legend.position = "left", + axis.title.y = element_blank(), + axis.text.y = element_blank(), + legend.title = element_text(size = 8), + legend.text = element_text(size = 6) + ) + labs(x = NULL, y = NULL) # Marginal Y density (Flu mean) @@ -227,9 +241,11 @@ ydensity <- ggplot(discord_flu, aes(x = flu_total_mean, group = ses_diff_group, ) + coord_flip() + theme_minimal() + - theme(legend.position = "none", - axis.title.x = element_blank(), - axis.text.x = element_blank()) + + theme( + legend.position = "none", + axis.title.x = element_blank(), + axis.text.x = element_blank() + ) + labs(x = NULL, y = NULL) ``` @@ -247,28 +263,28 @@ ydensity + theme_bw() + theme( plot.title = element_text(hjust = 0.5, size = 12), axis.title.x = element_text(size = 12) ) + labs(title = "Mean Flu Vaccinations (2006–2016)") - ``` ```{r} # Blank placeholder plot blankPlot <- ggplot() + theme_void() - + # Final layout (true layout you originally had: AB above CC) grid.arrange( arrangeGrob(xdensity, blankPlot, ncol = 2, widths = c(4, 1)), arrangeGrob(main_plot, ydensity, ncol = 2, widths = c(4, 1)), heights = c(1.5, 4), top = textGrob("Sibling Differences in SES and Flu Vaccinations", - gp=gpar(fontsize=20,font=3))) - + gp = gpar(fontsize = 20, font = 3) + ) +) ``` Other ways to plot this data include using `facet_wrap` to create separate panels for each SES difference group. This can help visualize the differences in flu vaccinations across different SES categories. ```{r echo=FALSE, message=FALSE} -main_plot + facet_wrap(~ ses_diff_group, ncol = 1) + +main_plot + facet_wrap(~ses_diff_group, ncol = 1) + theme(legend.position = "none") + labs(title = "Within Family Differences in SES and Flu Vaccinations") ``` @@ -282,35 +298,38 @@ This plot compares differences in SES at age 40 to differences in flu vaccinatio ## Setup: Use discord_data # Main scatter plot -main_plot_within <- ggplot(discord_flu, aes(x = s00_h40_diff, - y = flu_total_diff, colour = s00_h40_diff)) + - geom_point(size = 0.8, alpha = 0.8, na.rm = TRUE, - position = position_jitter(w = 0.2, h = 0.2)) + +main_plot_within <- ggplot(discord_flu, aes( + x = s00_h40_diff, + y = flu_total_diff, colour = s00_h40_diff +)) + + geom_point( + size = 0.8, alpha = 0.8, na.rm = TRUE, + position = position_jitter(w = 0.2, h = 0.2) + ) + geom_smooth(method = "lm", se = FALSE, color = "black") + scale_colour_gradientn( name = "Sibling\nDifferences\nin SES", colours = shading, na.value = "#AD78B6", - values = scales::rescale(c(min(discord_flu$s00_h40_diff, na.rm = TRUE), - mean(discord_flu$s00_h40_diff, na.rm = TRUE), - max(discord_flu$s00_h40_diff, na.rm = TRUE))) + values = scales::rescale(c( + min(discord_flu$s00_h40_diff, na.rm = TRUE), + mean(discord_flu$s00_h40_diff, na.rm = TRUE), + max(discord_flu$s00_h40_diff, na.rm = TRUE) + )) ) + theme_minimal() + - theme( legend.position = "left")+ + theme(legend.position = "left") + labs(x = "Diff SES at Age 40", y = "Diff Flu Vaccinations (2006–2016)") main_plot_within - ``` You can further facet this plot by the difference in SES between kin to see how the relationship varies across different groups. The following code does this and adds a title to the plot. ```{r echo=TRUE, message=FALSE} - -main_plot_within + facet_wrap(~ ses_diff_group, ncol = 1) + - theme(legend.position = "bottom") + +main_plot_within + facet_wrap(~ses_diff_group, ncol = 1) + + theme(legend.position = "bottom") + labs(title = "Within Family Differences in SES and Flu Vaccinations") - ``` ## Conclusion diff --git a/vignettes/regression.Rmd b/vignettes/regression.Rmd index 7d9fc55..5440d48 100644 --- a/vignettes/regression.Rmd +++ b/vignettes/regression.Rmd @@ -35,7 +35,7 @@ The original analysis examined whether socioeconomic status (SES) at age 40 pred ## Data Cleaning -For this example, we will load the following packages. +For this example, we will load the following packages: ```{r discord-setup, message = FALSE} # For easy data manipulation