diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 06dbd6bab..55e33c72d 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -59,7 +59,44 @@ This follows the [Quarto website linking guidelines](https://quarto.org/docs/web This ensures links work correctly across all output formats and during local development. -## Subfile Structure Convention +## Attribution for Adapted Content + +When content is adapted from published sources (textbooks, papers, websites), +**always provide explicit attribution** in the document. +Use `@citekey` Pandoc citation syntax and include a prose note explaining what was adapted. + +Examples of acceptable attribution: + +- "Adapted from @vittinghoff2e [Chapter 10]." +- "The following example is based on @kleinbaum2010logistic [Chapter 8]." +- "This approach follows @hulley1998hers." + +Attribution should appear: + +- At the top of the chapter or section (in the Acknowledgements or Introduction), + *or* +- Adjacent to the specific content being adapted (as a prose sentence or in a `::: notes` div). + +**Always include chapter and/or page numbers** in inline citations where applicable, +so that readers can locate the source material. +Use Pandoc's citation locator syntax: `[@citekey, Chapter 8]` or `[@citekey, p. 194]`. +Include both chapter and page when both are known. +Do **not** invent page numbers. +Only cite a page after opening the source PDF +and confirming the exact page. +If you have not verified a page directly from the source PDF, +cite chapter-level location only. + +Examples of attribution with page numbers +(formatting examples): + +- "Adapted from [@vittinghoff2e, Chapter 10, p. 194]." +- "The following example is based on [@kleinbaum2010logistic, Chapter 8, p. 230]." + +Do **not** reproduce verbatim text from copyrighted sources without clear quotation marks and attribution. +Paraphrase and summarize with a citation instead. + +## Decomposing content into subfiles Quarto subfiles (files under `_subfiles/`) should **not** begin with a section heading. The heading for a subfile's content should be placed in the **parent** file that includes the subfile, @@ -96,7 +133,27 @@ This applies whether the citation refers to one author or multiple authors (e.g. Pandoc renders `@dobson4e` as a noun phrase (e.g., "Dobson and Barnett (2018)"), but grammatically the citation key itself is treated as a singular pronoun/name. -## Attribution for Adapted Content +## DAG Naming for Categorical Variables + +In DAGs, +use node names that represent the full categorical variable, +not a specific level of that variable. + +Example: +use `personality_type`, +not `TypeA`. + +## Evidence and Source Citation Requirements + +For factual claims that are not directly proved in the text, +always include a specific source citation. +Do not leave factual statements uncited. + +Always cite papers and books using +BibTeX entries in `references.bib` +and Quarto/Pandoc citation syntax +(for example `[@MickeyGreenland1989]`), +rather than plaintext author-date references. When adapting any content from another source, always include specific attribution in the chapter text. @@ -250,6 +307,53 @@ Use this pattern: Apply this only where needed, so HTML and RevealJS output stay unchanged. +## Figures and Tables: Use Div Format + +Prefer the **Quarto div format** for new figures and tables +rather than chunk-option `fig-cap`/`tbl-cap`. +The div format makes it easier to write and format multi-sentence captions. +Existing chunk-option captions in older files are acceptable +unless you are already refactoring that content. + +**Correct** (div format): +````qmd +::: {#fig-my-figure} + +```{r} +plot(x, y) +``` + +Caption text here. +This can span multiple lines and include *markdown*. + +::: +```` + +**Correct** (div format for tables): +````qmd +::: {#tbl-my-table} + +```{r} +my_table +``` + +Caption text here. + +::: +```` + +**Incorrect** (chunk option format): +````qmd +```{r} +#| label: fig-my-figure +#| fig-cap: "Caption text here." +plot(x, y) +``` +```` + +Use this for newly added or substantially revised figures and tables in `.qmd` files. + + ## Math Notation This repository uses custom LaTeX macros defined in `latex-macros/macros.qmd` (a git submodule). diff --git a/DESCRIPTION b/DESCRIPTION index a623f0805..ebd8a28ad 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -38,6 +38,7 @@ Imports: Suggests: tidyverse, conflicted, + dagitty, dplyr, eha, ggfortify, diff --git a/_quarto-book.yml b/_quarto-book.yml index 7c1f97cb4..8cbbdce8f 100644 --- a/_quarto-book.yml +++ b/_quarto-book.yml @@ -26,6 +26,7 @@ book: - Linear-models-overview.qmd - logistic-regression.qmd - count-regression.qmd + - predictor-selection.qmd - intro-multilevel-models.qmd - part: time-to-event-models.qmd chapters: diff --git a/_quarto-revealjs.yml b/_quarto-revealjs.yml index 7a9322490..c335bf75a 100644 --- a/_quarto-revealjs.yml +++ b/_quarto-revealjs.yml @@ -22,6 +22,7 @@ project: - "intro-bayes.qmd" - "common-mistakes.qmd" - "notation.qmd" + - "predictor-selection.qmd" - "intro-to-R.qmd" - "exam-formula-sheet.qmd" diff --git a/_quarto-website.yml b/_quarto-website.yml index dbdbef8eb..02ef73b7d 100644 --- a/_quarto-website.yml +++ b/_quarto-website.yml @@ -9,6 +9,7 @@ project: - Linear-models-overview.qmd - logistic-regression.qmd - count-regression.qmd + - predictor-selection.qmd - intro-multilevel-models.qmd - time-to-event-models.qmd - intro-to-survival-analysis.qmd @@ -68,6 +69,8 @@ website: href: logistic-regression.qmd - text: "Count Regression" href: count-regression.qmd + - text: "Predictor Selection" + href: predictor-selection.qmd - text: "Multilevel Models" href: intro-multilevel-models.qmd - text: "---" diff --git a/_sec-pred-sel-dag.qmd b/_sec-pred-sel-dag.qmd new file mode 100644 index 000000000..93f3718cd --- /dev/null +++ b/_sec-pred-sel-dag.qmd @@ -0,0 +1,434 @@ +{{< include latex-macros/macros.qmd >}} + +## Directed Acyclic Graphs (DAGs) {#sec-pred-sel-dag} + +A **Directed Acyclic Graph (DAG)** is a graphical model +that encodes assumed causal relationships among variables. + +In a DAG: + +- **Nodes** represent variables. +- **Directed edges** (arrows) represent direct causal effects. +- The graph is **acyclic**: there are no directed cycles +(a variable cannot cause itself, even indirectly). + +DAGs are used in causal inference to: + +- Identify **confounders** that must be adjusted for. +- Identify **mediators** that should generally *not* be adjusted for +when estimating the total causal effect. +- Identify **colliders** that should *not* be adjusted for +(adjusting for a collider opens a non-causal path). + +## Key concepts from DAG theory {#sec-pred-sel-dag-concepts} + +:::{#def-confounder} +#### Confounder + +A **confounder** of the relationship between an exposure $X$ and +outcome $Y$ is a variable $C$ +that is a common cause of both $X$ and $Y$. +By contrast, +a mediator lies on the causal path +from $X$ to $Y$ +and is not a common cause of $X$ and $Y$. + +Adjusting for confounders removes confounding bias. +::: + +--- + +:::{#def-mediator} +#### Mediator + +A **mediator** $M$ is a variable that lies on the causal pathway +from the exposure $X$ to the outcome $Y$: +$$X \to M \to Y$$ + +Adjusting for a mediator blocks the indirect causal pathway +through $M$, +so the estimated coefficient for $X$ reflects only +the *direct* effect of $X$ on $Y$ not through $M$. + +Whether to adjust for mediators depends on the research question: + +- If you want the **total causal effect** of $X$ on $Y$, +do **not** adjust for $M$. +- If you want the **direct effect** of $X$ on $Y$ not through $M$, +adjust for $M$. +::: + +--- + +:::{#def-collider} +#### Collider + +A **collider** is a variable $L$ that is caused by two or more +variables on a path: +$$X \to L \leftarrow Y$$ + +Adjusting for a collider opens a non-causal (backdoor) path +between $X$ and $Y$, +introducing **collider bias**. +::: + +## HERS study DAG {#sec-pred-sel-hers-dag} + +{{< include _subfiles/shared/_sec_hers_intro.qmd >}} + +We now examine the assumed causal structure +among key baseline variables, treatment, and recurrent CHD events. + + +@fig-hers-dag shows a DAG for the HERS study, +representing the assumed causal structure among key variables. + +The main goal of the HERS study was to estimate +the causal effect of hormone therapy (HT) +on recurrent CHD event risk. + +::: {#tbl-hers-data-dictionary} + +Data dictionary for key HERS variables used in the DAG and examples. + +| Variable | Meaning | Type | +|---|---|---| +| `HT` | Hormone-therapy assignment (placebo vs estrogen+progestin) | Binary categorical | +| `CHD` | Recurrent coronary heart disease event outcome | Binary categorical | +| `age` | Baseline age (years) | Continuous | +| `smoking` | Baseline smoking status | Binary categorical | +| `diabetes` | Baseline diabetes status | Binary categorical | +| `BMI` | Body mass index | Continuous | +| `LDL` | Low-density lipoprotein cholesterol | Continuous | +| `HDL` | High-density lipoprotein cholesterol | Continuous | +| `statins` | Baseline statin use | Binary categorical | +| `SBP` | Systolic blood pressure | Continuous | + +::: + +::: {#fig-hers-dag} + +```{r} +#| fig-width: 7 +#| fig-height: 5 + +library(dagitty) + +hers_dag <- dagitty( + 'dag { + HT [exposure, pos="0,0"] + CHD [outcome, pos="4,0"] + age [pos="2,2"] + smoking [pos="-1,-2"] + diabetes [pos="1,-1.5"] + BMI [pos="-1,1"] + LDL [pos="2,1"] + HDL [pos="2,-1"] + statins [pos="3,2"] + SBP [pos="3,-2"] + + HT -> CHD + HT -> HDL + HT -> LDL + age -> CHD + smoking -> CHD + smoking -> HDL + BMI -> CHD + BMI -> diabetes + BMI -> SBP + BMI -> LDL + diabetes -> CHD + diabetes -> LDL + LDL -> CHD + HDL -> CHD + statins -> LDL + SBP -> CHD + }' +) + +plot(hers_dag) +``` + +Directed Acyclic Graph (DAG) for the HERS study. +`HT` = hormone therapy (exposure); +`CHD` = recurrent coronary heart disease events (outcome); +`HDL` = high-density lipoprotein; +`LDL` = low-density lipoprotein; +`SBP` = systolic blood pressure. + +::: + +In @fig-hers-dag: + +- **`age`** is prognostic for CHD: +older women are at higher risk of CHD events. +Since HT was randomized in HERS, +age is not a confounder of the HT → CHD relationship, +but adjusting for age can improve precision. +- **`LDL`** and **`HDL`** are *mediators*: +hormone therapy affects lipid levels, +which in turn affect CHD risk. +- **`BMI`** is a common cause of diabetes, SBP, and LDL. +- **`statins`** (statin use) affects LDL independently of HT +(statins are medications prescribed to lower LDL). +- This DAG is simplified; a more complete model might include +additional variables such as anti-hypertensive medication use. + +## What variables should we adjust for? {#sec-pred-sel-hers-adjust} + +Using DAG theory, +we can identify the minimum adjustment set +needed to estimate the total causal effect of HT on CHD. + +```{r} +#| label: tbl-hers-adj +adjustmentSets( + hers_dag, + exposure = "HT", + outcome = "CHD", + type = "minimal" +) +``` + +::: notes + +Since HT was randomized in HERS, +there are no backdoor paths from HT to CHD. +The `adjustmentSets()` function confirms that no adjustment is needed +to estimate the total causal effect of HT on CHD. + +The variables `LDL` and `HDL` are mediators on the path +HT → LDL/HDL → CHD, +so they should **not** be included +when estimating the *total* causal effect of HT on CHD. + +Including `LDL` and `HDL` in the model would block +a part of the causal effect of HT, +giving us only the *direct* effect of HT on CHD +(not through lipids). + +::: + +## Analyzing LDL in the HERS study {#sec-pred-sel-hers-ldl} + +::: notes + +The HERS baseline dataset includes LDL and HDL cholesterol, +allowing us to examine the effect of hormone therapy on lipid levels. + +Here we build models to predict LDL cholesterol, +illustrating how DAG thinking guides predictor selection. + +::: + +```{r} +#| label: hers-data-setup + +library(haven) +library(dplyr) + +hers <- read_stata(fs::path_package("rme", "extdata/hersdata.dta")) + +hers_ldl <- + hers |> + filter(!is.na(LDL)) |> + mutate( + HT = factor(HT, levels = c(0, 1), labels = c("Placebo", "HT")), + statins = factor(statins, levels = c(0, 1), labels = c("No", "Yes")), + diabetes = factor(diabetes, levels = c(0, 1), labels = c("No", "Yes")), + smoking = factor(smoking, levels = c(0, 1), labels = c("No", "Yes")) + ) +``` + +::: {#tbl-hers-ldl-full} + +```{r} + +library(gtsummary) + +hers_ldl |> + dplyr::select(LDL, HT, age, BMI, diabetes, smoking, statins) |> + tbl_summary(by = HT) |> + add_overall() |> + bold_labels() +``` + +Summary table for LDL analysis variables in HERS + +::: + +### Unadjusted model + +::: {#tbl-hers-ldl-unadj} + +```{r} + +model_ldl_unadj <- + lm(LDL ~ HT, data = hers_ldl) + +model_ldl_unadj |> + tbl_regression() |> + bold_labels() +``` + +Unadjusted linear regression for LDL in HERS + +::: + +### Adjusted model + +Based on the DAG in @fig-hers-dag, +BMI is a common cause of LDL and other outcomes +(but age does not confound the HT → LDL relationship, +since HT was randomized). + +In a randomized trial, +treatment assignment is independent of pre-treatment covariates, +so confounding adjustment is not required to estimate the causal effect. +However, adjusting for strong predictors of the outcome +can improve precision. + +::: {#tbl-hers-ldl-adj} + +```{r} + +model_ldl_adj <- + lm(LDL ~ HT + age + BMI + diabetes + statins, data = hers_ldl) + +model_ldl_adj |> + tbl_regression() |> + bold_labels() +``` + +Adjusted linear regression for LDL in HERS + +::: + +Note that `statins` is a strong predictor of LDL +(statin medications are prescribed specifically to lower LDL). +Including `statins` substantially reduces residual variability, +improving precision of the HT coefficient. + +## An observational study example: WCGS {#sec-pred-sel-wcgs-dag} + +The HERS study is a randomized trial, +so confounding of the treatment-outcome relationship is not a concern. +To illustrate DAG-guided confounder selection in an **observational study**, +we use the Western Collaborative Group Study (WCGS). + +The WCGS was a prospective cohort study of 3,154 men +employed by California companies, +followed from 1960 to 1969 [@rosenman1975coronary]. +The exposure of primary interest is **Type A behavior pattern** +(a personality trait characterized by competitive drive, +time urgency, and hostility), +hypothesized to increase CHD risk +independently of classic cardiovascular risk factors. + +::: {#tbl-wcgs-data-dictionary} + +Data dictionary for key WCGS variables used in the DAG and examples. + +| Variable | Meaning | Type | +|---|---|---| +| `personality_type` | Personality type (Type A vs Type B behavior pattern) | Binary categorical | +| `CHD` | Coronary heart disease event outcome | Binary categorical | +| `age` | Baseline age (years) | Continuous | +| `smoking` | Baseline smoking status | Binary categorical | +| `chol` | Serum cholesterol | Continuous | +| `sbp` | Systolic blood pressure | Continuous | +| `bmi` | Body mass index | Continuous | + +::: + +::: {#fig-wcgs-dag} + +```{r} +#| fig-width: 7 +#| fig-height: 5 + +wcgs_dag <- dagitty( + 'dag { + personality_type [exposure, pos="0,0"] + CHD [outcome, pos="4,0"] + age [pos="1.4,2"] + bmi [pos="-0.3,1.1"] + smoking [pos="1.1,-1.8"] + chol [pos="2.7,1"] + sbp [pos="2.8,-1.2"] + + personality_type -> CHD + age -> CHD + age -> chol + age -> sbp + age -> bmi + age -> personality_type + personality_type -> smoking + smoking -> CHD + chol -> CHD + sbp -> CHD + bmi -> CHD + bmi -> chol + bmi -> sbp + }' +) + +plot(wcgs_dag) +``` + +Directed Acyclic Graph (DAG) for the WCGS study. +`personality_type` = personality type (exposure; Type A vs Type B); +`CHD` = coronary heart disease events (outcome); +`chol` = serum cholesterol; +`sbp` = systolic blood pressure. + +::: + +In @fig-wcgs-dag, +`age` is a **confounder**: +it has direct causal arrows to both the exposure (`personality_type`) +and the outcome (`CHD`). +We assume `personality_type -> smoking` +rather than `smoking -> personality_type` +as an explicit modeling assumption +for this teaching example: +we treat Type A behavior pattern +as a relatively stable trait +that can influence smoking behavior. +Under that assumption, +`smoking` is a mediator on the path +`personality_type -> smoking -> CHD`, +not a confounder. +BMI is causally affected by age. +Cholesterol and SBP are causally affected +by both age and BMI. +All three have direct effects on CHD. +Unlike in HERS, +where HT was randomized +to estimate its causal effect on CHD event risk, +we cannot assume the exposure is unconfounded here. + +```{r} +#| label: wcgs-adj-sets + +wcgs_adj <- adjustmentSets( + wcgs_dag, + exposure = "personality_type", + outcome = "CHD", + type = "minimal" +) +wcgs_adj +``` + +::: notes + +The `adjustmentSets()` function returns the minimal set(s) of variables +that block all backdoor paths from `personality_type` to `CHD`. +Adjusting for any of these minimal sets is sufficient to +estimate the total causal effect of Type A behavior on CHD +(under the assumptions encoded in the DAG). + +::: diff --git a/_sec-pred-sel-details.qmd b/_sec-pred-sel-details.qmd new file mode 100644 index 000000000..d0591eaf5 --- /dev/null +++ b/_sec-pred-sel-details.qmd @@ -0,0 +1,189 @@ +{{< include latex-macros/macros.qmd >}} + +## Collinearity {#sec-pred-sel-collinearity} + +:::{#def-collinearity} +#### Collinearity + +**Collinearity** refers to high correlation between predictors, +sufficient to substantially degrade the precision of +one or more regression coefficient estimates. + +::: + +When two predictors are highly correlated, +each appears unimportant in the model adjusting for the other, +even if both are statistically significant in simpler models. +The joint F-test for both predictors may be highly significant, +while the individual t-tests are not. + +How we handle collinearity depends on the inferential goal: + +- **Prediction**: If including both collinear predictors reduces PE, + include both. + Cross-validation determines whether each contributes. + +- **Evaluating a primary predictor**: If the primary predictor + remains statistically significant after adjusting for a collinear confounder, + the evidence for an independent effect is convincing. + If collinearity with a *required* confounder inflates the primary + predictor's standard error, + we may need to accept reduced precision or + present sensitivity analyses. + +- **Collinearity among confounders**: Does not affect + the precision of the primary predictor's estimate. + Both variables can be retained if needed for confounding control. + +- **Identifying multiple predictors**: Choose among collinear variables + on substantive grounds (causal plausibility, measurement quality, + fewer missing values). + +### Variance inflation factor {#sec-pred-sel-vif} + +The **variance inflation factor (VIF)** for predictor $j$ measures +how much its variance is inflated by correlation with other predictors: + +$$\text{VIF}_j = \frac{1}{1 - R_j^2}$$ + +where $R_j^2$ is the $R^2$ from regressing $x_j$ on all other predictors. + +A VIF $> 10$ (or equivalently, $R_j^2 > 0.9$) indicates +problematic collinearity. + +```{r} +#| label: vif-ldl-hers + +library(car) + +model_full <- lm( + LDL ~ HT + age + BMI + diabetes + smoking + statins + SBP, + data = hers_ldl +) +vif(model_full) +``` + +::: notes + +VIFs close to 1 indicate minimal collinearity. +In the HERS LDL model, +we expect relatively modest collinearity among the predictors, +since they represent distinct pathways (see @fig-hers-dag). + +::: + +## Number of predictors {#sec-pred-sel-epv} + +A widely used rule of thumb specifies a minimum number of +observations (or events) per predictor: + +- **Linear models**: at least 10 observations per predictor. +- **Logistic and Cox models**: at least 10 events per predictor (EPV). + +This guideline exists because with too many parameters +relative to information in the data, +coefficient estimates become imprecise and logistic/Cox models +can behave poorly (e.g., converge to extreme estimates). + +::: {.callout-note} +## The EPV rule is a flag, not a hard limit + +The precision of estimates depends on more than just the EPV: +residual variance, effect size, and correlations among predictors +all matter. +Adding covariates that are weakly correlated with the primary predictor +but explain substantial outcome variance can *improve* precision. +Conversely, adding a single collinear covariate can degrade it severely. +::: + +The rule is best used as a **warning flag**: +if the EPV falls below 10, +check whether the standard errors are unusually large, +and whether Wald and likelihood ratio test p-values +give consistent results. +If problems are detected, tighten the backward selection criterion +(e.g., to $p < 0.15$ or $p < 0.10$) +and verify that a more parsimonious model gives consistent results. + +## Alternatives to backward selection {#sec-pred-sel-alts} + +The main alternatives to backward selection are: + +- **Best subsets**: screens all possible subsets of candidate predictors + and selects the model minimizing a summary measure + (adjusted $R^2$, AIC, or a cross-validated PE measure). + Computationally intensive for large sets of predictors. + +- **Forward selection**: starts with an empty model and adds + the predictor providing the greatest improvement at each step. + More prone to missing *negatively confounded* sets of variables + (sets where no member appears important unless all are included). + +- **Stepwise selection**: augments forward selection by + also allowing previously added variables to be removed. + Better than pure forward but still misses some models + that backward selection would find. + +- **Bivariate screening**: evaluates each candidate predictor + individually before multivariable modeling. + Convenient but shares the limitation of forward selection. + +::: {.callout-note} +## Why backward selection is preferred + +Backward selection begins with the full set of candidates, +so it is **less likely to miss negatively confounded sets** of variables — +groups of variables that appear unimportant individually +but are jointly important. +Forward and stepwise selection will only include such sets +if at least one member passes the inclusion criterion on its own. +::: + +## Model selection complicates inference {#sec-pred-sel-inference} + +When predictors are selected from the data, +the p-values and confidence intervals computed as if the model +were pre-specified are no longer valid. + +Specific problems include: + +- **Inflated type-I error**: searching over many models + greatly increases the chance of at least one spurious finding. +- **Testimation bias**: selected predictors tend to have + overstated estimated effects, + because we select those that appear large by chance. +- **Biased CIs**: confidence intervals are too narrow + because they do not account for model uncertainty. + +How much these issues matter depends on the inferential goal: + +- **Prediction** (Goal 1): + Cross-validation of a PE measure protects against overfitting + without relying on p-values, + so inference is least affected. + +- **Evaluating a primary predictor** (Goal 2): + The primary predictor is included by default in all candidate models, + so its estimate is relatively unaffected by selection. + Post-selection inference is most problematic + when the primary predictor is of borderline significance. + +- **Identifying multiple predictors** (Goal 3): + Selection most severely complicates inference here, + since p-values for all retained predictors are of direct interest. + Careful pre-specification, use of inclusive models, + and the Allen–Cady procedure (see @sec-pred-sel-allen-cady) + reduce, but do not eliminate, this problem. + +::: {.callout-tip} +## Best practices for reporting + +When selection procedures are used: + +1. Report the **full candidate predictor list**. +2. Report the **selection criterion and procedure** used. +3. Interpret **novel, weak, and borderline significant** findings + with explicit caution. +4. Examine **alternative models** with different predictor subsets + and report consistency of key results. +::: diff --git a/_sec-pred-sel-goals.qmd b/_sec-pred-sel-goals.qmd new file mode 100644 index 000000000..43d249390 --- /dev/null +++ b/_sec-pred-sel-goals.qmd @@ -0,0 +1,75 @@ +{{< include latex-macros/macros.qmd >}} + +## Three inferential goals {#sec-pred-sel-goals} + +The appropriate strategy for predictor selection depends on +the inferential goal of the analysis. +@vittinghoff2e [Chapter 10] distinguishes three distinct goals: + +### Goal 1: Prediction {#sec-pred-sel-goal-pred} + +The primary aim is to minimize prediction error for new observations. +The predictors' individual causal interpretations are secondary. + +:::{#exm-pred-sel-goal-pred} +*Example*: @Walter2001 developed a model to identify +older adults at high risk of death following hospitalization, +using backward selection on a development set +and validating predictions in an independent hospital. +::: + +For this goal, we use measures of **prediction error** (PE) +to choose among candidate models. +Overfitting — the tendency of flexible models to fit noise — is +the central concern. +Strategies such as cross-validation and penalized regression +(see @sec-pred-sel-prediction and @sec-pred-sel-lasso) +protect against overfitting. + +### Goal 2: Evaluating a predictor of primary interest {#sec-pred-sel-goal-eval} + +The aim is to obtain a valid, minimally confounded estimate of +the association between a specific exposure and the outcome. + +:::{#exm-pred-sel-goal-eval} +*Example*: @Grodstein2001 estimated the effect of hormone +therapy on CHD risk in the Nurses' Health Study (NHS), +controlling for a wide range of known CHD risk factors +to minimize confounding. +::: + +In observational studies, confounding is the primary concern. +In randomized trials (like HERS), treatment is already unconfounded +by design, but adjusting for strong prognostic covariates can +improve precision. +DAGs (see @sec-pred-sel-dag) are especially useful for this goal. + +### Goal 3: Identifying the important predictors of an outcome {#sec-pred-sel-goal-id} + +The aim is to discover which predictors are independently associated +with the outcome. + +:::{#exm-pred-sel-goal-id} +*Example*: @Orwoll1996 examined independent predictors of +bone mass in the Study of Osteoporotic Fractures (SOF), +including all predictors statistically significant at $p < 0.05$ +in age-adjusted models. +::: + +This goal is the most challenging because inferences about +multiple predictors are of direct interest, +making false-positive findings and unstable selections +particular concerns. +Inclusive models with cautious interpretation are preferred +(see @sec-pred-sel-multiple). + +::: notes + +In practice, these goals often overlap. +For example, +an analysis can estimate the causal effect of treatment (Goal 2) +while also selecting additional prognostic covariates +to improve individualized risk prediction performance (Goal 1) +in the same model-building workflow. + +::: diff --git a/_sec-pred-sel-multiple.qmd b/_sec-pred-sel-multiple.qmd new file mode 100644 index 000000000..c75f688f9 --- /dev/null +++ b/_sec-pred-sel-multiple.qmd @@ -0,0 +1,91 @@ +{{< include latex-macros/macros.qmd >}} + +## Identifying multiple important predictors {#sec-pred-sel-multiple} + +::: notes + +When the goal is to identify *all* important predictors of an outcome +(Goal 3, @sec-pred-sel-goals), +confounding remains a primary concern for each predictor in turn, +not just a single primary predictor of interest. + +::: + +This goal is the most challenging because: + +- Overfitting and false-positive results are a greater concern + (many predictors, many opportunities for spurious findings). +- Interactions among candidate predictors may be of interest, + but systematically testing all possible interactions + can easily produce false positives. +- Collinear predictors may be difficult to disentangle. + +The same strategies recommended for Goal 2 +(face validity, backward selection with liberal criterion) +also apply here. +However, cautious interpretation of novel and borderline findings +becomes even more important. + +## Allen–Cady modified backward selection {#sec-pred-sel-allen-cady} + +A modified backward selection procedure due to Allen and Cady +[@AllenCady1982] +limits false-positive findings while still allowing variable selection. + +The procedure works as follows: + +1. **Specify** a set of variables to be forced into the model: + predictors of primary interest + and variables needed for face validity. +2. **Rank** the remaining candidate variables + in order of hypothesized importance. +3. Start with a model containing all variables in both sets. +4. Delete variables from the ranked set + in order of **ascending importance** (least important first), + stopping as soon as the first variable that meets the retention criterion + is encountered. + +::: notes + +This procedure limits false positives because +there is a single pre-specified deletion sequence, +and selection stops at the first retained variable. +In contrast, conventional backward selection +considers *all* remaining predictors at each step, +producing many more potential opportunities for false findings. + +::: + +## Cautious interpretation {#sec-pred-sel-cautious-interp} + +For Goal 3 analyses, +we do **not** recommend using only predictors significant at $p < 0.05$: +in small samples, even important predictors may fail to meet this threshold, +and a parsimonious model may have substantial residual confounding. + +Instead, the recommended approach is: + +- Use a relatively **inclusive model** + (liberal backward selection criterion: $p < 0.2$). +- Interpret **novel, weak, and borderline significant** associations + cautiously. +- Report the **full list of candidate covariates** considered, + including those ultimately excluded. +- Acknowledge inflated type-I error as a study limitation. + +## Example: risk factors for CHD in HERS {#sec-pred-sel-chd-risk-factors} + +@Vittinghoff2003 identified independent CHD risk factors +in the HERS cohort using a multipredictor Cox model. +The large number of outcome events ($n = 361$) allowed +all previously identified risk factors with $p < 0.2$ +in unadjusted models to be included. + +Among 11 predictors retained as important on both substantive and +statistical grounds were: +ethnicity, exercise habits, diabetes, angina, congestive heart failure, +prior heart attacks, blood pressure, LDL, HDL, Lp(a), and creatinine clearance. + +The model also controlled for age, smoking, alcohol use, and obesity +(not statistically significant after adjustment) +to rule out confounding and ensure face validity. diff --git a/_sec-pred-sel-overview.qmd b/_sec-pred-sel-overview.qmd new file mode 100644 index 000000000..43651429c --- /dev/null +++ b/_sec-pred-sel-overview.qmd @@ -0,0 +1,72 @@ +{{< include latex-macros/macros.qmd >}} + +## Why predictor selection matters {#sec-pred-sel-why} + +In regression modeling, +we often have many potential predictors available, +but including too many predictors can cause problems: + +- **Overfitting**: the model fits noise in the training data and +predicts poorly for new data. +- **Multicollinearity**: highly correlated predictors make +coefficients unstable and hard to interpret. +- **Loss of precision**: including unnecessary predictors increases +standard errors. + +On the other hand, excluding important predictors can lead to: + +- **Confounding bias**: omitting a common cause of the exposure and +outcome distorts the estimated association. +- **Reduced power**: failing to adjust for strong predictors of the +outcome can reduce statistical power. + +## Strategies for predictor selection {#sec-pred-sel-strategies} + +@vittinghoff2e [Chapter 10] describes several general approaches to +predictor selection: + +1. **Science-driven (purposeful) selection**: +Use subject-matter knowledge and a causal diagram (DAG) +to identify which variables to include. +This is the preferred approach when the goal is causal inference. + +2. **Testing-based selection**: +Use statistical tests to decide which predictors to include. +Examples include forward selection, backward elimination, +and stepwise selection. +These methods have well-known problems and should be used with caution +(see @sec-pred-sel-stepwise). + +3. **Criterion-based selection**: +Compare models using information criteria such as AIC or BIC. +These criteria balance model fit against complexity +(see @sec-pred-sel-aic-bic). + +4. **Penalized regression**: +Use regularization methods (such as the LASSO) +that shrink coefficients toward zero, +effectively performing selection and estimation simultaneously +(see @sec-pred-sel-lasso). + +## The role of subject-matter knowledge {#sec-pred-sel-subject-matter} + +::: notes + +Regardless of which selection strategy is used, +subject-matter knowledge should guide model building. + +::: + +A regression model can only tell us about *associations* — +whether certain predictors are correlated with the outcome +in our data. +It cannot, on its own, tell us whether those associations reflect +causal effects. + +To draw causal conclusions, we need: + +1. A causal model (such as a DAG) specifying assumed causal relationships. +2. Appropriate adjustment for confounders. +3. Careful avoidance of adjusting for mediators or colliders. + +These concepts are discussed in @sec-pred-sel-dag. diff --git a/_sec-pred-sel-prediction.qmd b/_sec-pred-sel-prediction.qmd new file mode 100644 index 000000000..1db68a355 --- /dev/null +++ b/_sec-pred-sel-prediction.qmd @@ -0,0 +1,285 @@ +{{< include latex-macros/macros.qmd >}} + +## Bias–variance trade-off and overfitting {#sec-pred-sel-bias-var} + +When building a prediction model, +adding more predictors generally **reduces bias**: +the model better captures the true relationships in the population. +But it also **increases variance**: +more parameters to estimate means more sampling variability, +and the model can start to fit noise in the training data. + +:::{#def-overfitting} +#### Overfitting + +**Overfitting** occurs when a model fits the training data well +but predicts poorly for new observations. +It results from including too many predictors +relative to the effective sample size. + +::: + +A good prediction model minimizes the sum of bias and variance — +the **bias–variance trade-off**. +Adding a predictor helps only if +the reduction in bias outweighs the increase in variance. + +## Numerical example of overfitting {#sec-pred-sel-overfit-example} + +The table below gives a simple numerical example. +A parsimonious model and an intentionally overfit model +are both trained on the same development set. +The overfit model adds many noise predictors +that have no true relationship with LDL. + +::: {#tbl-overfit-example} + +```{r} +#| label: overfit-example + +set.seed(204) +n_noise_predictors <- 20 + +hers_base <- + hers_ldl |> + dplyr::select(LDL, HT, age, statins) |> + tidyr::drop_na() + +n_overfit <- nrow(hers_base) +idx_overfit <- sample(seq_len(n_overfit), size = floor(0.7 * n_overfit)) + +train_overfit <- hers_base[idx_overfit, ] +valid_overfit <- hers_base[-idx_overfit, ] + +for (j in seq_len(n_noise_predictors)) { + zname <- paste0("z", j) + train_overfit[[zname]] <- rnorm(nrow(train_overfit)) + valid_overfit[[zname]] <- rnorm(nrow(valid_overfit)) +} + +model_parsimonious <- lm(LDL ~ HT + age + statins, data = train_overfit) +noise_terms <- paste0("z", seq_len(n_noise_predictors)) +overfit_formula <- reformulate(c("HT", "age", "statins", noise_terms), "LDL") +model_overfit <- lm( + overfit_formula, + data = train_overfit +) + +rmse <- function(y, yhat) { + sqrt(mean((y - yhat)^2)) +} + +tibble::tibble( + Model = c( + "Parsimonious model", + paste0("Overfit model (+", n_noise_predictors, " noise predictors)") + ), + Training_RMSE = c( + rmse( + train_overfit$LDL, + predict(model_parsimonious, newdata = train_overfit) + ), + rmse(train_overfit$LDL, predict(model_overfit, newdata = train_overfit)) + ), + Validation_RMSE = c( + rmse( + valid_overfit$LDL, + predict(model_parsimonious, newdata = valid_overfit) + ), + rmse(valid_overfit$LDL, predict(model_overfit, newdata = valid_overfit)) + ) +) |> + dplyr::mutate( + dplyr::across(c(Training_RMSE, Validation_RMSE), ~ round(.x, 2)) + ) +``` + +Numerical example showing overfitting: +lower training error but worse validation error. + +::: + +In this example, +the overfit model usually has lower training RMSE +but higher validation RMSE, +which is the defining pattern of overfitting. + +## Measures of prediction error {#sec-pred-sel-pe} + +To select a model that minimizes prediction error, +we need a measure of PE that does not simply increase +with the number of predictors. + +For **continuous outcomes**: + +- **Adjusted $R^2$** penalizes the number of predictors: +$$\bar R^2 = 1 - \frac{n-1}{n-p-1}(1 - R^2)$$ +Adding a variable increases $\bar R^2$ only if it reduces +the residual variance more than the penalty. + +- **AIC** and **BIC** impose steeper penalties: +$$\text{AIC} = -2\hat\ell + 2p, \quad + \text{BIC} = -2\hat\ell + p\log n$$ +(see @sec-pred-sel-aic-bic). + +For **binary outcomes**, +common measures include +**balanced accuracy** and the **C-statistic**. + +**Balanced accuracy**: +$$\text{BA} = \frac{\text{Sensitivity} + \text{Specificity}}{2}$$ +where +$$\text{Sensitivity} = \Pr(\hat Y = 1 \mid Y = 1), \quad +\text{Specificity} = \Pr(\hat Y = 0 \mid Y = 0).$$ +If $J$ is **Youden's $J$ statistic**, +$$J = \text{Sensitivity} + \text{Specificity} - 1,$$ +then +$$\text{BA} = \frac{J + 1}{2}.$$ +So BA is a linear transformation of $J$, +and BA and $J$ are equivalent for comparing models. + +**C-statistic** (area under the ROC curve, AUC): +$$ +C = \Pr(\hat{\pi}_i > \hat{\pi}_j \mid Y_i = 1, Y_j = 0) \\ +\qquad + \frac{1}{2}\Pr(\hat{\pi}_i = \hat{\pi}_j \mid Y_i = 1, Y_j = 0) +$$ +where $\hat{\pi}$ is the predicted event probability. +An empirical estimator is +$$ +\hat{C} += \frac{1}{n_1 n_0} +\sum_{i:Y_i=1} +\sum_{j:Y_j=0} +\left[ +I(\hat{\pi}_i > \hat{\pi}_j) +\qquad + \frac{1}{2}I(\hat{\pi}_i = \hat{\pi}_j) +\right]. +$$ +It measures discrimination: +the model's ability to rank events above non-events. + +For **survival outcomes**, the analogous measure is the **C-index**, +defined as +$$ +\hat C_{\text{index}} += \frac{N_{\text{concordant}} + 0.5\,N_{\text{tied}}} +{N_{\text{comparable}}}, +$$ +where pairs are comparable when one subject is observed to fail +before the other is known to fail or be censored, +and concordance means the subject with earlier failure +has the higher predicted risk score. + +## Optimism and cross-validation {#sec-pred-sel-cv} + +Naive estimates of PE evaluated on the same data used +to fit the model are **optimistic**: +they overstate the model's ability to predict new data. +For example, $R^2$ always increases when a predictor is added, +even if the predictor is pure noise. + +**Cross-validation** provides a less optimistic estimate of PE +by evaluating predictions on observations +not used to estimate the model. + +:::{#def-kfold} +#### $k$-fold cross-validation + +In **$k$-fold cross-validation**: + +1. The data are randomly divided into $k$ equal-sized subsets ("folds"). +2. For each fold $i = 1, \ldots, k$: + - Fit the model using all data *except* fold $i$. + - Compute predicted values for the observations in fold $i$. +3. Compute a summary PE measure across all folds. + +Values of $k = 5$ or $k = 10$ are typical. + +::: + +Using cross-validation to select predictors avoids overfitting +because the predictive performance is evaluated on held-out data +that did not influence the model fit. + +## Cross-validation for LASSO in the HERS data {#sec-pred-sel-cv-hers} + +The LASSO (see @sec-pred-sel-lasso) uses cross-validation +to select the penalty parameter $\lambda$ +that minimizes prediction error. +The `cv.glmnet` function implements 10-fold cross-validation, +fitting the LASSO for a grid of $\lambda$ values and evaluating +mean squared error on held-out folds. +The fitted object provides two key $\lambda$ values: +`lambda.min` (minimizes CV mean squared error) and +`lambda.1se` (largest $\lambda$ within one standard error of the minimum). +See @sec-pred-sel-lasso for the HERS example +with coefficient paths and CV results. + +::: notes + +The two recommended values of $\lambda$ represent different +points on the bias–variance trade-off: + +- `lambda.min` minimizes cross-validated MSE, + yielding a model with good predictive accuracy. +- `lambda.1se` uses a larger penalty + (farther from zero, more shrinkage), + resulting in a sparser, more conservative model + that is often preferred for interpretability. + +::: + +## Development and validation sets {#sec-pred-sel-split} + +The most transparent approach to measuring prediction error +is to split the data into a **development set** +(used to fit the model) +and a **validation set** +(used only to evaluate predictions). + +::: notes + +When a single data set must be split, +a common allocation is two-thirds to development +and one-third to validation. +External validation using a completely independent sample +(from a different time period or study site) +provides the most rigorous assessment of generalizability. + +::: + +```{r} +#| label: split-sample-ldl + +set.seed(42) +n_hers <- nrow(hers_ldl) +train_idx <- sample(seq_len(n_hers), size = floor(0.67 * n_hers)) + +hers_train <- hers_ldl[train_idx, ] +hers_valid <- hers_ldl[-train_idx, ] + +# Fit model on training data +model_train <- lm( + LDL ~ HT + age + BMI + diabetes + statins, + data = hers_train +) + +# Evaluate on validation data +pred_valid <- predict(model_train, newdata = hers_valid) +rmse_valid <- sqrt(mean((hers_valid$LDL - pred_valid)^2, na.rm = TRUE)) +rmse_train <- sqrt(mean(residuals(model_train)^2)) + +cat( + "Training RMSE:", round(rmse_train, 2), "\n", + "Validation RMSE:", round(rmse_valid, 2), "\n" +) +``` + +::: notes + +If the validation RMSE is much larger than the training RMSE, +this is a sign of overfitting. +In well-specified models with adequate sample sizes, +the two values are typically close. + +::: diff --git a/_sec-pred-sel-primary.qmd b/_sec-pred-sel-primary.qmd new file mode 100644 index 000000000..e7185b304 --- /dev/null +++ b/_sec-pred-sel-primary.qmd @@ -0,0 +1,440 @@ +{{< include latex-macros/macros.qmd >}} + +## Evaluating a predictor of primary interest {#sec-pred-sel-eval-primary} + +::: notes + +When the goal is to estimate the causal effect of +a specific exposure on an outcome, +the central challenge in observational data is **confounding**. +Relatively inclusive models — those including more potential confounders — +are better at minimizing confounding bias. + +::: + +The following principles guide predictor selection for this goal. + +## Including predictors for face validity {#sec-pred-sel-face-validity} + +Some predictors are such well-established causal antecedents of the outcome +that they should be included regardless of their statistical significance +in the current data. +This ensures the **face validity** of the model: +readers and reviewers can be confident that obvious confounders +have been accounted for. + +For example, age and smoking are established CHD risk factors +and would be included in any model for CHD outcomes, +even if they are not statistically significant in a small sample. + +## Selecting confounders on statistical grounds {#sec-pred-sel-confounders} + +In addition to variables included for face validity, +we may consider other potential confounders +identified in previous studies or on substantive grounds. + +A practical approach is **backward selection with a liberal criterion**: + +1. Start with a full model including all pre-specified candidates. +2. At each step, remove the predictor with the largest p-value, + provided it exceeds a liberal threshold (e.g., $p > 0.2$). +3. Stop when all remaining predictors meet the retention criterion. + +The liberal threshold ($p > 0.2$ rather than $p > 0.05$) +is important for confounding control: +in small samples, even important confounders may have large p-values, +and removing them could introduce bias. + +An alternative retention criterion is the +**10% change-in-estimate rule** +[@kleinbaum2010logistic, Chapter 8, pp. 194--212; +@vittinghoff2e, Chapter 10, pp. 257--275; +@MickeyGreenland1989, p. 125; +@Greenland1989Modeling, p. 340]: + +Retain a candidate confounder $C$ in the model if +dropping it changes the estimated coefficient for the primary predictor +by more than 10–15%. + +More precisely, +let $\hat\beta$ be the estimated coefficient for the exposure +from the **full model** (including $C$), +and let $\hat\beta_0$ be the estimate from the **reduced model** (dropping $C$). +Then $C$ is judged a meaningful confounder if: + +$$\left| \frac{\hat\beta - \hat\beta_0}{\hat\beta} \right| > 0.10$$ + +The threshold is often taken as 10–20%, depending on the context. +A smaller threshold (e.g., 10%) retains more variables; +a larger threshold (e.g., 20%) allows more aggressive pruning. + +::: {.callout-note} + +The 10% change-in-estimate rule focuses on **bias** rather than **statistical significance**. +A variable can be a meaningful confounder even if it is not statistically significant +(because of small sample size or measurement error), +and conversely, a statistically significant variable may cause negligible confounding. +The p-value criterion and the change-in-estimate rule can disagree, +and @vittinghoff2e recommends using both as complementary checks. + +::: + +### Numerical example: 10% rule in WCGS {#sec-pred-sel-10pct-wcgs} + +::: notes + +The WCGS (Western Collaborative Group Study) is an observational +prospective cohort of 3,154 men followed for CHD events. +Unlike HERS (a randomized trial), +the WCGS exposure — Type A behavior pattern — was not randomly assigned, +so confounding is a genuine concern. + +::: + +In the WCGS study, +the exposure of interest is Type A behavior pattern (`dibpat`), +and the outcome is CHD incidence (`chd69`). +We fit logistic regression models +and apply the 10% change-in-estimate rule +to assess whether age, cholesterol, SBP, and BMI +are meaningful confounders of the Type A → CHD relationship, +while also examining smoking as an additional covariate. + +::: notes + +Adapted from @kleinbaum2010logistic [Chapter 8], +which describes the change-in-estimate procedure for logistic regression models. + +::: + +```{r} +#| label: wcgs-data-setup + +wcgs <- readRDS(fs::path_package("rme", "extdata/wcgs.rds")) +``` + +```{r} +#| label: wcgs-10pct-rule + +# Full model (all candidate confounders) +m_full <- glm( + I(chd69 == "Yes") ~ dibpat + age + chol + sbp + smoke + bmi, + data = wcgs, + family = binomial +) + +# Unadjusted model (Type A only) +m_unadj <- glm( + I(chd69 == "Yes") ~ dibpat, + data = wcgs, + family = binomial +) + +# Estimate of Type A effect from full model +beta_full <- coef(m_full)["dibpatType A"] +beta_unadj <- coef(m_unadj)["dibpatType A"] + +# Models dropping one confounder at a time +confounders <- c("age", "chol", "sbp", "smoke", "bmi") + +results <- lapply(confounders, function(v) { + f <- as.formula( + paste("I(chd69 == \"Yes\") ~ dibpat +", + paste(setdiff(confounders, v), collapse = " + ")) + ) + m_red <- glm(f, data = wcgs, family = binomial) + beta_red <- coef(m_red)["dibpatType A"] + pct_change <- abs((beta_full - beta_red) / beta_full) * 100 + data.frame( + Confounder = v, + Beta_full = round(beta_full, 4), + Beta_reduced = round(beta_red, 4), + Pct_change = round(pct_change, 1), + Meaningful = ifelse(pct_change > 10, "Yes (>10%)", "No (<=10%)") + ) +}) + +results_df <- do.call(rbind, results) +rownames(results_df) <- NULL +``` + +::: {#tbl-wcgs-10pct} + +```{r} +knitr::kable( + results_df, + col.names = c( + "Dropped variable", + "β (full model)", + "β (reduced model)", + "% change", + "Meaningful confounder?" + ) +) +``` + +10% change-in-estimate rule applied to the WCGS data. +The Type A coefficient from the full model +is compared with coefficients from models that drop one confounder at a time. +A percentage change exceeding 10% indicates meaningful confounding. + +::: + +In this example, +none of the individual confounders causes a >10% change when dropped, +suggesting they are not individually strong confounders of the +Type A → CHD relationship in WCGS. +Comparing the unadjusted log-OR for Type A +($\approx$ `r round(beta_unadj, 2)`) +with the fully adjusted log-OR +($\approx$ `r round(beta_full, 2)`) +shows that the overall set of covariates provides modest adjustment. +In practice, covariates with established biological relationships to the outcome +(such as age, smoking, and blood pressure for CHD) +are typically retained regardless of the change-in-estimate result, +for reasons of face validity. + +When multiple covariates are candidates for removal, +checking all possible subsets +is often impractical. +A practical alternative +is a backward-elimination procedure +guided jointly by the 10% rule +and precision. + +At each step: + +1. Start from the current model. +2. Fit models that drop one remaining candidate covariate at a time. +3. Keep only candidates with + absolute percent change from the full-model exposure coefficient + no larger than 10%, + where the full-model coefficient + is always from the original + gold-standard model. +4. Among those candidates, + drop the covariate + that gives the largest reduction + in the exposure coefficient's standard error. +5. Repeat until no remaining covariate can be dropped + without either + exceeding the 10% threshold + or increasing the exposure standard error. + +```{r} +#| label: wcgs-10pct-bwe +#| code-fold: false + +beta_full <- coef(m_full)["dibpatType A"] +se_full <- sqrt(vcov(m_full)["dibpatType A", "dibpatType A"]) +exposure_var <- "dibpat" + +current_covars <- confounders +current_se <- se_full +step_num <- 1 +bwe_steps <- list() + +repeat { + candidate_results <- lapply(current_covars, function(v) { + keep_covars <- setdiff(current_covars, v) + reduced_formula <- if (length(keep_covars) == 0) { + as.formula("I(chd69 == \"Yes\") ~ dibpat") + } else { + as.formula( + paste( + "I(chd69 == \"Yes\") ~", + paste(c(exposure_var, keep_covars), collapse = " + ") + ) + ) + } + + m_red <- glm(reduced_formula, data = wcgs, family = binomial) + beta_red <- coef(m_red)["dibpatType A"] + se_red <- sqrt(vcov(m_red)["dibpatType A", "dibpatType A"]) + pct_change <- abs((beta_full - beta_red) / beta_full) * 100 + se_reduction <- current_se - se_red + + data.frame( + Step = step_num, + Candidate_drop = v, + Beta_reduced = beta_red, + Pct_change_from_full = pct_change, + SE_reduced = se_red, + SE_reduction = se_reduction, + Eligible = pct_change <= 10 & se_reduction > 0 + ) + }) + + candidates_df <- do.call(rbind, candidate_results) + eligible_df <- candidates_df[candidates_df$Eligible, , drop = FALSE] + + if (nrow(eligible_df) == 0) { + break + } + + best_idx <- which.max(eligible_df$SE_reduction) + best_drop <- eligible_df[best_idx, , drop = FALSE] + dropped_var <- as.character(best_drop$Candidate_drop) + + bwe_steps[[step_num]] <- data.frame( + Step = step_num, + Dropped = dropped_var, + Pct_change_from_full = round(best_drop$Pct_change_from_full, 1), + SE_before = round(current_se, 4), + SE_after = round(best_drop$SE_reduced, 4), + SE_reduction = round(best_drop$SE_reduction, 4) + ) + + current_covars <- setdiff(current_covars, dropped_var) + current_se <- best_drop$SE_reduced + step_num <- step_num + 1 +} + +bwe_steps_df <- if (length(bwe_steps) == 0) { + data.frame( + Step = NA_integer_, + Dropped = "None", + Pct_change_from_full = NA_real_, + SE_before = round(se_full, 4), + SE_after = round(se_full, 4), + SE_reduction = 0 + ) +} else { + do.call(rbind, bwe_steps) +} + +retained_covars <- current_covars +``` + +::: {#tbl-wcgs-10pct-bwe} + +```{r} +knitr::kable( + bwe_steps_df, + col.names = c( + "Step", + "Dropped covariate", + "% change from full model", + "SE before drop", + "SE after drop", + "SE reduction" + ) +) +``` + +Backward elimination under the 10% change-in-estimate rule, +with standard-error reduction +used to choose among eligible drops. +The reference for the percent-change criterion +is always the Type A coefficient in the original full model. + +::: + +For further reading on model selection strategies in epidemiologic analyses, +see @vittinghoff2e [Chapter 10, pp. 257--275], +@kleinbaum2010logistic [Chapters 6--8], +and @me4 [Chapter 21, section "Model Selection"]. + +::: {.callout-note} +## Collinearity between confounders is not a problem + +Unlike collinearity between a predictor of primary interest +and other covariates (which inflates its standard error), +collinearity *among* adjustment variables does not affect +the precision of the primary predictor's estimate. +Thus, two collinear confounders can both be retained +if both are judged necessary on substantive grounds. +::: + +## Example: backward selection for CHD confounders {#sec-pred-sel-backward-hers} + +In the HERS data, we can use backward selection +to identify which baseline covariates confound +the effect of hormone therapy on LDL. +Since HT was randomized in HERS, +adjustment is needed for precision, not confounding; +but this illustrates the procedure. + +```{r} +#| label: backward-sel-ldl-hers + +full_model <- lm( + LDL ~ HT + age + BMI + diabetes + smoking + statins + SBP, + data = hers_ldl +) + +# Backward selection using AIC +back_model <- MASS::stepAIC(full_model, direction = "backward", trace = FALSE) +summary(back_model) +``` + +::: notes + +The backward selection procedure retains the predictors that +most improve model fit according to AIC. +In this setting, since HT was randomized, +we interpret the selected set as prognostic covariates +that improve precision of the HT estimate, +not as confounders. + +::: + +## Interactions with the primary predictor {#sec-pred-sel-interactions} + +A useful check on the validity of the selected model is to test +for **interactions** between the primary predictor and +key covariates. +Particularly for novel findings, +showing that the association is consistent across subgroups +strengthens credibility. + +::: notes + +If a substantial and biologically plausible interaction is found, +the analysis must account for it. +However, exploratory interaction tests are susceptible to +false-positive findings, +and unexpected interactions should be interpreted cautiously. + +::: + +## Randomized experiments {#sec-pred-sel-rct} + +In randomized trials, participants are allocated to treatment randomly, +so the treatment indicator is by design uncorrelated +with pre-treatment covariates in expectation. +Confounding adjustment is therefore **not required** for causal inference. + +However, there are several reasons to include covariates +in trial analyses: + +1. **Improving precision** (continuous outcomes). + Adjusting for strong predictors of the outcome + reduces residual variance and tightens the treatment effect estimate. + Because treatment is uncorrelated with the covariates by randomization, + variance inflation is negligible. + +2. **De-attenuation** (binary and survival outcomes). + In logistic and Cox models, + omitting important predictors attenuates the treatment effect estimate. + Adjusting for them moves the estimate closer to the true subject-specific effect. + +3. **Accounting for stratified designs**. + In stratified or cluster-randomized trials, + including stratification variables in the model + yields correct standard errors and p-values. + +4. **Correcting for chance imbalances**. + In small trials, randomization may leave residual imbalance + in important prognostic covariates. + Adjusted analyses demonstrate that the main inference + is not materially affected by any such imbalance. + +::: {.callout-note} +## Pre-specification is important in trials + +Adjusted analyses in clinical trials should ideally be +**pre-specified in the study protocol**, +to prevent post-hoc selection of covariates +that minimize the treatment effect p-value. +::: diff --git a/_subfiles/shared/_sec_hers_conclusions.qmd b/_subfiles/shared/_sec_hers_conclusions.qmd new file mode 100644 index 000000000..804f242a5 --- /dev/null +++ b/_subfiles/shared/_sec_hers_conclusions.qmd @@ -0,0 +1,32 @@ +Despite observational studies suggesting up to 35–80% fewer recurrent +CHD events among hormone users than non-users, +[@HulleyStephen1998RToE], +HERS found **no significant difference** in CHD event rates between +the hormone therapy (HT) and placebo groups +(relative hazard 0.99; 95% CI: 0.80–1.22) +[@HulleyStephen1998RToE]. + +The HERS investigators noted a statistically significant time trend: +more CHD events occurred in the HT group in year 1, +while fewer occurred in years 4 and 5, +suggesting that HT may have early harmful effects that are later reversed. + +A key finding was that HT significantly altered lipid profiles: + +- LDL cholesterol: 11% lower in the HT group [@HulleyStephen1998RToE]. +- HDL cholesterol: 10% higher in the HT group [@HulleyStephen1998RToE]. + +Yet these favorable lipid changes did not translate into +fewer CHD events overall. +One explanation consistent with the DAG in @fig-hers-dag +is that the effects of HT on lipids (a mediator pathway) +were offset by other, harmful pathways — +possibly thrombotic — that are not captured in the lipid mediators. + +::: notes + +@HulleyStephen1998RToE also noted that HT increased the rate of +venous thromboembolic events (relative hazard 2.89; 95% CI: 1.50–5.58) +and gallbladder disease (relative hazard 1.38; 95% CI: 1.00–1.92). + +::: diff --git a/_subfiles/shared/_sec_hers_intro.qmd b/_subfiles/shared/_sec_hers_intro.qmd index cff857e51..6194b059b 100644 --- a/_subfiles/shared/_sec_hers_intro.qmd +++ b/_subfiles/shared/_sec_hers_intro.qmd @@ -2,3 +2,13 @@ The "heart and estrogen/progestin study" (HERS) was a clinical trial of hormone therapy for prevention of recurrent heart attacks and death among 2,763 post-menopausal women with existing coronary heart disease (CHD) [@HulleyStephen1998RToE]. + +The trial was conducted at 20 US clinical centers. +Participants were randomized to receive either conjugated equine estrogens +(0.625 mg/day) plus medroxyprogesterone acetate (2.5 mg/day) +or a matching placebo [@HulleyStephen1998RToE]. +Women were followed for an average of 4.1 years +[@HulleyStephen1998RToE]. + +The primary outcome was nonfatal myocardial infarction or CHD death +[@HulleyStephen1998RToE]. diff --git a/inst/WORDLIST b/inst/WORDLIST index b2e644f28..ce592c75f 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -1,63 +1,93 @@ Aalen affine Acknowledgements -Biostat BMI +Biostat CHD CLT +Cady +Confounder +DAGs +De devn Ef EDA +EPV Epi Ftest GOF GitHub +Grodstein +HDL HSE Hua Kaplan Kleinbaum Kullback +LDL Leibler +Lp MathJax +Multicollinearity Multipredictor +NHS NoDerivatives NonCommercial ORCID +Orwoll +Osteoporotic resid RevealJS Rocke +Rosenman SBP +SOF Satterthwaite Sjoberg Stochasticity +TSS +Testimation Therneau TinyTeX -TSS UC +VIF +VIFs +Vittinghoff vxi WCGS Welch Zhou +al +anticonservative +arg biomarkers bmatrix bmt callout cdot cdots +collinear +confounder +creatinine +dL da ddots +de demstats det df dfrac diag dists -dL eps +equine +estrogens +et exm exponentiating fp frac +generalizability geq ggfortify ggplot @@ -68,29 +98,40 @@ idempotent ij ind infty +interpretability invertible ji ldots +leftarrow leq +lipoprotein logrank mX -matr mathcal +matr +medroxyprogesterone mn multimodality multipredictor +multivariable ndist neq np +overfit overset pec pmatrix pre +pred probabilistically +progestin qmd qquad +sel siid sj +statin +statins stókhos subfiles submodule @@ -99,7 +140,10 @@ subtypes surv survminer th +thromboembolic +thrombotic tp +unconfounded ungrouped varepsilon vdots diff --git a/latex-macros b/latex-macros index a0b66f56a..339013406 160000 --- a/latex-macros +++ b/latex-macros @@ -1 +1 @@ -Subproject commit a0b66f56a01f9ffc65625071fe40b526acb5a268 +Subproject commit 339013406d162cb2c6a41059d5829a40ac03933d diff --git a/predictor-selection.qmd b/predictor-selection.qmd new file mode 100644 index 000000000..b0d23db95 --- /dev/null +++ b/predictor-selection.qmd @@ -0,0 +1,359 @@ +--- +title: "Predictor Selection" +subtitle: "Choosing which variables to include in a regression model" +format: + html: default + revealjs: + output-file: predictor-selection-slides.html + pdf: + output-file: predictor-selection-handout.pdf +--- + +{{< include shared-config.qmd >}} + +# Acknowledgements {.unnumbered} + +This content is adapted from: + +- [@vittinghoff2e, Chapter 10] +- [@heinze2018variable] + +# Introduction {#sec-pred-sel-intro} + +{{< include _sec-pred-sel-overview.qmd >}} + +# Inferential Goals {#sec-pred-sel-goals-chapter} + +{{< include _sec-pred-sel-goals.qmd >}} + +# Causal Thinking and DAGs {#sec-pred-sel-causal} + +{{< include _sec-pred-sel-dag.qmd >}} + +# Prediction {#sec-pred-sel-prediction} + +{{< include _sec-pred-sel-prediction.qmd >}} + +# Evaluating a Predictor of Primary Interest {#sec-pred-sel-eval} + +{{< include _sec-pred-sel-primary.qmd >}} + +# Identifying Multiple Important Predictors {#sec-pred-sel-multiple-chapter} + +{{< include _sec-pred-sel-multiple.qmd >}} + +# Testing-based selection {#sec-pred-sel-stepwise} + +::: notes + +Testing-based selection methods use statistical tests +(such as F-tests or likelihood ratio tests) +to decide which predictors to include in a model. + +::: + +## Stepwise regression + +::: {.callout-warning} +## Caution about stepwise selection + +Stepwise regression has well-known problems: + +- It tends to select too many variables (overfitting). +- P-values and confidence intervals are biased after selection. +- It ignores model uncertainty. +- Results can be unstable across different samples. + +Consider using subject-matter knowledge (DAGs), +cross-validation, or penalized methods (such as LASSO) instead. +See @heinze2018variable for a thorough review. +::: + +The basic idea of stepwise selection: + +- **Forward selection**: Start with no predictors; +add the predictor that most improves model fit at each step. +- **Backward elimination**: Start with all predictors; +remove the predictor that least worsens model fit at each step. +- **Bidirectional stepwise**: Combine forward and backward steps. + +## Example: stepwise selection for LDL + +```{r} +#| label: stepwise-ldl + +# Full model with all candidate predictors +full_model_ldl <- lm( + LDL ~ HT + age + BMI + diabetes + smoking + statins + SBP, + data = hers_ldl +) + +# Backward elimination using AIC +step_model_ldl <- step( + full_model_ldl, + direction = "backward", + trace = 0 +) + +summary(step_model_ldl) +``` + +# Criterion-based selection {#sec-pred-sel-aic-bic} + +## AIC and BIC + +When comparing non-nested models, +we can use information criteria: + +- **AIC** (Akaike Information Criterion): +$$\text{AIC} = -2\,\hat\ell + 2p$$ +- **BIC** (Bayesian Information Criterion): +$$\text{BIC} = -2\,\hat\ell + p\log n$$ + +where $\hat\ell$ is the maximized log-likelihood, +$p$ is the number of model parameters, +and $n$ is the sample size. + +Lower values of AIC or BIC indicate a better model. +BIC applies a larger penalty for model complexity, +especially for large $n$. + +## Comparing models for LDL + +::: {#tbl-model-comparison} + +```{r} + +library(tibble) + +models_ldl <- list( + "HT only" = lm(LDL ~ HT, data = hers_ldl), + "HT + age" = lm(LDL ~ HT + age, data = hers_ldl), + "HT + statins" = lm(LDL ~ HT + statins, data = hers_ldl), + "HT + age + BMI + diabetes + statins" = lm( + LDL ~ HT + age + BMI + diabetes + statins, + data = hers_ldl + ), + "Full model" = lm( + LDL ~ HT + age + BMI + diabetes + smoking + statins + SBP, + data = hers_ldl + ) +) + +tibble( + Model = names(models_ldl), + AIC = sapply(models_ldl, AIC), + BIC = sapply(models_ldl, BIC), + Adj_R2 = sapply( + models_ldl, + function(m) summary(m)$adj.r.squared + ) +) +``` + +Comparison of LDL models by AIC, BIC, and adjusted $R^2$. + +::: + +# Penalized regression {#sec-pred-sel-lasso} + +::: notes + +When there are many candidate predictors, +penalized regression methods simultaneously +estimate model coefficients and perform variable selection +by shrinking small coefficients toward zero. + +::: + +## The LASSO + +The **LASSO** (Least Absolute Shrinkage and Selection Operator) +adds an $\ell_1$ penalty to the log-likelihood: + +$$\hat\beta_\lambda = \arg\max_\beta \left\{ +\hat\ell(\beta) - \lambda \sum_{j=1}^p |\beta_j| +\right\}$$ + +The tuning parameter $\lambda$ controls the strength of the penalty: + +- $\lambda = 0$: no penalty (ordinary least squares). +- $\lambda \to \infty$: all coefficients shrink to zero. + +For intermediate values of $\lambda$, +some coefficients are exactly zero (excluded), +while others are shrunk but non-zero. + +The optimal $\lambda$ is usually chosen by cross-validation. + +## Example: LASSO for LDL + +::: {#fig-lasso-ldl} + +```{r} + +library(glmnet) +library(ggfortify) + +hers_ldl_complete <- + hers_ldl |> + dplyr::select(LDL, HT, age, BMI, diabetes, smoking, statins, SBP) |> + tidyr::drop_na() + +y_ldl <- hers_ldl_complete$LDL +x_ldl <- + hers_ldl_complete |> + dplyr::select(-LDL) |> + mutate( + HT = as.integer(HT == "HT"), + statins = as.integer(statins == "Yes"), + diabetes = as.integer(diabetes == "Yes"), + smoking = as.integer(smoking == "Yes") + ) |> + as.matrix() + +lasso_fit <- glmnet(x_ldl, y_ldl) +autoplot(lasso_fit, xvar = "lambda") + + theme_bw() +``` + +LASSO coefficient paths for LDL predictors in HERS. +Each line traces one predictor's coefficient as $\lambda$ increases. +Coefficients shrink toward zero as the penalty grows; +some reach exactly zero (variables excluded from the model). + +::: + + +--- + +::: {#fig-lasso-cv} + +```{r} + +cv_lasso <- cv.glmnet(x_ldl, y_ldl) +plot(cv_lasso) +``` + +Cross-validated LASSO for LDL: mean squared error vs. $\log(\lambda)$. +The left dashed line marks `lambda.min` (minimum CV error); +the right dashed line marks `lambda.1se` (largest $\lambda$ within 1 SE of the minimum). + +::: + +--- + +::: {#tbl-lasso-coefs} + +```{r} + +data.frame( + lambda.min = as.vector(coef(cv_lasso, s = "lambda.min")), + lambda.1se = as.vector(coef(cv_lasso, s = "lambda.1se")), + row.names = rownames(coef(cv_lasso)) +) +``` + +LASSO coefficients at two values of the regularization parameter. +`lambda.min` minimizes cross-validation error; +`lambda.1se` uses a more conservative (larger) $\lambda$, +within one standard error of the minimum, +producing a sparser model. + +::: + +::: notes + +The LASSO identifies `statins` and `HT` as the strongest predictors +of LDL cholesterol in the HERS data, +consistent with the biological knowledge encoded in the DAG +in @fig-hers-dag. + +::: + +# Some Details {#sec-pred-sel-details} + +{{< include _sec-pred-sel-details.qmd >}} + +# Summary {#sec-pred-sel-summary} + +Predictor selection is the process of choosing appropriate variables +for inclusion in a multivariable regression model. +The right approach depends on the **inferential goal**: + +**For prediction** (Goal 1): + +- Pre-specify well-motivated candidate predictors. +- Use cross-validation of a target PE measure + to select among models without overfitting. +- Shrinkage methods (LASSO, ridge) are effective + when there are many candidates. + +**For evaluating a predictor of primary interest** (Goal 2): + +- Use a DAG to identify confounders, mediators, and colliders. +- Include all well-established confounders for face validity. +- Use backward selection with a liberal criterion ($p < 0.2$) + to remove apparent non-confounders. +- In randomized trials, covariates are not required for confounding control + but improve precision (and de-attenuate estimates in logistic/Cox models). +- Pre-specify adjusted analyses in trial protocols. + +**For identifying multiple important predictors** (Goal 3): + +- Apply the same inclusive strategy as Goal 2. +- Consider the Allen–Cady modified backward selection procedure + to limit false-positive findings. +- Interpret novel, weak, and borderline significant associations + cautiously; report all candidate predictors considered. + +**Across all goals**: + +- Collinearity between a primary predictor and a confounder + can inflate standard errors; between adjustment variables, it does not. +- The EPV rule (10 events per predictor) flags potential problems + but is not a hard limit. +- Model selection complicates inference: + nominal p-values and confidence intervals are anticonservative + when predictors have been selected from the data. + +# Learning Objectives {#sec-pred-sel-objectives .unnumbered} + +After completing this chapter, students should be able to: + +1. **Identify the inferential goal** of a predictor selection problem + (prediction, evaluating a primary predictor, + or identifying multiple important predictors) + and choose an appropriate strategy. + +2. **Use a DAG** to identify confounders, mediators, and colliders, + and determine which covariates to include or exclude. + +3. **Apply backward selection with a liberal criterion** + to select confounders in observational studies. + +4. **Use AIC and BIC** to compare alternative models. + +5. **Apply and interpret the LASSO**, + including cross-validated selection of $\lambda$. + +6. **Explain why stepwise selection is problematic** + and describe its known pitfalls. + +7. **Describe the bias–variance trade-off** and explain + why cross-validation provides a less optimistic estimate of PE + than naive within-sample measures. + +8. **Identify and handle collinearity**, + including using the VIF and understanding its implications + for different inferential goals. + +9. **Articulate how model selection complicates inference** + and describe best practices for reporting in each inferential context. + + + +# References {.unnumbered} + +::: {#refs} +::: diff --git a/references.bib b/references.bib index 61a087ee2..d51cde7d5 100644 --- a/references.bib +++ b/references.bib @@ -31,6 +31,57 @@ @book{vittinghoff2e url={https://doi.org/10.1007/978-1-4614-1353-0} } +@book{AllenCady1982, + title={Analyzing experimental data by regression}, + author={Allen, David M. and Cady, Frederick B.}, + year={1982}, + publisher={Lifetime Learning Publications}, + address={Belmont, CA} +} + +@article{Walter2001, + title={Development and validation of a prognostic index for 1-year mortality in older adults after hospitalization}, + author={Walter, Louise C. and Brand, Richard J. and Counsell, Steven R. and Palmer, R. Mark and Landefeld, C. Seth and Fortinsky, Richard H. and Covinsky, Kenneth E.}, + journal={JAMA}, + year={2001}, + volume={285}, + number={23}, + pages={2987--2994}, + doi={10.1001/jama.285.23.2987} +} + +@article{Grodstein2001, + title={Postmenopausal hormone use and secondary prevention of coronary events in the Nurses' Health Study: a prospective, observational study}, + author={Grodstein, Francine and Manson, JoAnn E. and Stampfer, Meir J. and Colditz, Graham A. and Willett, Walter C. and Rosner, Bernard and Speizer, Frank E. and Hennekens, Charles H.}, + journal={Annals of Internal Medicine}, + year={2001}, + volume={135}, + number={1}, + pages={1--8}, + doi={10.7326/0003-4819-135-1-200107030-00007} +} + +@article{Orwoll1996, + title={Axial bone mass in older women}, + author={Orwoll, Eric S. and Bauer, Douglas C. and Vogt, Thomas M. and Fox, Kathleen M. and Study of Osteoporotic Fractures Research Group}, + journal={Annals of Internal Medicine}, + year={1996}, + volume={124}, + number={3}, + pages={187--196} +} + +@article{Vittinghoff2003, + title={Risk factors and secondary prevention in women with heart disease: the Heart and Estrogen/progestin Replacement Study}, + author={Vittinghoff, Eric and Shlipak, Michael G. and Varosy, Peter D. and McDermott, Mary and Robbins, John A. and Harris, Tamara B. and Newman, Anne B. and Bauer, Douglas C.}, + journal={Annals of Internal Medicine}, + year={2003}, + volume={139}, + number={4}, + pages={274--281}, + doi={10.7326/0003-4819-139-4-200308190-00006} +} + @book{klein2003survival, title={Survival analysis: techniques for censored and truncated data}, author={Klein, John P and Moeschberger, Melvin L}, @@ -205,6 +256,36 @@ @book{dunn2018generalized url={https://link.springer.com/book/10.1007/978-1-4419-0118-7} } +@article{MickeyGreenland1989, + author = {Mickey, Ruth M. and Greenland, Sander}, + title = {The impact of confounder selection criteria on effect estimation}, + journal = {American Journal of Epidemiology}, + volume = {129}, + number = {1}, + pages = {125--137}, + year = {1989}, + month = jan, + issn = {0002-9262}, + doi = {10.1093/oxfordjournals.aje.a115101}, + url = {https://doi.org/10.1093/oxfordjournals.aje.a115101}, + eprint = {https://academic.oup.com/aje/article-pdf/129/1/125/127199/129-1-125.pdf} +} + +@article{Greenland1989Modeling, + author = {Greenland, Sander}, + title = {Modeling and variable selection in epidemiologic analysis}, + journal = {American Journal of Public Health}, + volume = {79}, + number = {3}, + pages = {340--349}, + year = {1989}, + month = mar, + doi = {10.2105/AJPH.79.3.340}, + pmid = {2916724}, + pmcid = {PMC1349563}, + url = {https://doi.org/10.2105/AJPH.79.3.340} +} + @book{rpkgs, title={{R} Packages}, author={Wickham, Hadley and Bryan, Jennifer},