diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
index 06dbd6bab..55e33c72d 100644
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -59,7 +59,44 @@ This follows the [Quarto website linking guidelines](https://quarto.org/docs/web
 
 This ensures links work correctly across all output formats and during local development.
 
-## Subfile Structure Convention
+## Attribution for Adapted Content
+
+When content is adapted from published sources (textbooks, papers, websites),
+**always provide explicit attribution** in the document.
+Use `@citekey` Pandoc citation syntax and include a prose note explaining what was adapted.
+
+Examples of acceptable attribution:
+
+- "Adapted from @vittinghoff2e [Chapter 10]."
+- "The following example is based on @kleinbaum2010logistic [Chapter 8]."
+- "This approach follows @hulley1998hers."
+
+Attribution should appear:
+
+- At the top of the chapter or section (in the Acknowledgements or Introduction),
+  *or*
+- Adjacent to the specific content being adapted (as a prose sentence or in a `::: notes` div).
+
+**Always include chapter and/or page numbers** in inline citations where applicable,
+so that readers can locate the source material.
+Use Pandoc's citation locator syntax: `[@citekey, Chapter 8]` or `[@citekey, p. 194]`.
+Include both chapter and page when both are known.
+Do **not** invent page numbers.
+Only cite a page after opening the source PDF
+and confirming the exact page.
+If you have not verified a page directly from the source PDF,
+cite chapter-level location only.
+
+Examples of attribution with page numbers
+(formatting examples):
+
+- "Adapted from [@vittinghoff2e, Chapter 10, p. 194]."
+- "The following example is based on [@kleinbaum2010logistic, Chapter 8, p. 230]."
+
+Do **not** reproduce verbatim text from copyrighted sources without clear quotation marks and attribution.
+Paraphrase and summarize with a citation instead.
+
+## Decomposing content into subfiles
 
 Quarto subfiles (files under `_subfiles/`) should **not** begin with a section heading.
 The heading for a subfile's content should be placed in the **parent** file that includes the subfile,
@@ -96,7 +133,27 @@ This applies whether the citation refers to one author or multiple authors (e.g.
 Pandoc renders `@dobson4e` as a noun phrase (e.g., "Dobson and Barnett (2018)"),
 but grammatically the citation key itself is treated as a singular pronoun/name.
 
-## Attribution for Adapted Content
+## DAG Naming for Categorical Variables
+
+In DAGs,
+use node names that represent the full categorical variable,
+not a specific level of that variable.
+
+Example:
+use `personality_type`,
+not `TypeA`.
+
+## Evidence and Source Citation Requirements
+
+For factual claims that are not directly proved in the text,
+always include a specific source citation.
+Do not leave factual statements uncited.
+
+Always cite papers and books using
+BibTeX entries in `references.bib`
+and Quarto/Pandoc citation syntax
+(for example `[@MickeyGreenland1989]`),
+rather than plaintext author-date references.
 
 When adapting any content from another source,
 always include specific attribution in the chapter text.
@@ -250,6 +307,53 @@ Use this pattern:
 Apply this only where needed,
 so HTML and RevealJS output stay unchanged.
 
+## Figures and Tables: Use Div Format
+
+Prefer the **Quarto div format** for new figures and tables
+rather than chunk-option `fig-cap`/`tbl-cap`.
+The div format makes it easier to write and format multi-sentence captions.
+Existing chunk-option captions in older files are acceptable
+unless you are already refactoring that content.
+
+**Correct** (div format):
+````qmd
+::: {#fig-my-figure}
+
+```{r}
+plot(x, y)
+```
+
+Caption text here.
+This can span multiple lines and include *markdown*.
+
+:::
+````
+
+**Correct** (div format for tables):
+````qmd
+::: {#tbl-my-table}
+
+```{r}
+my_table
+```
+
+Caption text here.
+
+:::
+````
+
+**Incorrect** (chunk option format):
+````qmd
+```{r}
+#| label: fig-my-figure
+#| fig-cap: "Caption text here."
+plot(x, y)
+```
+````
+
+Use this for newly added or substantially revised figures and tables in `.qmd` files.
+
+
 ## Math Notation
 
 This repository uses custom LaTeX macros defined in `latex-macros/macros.qmd` (a git submodule).
diff --git a/DESCRIPTION b/DESCRIPTION
index a623f0805..ebd8a28ad 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -38,6 +38,7 @@ Imports:
 Suggests: 
     tidyverse,
     conflicted,
+    dagitty,
     dplyr,
     eha,
     ggfortify,
diff --git a/_quarto-book.yml b/_quarto-book.yml
index 7c1f97cb4..8cbbdce8f 100644
--- a/_quarto-book.yml
+++ b/_quarto-book.yml
@@ -26,6 +26,7 @@ book:
        - Linear-models-overview.qmd
        - logistic-regression.qmd
        - count-regression.qmd
+       - predictor-selection.qmd
        - intro-multilevel-models.qmd
     - part: time-to-event-models.qmd
       chapters:
diff --git a/_quarto-revealjs.yml b/_quarto-revealjs.yml
index 7a9322490..c335bf75a 100644
--- a/_quarto-revealjs.yml
+++ b/_quarto-revealjs.yml
@@ -22,6 +22,7 @@ project:
     - "intro-bayes.qmd"
     - "common-mistakes.qmd"
     - "notation.qmd"
+    - "predictor-selection.qmd"
     - "intro-to-R.qmd"
     - "exam-formula-sheet.qmd"
 
diff --git a/_quarto-website.yml b/_quarto-website.yml
index dbdbef8eb..02ef73b7d 100644
--- a/_quarto-website.yml
+++ b/_quarto-website.yml
@@ -9,6 +9,7 @@ project:
     - Linear-models-overview.qmd
     - logistic-regression.qmd
     - count-regression.qmd
+    - predictor-selection.qmd
     - intro-multilevel-models.qmd
     - time-to-event-models.qmd
     - intro-to-survival-analysis.qmd
@@ -68,6 +69,8 @@ website:
             href: logistic-regression.qmd
           - text: "Count Regression"
             href: count-regression.qmd
+          - text: "Predictor Selection"
+            href: predictor-selection.qmd
           - text: "Multilevel Models"
             href: intro-multilevel-models.qmd
           - text: "---"
diff --git a/_sec-pred-sel-dag.qmd b/_sec-pred-sel-dag.qmd
new file mode 100644
index 000000000..93f3718cd
--- /dev/null
+++ b/_sec-pred-sel-dag.qmd
@@ -0,0 +1,434 @@
+{{< include latex-macros/macros.qmd >}}
+
+## Directed Acyclic Graphs (DAGs) {#sec-pred-sel-dag}
+
+A **Directed Acyclic Graph (DAG)** is a graphical model
+that encodes assumed causal relationships among variables.
+
+In a DAG:
+
+- **Nodes** represent variables.
+- **Directed edges** (arrows) represent direct causal effects.
+- The graph is **acyclic**: there are no directed cycles
+(a variable cannot cause itself, even indirectly).
+
+DAGs are used in causal inference to:
+
+- Identify **confounders** that must be adjusted for.
+- Identify **mediators** that should generally *not* be adjusted for
+when estimating the total causal effect.
+- Identify **colliders** that should *not* be adjusted for
+(adjusting for a collider opens a non-causal path).
+
+## Key concepts from DAG theory {#sec-pred-sel-dag-concepts}
+
+:::{#def-confounder}
+#### Confounder
+
+A **confounder** of the relationship between an exposure $X$ and
+outcome $Y$ is a variable $C$
+that is a common cause of both $X$ and $Y$.
+By contrast,
+a mediator lies on the causal path
+from $X$ to $Y$
+and is not a common cause of $X$ and $Y$.
+
+Adjusting for confounders removes confounding bias.
+:::
+
+---
+
+:::{#def-mediator}
+#### Mediator
+
+A **mediator** $M$ is a variable that lies on the causal pathway
+from the exposure $X$ to the outcome $Y$:
+$$X \to M \to Y$$
+
+Adjusting for a mediator blocks the indirect causal pathway
+through $M$,
+so the estimated coefficient for $X$ reflects only
+the *direct* effect of $X$ on $Y$ not through $M$.
+
+Whether to adjust for mediators depends on the research question:
+
+- If you want the **total causal effect** of $X$ on $Y$,
+do **not** adjust for $M$.
+- If you want the **direct effect** of $X$ on $Y$ not through $M$,
+adjust for $M$.
+:::
+
+---
+
+:::{#def-collider}
+#### Collider
+
+A **collider** is a variable $L$ that is caused by two or more
+variables on a path:
+$$X \to L \leftarrow Y$$
+
+Adjusting for a collider opens a non-causal (backdoor) path
+between $X$ and $Y$,
+introducing **collider bias**.
+:::
+
+## HERS study DAG {#sec-pred-sel-hers-dag}
+
+{{< include _subfiles/shared/_sec_hers_intro.qmd >}}
+
+We now examine the assumed causal structure
+among key baseline variables, treatment, and recurrent CHD events.
+<!--
+The shared HERS conclusions subfile is intentionally excluded here.
+This section focuses only on the DAG setup for the study.
+-->
+
+@fig-hers-dag shows a DAG for the HERS study,
+representing the assumed causal structure among key variables.
+
+The main goal of the HERS study was to estimate
+the causal effect of hormone therapy (HT)
+on recurrent CHD event risk.
+
+::: {#tbl-hers-data-dictionary}
+
+Data dictionary for key HERS variables used in the DAG and examples.
+
+| Variable | Meaning | Type |
+|---|---|---|
+| `HT` | Hormone-therapy assignment (placebo vs estrogen+progestin) | Binary categorical |
+| `CHD` | Recurrent coronary heart disease event outcome | Binary categorical |
+| `age` | Baseline age (years) | Continuous |
+| `smoking` | Baseline smoking status | Binary categorical |
+| `diabetes` | Baseline diabetes status | Binary categorical |
+| `BMI` | Body mass index | Continuous |
+| `LDL` | Low-density lipoprotein cholesterol | Continuous |
+| `HDL` | High-density lipoprotein cholesterol | Continuous |
+| `statins` | Baseline statin use | Binary categorical |
+| `SBP` | Systolic blood pressure | Continuous |
+
+:::
+
+::: {#fig-hers-dag}
+
+```{r}
+#| fig-width: 7
+#| fig-height: 5
+
+library(dagitty)
+
+hers_dag <- dagitty(
+  'dag {
+    HT [exposure, pos="0,0"]
+    CHD [outcome, pos="4,0"]
+    age [pos="2,2"]
+    smoking [pos="-1,-2"]
+    diabetes [pos="1,-1.5"]
+    BMI [pos="-1,1"]
+    LDL [pos="2,1"]
+    HDL [pos="2,-1"]
+    statins [pos="3,2"]
+    SBP [pos="3,-2"]
+
+    HT -> CHD
+    HT -> HDL
+    HT -> LDL
+    age -> CHD
+    smoking -> CHD
+    smoking -> HDL
+    BMI -> CHD
+    BMI -> diabetes
+    BMI -> SBP
+    BMI -> LDL
+    diabetes -> CHD
+    diabetes -> LDL
+    LDL -> CHD
+    HDL -> CHD
+    statins -> LDL
+    SBP -> CHD
+  }'
+)
+
+plot(hers_dag)
+```
+
+Directed Acyclic Graph (DAG) for the HERS study.
+`HT` = hormone therapy (exposure);
+`CHD` = recurrent coronary heart disease events (outcome);
+`HDL` = high-density lipoprotein;
+`LDL` = low-density lipoprotein;
+`SBP` = systolic blood pressure.
+
+:::
+
+In @fig-hers-dag:
+
+- **`age`** is prognostic for CHD:
+older women are at higher risk of CHD events.
+Since HT was randomized in HERS,
+age is not a confounder of the HT → CHD relationship,
+but adjusting for age can improve precision.
+- **`LDL`** and **`HDL`** are *mediators*:
+hormone therapy affects lipid levels,
+which in turn affect CHD risk.
+- **`BMI`** is a common cause of diabetes, SBP, and LDL.
+- **`statins`** (statin use) affects LDL independently of HT
+(statins are medications prescribed to lower LDL).
+- This DAG is simplified; a more complete model might include
+additional variables such as anti-hypertensive medication use.
+
+## What variables should we adjust for? {#sec-pred-sel-hers-adjust}
+
+Using DAG theory,
+we can identify the minimum adjustment set
+needed to estimate the total causal effect of HT on CHD.
+
+```{r}
+#| label: tbl-hers-adj
+adjustmentSets(
+  hers_dag,
+  exposure = "HT",
+  outcome = "CHD",
+  type = "minimal"
+)
+```
+
+::: notes
+
+Since HT was randomized in HERS,
+there are no backdoor paths from HT to CHD.
+The `adjustmentSets()` function confirms that no adjustment is needed
+to estimate the total causal effect of HT on CHD.
+
+The variables `LDL` and `HDL` are mediators on the path
+HT → LDL/HDL → CHD,
+so they should **not** be included
+when estimating the *total* causal effect of HT on CHD.
+
+Including `LDL` and `HDL` in the model would block
+a part of the causal effect of HT,
+giving us only the *direct* effect of HT on CHD
+(not through lipids).
+
+:::
+
+## Analyzing LDL in the HERS study {#sec-pred-sel-hers-ldl}
+
+::: notes
+
+The HERS baseline dataset includes LDL and HDL cholesterol,
+allowing us to examine the effect of hormone therapy on lipid levels.
+
+Here we build models to predict LDL cholesterol,
+illustrating how DAG thinking guides predictor selection.
+
+:::
+
+```{r}
+#| label: hers-data-setup
+
+library(haven)
+library(dplyr)
+
+hers <- read_stata(fs::path_package("rme", "extdata/hersdata.dta"))
+
+hers_ldl <-
+  hers |>
+  filter(!is.na(LDL)) |>
+  mutate(
+    HT = factor(HT, levels = c(0, 1), labels = c("Placebo", "HT")),
+    statins = factor(statins, levels = c(0, 1), labels = c("No", "Yes")),
+    diabetes = factor(diabetes, levels = c(0, 1), labels = c("No", "Yes")),
+    smoking = factor(smoking, levels = c(0, 1), labels = c("No", "Yes"))
+  )
+```
+
+::: {#tbl-hers-ldl-full}
+
+```{r}
+
+library(gtsummary)
+
+hers_ldl |>
+  dplyr::select(LDL, HT, age, BMI, diabetes, smoking, statins) |>
+  tbl_summary(by = HT) |>
+  add_overall() |>
+  bold_labels()
+```
+
+Summary table for LDL analysis variables in HERS
+
+:::
+
+### Unadjusted model
+
+::: {#tbl-hers-ldl-unadj}
+
+```{r}
+
+model_ldl_unadj <-
+  lm(LDL ~ HT, data = hers_ldl)
+
+model_ldl_unadj |>
+  tbl_regression() |>
+  bold_labels()
+```
+
+Unadjusted linear regression for LDL in HERS
+
+:::
+
+### Adjusted model
+
+Based on the DAG in @fig-hers-dag,
+BMI is a common cause of LDL and other outcomes
+(but age does not confound the HT → LDL relationship,
+since HT was randomized).
+
+In a randomized trial,
+treatment assignment is independent of pre-treatment covariates,
+so confounding adjustment is not required to estimate the causal effect.
+However, adjusting for strong predictors of the outcome
+can improve precision.
+
+::: {#tbl-hers-ldl-adj}
+
+```{r}
+
+model_ldl_adj <-
+  lm(LDL ~ HT + age + BMI + diabetes + statins, data = hers_ldl)
+
+model_ldl_adj |>
+  tbl_regression() |>
+  bold_labels()
+```
+
+Adjusted linear regression for LDL in HERS
+
+:::
+
+Note that `statins` is a strong predictor of LDL
+(statin medications are prescribed specifically to lower LDL).
+Including `statins` substantially reduces residual variability,
+improving precision of the HT coefficient.
+
+## An observational study example: WCGS {#sec-pred-sel-wcgs-dag}
+
+The HERS study is a randomized trial,
+so confounding of the treatment-outcome relationship is not a concern.
+To illustrate DAG-guided confounder selection in an **observational study**,
+we use the Western Collaborative Group Study (WCGS).
+
+The WCGS was a prospective cohort study of 3,154 men
+employed by California companies,
+followed from 1960 to 1969 [@rosenman1975coronary].
+The exposure of primary interest is **Type A behavior pattern**
+(a personality trait characterized by competitive drive,
+time urgency, and hostility),
+hypothesized to increase CHD risk
+independently of classic cardiovascular risk factors.
+
+::: {#tbl-wcgs-data-dictionary}
+
+Data dictionary for key WCGS variables used in the DAG and examples.
+
+| Variable | Meaning | Type |
+|---|---|---|
+| `personality_type` | Personality type (Type A vs Type B behavior pattern) | Binary categorical |
+| `CHD` | Coronary heart disease event outcome | Binary categorical |
+| `age` | Baseline age (years) | Continuous |
+| `smoking` | Baseline smoking status | Binary categorical |
+| `chol` | Serum cholesterol | Continuous |
+| `sbp` | Systolic blood pressure | Continuous |
+| `bmi` | Body mass index | Continuous |
+
+:::
+
+::: {#fig-wcgs-dag}
+
+```{r}
+#| fig-width: 7
+#| fig-height: 5
+
+wcgs_dag <- dagitty(
+  'dag {
+    personality_type [exposure, pos="0,0"]
+    CHD [outcome, pos="4,0"]
+    age [pos="1.4,2"]
+    bmi [pos="-0.3,1.1"]
+    smoking [pos="1.1,-1.8"]
+    chol [pos="2.7,1"]
+    sbp [pos="2.8,-1.2"]
+
+    personality_type -> CHD
+    age -> CHD
+    age -> chol
+    age -> sbp
+    age -> bmi
+    age -> personality_type
+    personality_type -> smoking
+    smoking -> CHD
+    chol -> CHD
+    sbp -> CHD
+    bmi -> CHD
+    bmi -> chol
+    bmi -> sbp
+  }'
+)
+
+plot(wcgs_dag)
+```
+
+Directed Acyclic Graph (DAG) for the WCGS study.
+`personality_type` = personality type (exposure; Type A vs Type B);
+`CHD` = coronary heart disease events (outcome);
+`chol` = serum cholesterol;
+`sbp` = systolic blood pressure.
+
+:::
+
+In @fig-wcgs-dag,
+`age` is a **confounder**:
+it has direct causal arrows to both the exposure (`personality_type`)
+and the outcome (`CHD`).
+We assume `personality_type -> smoking`
+rather than `smoking -> personality_type`
+as an explicit modeling assumption
+for this teaching example:
+we treat Type A behavior pattern
+as a relatively stable trait
+that can influence smoking behavior.
+Under that assumption,
+`smoking` is a mediator on the path
+`personality_type -> smoking -> CHD`,
+not a confounder.
+BMI is causally affected by age.
+Cholesterol and SBP are causally affected
+by both age and BMI.
+All three have direct effects on CHD.
+Unlike in HERS,
+where HT was randomized
+to estimate its causal effect on CHD event risk,
+we cannot assume the exposure is unconfounded here.
+
+```{r}
+#| label: wcgs-adj-sets
+
+wcgs_adj <- adjustmentSets(
+  wcgs_dag,
+  exposure = "personality_type",
+  outcome = "CHD",
+  type = "minimal"
+)
+wcgs_adj
+```
+
+::: notes
+
+The `adjustmentSets()` function returns the minimal set(s) of variables
+that block all backdoor paths from `personality_type` to `CHD`.
+Adjusting for any of these minimal sets is sufficient to
+estimate the total causal effect of Type A behavior on CHD
+(under the assumptions encoded in the DAG).
+
+:::
diff --git a/_sec-pred-sel-details.qmd b/_sec-pred-sel-details.qmd
new file mode 100644
index 000000000..d0591eaf5
--- /dev/null
+++ b/_sec-pred-sel-details.qmd
@@ -0,0 +1,189 @@
+{{< include latex-macros/macros.qmd >}}
+
+## Collinearity {#sec-pred-sel-collinearity}
+
+:::{#def-collinearity}
+#### Collinearity
+
+**Collinearity** refers to high correlation between predictors,
+sufficient to substantially degrade the precision of
+one or more regression coefficient estimates.
+
+:::
+
+When two predictors are highly correlated,
+each appears unimportant in the model adjusting for the other,
+even if both are statistically significant in simpler models.
+The joint F-test for both predictors may be highly significant,
+while the individual t-tests are not.
+
+How we handle collinearity depends on the inferential goal:
+
+- **Prediction**: If including both collinear predictors reduces PE,
+  include both.
+  Cross-validation determines whether each contributes.
+
+- **Evaluating a primary predictor**: If the primary predictor
+  remains statistically significant after adjusting for a collinear confounder,
+  the evidence for an independent effect is convincing.
+  If collinearity with a *required* confounder inflates the primary
+  predictor's standard error,
+  we may need to accept reduced precision or
+  present sensitivity analyses.
+
+- **Collinearity among confounders**: Does not affect
+  the precision of the primary predictor's estimate.
+  Both variables can be retained if needed for confounding control.
+
+- **Identifying multiple predictors**: Choose among collinear variables
+  on substantive grounds (causal plausibility, measurement quality,
+  fewer missing values).
+
+### Variance inflation factor {#sec-pred-sel-vif}
+
+The **variance inflation factor (VIF)** for predictor $j$ measures
+how much its variance is inflated by correlation with other predictors:
+
+$$\text{VIF}_j = \frac{1}{1 - R_j^2}$$
+
+where $R_j^2$ is the $R^2$ from regressing $x_j$ on all other predictors.
+
+A VIF $> 10$ (or equivalently, $R_j^2 > 0.9$) indicates
+problematic collinearity.
+
+```{r}
+#| label: vif-ldl-hers
+
+library(car)
+
+model_full <- lm(
+  LDL ~ HT + age + BMI + diabetes + smoking + statins + SBP,
+  data = hers_ldl
+)
+vif(model_full)
+```
+
+::: notes
+
+VIFs close to 1 indicate minimal collinearity.
+In the HERS LDL model,
+we expect relatively modest collinearity among the predictors,
+since they represent distinct pathways (see @fig-hers-dag).
+
+:::
+
+## Number of predictors {#sec-pred-sel-epv}
+
+A widely used rule of thumb specifies a minimum number of
+observations (or events) per predictor:
+
+- **Linear models**: at least 10 observations per predictor.
+- **Logistic and Cox models**: at least 10 events per predictor (EPV).
+
+This guideline exists because with too many parameters
+relative to information in the data,
+coefficient estimates become imprecise and logistic/Cox models
+can behave poorly (e.g., converge to extreme estimates).
+
+::: {.callout-note}
+## The EPV rule is a flag, not a hard limit
+
+The precision of estimates depends on more than just the EPV:
+residual variance, effect size, and correlations among predictors
+all matter.
+Adding covariates that are weakly correlated with the primary predictor
+but explain substantial outcome variance can *improve* precision.
+Conversely, adding a single collinear covariate can degrade it severely.
+:::
+
+The rule is best used as a **warning flag**:
+if the EPV falls below 10,
+check whether the standard errors are unusually large,
+and whether Wald and likelihood ratio test p-values
+give consistent results.
+If problems are detected, tighten the backward selection criterion
+(e.g., to $p < 0.15$ or $p < 0.10$)
+and verify that a more parsimonious model gives consistent results.
+
+## Alternatives to backward selection {#sec-pred-sel-alts}
+
+The main alternatives to backward selection are:
+
+- **Best subsets**: screens all possible subsets of candidate predictors
+  and selects the model minimizing a summary measure
+  (adjusted $R^2$, AIC, or a cross-validated PE measure).
+  Computationally intensive for large sets of predictors.
+
+- **Forward selection**: starts with an empty model and adds
+  the predictor providing the greatest improvement at each step.
+  More prone to missing *negatively confounded* sets of variables
+  (sets where no member appears important unless all are included).
+
+- **Stepwise selection**: augments forward selection by
+  also allowing previously added variables to be removed.
+  Better than pure forward but still misses some models
+  that backward selection would find.
+
+- **Bivariate screening**: evaluates each candidate predictor
+  individually before multivariable modeling.
+  Convenient but shares the limitation of forward selection.
+
+::: {.callout-note}
+## Why backward selection is preferred
+
+Backward selection begins with the full set of candidates,
+so it is **less likely to miss negatively confounded sets** of variables —
+groups of variables that appear unimportant individually
+but are jointly important.
+Forward and stepwise selection will only include such sets
+if at least one member passes the inclusion criterion on its own.
+:::
+
+## Model selection complicates inference {#sec-pred-sel-inference}
+
+When predictors are selected from the data,
+the p-values and confidence intervals computed as if the model
+were pre-specified are no longer valid.
+
+Specific problems include:
+
+- **Inflated type-I error**: searching over many models
+  greatly increases the chance of at least one spurious finding.
+- **Testimation bias**: selected predictors tend to have
+  overstated estimated effects,
+  because we select those that appear large by chance.
+- **Biased CIs**: confidence intervals are too narrow
+  because they do not account for model uncertainty.
+
+How much these issues matter depends on the inferential goal:
+
+- **Prediction** (Goal 1):
+  Cross-validation of a PE measure protects against overfitting
+  without relying on p-values,
+  so inference is least affected.
+
+- **Evaluating a primary predictor** (Goal 2):
+  The primary predictor is included by default in all candidate models,
+  so its estimate is relatively unaffected by selection.
+  Post-selection inference is most problematic
+  when the primary predictor is of borderline significance.
+
+- **Identifying multiple predictors** (Goal 3):
+  Selection most severely complicates inference here,
+  since p-values for all retained predictors are of direct interest.
+  Careful pre-specification, use of inclusive models,
+  and the Allen–Cady procedure (see @sec-pred-sel-allen-cady)
+  reduce, but do not eliminate, this problem.
+
+::: {.callout-tip}
+## Best practices for reporting
+
+When selection procedures are used:
+
+1. Report the **full candidate predictor list**.
+2. Report the **selection criterion and procedure** used.
+3. Interpret **novel, weak, and borderline significant** findings
+   with explicit caution.
+4. Examine **alternative models** with different predictor subsets
+   and report consistency of key results.
+:::
diff --git a/_sec-pred-sel-goals.qmd b/_sec-pred-sel-goals.qmd
new file mode 100644
index 000000000..43d249390
--- /dev/null
+++ b/_sec-pred-sel-goals.qmd
@@ -0,0 +1,75 @@
+{{< include latex-macros/macros.qmd >}}
+
+## Three inferential goals {#sec-pred-sel-goals}
+
+The appropriate strategy for predictor selection depends on
+the inferential goal of the analysis.
+@vittinghoff2e [Chapter 10] distinguishes three distinct goals:
+
+### Goal 1: Prediction {#sec-pred-sel-goal-pred}
+
+The primary aim is to minimize prediction error for new observations.
+The predictors' individual causal interpretations are secondary.
+
+:::{#exm-pred-sel-goal-pred}
+*Example*: @Walter2001 developed a model to identify
+older adults at high risk of death following hospitalization,
+using backward selection on a development set
+and validating predictions in an independent hospital.
+:::
+
+For this goal, we use measures of **prediction error** (PE)
+to choose among candidate models.
+Overfitting — the tendency of flexible models to fit noise — is
+the central concern.
+Strategies such as cross-validation and penalized regression
+(see @sec-pred-sel-prediction and @sec-pred-sel-lasso)
+protect against overfitting.
+
+### Goal 2: Evaluating a predictor of primary interest {#sec-pred-sel-goal-eval}
+
+The aim is to obtain a valid, minimally confounded estimate of
+the association between a specific exposure and the outcome.
+
+:::{#exm-pred-sel-goal-eval}
+*Example*: @Grodstein2001 estimated the effect of hormone
+therapy on CHD risk in the Nurses' Health Study (NHS),
+controlling for a wide range of known CHD risk factors
+to minimize confounding.
+:::
+
+In observational studies, confounding is the primary concern.
+In randomized trials (like HERS), treatment is already unconfounded
+by design, but adjusting for strong prognostic covariates can
+improve precision.
+DAGs (see @sec-pred-sel-dag) are especially useful for this goal.
+
+### Goal 3: Identifying the important predictors of an outcome {#sec-pred-sel-goal-id}
+
+The aim is to discover which predictors are independently associated
+with the outcome.
+
+:::{#exm-pred-sel-goal-id}
+*Example*: @Orwoll1996 examined independent predictors of
+bone mass in the Study of Osteoporotic Fractures (SOF),
+including all predictors statistically significant at $p < 0.05$
+in age-adjusted models.
+:::
+
+This goal is the most challenging because inferences about
+multiple predictors are of direct interest,
+making false-positive findings and unstable selections
+particular concerns.
+Inclusive models with cautious interpretation are preferred
+(see @sec-pred-sel-multiple).
+
+::: notes
+
+In practice, these goals often overlap.
+For example,
+an analysis can estimate the causal effect of treatment (Goal 2)
+while also selecting additional prognostic covariates
+to improve individualized risk prediction performance (Goal 1)
+in the same model-building workflow.
+
+:::
diff --git a/_sec-pred-sel-multiple.qmd b/_sec-pred-sel-multiple.qmd
new file mode 100644
index 000000000..c75f688f9
--- /dev/null
+++ b/_sec-pred-sel-multiple.qmd
@@ -0,0 +1,91 @@
+{{< include latex-macros/macros.qmd >}}
+
+## Identifying multiple important predictors {#sec-pred-sel-multiple}
+
+::: notes
+
+When the goal is to identify *all* important predictors of an outcome
+(Goal 3, @sec-pred-sel-goals),
+confounding remains a primary concern for each predictor in turn,
+not just a single primary predictor of interest.
+
+:::
+
+This goal is the most challenging because:
+
+- Overfitting and false-positive results are a greater concern
+  (many predictors, many opportunities for spurious findings).
+- Interactions among candidate predictors may be of interest,
+  but systematically testing all possible interactions
+  can easily produce false positives.
+- Collinear predictors may be difficult to disentangle.
+
+The same strategies recommended for Goal 2
+(face validity, backward selection with liberal criterion)
+also apply here.
+However, cautious interpretation of novel and borderline findings
+becomes even more important.
+
+## Allen–Cady modified backward selection {#sec-pred-sel-allen-cady}
+
+A modified backward selection procedure due to Allen and Cady
+[@AllenCady1982]
+limits false-positive findings while still allowing variable selection.
+
+The procedure works as follows:
+
+1. **Specify** a set of variables to be forced into the model:
+   predictors of primary interest
+   and variables needed for face validity.
+2. **Rank** the remaining candidate variables
+   in order of hypothesized importance.
+3. Start with a model containing all variables in both sets.
+4. Delete variables from the ranked set
+   in order of **ascending importance** (least important first),
+   stopping as soon as the first variable that meets the retention criterion
+   is encountered.
+
+::: notes
+
+This procedure limits false positives because
+there is a single pre-specified deletion sequence,
+and selection stops at the first retained variable.
+In contrast, conventional backward selection
+considers *all* remaining predictors at each step,
+producing many more potential opportunities for false findings.
+
+:::
+
+## Cautious interpretation {#sec-pred-sel-cautious-interp}
+
+For Goal 3 analyses,
+we do **not** recommend using only predictors significant at $p < 0.05$:
+in small samples, even important predictors may fail to meet this threshold,
+and a parsimonious model may have substantial residual confounding.
+
+Instead, the recommended approach is:
+
+- Use a relatively **inclusive model**
+  (liberal backward selection criterion: $p < 0.2$).
+- Interpret **novel, weak, and borderline significant** associations
+  cautiously.
+- Report the **full list of candidate covariates** considered,
+  including those ultimately excluded.
+- Acknowledge inflated type-I error as a study limitation.
+
+## Example: risk factors for CHD in HERS {#sec-pred-sel-chd-risk-factors}
+
+@Vittinghoff2003 identified independent CHD risk factors
+in the HERS cohort using a multipredictor Cox model.
+The large number of outcome events ($n = 361$) allowed
+all previously identified risk factors with $p < 0.2$
+in unadjusted models to be included.
+
+Among 11 predictors retained as important on both substantive and
+statistical grounds were:
+ethnicity, exercise habits, diabetes, angina, congestive heart failure,
+prior heart attacks, blood pressure, LDL, HDL, Lp(a), and creatinine clearance.
+
+The model also controlled for age, smoking, alcohol use, and obesity
+(not statistically significant after adjustment)
+to rule out confounding and ensure face validity.
diff --git a/_sec-pred-sel-overview.qmd b/_sec-pred-sel-overview.qmd
new file mode 100644
index 000000000..43651429c
--- /dev/null
+++ b/_sec-pred-sel-overview.qmd
@@ -0,0 +1,72 @@
+{{< include latex-macros/macros.qmd >}}
+
+## Why predictor selection matters {#sec-pred-sel-why}
+
+In regression modeling,
+we often have many potential predictors available,
+but including too many predictors can cause problems:
+
+- **Overfitting**: the model fits noise in the training data and
+predicts poorly for new data.
+- **Multicollinearity**: highly correlated predictors make
+coefficients unstable and hard to interpret.
+- **Loss of precision**: including unnecessary predictors increases
+standard errors.
+
+On the other hand, excluding important predictors can lead to:
+
+- **Confounding bias**: omitting a common cause of the exposure and
+outcome distorts the estimated association.
+- **Reduced power**: failing to adjust for strong predictors of the
+outcome can reduce statistical power.
+
+## Strategies for predictor selection {#sec-pred-sel-strategies}
+
+@vittinghoff2e [Chapter 10] describes several general approaches to
+predictor selection:
+
+1. **Science-driven (purposeful) selection**:
+Use subject-matter knowledge and a causal diagram (DAG)
+to identify which variables to include.
+This is the preferred approach when the goal is causal inference.
+
+2. **Testing-based selection**:
+Use statistical tests to decide which predictors to include.
+Examples include forward selection, backward elimination,
+and stepwise selection.
+These methods have well-known problems and should be used with caution
+(see @sec-pred-sel-stepwise).
+
+3. **Criterion-based selection**:
+Compare models using information criteria such as AIC or BIC.
+These criteria balance model fit against complexity
+(see @sec-pred-sel-aic-bic).
+
+4. **Penalized regression**:
+Use regularization methods (such as the LASSO)
+that shrink coefficients toward zero,
+effectively performing selection and estimation simultaneously
+(see @sec-pred-sel-lasso).
+
+## The role of subject-matter knowledge {#sec-pred-sel-subject-matter}
+
+::: notes
+
+Regardless of which selection strategy is used,
+subject-matter knowledge should guide model building.
+
+:::
+
+A regression model can only tell us about *associations* —
+whether certain predictors are correlated with the outcome
+in our data.
+It cannot, on its own, tell us whether those associations reflect
+causal effects.
+
+To draw causal conclusions, we need:
+
+1. A causal model (such as a DAG) specifying assumed causal relationships.
+2. Appropriate adjustment for confounders.
+3. Careful avoidance of adjusting for mediators or colliders.
+
+These concepts are discussed in @sec-pred-sel-dag.
diff --git a/_sec-pred-sel-prediction.qmd b/_sec-pred-sel-prediction.qmd
new file mode 100644
index 000000000..1db68a355
--- /dev/null
+++ b/_sec-pred-sel-prediction.qmd
@@ -0,0 +1,285 @@
+{{< include latex-macros/macros.qmd >}}
+
+## Bias–variance trade-off and overfitting {#sec-pred-sel-bias-var}
+
+When building a prediction model,
+adding more predictors generally **reduces bias**:
+the model better captures the true relationships in the population.
+But it also **increases variance**:
+more parameters to estimate means more sampling variability,
+and the model can start to fit noise in the training data.
+
+:::{#def-overfitting}
+#### Overfitting
+
+**Overfitting** occurs when a model fits the training data well
+but predicts poorly for new observations.
+It results from including too many predictors
+relative to the effective sample size.
+
+:::
+
+A good prediction model minimizes the sum of bias and variance —
+the **bias–variance trade-off**.
+Adding a predictor helps only if
+the reduction in bias outweighs the increase in variance.
+
+## Numerical example of overfitting {#sec-pred-sel-overfit-example}
+
+The table below gives a simple numerical example.
+A parsimonious model and an intentionally overfit model
+are both trained on the same development set.
+The overfit model adds many noise predictors
+that have no true relationship with LDL.
+
+::: {#tbl-overfit-example}
+
+```{r}
+#| label: overfit-example
+
+set.seed(204)
+n_noise_predictors <- 20
+
+hers_base <-
+  hers_ldl |>
+  dplyr::select(LDL, HT, age, statins) |>
+  tidyr::drop_na()
+
+n_overfit <- nrow(hers_base)
+idx_overfit <- sample(seq_len(n_overfit), size = floor(0.7 * n_overfit))
+
+train_overfit <- hers_base[idx_overfit, ]
+valid_overfit <- hers_base[-idx_overfit, ]
+
+for (j in seq_len(n_noise_predictors)) {
+  zname <- paste0("z", j)
+  train_overfit[[zname]] <- rnorm(nrow(train_overfit))
+  valid_overfit[[zname]] <- rnorm(nrow(valid_overfit))
+}
+
+model_parsimonious <- lm(LDL ~ HT + age + statins, data = train_overfit)
+noise_terms <- paste0("z", seq_len(n_noise_predictors))
+overfit_formula <- reformulate(c("HT", "age", "statins", noise_terms), "LDL")
+model_overfit <- lm(
+  overfit_formula,
+  data = train_overfit
+)
+
+rmse <- function(y, yhat) {
+  sqrt(mean((y - yhat)^2))
+}
+
+tibble::tibble(
+  Model = c(
+    "Parsimonious model",
+    paste0("Overfit model (+", n_noise_predictors, " noise predictors)")
+  ),
+  Training_RMSE = c(
+    rmse(
+      train_overfit$LDL,
+      predict(model_parsimonious, newdata = train_overfit)
+    ),
+    rmse(train_overfit$LDL, predict(model_overfit, newdata = train_overfit))
+  ),
+  Validation_RMSE = c(
+    rmse(
+      valid_overfit$LDL,
+      predict(model_parsimonious, newdata = valid_overfit)
+    ),
+    rmse(valid_overfit$LDL, predict(model_overfit, newdata = valid_overfit))
+  )
+) |>
+  dplyr::mutate(
+    dplyr::across(c(Training_RMSE, Validation_RMSE), ~ round(.x, 2))
+  )
+```
+
+Numerical example showing overfitting:
+lower training error but worse validation error.
+
+:::
+
+In this example,
+the overfit model usually has lower training RMSE
+but higher validation RMSE,
+which is the defining pattern of overfitting.
+
+## Measures of prediction error {#sec-pred-sel-pe}
+
+To select a model that minimizes prediction error,
+we need a measure of PE that does not simply increase
+with the number of predictors.
+
+For **continuous outcomes**:
+
+- **Adjusted $R^2$** penalizes the number of predictors:
+$$\bar R^2 = 1 - \frac{n-1}{n-p-1}(1 - R^2)$$
+Adding a variable increases $\bar R^2$ only if it reduces
+the residual variance more than the penalty.
+
+- **AIC** and **BIC** impose steeper penalties:
+$$\text{AIC} = -2\hat\ell + 2p, \quad
+  \text{BIC} = -2\hat\ell + p\log n$$
+(see @sec-pred-sel-aic-bic).
+
+For **binary outcomes**,
+common measures include
+**balanced accuracy** and the **C-statistic**.
+
+**Balanced accuracy**:
+$$\text{BA} = \frac{\text{Sensitivity} + \text{Specificity}}{2}$$
+where
+$$\text{Sensitivity} = \Pr(\hat Y = 1 \mid Y = 1), \quad
+\text{Specificity} = \Pr(\hat Y = 0 \mid Y = 0).$$
+If $J$ is **Youden's $J$ statistic**,
+$$J = \text{Sensitivity} + \text{Specificity} - 1,$$
+then
+$$\text{BA} = \frac{J + 1}{2}.$$
+So BA is a linear transformation of $J$,
+and BA and $J$ are equivalent for comparing models.
+
+**C-statistic** (area under the ROC curve, AUC):
+$$
+C = \Pr(\hat{\pi}_i > \hat{\pi}_j \mid Y_i = 1, Y_j = 0) \\
+\qquad + \frac{1}{2}\Pr(\hat{\pi}_i = \hat{\pi}_j \mid Y_i = 1, Y_j = 0)
+$$
+where $\hat{\pi}$ is the predicted event probability.
+An empirical estimator is
+$$
+\hat{C}
+= \frac{1}{n_1 n_0}
+\sum_{i:Y_i=1}
+\sum_{j:Y_j=0}
+\left[
+I(\hat{\pi}_i > \hat{\pi}_j)
+\qquad + \frac{1}{2}I(\hat{\pi}_i = \hat{\pi}_j)
+\right].
+$$
+It measures discrimination:
+the model's ability to rank events above non-events.
+
+For **survival outcomes**, the analogous measure is the **C-index**,
+defined as
+$$
+\hat C_{\text{index}}
+= \frac{N_{\text{concordant}} + 0.5\,N_{\text{tied}}}
+{N_{\text{comparable}}},
+$$
+where pairs are comparable when one subject is observed to fail
+before the other is known to fail or be censored,
+and concordance means the subject with earlier failure
+has the higher predicted risk score.
+
+## Optimism and cross-validation {#sec-pred-sel-cv}
+
+Naive estimates of PE evaluated on the same data used
+to fit the model are **optimistic**:
+they overstate the model's ability to predict new data.
+For example, $R^2$ always increases when a predictor is added,
+even if the predictor is pure noise.
+
+**Cross-validation** provides a less optimistic estimate of PE
+by evaluating predictions on observations
+not used to estimate the model.
+
+:::{#def-kfold}
+#### $k$-fold cross-validation
+
+In **$k$-fold cross-validation**:
+
+1. The data are randomly divided into $k$ equal-sized subsets ("folds").
+2. For each fold $i = 1, \ldots, k$:
+   - Fit the model using all data *except* fold $i$.
+   - Compute predicted values for the observations in fold $i$.
+3. Compute a summary PE measure across all folds.
+
+Values of $k = 5$ or $k = 10$ are typical.
+
+:::
+
+Using cross-validation to select predictors avoids overfitting
+because the predictive performance is evaluated on held-out data
+that did not influence the model fit.
+
+## Cross-validation for LASSO in the HERS data {#sec-pred-sel-cv-hers}
+
+The LASSO (see @sec-pred-sel-lasso) uses cross-validation
+to select the penalty parameter $\lambda$
+that minimizes prediction error.
+The `cv.glmnet` function implements 10-fold cross-validation,
+fitting the LASSO for a grid of $\lambda$ values and evaluating
+mean squared error on held-out folds.
+The fitted object provides two key $\lambda$ values:
+`lambda.min` (minimizes CV mean squared error) and
+`lambda.1se` (largest $\lambda$ within one standard error of the minimum).
+See @sec-pred-sel-lasso for the HERS example
+with coefficient paths and CV results.
+
+::: notes
+
+The two recommended values of $\lambda$ represent different
+points on the bias–variance trade-off:
+
+- `lambda.min` minimizes cross-validated MSE,
+  yielding a model with good predictive accuracy.
+- `lambda.1se` uses a larger penalty
+  (farther from zero, more shrinkage),
+  resulting in a sparser, more conservative model
+  that is often preferred for interpretability.
+
+:::
+
+## Development and validation sets {#sec-pred-sel-split}
+
+The most transparent approach to measuring prediction error
+is to split the data into a **development set**
+(used to fit the model)
+and a **validation set**
+(used only to evaluate predictions).
+
+::: notes
+
+When a single data set must be split,
+a common allocation is two-thirds to development
+and one-third to validation.
+External validation using a completely independent sample
+(from a different time period or study site)
+provides the most rigorous assessment of generalizability.
+
+:::
+
+```{r}
+#| label: split-sample-ldl
+
+set.seed(42)
+n_hers <- nrow(hers_ldl)
+train_idx <- sample(seq_len(n_hers), size = floor(0.67 * n_hers))
+
+hers_train <- hers_ldl[train_idx, ]
+hers_valid <- hers_ldl[-train_idx, ]
+
+# Fit model on training data
+model_train <- lm(
+  LDL ~ HT + age + BMI + diabetes + statins,
+  data = hers_train
+)
+
+# Evaluate on validation data
+pred_valid <- predict(model_train, newdata = hers_valid)
+rmse_valid <- sqrt(mean((hers_valid$LDL - pred_valid)^2, na.rm = TRUE))
+rmse_train <- sqrt(mean(residuals(model_train)^2))
+
+cat(
+  "Training RMSE:", round(rmse_train, 2), "\n",
+  "Validation RMSE:", round(rmse_valid, 2), "\n"
+)
+```
+
+::: notes
+
+If the validation RMSE is much larger than the training RMSE,
+this is a sign of overfitting.
+In well-specified models with adequate sample sizes,
+the two values are typically close.
+
+:::
diff --git a/_sec-pred-sel-primary.qmd b/_sec-pred-sel-primary.qmd
new file mode 100644
index 000000000..e7185b304
--- /dev/null
+++ b/_sec-pred-sel-primary.qmd
@@ -0,0 +1,440 @@
+{{< include latex-macros/macros.qmd >}}
+
+## Evaluating a predictor of primary interest {#sec-pred-sel-eval-primary}
+
+::: notes
+
+When the goal is to estimate the causal effect of
+a specific exposure on an outcome,
+the central challenge in observational data is **confounding**.
+Relatively inclusive models — those including more potential confounders —
+are better at minimizing confounding bias.
+
+:::
+
+The following principles guide predictor selection for this goal.
+
+## Including predictors for face validity {#sec-pred-sel-face-validity}
+
+Some predictors are such well-established causal antecedents of the outcome
+that they should be included regardless of their statistical significance
+in the current data.
+This ensures the **face validity** of the model:
+readers and reviewers can be confident that obvious confounders
+have been accounted for.
+
+For example, age and smoking are established CHD risk factors
+and would be included in any model for CHD outcomes,
+even if they are not statistically significant in a small sample.
+
+## Selecting confounders on statistical grounds {#sec-pred-sel-confounders}
+
+In addition to variables included for face validity,
+we may consider other potential confounders
+identified in previous studies or on substantive grounds.
+
+A practical approach is **backward selection with a liberal criterion**:
+
+1. Start with a full model including all pre-specified candidates.
+2. At each step, remove the predictor with the largest p-value,
+   provided it exceeds a liberal threshold (e.g., $p > 0.2$).
+3. Stop when all remaining predictors meet the retention criterion.
+
+The liberal threshold ($p > 0.2$ rather than $p > 0.05$)
+is important for confounding control:
+in small samples, even important confounders may have large p-values,
+and removing them could introduce bias.
+
+An alternative retention criterion is the
+**10% change-in-estimate rule**
+[@kleinbaum2010logistic, Chapter 8, pp. 194--212;
+@vittinghoff2e, Chapter 10, pp. 257--275;
+@MickeyGreenland1989, p. 125;
+@Greenland1989Modeling, p. 340]:
+
+Retain a candidate confounder $C$ in the model if
+dropping it changes the estimated coefficient for the primary predictor
+by more than 10–15%.
+
+More precisely,
+let $\hat\beta$ be the estimated coefficient for the exposure
+from the **full model** (including $C$),
+and let $\hat\beta_0$ be the estimate from the **reduced model** (dropping $C$).
+Then $C$ is judged a meaningful confounder if:
+
+$$\left| \frac{\hat\beta - \hat\beta_0}{\hat\beta} \right| > 0.10$$
+
+The threshold is often taken as 10–20%, depending on the context.
+A smaller threshold (e.g., 10%) retains more variables;
+a larger threshold (e.g., 20%) allows more aggressive pruning.
+
+::: {.callout-note}
+
+The 10% change-in-estimate rule focuses on **bias** rather than **statistical significance**.
+A variable can be a meaningful confounder even if it is not statistically significant
+(because of small sample size or measurement error),
+and conversely, a statistically significant variable may cause negligible confounding.
+The p-value criterion and the change-in-estimate rule can disagree,
+and @vittinghoff2e recommends using both as complementary checks.
+
+:::
+
+### Numerical example: 10% rule in WCGS {#sec-pred-sel-10pct-wcgs}
+
+::: notes
+
+The WCGS (Western Collaborative Group Study) is an observational
+prospective cohort of 3,154 men followed for CHD events.
+Unlike HERS (a randomized trial),
+the WCGS exposure — Type A behavior pattern — was not randomly assigned,
+so confounding is a genuine concern.
+
+:::
+
+In the WCGS study,
+the exposure of interest is Type A behavior pattern (`dibpat`),
+and the outcome is CHD incidence (`chd69`).
+We fit logistic regression models
+and apply the 10% change-in-estimate rule
+to assess whether age, cholesterol, SBP, and BMI
+are meaningful confounders of the Type A → CHD relationship,
+while also examining smoking as an additional covariate.
+
+::: notes
+
+Adapted from @kleinbaum2010logistic [Chapter 8],
+which describes the change-in-estimate procedure for logistic regression models.
+
+:::
+
+```{r}
+#| label: wcgs-data-setup
+
+wcgs <- readRDS(fs::path_package("rme", "extdata/wcgs.rds"))
+```
+
+```{r}
+#| label: wcgs-10pct-rule
+
+# Full model (all candidate confounders)
+m_full <- glm(
+  I(chd69 == "Yes") ~ dibpat + age + chol + sbp + smoke + bmi,
+  data = wcgs,
+  family = binomial
+)
+
+# Unadjusted model (Type A only)
+m_unadj <- glm(
+  I(chd69 == "Yes") ~ dibpat,
+  data = wcgs,
+  family = binomial
+)
+
+# Estimate of Type A effect from full model
+beta_full <- coef(m_full)["dibpatType A"]
+beta_unadj <- coef(m_unadj)["dibpatType A"]
+
+# Models dropping one confounder at a time
+confounders <- c("age", "chol", "sbp", "smoke", "bmi")
+
+results <- lapply(confounders, function(v) {
+  f <- as.formula(
+    paste("I(chd69 == \"Yes\") ~ dibpat +",
+          paste(setdiff(confounders, v), collapse = " + "))
+  )
+  m_red <- glm(f, data = wcgs, family = binomial)
+  beta_red <- coef(m_red)["dibpatType A"]
+  pct_change <- abs((beta_full - beta_red) / beta_full) * 100
+  data.frame(
+    Confounder = v,
+    Beta_full = round(beta_full, 4),
+    Beta_reduced = round(beta_red, 4),
+    Pct_change = round(pct_change, 1),
+    Meaningful = ifelse(pct_change > 10, "Yes (>10%)", "No (<=10%)")
+  )
+})
+
+results_df <- do.call(rbind, results)
+rownames(results_df) <- NULL
+```
+
+::: {#tbl-wcgs-10pct}
+
+```{r}
+knitr::kable(
+  results_df,
+  col.names = c(
+    "Dropped variable",
+    "β (full model)",
+    "β (reduced model)",
+    "% change",
+    "Meaningful confounder?"
+  )
+)
+```
+
+10% change-in-estimate rule applied to the WCGS data.
+The Type A coefficient from the full model
+is compared with coefficients from models that drop one confounder at a time.
+A percentage change exceeding 10% indicates meaningful confounding.
+
+:::
+
+In this example,
+none of the individual confounders causes a >10% change when dropped,
+suggesting they are not individually strong confounders of the
+Type A → CHD relationship in WCGS.
+Comparing the unadjusted log-OR for Type A
+($\approx$ `r round(beta_unadj, 2)`)
+with the fully adjusted log-OR
+($\approx$ `r round(beta_full, 2)`)
+shows that the overall set of covariates provides modest adjustment.
+In practice, covariates with established biological relationships to the outcome
+(such as age, smoking, and blood pressure for CHD)
+are typically retained regardless of the change-in-estimate result,
+for reasons of face validity.
+
+When multiple covariates are candidates for removal,
+checking all possible subsets
+is often impractical.
+A practical alternative
+is a backward-elimination procedure
+guided jointly by the 10% rule
+and precision.
+
+At each step:
+
+1. Start from the current model.
+2. Fit models that drop one remaining candidate covariate at a time.
+3. Keep only candidates with
+   absolute percent change from the full-model exposure coefficient
+   no larger than 10%,
+   where the full-model coefficient
+   is always from the original
+   gold-standard model.
+4. Among those candidates,
+   drop the covariate
+   that gives the largest reduction
+   in the exposure coefficient's standard error.
+5. Repeat until no remaining covariate can be dropped
+   without either
+   exceeding the 10% threshold
+   or increasing the exposure standard error.
+
+```{r}
+#| label: wcgs-10pct-bwe
+#| code-fold: false
+
+beta_full <- coef(m_full)["dibpatType A"]
+se_full <- sqrt(vcov(m_full)["dibpatType A", "dibpatType A"])
+exposure_var <- "dibpat"
+
+current_covars <- confounders
+current_se <- se_full
+step_num <- 1
+bwe_steps <- list()
+
+repeat {
+  candidate_results <- lapply(current_covars, function(v) {
+    keep_covars <- setdiff(current_covars, v)
+    reduced_formula <- if (length(keep_covars) == 0) {
+      as.formula("I(chd69 == \"Yes\") ~ dibpat")
+    } else {
+      as.formula(
+        paste(
+          "I(chd69 == \"Yes\") ~",
+          paste(c(exposure_var, keep_covars), collapse = " + ")
+        )
+      )
+    }
+
+    m_red <- glm(reduced_formula, data = wcgs, family = binomial)
+    beta_red <- coef(m_red)["dibpatType A"]
+    se_red <- sqrt(vcov(m_red)["dibpatType A", "dibpatType A"])
+    pct_change <- abs((beta_full - beta_red) / beta_full) * 100
+    se_reduction <- current_se - se_red
+
+    data.frame(
+      Step = step_num,
+      Candidate_drop = v,
+      Beta_reduced = beta_red,
+      Pct_change_from_full = pct_change,
+      SE_reduced = se_red,
+      SE_reduction = se_reduction,
+      Eligible = pct_change <= 10 & se_reduction > 0
+    )
+  })
+
+  candidates_df <- do.call(rbind, candidate_results)
+  eligible_df <- candidates_df[candidates_df$Eligible, , drop = FALSE]
+
+  if (nrow(eligible_df) == 0) {
+    break
+  }
+
+  best_idx <- which.max(eligible_df$SE_reduction)
+  best_drop <- eligible_df[best_idx, , drop = FALSE]
+  dropped_var <- as.character(best_drop$Candidate_drop)
+
+  bwe_steps[[step_num]] <- data.frame(
+    Step = step_num,
+    Dropped = dropped_var,
+    Pct_change_from_full = round(best_drop$Pct_change_from_full, 1),
+    SE_before = round(current_se, 4),
+    SE_after = round(best_drop$SE_reduced, 4),
+    SE_reduction = round(best_drop$SE_reduction, 4)
+  )
+
+  current_covars <- setdiff(current_covars, dropped_var)
+  current_se <- best_drop$SE_reduced
+  step_num <- step_num + 1
+}
+
+bwe_steps_df <- if (length(bwe_steps) == 0) {
+  data.frame(
+    Step = NA_integer_,
+    Dropped = "None",
+    Pct_change_from_full = NA_real_,
+    SE_before = round(se_full, 4),
+    SE_after = round(se_full, 4),
+    SE_reduction = 0
+  )
+} else {
+  do.call(rbind, bwe_steps)
+}
+
+retained_covars <- current_covars
+```
+
+::: {#tbl-wcgs-10pct-bwe}
+
+```{r}
+knitr::kable(
+  bwe_steps_df,
+  col.names = c(
+    "Step",
+    "Dropped covariate",
+    "% change from full model",
+    "SE before drop",
+    "SE after drop",
+    "SE reduction"
+  )
+)
+```
+
+Backward elimination under the 10% change-in-estimate rule,
+with standard-error reduction
+used to choose among eligible drops.
+The reference for the percent-change criterion
+is always the Type A coefficient in the original full model.
+
+:::
+
+For further reading on model selection strategies in epidemiologic analyses,
+see @vittinghoff2e [Chapter 10, pp. 257--275],
+@kleinbaum2010logistic [Chapters 6--8],
+and @me4 [Chapter 21, section "Model Selection"].
+
+::: {.callout-note}
+## Collinearity between confounders is not a problem
+
+Unlike collinearity between a predictor of primary interest
+and other covariates (which inflates its standard error),
+collinearity *among* adjustment variables does not affect
+the precision of the primary predictor's estimate.
+Thus, two collinear confounders can both be retained
+if both are judged necessary on substantive grounds.
+:::
+
+## Example: backward selection for CHD confounders {#sec-pred-sel-backward-hers}
+
+In the HERS data, we can use backward selection
+to identify which baseline covariates confound
+the effect of hormone therapy on LDL.
+Since HT was randomized in HERS,
+adjustment is needed for precision, not confounding;
+but this illustrates the procedure.
+
+```{r}
+#| label: backward-sel-ldl-hers
+
+full_model <- lm(
+  LDL ~ HT + age + BMI + diabetes + smoking + statins + SBP,
+  data = hers_ldl
+)
+
+# Backward selection using AIC
+back_model <- MASS::stepAIC(full_model, direction = "backward", trace = FALSE)
+summary(back_model)
+```
+
+::: notes
+
+The backward selection procedure retains the predictors that
+most improve model fit according to AIC.
+In this setting, since HT was randomized,
+we interpret the selected set as prognostic covariates
+that improve precision of the HT estimate,
+not as confounders.
+
+:::
+
+## Interactions with the primary predictor {#sec-pred-sel-interactions}
+
+A useful check on the validity of the selected model is to test
+for **interactions** between the primary predictor and
+key covariates.
+Particularly for novel findings,
+showing that the association is consistent across subgroups
+strengthens credibility.
+
+::: notes
+
+If a substantial and biologically plausible interaction is found,
+the analysis must account for it.
+However, exploratory interaction tests are susceptible to
+false-positive findings,
+and unexpected interactions should be interpreted cautiously.
+
+:::
+
+## Randomized experiments {#sec-pred-sel-rct}
+
+In randomized trials, participants are allocated to treatment randomly,
+so the treatment indicator is by design uncorrelated
+with pre-treatment covariates in expectation.
+Confounding adjustment is therefore **not required** for causal inference.
+
+However, there are several reasons to include covariates
+in trial analyses:
+
+1. **Improving precision** (continuous outcomes).
+   Adjusting for strong predictors of the outcome
+   reduces residual variance and tightens the treatment effect estimate.
+   Because treatment is uncorrelated with the covariates by randomization,
+   variance inflation is negligible.
+
+2. **De-attenuation** (binary and survival outcomes).
+   In logistic and Cox models,
+   omitting important predictors attenuates the treatment effect estimate.
+   Adjusting for them moves the estimate closer to the true subject-specific effect.
+
+3. **Accounting for stratified designs**.
+   In stratified or cluster-randomized trials,
+   including stratification variables in the model
+   yields correct standard errors and p-values.
+
+4. **Correcting for chance imbalances**.
+   In small trials, randomization may leave residual imbalance
+   in important prognostic covariates.
+   Adjusted analyses demonstrate that the main inference
+   is not materially affected by any such imbalance.
+
+::: {.callout-note}
+## Pre-specification is important in trials
+
+Adjusted analyses in clinical trials should ideally be
+**pre-specified in the study protocol**,
+to prevent post-hoc selection of covariates
+that minimize the treatment effect p-value.
+:::
diff --git a/_subfiles/shared/_sec_hers_conclusions.qmd b/_subfiles/shared/_sec_hers_conclusions.qmd
new file mode 100644
index 000000000..804f242a5
--- /dev/null
+++ b/_subfiles/shared/_sec_hers_conclusions.qmd
@@ -0,0 +1,32 @@
+Despite observational studies suggesting up to 35–80% fewer recurrent
+CHD events among hormone users than non-users,
+[@HulleyStephen1998RToE],
+HERS found **no significant difference** in CHD event rates between
+the hormone therapy (HT) and placebo groups
+(relative hazard 0.99; 95% CI: 0.80–1.22)
+[@HulleyStephen1998RToE].
+
+The HERS investigators noted a statistically significant time trend:
+more CHD events occurred in the HT group in year 1,
+while fewer occurred in years 4 and 5,
+suggesting that HT may have early harmful effects that are later reversed.
+
+A key finding was that HT significantly altered lipid profiles:
+
+- LDL cholesterol: 11% lower in the HT group [@HulleyStephen1998RToE].
+- HDL cholesterol: 10% higher in the HT group [@HulleyStephen1998RToE].
+
+Yet these favorable lipid changes did not translate into
+fewer CHD events overall.
+One explanation consistent with the DAG in @fig-hers-dag
+is that the effects of HT on lipids (a mediator pathway)
+were offset by other, harmful pathways —
+possibly thrombotic — that are not captured in the lipid mediators.
+
+::: notes
+
+@HulleyStephen1998RToE also noted that HT increased the rate of
+venous thromboembolic events (relative hazard 2.89; 95% CI: 1.50–5.58)
+and gallbladder disease (relative hazard 1.38; 95% CI: 1.00–1.92).
+
+:::
diff --git a/_subfiles/shared/_sec_hers_intro.qmd b/_subfiles/shared/_sec_hers_intro.qmd
index cff857e51..6194b059b 100644
--- a/_subfiles/shared/_sec_hers_intro.qmd
+++ b/_subfiles/shared/_sec_hers_intro.qmd
@@ -2,3 +2,13 @@ The "heart and estrogen/progestin study" (HERS) was a clinical
 trial of hormone therapy for prevention of recurrent heart attacks and
 death among 2,763 post-menopausal women with existing coronary heart disease
 (CHD) [@HulleyStephen1998RToE].
+
+The trial was conducted at 20 US clinical centers.
+Participants were randomized to receive either conjugated equine estrogens
+(0.625 mg/day) plus medroxyprogesterone acetate (2.5 mg/day)
+or a matching placebo [@HulleyStephen1998RToE].
+Women were followed for an average of 4.1 years
+[@HulleyStephen1998RToE].
+
+The primary outcome was nonfatal myocardial infarction or CHD death
+[@HulleyStephen1998RToE].
diff --git a/inst/WORDLIST b/inst/WORDLIST
index b2e644f28..ce592c75f 100644
--- a/inst/WORDLIST
+++ b/inst/WORDLIST
@@ -1,63 +1,93 @@
 Aalen
 affine
 Acknowledgements
-Biostat
 BMI
+Biostat
 CHD
 CLT
+Cady
+Confounder
+DAGs
+De
 devn
 Ef
 EDA
+EPV
 Epi
 Ftest
 GOF
 GitHub
+Grodstein
+HDL
 HSE
 Hua
 Kaplan
 Kleinbaum
 Kullback
+LDL
 Leibler
+Lp
 MathJax
+Multicollinearity
 Multipredictor
+NHS
 NoDerivatives
 NonCommercial
 ORCID
+Orwoll
+Osteoporotic
 resid
 RevealJS
 Rocke
+Rosenman
 SBP
+SOF
 Satterthwaite
 Sjoberg
 Stochasticity
+TSS
+Testimation
 Therneau
 TinyTeX
-TSS
 UC
+VIF
+VIFs
+Vittinghoff
 vxi
 WCGS
 Welch
 Zhou
+al
+anticonservative
+arg
 biomarkers
 bmatrix
 bmt
 callout
 cdot
 cdots
+collinear
+confounder
+creatinine
+dL
 da
 ddots
+de
 demstats
 det
 df
 dfrac
 diag
 dists
-dL
 eps
+equine
+estrogens
+et
 exm
 exponentiating
 fp
 frac
+generalizability
 geq
 ggfortify
 ggplot
@@ -68,29 +98,40 @@ idempotent
 ij
 ind
 infty
+interpretability
 invertible
 ji
 ldots
+leftarrow
 leq
+lipoprotein
 logrank
 mX
-matr
 mathcal
+matr
+medroxyprogesterone
 mn
 multimodality
 multipredictor
+multivariable
 ndist
 neq
 np
+overfit
 overset
 pec
 pmatrix
 pre
+pred
 probabilistically
+progestin
 qmd
 qquad
+sel
 siid
 sj
+statin
+statins
 stókhos
 subfiles
 submodule
@@ -99,7 +140,10 @@ subtypes
 surv
 survminer
 th
+thromboembolic
+thrombotic
 tp
+unconfounded
 ungrouped
 varepsilon
 vdots
diff --git a/latex-macros b/latex-macros
index a0b66f56a..339013406 160000
--- a/latex-macros
+++ b/latex-macros
@@ -1 +1 @@
-Subproject commit a0b66f56a01f9ffc65625071fe40b526acb5a268
+Subproject commit 339013406d162cb2c6a41059d5829a40ac03933d
diff --git a/predictor-selection.qmd b/predictor-selection.qmd
new file mode 100644
index 000000000..b0d23db95
--- /dev/null
+++ b/predictor-selection.qmd
@@ -0,0 +1,359 @@
+---
+title: "Predictor Selection"
+subtitle: "Choosing which variables to include in a regression model"
+format:
+  html: default
+  revealjs:
+    output-file: predictor-selection-slides.html
+  pdf:
+    output-file: predictor-selection-handout.pdf
+---
+
+{{< include shared-config.qmd >}}
+
+# Acknowledgements {.unnumbered}
+
+This content is adapted from:
+
+-   [@vittinghoff2e, Chapter 10]
+-   [@heinze2018variable]
+
+# Introduction {#sec-pred-sel-intro}
+
+{{< include _sec-pred-sel-overview.qmd >}}
+
+# Inferential Goals {#sec-pred-sel-goals-chapter}
+
+{{< include _sec-pred-sel-goals.qmd >}}
+
+# Causal Thinking and DAGs {#sec-pred-sel-causal}
+
+{{< include _sec-pred-sel-dag.qmd >}}
+
+# Prediction {#sec-pred-sel-prediction}
+
+{{< include _sec-pred-sel-prediction.qmd >}}
+
+# Evaluating a Predictor of Primary Interest {#sec-pred-sel-eval}
+
+{{< include _sec-pred-sel-primary.qmd >}}
+
+# Identifying Multiple Important Predictors {#sec-pred-sel-multiple-chapter}
+
+{{< include _sec-pred-sel-multiple.qmd >}}
+
+# Testing-based selection {#sec-pred-sel-stepwise}
+
+::: notes
+
+Testing-based selection methods use statistical tests
+(such as F-tests or likelihood ratio tests)
+to decide which predictors to include in a model.
+
+:::
+
+## Stepwise regression
+
+::: {.callout-warning}
+## Caution about stepwise selection
+
+Stepwise regression has well-known problems:
+
+- It tends to select too many variables (overfitting).
+- P-values and confidence intervals are biased after selection.
+- It ignores model uncertainty.
+- Results can be unstable across different samples.
+
+Consider using subject-matter knowledge (DAGs),
+cross-validation, or penalized methods (such as LASSO) instead.
+See @heinze2018variable for a thorough review.
+:::
+
+The basic idea of stepwise selection:
+
+- **Forward selection**: Start with no predictors;
+add the predictor that most improves model fit at each step.
+- **Backward elimination**: Start with all predictors;
+remove the predictor that least worsens model fit at each step.
+- **Bidirectional stepwise**: Combine forward and backward steps.
+
+## Example: stepwise selection for LDL
+
+```{r}
+#| label: stepwise-ldl
+
+# Full model with all candidate predictors
+full_model_ldl <- lm(
+  LDL ~ HT + age + BMI + diabetes + smoking + statins + SBP,
+  data = hers_ldl
+)
+
+# Backward elimination using AIC
+step_model_ldl <- step(
+  full_model_ldl,
+  direction = "backward",
+  trace = 0
+)
+
+summary(step_model_ldl)
+```
+
+# Criterion-based selection {#sec-pred-sel-aic-bic}
+
+## AIC and BIC
+
+When comparing non-nested models,
+we can use information criteria:
+
+- **AIC** (Akaike Information Criterion):
+$$\text{AIC} = -2\,\hat\ell + 2p$$
+- **BIC** (Bayesian Information Criterion):
+$$\text{BIC} = -2\,\hat\ell + p\log n$$
+
+where $\hat\ell$ is the maximized log-likelihood,
+$p$ is the number of model parameters,
+and $n$ is the sample size.
+
+Lower values of AIC or BIC indicate a better model.
+BIC applies a larger penalty for model complexity,
+especially for large $n$.
+
+## Comparing models for LDL
+
+::: {#tbl-model-comparison}
+
+```{r}
+
+library(tibble)
+
+models_ldl <- list(
+  "HT only" = lm(LDL ~ HT, data = hers_ldl),
+  "HT + age" = lm(LDL ~ HT + age, data = hers_ldl),
+  "HT + statins" = lm(LDL ~ HT + statins, data = hers_ldl),
+  "HT + age + BMI + diabetes + statins" = lm(
+    LDL ~ HT + age + BMI + diabetes + statins,
+    data = hers_ldl
+  ),
+  "Full model" = lm(
+    LDL ~ HT + age + BMI + diabetes + smoking + statins + SBP,
+    data = hers_ldl
+  )
+)
+
+tibble(
+  Model = names(models_ldl),
+  AIC = sapply(models_ldl, AIC),
+  BIC = sapply(models_ldl, BIC),
+  Adj_R2 = sapply(
+    models_ldl,
+    function(m) summary(m)$adj.r.squared
+  )
+)
+```
+
+Comparison of LDL models by AIC, BIC, and adjusted $R^2$.
+
+:::
+
+# Penalized regression {#sec-pred-sel-lasso}
+
+::: notes
+
+When there are many candidate predictors,
+penalized regression methods simultaneously
+estimate model coefficients and perform variable selection
+by shrinking small coefficients toward zero.
+
+:::
+
+## The LASSO
+
+The **LASSO** (Least Absolute Shrinkage and Selection Operator)
+adds an $\ell_1$ penalty to the log-likelihood:
+
+$$\hat\beta_\lambda = \arg\max_\beta \left\{
+\hat\ell(\beta) - \lambda \sum_{j=1}^p |\beta_j|
+\right\}$$
+
+The tuning parameter $\lambda$ controls the strength of the penalty:
+
+- $\lambda = 0$: no penalty (ordinary least squares).
+- $\lambda \to \infty$: all coefficients shrink to zero.
+
+For intermediate values of $\lambda$,
+some coefficients are exactly zero (excluded),
+while others are shrunk but non-zero.
+
+The optimal $\lambda$ is usually chosen by cross-validation.
+
+## Example: LASSO for LDL
+
+::: {#fig-lasso-ldl}
+
+```{r}
+
+library(glmnet)
+library(ggfortify)
+
+hers_ldl_complete <-
+  hers_ldl |>
+  dplyr::select(LDL, HT, age, BMI, diabetes, smoking, statins, SBP) |>
+  tidyr::drop_na()
+
+y_ldl <- hers_ldl_complete$LDL
+x_ldl <-
+  hers_ldl_complete |>
+  dplyr::select(-LDL) |>
+  mutate(
+    HT = as.integer(HT == "HT"),
+    statins = as.integer(statins == "Yes"),
+    diabetes = as.integer(diabetes == "Yes"),
+    smoking = as.integer(smoking == "Yes")
+  ) |>
+  as.matrix()
+
+lasso_fit <- glmnet(x_ldl, y_ldl)
+autoplot(lasso_fit, xvar = "lambda") +
+  theme_bw()
+```
+
+LASSO coefficient paths for LDL predictors in HERS.
+Each line traces one predictor's coefficient as $\lambda$ increases.
+Coefficients shrink toward zero as the penalty grows;
+some reach exactly zero (variables excluded from the model).
+
+:::
+
+
+---
+
+::: {#fig-lasso-cv}
+
+```{r}
+
+cv_lasso <- cv.glmnet(x_ldl, y_ldl)
+plot(cv_lasso)
+```
+
+Cross-validated LASSO for LDL: mean squared error vs. $\log(\lambda)$.
+The left dashed line marks `lambda.min` (minimum CV error);
+the right dashed line marks `lambda.1se` (largest $\lambda$ within 1 SE of the minimum).
+
+:::
+
+---
+
+::: {#tbl-lasso-coefs}
+
+```{r}
+
+data.frame(
+  lambda.min = as.vector(coef(cv_lasso, s = "lambda.min")),
+  lambda.1se = as.vector(coef(cv_lasso, s = "lambda.1se")),
+  row.names = rownames(coef(cv_lasso))
+)
+```
+
+LASSO coefficients at two values of the regularization parameter.
+`lambda.min` minimizes cross-validation error;
+`lambda.1se` uses a more conservative (larger) $\lambda$,
+within one standard error of the minimum,
+producing a sparser model.
+
+:::
+
+::: notes
+
+The LASSO identifies `statins` and `HT` as the strongest predictors
+of LDL cholesterol in the HERS data,
+consistent with the biological knowledge encoded in the DAG
+in @fig-hers-dag.
+
+:::
+
+# Some Details {#sec-pred-sel-details}
+
+{{< include _sec-pred-sel-details.qmd >}}
+
+# Summary {#sec-pred-sel-summary}
+
+Predictor selection is the process of choosing appropriate variables
+for inclusion in a multivariable regression model.
+The right approach depends on the **inferential goal**:
+
+**For prediction** (Goal 1):
+
+- Pre-specify well-motivated candidate predictors.
+- Use cross-validation of a target PE measure
+  to select among models without overfitting.
+- Shrinkage methods (LASSO, ridge) are effective
+  when there are many candidates.
+
+**For evaluating a predictor of primary interest** (Goal 2):
+
+- Use a DAG to identify confounders, mediators, and colliders.
+- Include all well-established confounders for face validity.
+- Use backward selection with a liberal criterion ($p < 0.2$)
+  to remove apparent non-confounders.
+- In randomized trials, covariates are not required for confounding control
+  but improve precision (and de-attenuate estimates in logistic/Cox models).
+- Pre-specify adjusted analyses in trial protocols.
+
+**For identifying multiple important predictors** (Goal 3):
+
+- Apply the same inclusive strategy as Goal 2.
+- Consider the Allen–Cady modified backward selection procedure
+  to limit false-positive findings.
+- Interpret novel, weak, and borderline significant associations
+  cautiously; report all candidate predictors considered.
+
+**Across all goals**:
+
+- Collinearity between a primary predictor and a confounder
+  can inflate standard errors; between adjustment variables, it does not.
+- The EPV rule (10 events per predictor) flags potential problems
+  but is not a hard limit.
+- Model selection complicates inference:
+  nominal p-values and confidence intervals are anticonservative
+  when predictors have been selected from the data.
+
+# Learning Objectives {#sec-pred-sel-objectives .unnumbered}
+
+After completing this chapter, students should be able to:
+
+1. **Identify the inferential goal** of a predictor selection problem
+   (prediction, evaluating a primary predictor,
+   or identifying multiple important predictors)
+   and choose an appropriate strategy.
+
+2. **Use a DAG** to identify confounders, mediators, and colliders,
+   and determine which covariates to include or exclude.
+
+3. **Apply backward selection with a liberal criterion**
+   to select confounders in observational studies.
+
+4. **Use AIC and BIC** to compare alternative models.
+
+5. **Apply and interpret the LASSO**,
+   including cross-validated selection of $\lambda$.
+
+6. **Explain why stepwise selection is problematic**
+   and describe its known pitfalls.
+
+7. **Describe the bias–variance trade-off** and explain
+   why cross-validation provides a less optimistic estimate of PE
+   than naive within-sample measures.
+
+8. **Identify and handle collinearity**,
+   including using the VIF and understanding its implications
+   for different inferential goals.
+
+9. **Articulate how model selection complicates inference**
+   and describe best practices for reporting in each inferential context.
+
+
+
+# References {.unnumbered}
+
+::: {#refs}
+:::
diff --git a/references.bib b/references.bib
index 61a087ee2..d51cde7d5 100644
--- a/references.bib
+++ b/references.bib
@@ -31,6 +31,57 @@ @book{vittinghoff2e
   url={https://doi.org/10.1007/978-1-4614-1353-0}
 }
 
+@book{AllenCady1982,
+  title={Analyzing experimental data by regression},
+  author={Allen, David M. and Cady, Frederick B.},
+  year={1982},
+  publisher={Lifetime Learning Publications},
+  address={Belmont, CA}
+}
+
+@article{Walter2001,
+  title={Development and validation of a prognostic index for 1-year mortality in older adults after hospitalization},
+  author={Walter, Louise C. and Brand, Richard J. and Counsell, Steven R. and Palmer, R. Mark and Landefeld, C. Seth and Fortinsky, Richard H. and Covinsky, Kenneth E.},
+  journal={JAMA},
+  year={2001},
+  volume={285},
+  number={23},
+  pages={2987--2994},
+  doi={10.1001/jama.285.23.2987}
+}
+
+@article{Grodstein2001,
+  title={Postmenopausal hormone use and secondary prevention of coronary events in the Nurses' Health Study: a prospective, observational study},
+  author={Grodstein, Francine and Manson, JoAnn E. and Stampfer, Meir J. and Colditz, Graham A. and Willett, Walter C. and Rosner, Bernard and Speizer, Frank E. and Hennekens, Charles H.},
+  journal={Annals of Internal Medicine},
+  year={2001},
+  volume={135},
+  number={1},
+  pages={1--8},
+  doi={10.7326/0003-4819-135-1-200107030-00007}
+}
+
+@article{Orwoll1996,
+  title={Axial bone mass in older women},
+  author={Orwoll, Eric S. and Bauer, Douglas C. and Vogt, Thomas M. and Fox, Kathleen M. and Study of Osteoporotic Fractures Research Group},
+  journal={Annals of Internal Medicine},
+  year={1996},
+  volume={124},
+  number={3},
+  pages={187--196}
+}
+
+@article{Vittinghoff2003,
+  title={Risk factors and secondary prevention in women with heart disease: the Heart and Estrogen/progestin Replacement Study},
+  author={Vittinghoff, Eric and Shlipak, Michael G. and Varosy, Peter D. and McDermott, Mary and Robbins, John A. and Harris, Tamara B. and Newman, Anne B. and Bauer, Douglas C.},
+  journal={Annals of Internal Medicine},
+  year={2003},
+  volume={139},
+  number={4},
+  pages={274--281},
+  doi={10.7326/0003-4819-139-4-200308190-00006}
+}
+
 @book{klein2003survival,
   title={Survival analysis: techniques for censored and truncated data},
   author={Klein, John P and Moeschberger, Melvin L},
@@ -205,6 +256,36 @@ @book{dunn2018generalized
   url={https://link.springer.com/book/10.1007/978-1-4419-0118-7}
 }
 
+@article{MickeyGreenland1989,
+  author = {Mickey, Ruth M. and Greenland, Sander},
+  title = {The impact of confounder selection criteria on effect estimation},
+  journal = {American Journal of Epidemiology},
+  volume = {129},
+  number = {1},
+  pages = {125--137},
+  year = {1989},
+  month = jan,
+  issn = {0002-9262},
+  doi = {10.1093/oxfordjournals.aje.a115101},
+  url = {https://doi.org/10.1093/oxfordjournals.aje.a115101},
+  eprint = {https://academic.oup.com/aje/article-pdf/129/1/125/127199/129-1-125.pdf}
+}
+
+@article{Greenland1989Modeling,
+  author = {Greenland, Sander},
+  title = {Modeling and variable selection in epidemiologic analysis},
+  journal = {American Journal of Public Health},
+  volume = {79},
+  number = {3},
+  pages = {340--349},
+  year = {1989},
+  month = mar,
+  doi = {10.2105/AJPH.79.3.340},
+  pmid = {2916724},
+  pmcid = {PMC1349563},
+  url = {https://doi.org/10.2105/AJPH.79.3.340}
+}
+
 @book{rpkgs,
   title={{R} Packages},
   author={Wickham, Hadley and Bryan, Jennifer},