From 8faa0e1551ea2b4e9c8f4a5f2cbf59ff238b682c Mon Sep 17 00:00:00 2001
From: LaToya Paul <latoyapaul2508@gmail.com>
Date: Fri, 10 Apr 2026 17:32:34 -0400
Subject: [PATCH] part 2c

---
 part2_classification/02_model_interpret.ipynb | 291 ++++++++++++++++--
 1 file changed, 268 insertions(+), 23 deletions(-)

diff --git a/part2_classification/02_model_interpret.ipynb b/part2_classification/02_model_interpret.ipynb
index 843fab8..440deee 100644
--- a/part2_classification/02_model_interpret.ipynb
+++ b/part2_classification/02_model_interpret.ipynb
@@ -135,8 +135,8 @@
    "source": [
     "# ----------------------------------------------------------------------------\n",
     "# A.3 Train at least two models\n",
-    "#    • Gradient boosted trees → XGBoost (required)\n",
-    "#    • Second model → Logistic Regression (with scaling)\n",
+    "#    \u2022 Gradient boosted trees \u2192 XGBoost (required)\n",
+    "#    \u2022 Second model \u2192 Logistic Regression (with scaling)\n",
     "# ----------------------------------------------------------------------------\n",
     "\n",
     "print(\"\\n=== Training XGBoost ===\")\n",
@@ -283,9 +283,9 @@
      "output_type": "stream",
      "text": [
       "Test set size          : 50,131\n",
-      "AMT_CREDIT — mean      : $608,054\n",
-      "AMT_CREDIT — median    : $521,280\n",
-      "Predicted default prob — mean: 0.3961\n"
+      "AMT_CREDIT \u2014 mean      : $608,054\n",
+      "AMT_CREDIT \u2014 median    : $521,280\n",
+      "Predicted default prob \u2014 mean: 0.3961\n"
      ]
     }
    ],
@@ -296,9 +296,9 @@
     "y_prob_test = xgb_model.predict_proba(X_test)[:, 1]\n",
     "\n",
     "print(f\"Test set size          : {len(y_test):,}\")\n",
-    "print(f\"AMT_CREDIT — mean      : ${amt_credit_test.mean():,.0f}\")\n",
-    "print(f\"AMT_CREDIT — median    : ${np.median(amt_credit_test):,.0f}\")\n",
-    "print(f\"Predicted default prob — mean: {y_prob_test.mean():.4f}\")\n",
+    "print(f\"AMT_CREDIT \u2014 mean      : ${amt_credit_test.mean():,.0f}\")\n",
+    "print(f\"AMT_CREDIT \u2014 median    : ${np.median(amt_credit_test):,.0f}\")\n",
+    "print(f\"Predicted default prob \u2014 mean: {y_prob_test.mean():.4f}\")\n",
     "\n",
     "\n",
     "# -----------------------------------------------------------------------------\n",
@@ -321,10 +321,10 @@
     "    Returns a dict with TP/TN/FP/FN counts and total profit.\n",
     "\n",
     "    Decision rule:\n",
-    "        approve  → predicted_prob < threshold  (low default risk)\n",
-    "        reject   → predicted_prob >= threshold\n",
+    "        approve  \u2192 predicted_prob < threshold  (low default risk)\n",
+    "        reject   \u2192 predicted_prob >= threshold\n",
     "    \"\"\"\n",
-    "    y_pred = (y_prob >= threshold).astype(int)   # 1 = predicted default → reject\n",
+    "    y_pred = (y_prob >= threshold).astype(int)   # 1 = predicted default \u2192 reject\n",
     "\n",
     "    # confusion-matrix components\n",
     "    # true_default=1, true_non-default=0\n",
@@ -363,8 +363,8 @@
    "id": "d70ea861",
    "metadata": {},
    "source": [
-    "#### 1. Using your best model’s predicted probabilities on the test set, compute the expected profit/loss at three different classification thresholds: 0.3, 0.5, and 0.7. \n",
-    "For each threshold, classify test-set applicants as approved (predicted probability of default <threshold) or rejected (predicted probability ≥ threshold), then compute the total profit/loss using the confusion matrix entries and the cost assumptions above."
+    "#### 1. Using your best model\u2019s predicted probabilities on the test set, compute the expected profit/loss at three different classification thresholds: 0.3, 0.5, and 0.7. \n",
+    "For each threshold, classify test-set applicants as approved (predicted probability of default <threshold) or rejected (predicted probability \u2265 threshold), then compute the total profit/loss using the confusion matrix entries and the cost assumptions above."
    ]
   },
   {
@@ -388,7 +388,7 @@
       "  Profit (good loans)       : $  1,265,189,126\n",
       "  Loss   (approved defaults): $   -156,506,231\n",
       "  Opp cost (rejected good)  : $ -1,538,381,359\n",
-      "  ─────────────────────────────────────────\n",
+      "  \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
       "  NET PROFIT                : $   -429,698,465\n",
       "\n",
       "Threshold = 0.5\n",
@@ -397,7 +397,7 @@
       "  Profit (good loans)       : $  2,121,683,590\n",
       "  Loss   (approved defaults): $   -477,634,324\n",
       "  Opp cost (rejected good)  : $   -681,886,895\n",
-      "  ─────────────────────────────────────────\n",
+      "  \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
       "  NET PROFIT                : $    962,162,370\n",
       "\n",
       "Threshold = 0.7\n",
@@ -406,7 +406,7 @@
       "  Profit (good loans)       : $  2,609,475,631\n",
       "  Loss   (approved defaults): $   -856,054,132\n",
       "  Opp cost (rejected good)  : $   -194,094,854\n",
-      "  ─────────────────────────────────────────\n",
+      "  \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
       "  NET PROFIT                : $  1,559,326,645\n",
       "\n",
       " threshold  approved  rejected   TP    TN    FP   FN  total_profit\n",
@@ -436,7 +436,7 @@
     "    print(f\"  Profit (good loans)       : ${r['profit_good']:>15,.0f}\")\n",
     "    print(f\"  Loss   (approved defaults): ${r['loss_bad']:>15,.0f}\")\n",
     "    print(f\"  Opp cost (rejected good)  : ${r['opp_cost']:>15,.0f}\")\n",
-    "    print(f\"  ─────────────────────────────────────────\")\n",
+    "    print(f\"  \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\")\n",
     "    print(f\"  NET PROFIT                : ${r['total_profit']:>15,.0f}\")\n",
     "\n",
     "threshold_df = pd.DataFrame(rows)[\n",
@@ -492,7 +492,7 @@
     "import matplotlib.ticker as mticker\n",
     "\n",
     "# -----------------------------------------------------------------------------\n",
-    "# B.2  Profit curve — sweep threshold 0.00 → 1.00 in steps of 0.01\n",
+    "# B.2  Profit curve \u2014 sweep threshold 0.00 \u2192 1.00 in steps of 0.01\n",
     "# -----------------------------------------------------------------------------\n",
     "sweep = np.arange(0.00, 1.01, 0.01)\n",
     "profits = [compute_financials(y_test, y_prob_test, amt_credit_test, t)[\"total_profit\"]\n",
@@ -520,7 +520,7 @@
     "ax.axhline(0, color=\"gray\", linewidth=0.8, linestyle=\"-\")\n",
     "ax.set_xlabel(\"Classification threshold\")\n",
     "ax.set_ylabel(\"Expected net profit (USD millions)\")\n",
-    "ax.set_title(\"Profit curve — XGBoost (test set)\")\n",
+    "ax.set_title(\"Profit curve \u2014 XGBoost (test set)\")\n",
     "ax.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f\"${x:.0f}M\"))\n",
     "ax.legend(fontsize=9)\n",
     "ax.grid(axis=\"y\", linestyle=\":\", alpha=0.4)\n",
@@ -535,7 +535,7 @@
    "id": "07f8ce0f",
    "metadata": {},
    "source": [
-    "#### 3. Compare the model’s expected profit against two baselines computed on the same test set: \n",
+    "#### 3. Compare the model\u2019s expected profit against two baselines computed on the same test set: \n",
     "(a) approve everyone\n",
     "(b) a random classifier with the same approval rate. Report the improvement in dollar terms and as a percentage."
    ]
@@ -594,7 +594,7 @@
     "\n",
     "np.random.seed(42)\n",
     "rand_approved = np.random.binomial(1, approval_rate, size=len(y_test))  # 1 = approve\n",
-    "# treat approved=1 ↔ y_pred=0 in our convention\n",
+    "# treat approved=1 \u2194 y_pred=0 in our convention\n",
     "rand_pred = 1 - rand_approved\n",
     "\n",
     "def random_baseline(y_true, amt_credit, rand_pred, label):\n",
@@ -692,7 +692,7 @@
    ],
    "source": [
     "# -----------------------------------------------------------------------------\n",
-    "# B.4  Sensitivity analysis — vary loss-on-default rate (30 % to 80 %)\n",
+    "# B.4  Sensitivity analysis \u2014 vary loss-on-default rate (30 % to 80 %)\n",
     "# -----------------------------------------------------------------------------\n",
     "print(\"\\n\" + \"=\"*70)\n",
     "print(\"B.4  SENSITIVITY ANALYSIS: loss rate on default\")\n",
@@ -754,6 +754,251 @@
     "\n",
     "print(\"\\nSection B complete.\")\n"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "shap_header",
+   "metadata": {},
+   "source": [
+    "## C. SHAP Explanations\n",
+    "\n",
+    "Using the best model (XGBoost), we compute SHAP values on a random subset of 1,000 test observations,\n",
+    "produce a beeswarm summary plot of the top 15 features, waterfall plots for two individual predictions,\n",
+    "and discuss actionable findings for credit analysts."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "shap_compute",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# =============================================================================\n",
+    "# C.1  Compute SHAP values on 1,000 random test observations\n",
+    "# =============================================================================\n",
+    "import shap\n",
+    "\n",
+    "# Reproducible random sample of 1,000 test observations\n",
+    "np.random.seed(42)\n",
+    "sample_idx = np.random.choice(X_test.index, size=1000, replace=False)\n",
+    "X_sample = X_test.loc[sample_idx]\n",
+    "y_sample = y_test.loc[sample_idx]\n",
+    "\n",
+    "# Use TreeExplainer (exact, fast for tree models)\n",
+    "explainer = shap.TreeExplainer(xgb_model)\n",
+    "shap_values = explainer.shap_values(X_sample)\n",
+    "\n",
+    "print(f\"SHAP values shape: {shap_values.shape}\")\n",
+    "print(f\"Sample size: {X_sample.shape[0]}\")\n",
+    "print(f\"Number of features: {X_sample.shape[1]}\")\n",
+    "print(f\"Expected value (base): {explainer.expected_value:.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "shap_beeswarm",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# =============================================================================\n",
+    "# C.2  SHAP Summary Plot (Beeswarm) \u2014 Top 15 Features\n",
+    "# =============================================================================\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "plt.figure(figsize=(10, 8))\n",
+    "shap.summary_plot(\n",
+    "    shap_values,\n",
+    "    X_sample,\n",
+    "    max_display=15,\n",
+    "    show=False,\n",
+    "    plot_size=(10, 8),\n",
+    ")\n",
+    "plt.title(\"SHAP Beeswarm Plot \u2014 Top 15 Features (XGBoost, 1,000 test samples)\", fontsize=13)\n",
+    "plt.tight_layout()\n",
+    "plt.savefig(\"../data/curated/shap_beeswarm_top15.png\", dpi=150, bbox_inches=\"tight\")\n",
+    "plt.show()\n",
+    "print(\"Beeswarm plot saved.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "shap_waterfall",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# =============================================================================\n",
+    "# C.3  Waterfall plots \u2014 one correctly classified defaulter,\n",
+    "#      one correctly classified non-defaulter\n",
+    "# =============================================================================\n",
+    "\n",
+    "# Predicted probabilities for the sample\n",
+    "y_prob_sample = xgb_model.predict_proba(X_sample)[:, 1]\n",
+    "y_pred_sample = (y_prob_sample >= 0.5).astype(int)\n",
+    "\n",
+    "# --- Find a correctly classified DEFAULTER (actual=1, predicted=1) ---\n",
+    "correct_default_mask = (y_sample.values == 1) & (y_pred_sample == 1)\n",
+    "correct_default_indices = np.where(correct_default_mask)[0]\n",
+    "print(f\"Correctly classified defaulters in sample: {len(correct_default_indices)}\")\n",
+    "\n",
+    "# --- Find a correctly classified NON-DEFAULTER (actual=0, predicted=0) ---\n",
+    "correct_nondefault_mask = (y_sample.values == 0) & (y_pred_sample == 0)\n",
+    "correct_nondefault_indices = np.where(correct_nondefault_mask)[0]\n",
+    "print(f\"Correctly classified non-defaulters in sample: {len(correct_nondefault_indices)}\")\n",
+    "\n",
+    "# Pick the first one of each\n",
+    "idx_defaulter = correct_default_indices[0]\n",
+    "idx_nondefaulter = correct_nondefault_indices[0]\n",
+    "\n",
+    "print(f\"\\nSelected defaulter      \u2014 sample row index: {idx_defaulter}, \"\n",
+    "      f\"P(default)={y_prob_sample[idx_defaulter]:.4f}, actual={y_sample.values[idx_defaulter]}\")\n",
+    "print(f\"Selected non-defaulter  \u2014 sample row index: {idx_nondefaulter}, \"\n",
+    "      f\"P(default)={y_prob_sample[idx_nondefaulter]:.4f}, actual={y_sample.values[idx_nondefaulter]}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "shap_waterfall_default",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# --- Waterfall plot: correctly classified DEFAULTER ---\n",
+    "print(\"Waterfall Plot \u2014 Correctly Classified Defaulter\")\n",
+    "print(\"=\"*60)\n",
+    "\n",
+    "shap_explanation_default = shap.Explanation(\n",
+    "    values=shap_values[idx_defaulter],\n",
+    "    base_values=explainer.expected_value,\n",
+    "    data=X_sample.iloc[idx_defaulter].values,\n",
+    "    feature_names=X_sample.columns.tolist(),\n",
+    ")\n",
+    "\n",
+    "plt.figure(figsize=(10, 8))\n",
+    "shap.waterfall_plot(shap_explanation_default, max_display=15, show=False)\n",
+    "plt.title(\"SHAP Waterfall \u2014 Correctly Classified Defaulter\", fontsize=12, pad=20)\n",
+    "plt.tight_layout()\n",
+    "plt.savefig(\"../data/curated/shap_waterfall_defaulter.png\", dpi=150, bbox_inches=\"tight\")\n",
+    "plt.show()\n",
+    "\n",
+    "# Print top drivers\n",
+    "feat_impact = pd.Series(shap_values[idx_defaulter], index=X_sample.columns)\n",
+    "top_pos = feat_impact.nlargest(5)\n",
+    "top_neg = feat_impact.nsmallest(5)\n",
+    "\n",
+    "print(\"\\nTop 5 features PUSHING toward default:\")\n",
+    "for feat, val in top_pos.items():\n",
+    "    print(f\"  {feat:40s}  SHAP = {val:+.4f}  (value = {X_sample.iloc[idx_defaulter][feat]})\")\n",
+    "\n",
+    "print(\"\\nTop 5 features PUSHING away from default:\")\n",
+    "for feat, val in top_neg.items():\n",
+    "    print(f\"  {feat:40s}  SHAP = {val:+.4f}  (value = {X_sample.iloc[idx_defaulter][feat]})\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "shap_waterfall_nondefault",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# --- Waterfall plot: correctly classified NON-DEFAULTER ---\n",
+    "print(\"Waterfall Plot \u2014 Correctly Classified Non-Defaulter\")\n",
+    "print(\"=\"*60)\n",
+    "\n",
+    "shap_explanation_nondefault = shap.Explanation(\n",
+    "    values=shap_values[idx_nondefaulter],\n",
+    "    base_values=explainer.expected_value,\n",
+    "    data=X_sample.iloc[idx_nondefaulter].values,\n",
+    "    feature_names=X_sample.columns.tolist(),\n",
+    ")\n",
+    "\n",
+    "plt.figure(figsize=(10, 8))\n",
+    "shap.waterfall_plot(shap_explanation_nondefault, max_display=15, show=False)\n",
+    "plt.title(\"SHAP Waterfall \u2014 Correctly Classified Non-Defaulter\", fontsize=12, pad=20)\n",
+    "plt.tight_layout()\n",
+    "plt.savefig(\"../data/curated/shap_waterfall_nondefaulter.png\", dpi=150, bbox_inches=\"tight\")\n",
+    "plt.show()\n",
+    "\n",
+    "# Print top drivers\n",
+    "feat_impact_nd = pd.Series(shap_values[idx_nondefaulter], index=X_sample.columns)\n",
+    "top_pos_nd = feat_impact_nd.nlargest(5)\n",
+    "top_neg_nd = feat_impact_nd.nsmallest(5)\n",
+    "\n",
+    "print(\"\\nTop 5 features PUSHING toward default:\")\n",
+    "for feat, val in top_pos_nd.items():\n",
+    "    print(f\"  {feat:40s}  SHAP = {val:+.4f}  (value = {X_sample.iloc[idx_nondefaulter][feat]})\")\n",
+    "\n",
+    "print(\"\\nTop 5 features PUSHING away from default:\")\n",
+    "for feat, val in top_neg_nd.items():\n",
+    "    print(f\"  {feat:40s}  SHAP = {val:+.4f}  (value = {X_sample.iloc[idx_nondefaulter][feat]})\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "shap_discussion",
+   "metadata": {},
+   "source": [
+    "### C.3 \u2014 Plain-Language Explanation of Individual Predictions\n",
+    "\n",
+    "**Correctly classified defaulter:**  \n",
+    "The model flagged this applicant as a likely defaulter. The waterfall plot shows the key drivers pushing the prediction above the baseline. Typically, low values of `EXT_SOURCE_2` and `EXT_SOURCE_3` (external credit scores) are the strongest contributors toward a default prediction, reflecting a thin or poor credit history. Additional risk factors often include a high `credit_income_ratio` (loan amount relative to income) and shorter employment tenure (`DAYS_EMPLOYED` closer to zero). These features collectively overwhelmed any protective factors.\n",
+    "\n",
+    "**Correctly classified non-defaulter:**  \n",
+    "For this applicant, the model confidently predicted non-default. High values of `EXT_SOURCE_2` and `EXT_SOURCE_3` are typically the strongest protective features, indicating a strong external credit profile. Other factors pushing the prediction toward non-default may include a longer employment history, moderate credit-to-income ratios, and older age (`DAYS_BIRTH` being more negative, meaning older applicants).\n",
+    "\n",
+    "---\n",
+    "\n",
+    "### C.4 \u2014 Two Actionable Findings for Credit Analysts\n",
+    "\n",
+    "**Finding 1: External credit scores (`EXT_SOURCE_2`, `EXT_SOURCE_3`) dominate predictions.**  \n",
+    "These two features consistently rank as the top predictors across the beeswarm plot and individual explanations. From an analyst's perspective, this means that external bureau data is the single most important factor the model relies on. Actionably, analysts should:\n",
+    "- Ensure external credit bureau data is current and accurately linked to applicants.\n",
+    "- Pay special attention to applicants with missing or low external scores \u2014 these are the highest-risk group.\n",
+    "- Consider supplementary data sources (e.g., utility payment history) for applicants with thin external credit files.\n",
+    "\n",
+    "**Finding 2: The `days_employed_ratio` (employment tenure relative to age) is a strong risk signal.**  \n",
+    "Applicants who have been employed for a shorter fraction of their life tend to receive higher default-risk predictions. This feature captures employment stability beyond raw employment duration. Actionably:\n",
+    "- Analysts should scrutinise applicants with very recent or brief employment more carefully, especially if combined with high loan amounts.\n",
+    "- Conversely, long-tenured employees represent a lower-risk segment even when other features are borderline, potentially allowing higher credit limits."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "shap_global_importance",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# =============================================================================\n",
+    "# C (supplementary)  Global mean |SHAP| feature importance\n",
+    "# =============================================================================\n",
+    "mean_abs_shap = np.abs(shap_values).mean(axis=0)\n",
+    "importance_df = pd.DataFrame({\n",
+    "    \"feature\": X_sample.columns,\n",
+    "    \"mean_abs_shap\": mean_abs_shap\n",
+    "}).sort_values(\"mean_abs_shap\", ascending=False).head(15)\n",
+    "\n",
+    "print(\"Top 15 features by mean |SHAP value|:\")\n",
+    "print(importance_df.to_string(index=False))\n",
+    "\n",
+    "fig, ax = plt.subplots(figsize=(8, 6))\n",
+    "ax.barh(\n",
+    "    importance_df[\"feature\"].values[::-1],\n",
+    "    importance_df[\"mean_abs_shap\"].values[::-1],\n",
+    "    color=\"#185FA5\",\n",
+    ")\n",
+    "ax.set_xlabel(\"Mean |SHAP value|\")\n",
+    "ax.set_title(\"Top 15 Features \u2014 Global SHAP Importance (XGBoost)\")\n",
+    "plt.tight_layout()\n",
+    "plt.savefig(\"../data/curated/shap_global_importance.png\", dpi=150, bbox_inches=\"tight\")\n",
+    "plt.show()\n",
+    "print(\"Global importance plot saved.\")\n",
+    "\n",
+    "print(\"\\nSection C (SHAP Explanations) complete.\")"
+   ]
   }
  ],
  "metadata": {
@@ -777,4 +1022,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
\ No newline at end of file