From a1a3db154fdf476bcdebbb65f4957a19d5a023aa Mon Sep 17 00:00:00 2001
From: Anatolii <anatolii@nullrun.io>
Date: Tue, 23 Jun 2026 21:31:01 +0400
Subject: [PATCH] feat(exceptions): introduce Layer-1 structured exception
 hierarchy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Every public SDK exception now inherits from NullRunError and carries
four actionable fields (error_code, user_action, retryable, docs_url)
plus an optional chained cause. Users get a stable, grep-able error
code (NR-A001, NR-B002, NR-R001, ...) and a short imperative
next-step hint instead of a free-form message string.

New specialized classes (back-compat subclasses of existing
user-facing classes, so existing except clauses keep matching):

  * NullRunConfigError       — config/initialization failures
  * NullRunAuthError         — invalid/missing API key (subclass of
                               NullRunAuthenticationError)
  * NullRunBackendError      — gateway 5xx (subclass of
                               NullRunTransportError, retryable=True)
  * NullRunBudgetError       — budget exhausted (subclass of
                               NullRunBlockedException)
  * NullRunToolBlockedError  — tool blocked by policy (subclass of
                               NullRunBlockedException)

Existing except handlers keep working: every new class is a subclass
of an existing one, so e.g. 'except NullRunBlockedException' still
catches NullRunBudgetError and NullRunToolBlockedError.

Tests: tests/test_exception_hierarchy.py pins the hierarchy shape
(class roots), the structured fields on every public class, and the
five back-compat invariants (subclass matching for the user-facing
exception trees, BaseException isolation for WorkflowKilledInterrupt).

Verified locally: pytest 880 passed / 13 skipped, ruff check src/
clean, mypy src/ clean.
---
 src/nullrun/__init__.py           |  27 ++-
 src/nullrun/breaker/exceptions.py | 380 ++++++++++++++++++++++++++++--
 src/nullrun/decorators.py         | 108 +++++++--
 src/nullrun/runtime.py            |  63 ++++-
 src/nullrun/transport.py          |  34 ++-
 tests/test_exception_hierarchy.py | 258 ++++++++++++++++++++
 6 files changed, 816 insertions(+), 54 deletions(-)
 create mode 100644 tests/test_exception_hierarchy.py

diff --git a/src/nullrun/__init__.py b/src/nullrun/__init__.py
index 54b8cfa..cc54da6 100644
--- a/src/nullrun/__init__.py
+++ b/src/nullrun/__init__.py
@@ -95,12 +95,27 @@ def my_agent():
     # caught at startup rather than producing silent allow-all decisions.
     resolved_key = api_key or os.getenv("NULLRUN_API_KEY")
     if not resolved_key:
+        # Layer 1: raise the legacy type (``NullRunAuthenticationError``)
+        # so user code with ``except NullRunAuthenticationError:`` still
+        # catches this case, but stamp the structured ``error_code`` /
+        # ``user_action`` so a Layer-2 on_error hook (or a
+        # ``except NullRunError:`` clause) can branch on the catalog
+        # value ``NR-C001`` ("configuration: no api_key") without
+        # parsing the message.
         from nullrun.breaker.exceptions import NullRunAuthenticationError
 
         raise NullRunAuthenticationError(
             "nullrun.init() requires an api_key. Pass api_key='nr_live_...' "
             "explicitly or set the NULLRUN_API_KEY environment variable. "
-            "(Silent no-op fallback was removed in 0.3.0 — see CHANGELOG.)"
+            "(Silent no-op fallback was removed in 0.3.0 — see CHANGELOG.)",
+            error_code="NR-C001",
+            user_action=(
+                "Get an API key at https://app.nullrun.io/settings/api-keys, "
+                "then either pass api_key='nr_live_...' to nullrun.init() or "
+                "set the NULLRUN_API_KEY environment variable. The SDK cannot "
+                "operate without credentials — the silent no-op fallback was "
+                "removed in 0.3.0 because it bypassed every backend gate."
+            ),
         )
 
     # Imported lazily so we don't pull the runtime into the namespace
@@ -146,6 +161,7 @@ def my_agent():
     # unconditional — we always have a remote LLM traffic source if
     # auto-instrumentation libraries are installed.
     from nullrun.instrumentation.auto import auto_instrument
+
     auto_instrument(runtime)
 
     # Start the coverage reporter so the backend gets a coverage_report
@@ -179,7 +195,6 @@ def my_agent():
     "get_trace_id": ("nullrun.context", "get_trace_id"),
     "get_span_id": ("nullrun.context", "get_span_id"),
     "get_agent_id": ("nullrun.context", "get_agent_id"),
-
     # Instrumentation
     "NullRunCallback": ("nullrun.instrumentation", "NullRunCallback"),
     # NOTE (Sprint 1.2 / B11-B12): `patch_openai` and `unpatch_openai`
@@ -191,14 +206,12 @@ def my_agent():
     # a worse failure mode than a clean `ImportError` from
     # `from nullrun import patch_openai` failing because the symbol
     # is no longer in the lazy table.
-
     # Toolbox — framework-specific wrappers (Phase 1 Commit 6).
     # The previous `instrument()` helper lived at
     # `nullrun.instrumentation.langgraph.instrument`; it is now
     # `nullrun.toolbox.langgraph.wrapper`. Reachable as
     # `from nullrun import wrapper` for one-line import.
     "wrapper": ("nullrun.toolbox.langgraph", "wrapper"),
-
     # Span / trace context (Phase 2 Commit 3).
     # `tracing.py` is the structured replacement for the loose `_trace_id`
     # / `_span_id` contextvars in `nullrun.context`. `SpanContext` is a
@@ -211,10 +224,8 @@ def my_agent():
     "create_child_span": ("nullrun.tracing", "create_child_span"),
     "set_span": ("nullrun.tracing", "set_span"),
     "reset_span": ("nullrun.tracing", "reset_span"),
-
     # Decorators
     "sensitive": ("nullrun.decorators", "sensitive"),
-
     # Actions (Phase 3)
     "ActionHandler": ("nullrun.actions", "ActionHandler"),
     "ActionType": ("nullrun.actions", "ActionType"),
@@ -223,7 +234,6 @@ def my_agent():
     "handle_action": ("nullrun.actions", "handle_action"),
     "register_action_handler": ("nullrun.actions", "register_action_handler"),
     "get_action_handler": ("nullrun.actions", "get_action_handler"),
-
     # Exceptions (Phase 3)
     "NullRunBlockedException": ("nullrun.breaker.exceptions", "NullRunBlockedException"),
     "NullRunAuthenticationError": ("nullrun.breaker.exceptions", "NullRunAuthenticationError"),
@@ -265,13 +275,12 @@ def __dir__() -> list[str]:
 __all__ = [
     # Version (single value, always public)
     "__version__",
-
     # Phase 3.4: the curated public surface — six symbols.
     # Everything else stays importable as `from nullrun import X` for
     # backward compatibility, but does NOT appear in `dir(nullrun)`
     # until the user actually accesses it.
     "init",
-    "protect",         # gate decorator
+    "protect",  # gate decorator
     "track_llm",
     "track_tool",
     "track_event",
diff --git a/src/nullrun/breaker/exceptions.py b/src/nullrun/breaker/exceptions.py
index a0335a7..6f6c72b 100644
--- a/src/nullrun/breaker/exceptions.py
+++ b/src/nullrun/breaker/exceptions.py
@@ -4,9 +4,125 @@
 
 class BreakerError(Exception):
     """Base exception for Breaker SDK."""
+
     pass
 
 
+# ---------------------------------------------------------------------------
+# Structured error base (Layer 1 of the "give the user a chance" design)
+# ---------------------------------------------------------------------------
+# Pre-Layer-1: every SDK exception was a plain ``Exception`` with a free-form
+# ``message``. Users got the same string for "you forgot api_key" and
+# "backend is on fire" — no machine-readable code, no next-step hint, no
+# retryable flag. Cookbook examples had to grep the message for keywords.
+#
+# Post-Layer-1: every public SDK exception inherits from ``NullRunError``
+# and carries four structured fields:
+#
+#   * ``error_code``   — stable, grep-able identifier (e.g. ``"NR-A001"``).
+#                        Documented in ``docs/errors/<code>.md`` and
+#                        available to telemetry / Sentry / dashboards.
+#   * ``user_action``  — short, imperative sentence telling the user what
+#                        to do next ("Set NULLRUN_API_KEY env var",
+#                        "Verify API key at https://app.nullrun.io/...",
+#                        "Retry in 30s, backend is down"). Empty when
+#                        there is no actionable step.
+#   * ``retryable``    — ``True`` when a retry after a backoff is the
+#                        correct response (5xx, network blip, transient
+#                        auth). ``False`` for config / permission /
+#                        budget-exhausted — retrying without changing
+#                        something will just hit the same wall.
+#   * ``docs_url``     — link to the per-code docs page. Always set; falls
+#                        back to ``https://docs.nullrun.io/errors`` when
+#                        the per-code page does not exist yet.
+#
+# Existing ``except`` clauses keep working: every existing public class
+# (``NullRunAuthenticationError``, ``NullRunBlockedException``,
+# ``NullRunTransportError``, ``WorkflowKilledException``,
+# ``WorkflowPausedException``) inherits from ``NullRunError`` now, so
+# ``except NullRunError:`` catches them all — but the narrower clauses
+# keep matching too.
+#
+# New specialized classes (``NullRunConfigError``, ``NullRunAuthError``,
+# ``NullRunBackendError``, ``NullRunBudgetError``, ``NullRunToolBlockedError``)
+# are added below. They are subclasses of the existing user-facing
+# classes where it makes sense (e.g. ``NullRunBudgetError`` is a subclass
+# of ``NullRunBlockedException``) so existing handlers still match.
+class NullRunError(BreakerError):
+    """Structured base for every user-facing SDK exception.
+
+    Carries the four fields that make an exception actionable
+    (``error_code``, ``user_action``, ``retryable``, ``docs_url``)
+    plus the optional ``cause`` (chained original exception). Every
+    subclass populates at least ``error_code``; ``user_action`` is
+    empty only when there is genuinely nothing to suggest (e.g. an
+    internal sanity check).
+    """
+
+    #: Default error code when a subclass does not override it.
+    #: Real codes are ``"NR-LETTERNNN"`` — see the catalog at the top
+    #: of the docstring above.
+    error_code: str = "NR-0000"
+
+    #: Short imperative next-step hint shown in tracebacks and
+    #: surfaced by the cookbook example. Empty string means "no
+    #: actionable step beyond what the message says".
+    user_action: str = ""
+
+    #: ``True`` only when a retry after a backoff is the correct
+    #: response (5xx, network blip, transient auth). Default is
+    #: ``False`` because the common case is "user must change
+    #: something before retrying makes sense".
+    retryable: bool = False
+
+    #: Per-code docs page. Fallback to the index when the per-code
+    #: page does not exist yet — the docs site is responsible for
+    #: the 404 page, not the SDK.
+    docs_url: str = "https://docs.nullrun.io/errors"
+
+    def __init__(
+        self,
+        message: str,
+        *,
+        error_code: str | None = None,
+        user_action: str | None = None,
+        retryable: bool | None = None,
+        docs_url: str | None = None,
+        cause: BaseException | None = None,
+    ) -> None:
+        # Apply per-instance overrides, falling back to the class
+        # attribute. We intentionally do NOT mutate the class attribute
+        # — each instance must own its own fields so a subclass
+        # override (e.g. ``NullRunBackendError.retryable = True``)
+        # does not leak across other subclasses.
+        if error_code is not None:
+            self.error_code = error_code
+        if user_action is not None:
+            self.user_action = user_action
+        if retryable is not None:
+            self.retryable = retryable
+        if docs_url is not None:
+            self.docs_url = docs_url
+        # ``cause`` is the chained original exception, mirroring
+        # ``raise X from e``. We store it on the instance so the
+        # cookbook ``except`` handlers and the on_error hook
+        # (Layer 2) can introspect it without parsing ``__cause__``.
+        if cause is not None:
+            self.cause = cause
+            # Mirror Python's `raise ... from` behaviour so ``str(exc)``
+            # shows the chain ("The above exception was the direct
+            # cause of the following exception"). Skipped when the
+            # caller already chained via `from` — ``__cause__`` is
+            # then set automatically and we just stash the reference
+            # for structured access.
+            if getattr(self, "__cause__", None) is None:
+                self.__cause__ = cause
+        super().__init__(message)
+
+
+# ---------------------------------------------------------------------------
+# Transport / network failures
+# ---------------------------------------------------------------------------
 class TransportErrorSource(str, Enum):
     """Where a transport failure originated.
 
@@ -19,13 +135,14 @@ class TransportErrorSource(str, Enum):
     `execute` / `check` return dicts when the transport layer
     degrades to a fallback instead of raising.
     """
+
     NETWORK_ERROR = "NETWORK_ERROR"  # httpx.ConnectError, timeout, DNS
     GATEWAY_ERROR = "GATEWAY_ERROR"  # 5xx from the gateway
     BREAKER_OPEN = "BREAKER_OPEN"  # circuit breaker tripped
     AUTH_ERROR = "AUTH_ERROR"  # 401 / 403 from the gateway
 
 
-class NullRunTransportError(BreakerError):
+class NullRunTransportError(NullRunError):
     """Raised by transport layer when the policy engine is unreachable.
 
     The exception carries a `source` (TransportErrorSource) and the
@@ -37,7 +154,19 @@ class NullRunTransportError(BreakerError):
     returning a synthetic `allow` / `block` response — that hid
     the policy-engine outage from operators and was the root cause
     of bug #1 / #2 fixed in ADR-008.
+
+    Inherits from :class:`NullRunError` (Layer 1) so every transport
+    failure carries an ``error_code`` and ``user_action`` — see
+    :class:`NullRunBackendError` for the most common 5xx case.
     """
+
+    error_code = "NR-B001"  # default; subclasses override
+    user_action = (
+        "Check connectivity to the NullRun backend. If the backend is "
+        "up, retry the request — transport errors are usually transient."
+    )
+    retryable = True
+
     def __init__(
         self,
         message: str,
@@ -48,9 +177,72 @@ def __init__(
         self.source = source
         self.endpoint = endpoint
         self.details = details
+        # Map the transport-source classification to a per-class
+        # ``error_code`` when the caller does not override it via
+        # ``**details``. NETWORK_ERROR / GATEWAY_ERROR are the two
+        # common paths; the others (BREAKER_OPEN, AUTH_ERROR) are
+        # kept as the default ``NR-B001`` because they signal SDK-
+        # internal state, not the backend.
+        _CODE_BY_SOURCE = {
+            TransportErrorSource.NETWORK_ERROR: "NR-B001",
+            TransportErrorSource.GATEWAY_ERROR: "NR-B002",
+            TransportErrorSource.AUTH_ERROR: "NR-A003",
+            TransportErrorSource.BREAKER_OPEN: "NR-B005",
+        }
+        # Precedence: explicit ``error_code=`` in details wins, then
+        # the class's own ``error_code`` (which subclasses like
+        # ``RateLimitError`` override to opt out of the source
+        # mapping — 429 is not a gateway error), then the source
+        # mapping (which only applies when the class still uses the
+        # parent's ``"NR-B001"`` default).
+        _PARENT_DEFAULT_CODE = "NR-B001"
+        if type(self).error_code != _PARENT_DEFAULT_CODE:
+            # Subclass overrode the default — honor it.
+            code = details.pop("error_code", None) or type(self).error_code
+        else:
+            code = details.pop("error_code", None) or _CODE_BY_SOURCE.get(
+                source, _PARENT_DEFAULT_CODE
+            )
+        # Only forward the structured fields the base class accepts —
+        # arbitrary ``**details`` like ``status_code`` must NOT leak
+        # into ``NullRunError.__init__`` (which has a fixed kwarg
+        # signature). Non-structured details stay on ``self.details``
+        # for the message string and for inspection.
         super().__init__(
-            f"Transport error on {endpoint}: {message} "
-            f"(source={source.value}, details={details})"
+            f"Transport error on {endpoint}: {message} (source={source.value}, details={details})",
+            error_code=code,
+        )
+
+
+class NullRunBackendError(NullRunTransportError):
+    """5xx from the NullRun backend. Retryable.
+
+    Subclass of :class:`NullRunTransportError` so existing
+    ``except NullRunTransportError:`` handlers keep matching.
+    Adds a specific ``error_code`` and a retry hint.
+    """
+
+    error_code = "NR-B002"
+    user_action = (
+        "The NullRun backend returned a server error. This is usually "
+        "transient — retry after a few seconds. If it persists for more "
+        "than a minute, check https://status.nullrun.io or contact support."
+    )
+    retryable = True
+
+    def __init__(
+        self,
+        message: str,
+        endpoint: str,
+        status_code: int | None = None,
+        **details: Any,
+    ) -> None:
+        details.setdefault("status_code", status_code)
+        super().__init__(
+            message,
+            source=TransportErrorSource.GATEWAY_ERROR,
+            endpoint=endpoint,
+            **details,
         )
 
 
@@ -70,6 +262,14 @@ class RateLimitError(NullRunTransportError):
             when the response did not include one.
         body: Parsed JSON body (gateway's ``error`` / ``message``).
     """
+
+    error_code = "NR-R001"
+    user_action = (
+        "The NullRun backend rate-limited this API key. Wait "
+        "``retry_after`` seconds (or upgrade the plan) before retrying."
+    )
+    retryable = True
+
     def __init__(
         self,
         message: str,
@@ -103,9 +303,15 @@ class BreakerTransportError(BreakerError):
     - Transport buffer full and circuit breaker triggered
     - Network connectivity issues preventing delivery
 
-    Applications should implement retry logic or alerting mechanism when this exception
-    is raised, as budget protection may be compromised.
+    Applications should implement retry logic or alerting mechanism when this
+    exception is raised, as budget protection may be compromised.
+
+    NOTE: NOT inheriting from ``NullRunError`` because this exception
+    signals a loss of the audit pipeline itself, not a structured
+    SDK error. Surface to the operator; do not treat like a regular
+    NullRun failure.
     """
+
     def __init__(
         self,
         message: str,
@@ -124,23 +330,79 @@ def __init__(
 
 class InsecureTransportError(BreakerTransportError):
     """Raised when SDK is configured with insecure HTTP (non-localhost)."""
+
     pass
 
 
-class NullRunAuthenticationError(BreakerError):
+# ---------------------------------------------------------------------------
+# Configuration / authentication
+# ---------------------------------------------------------------------------
+class NullRunConfigError(NullRunError):
+    """Raised when the SDK is misconfigured: missing api_key, bad
+    key format, workflow not registered, etc.
+
+    These are NEVER retryable — retrying with the same configuration
+    will hit the same wall. The fix is always outside the loop.
+    """
+
+    error_code = "NR-C000"  # subclasses override
+    user_action = (
+        "Review your NullRun configuration. The SDK cannot recover "
+        "from configuration errors on its own — see the error_code "
+        "link in the exception for the specific fix."
+    )
+    retryable = False
+
+
+class NullRunAuthenticationError(NullRunError):
     """
     Raised when authentication fails and safe mode is required.
 
     This exception indicates that the SDK could not authenticate with
     the NullRun backend and will not operate in unprotected mode.
     Applications should handle this exception and provide valid credentials.
+
+    Inherits from :class:`NullRunError` (Layer 1) so callers can do
+    ``except NullRunError`` to catch every user-facing SDK failure
+    with structured fields. Existing ``except NullRunAuthenticationError``
+    clauses keep matching.
     """
-    def __init__(self, message: str):
+
+    error_code = "NR-A001"  # default; ``NullRunAuthError`` overrides per status
+    user_action = (
+        "The NullRun backend rejected the request. Verify the API "
+        "key at https://app.nullrun.io/settings/api-keys and ensure "
+        "it has not been revoked."
+    )
+    retryable = False
+
+    def __init__(self, message: str, **kwargs: Any) -> None:
+        # Preserve the historical ``self.message`` attribute — some
+        # user code reads ``exc.message`` instead of ``str(exc)``.
         self.message = message
-        super().__init__(message)
+        super().__init__(message, **kwargs)
+
+
+class NullRunAuthError(NullRunAuthenticationError):
+    """401 from the backend — key was rejected.
 
+    Subclass of :class:`NullRunAuthenticationError` so existing
+    ``except NullRunAuthenticationError`` clauses keep matching.
+    """
+
+    error_code = "NR-A003"
+    user_action = (
+        "The API key was rejected by the NullRun backend (401). "
+        "Verify the key at https://app.nullrun.io/settings/api-keys "
+        "and rotate it if it has been revoked."
+    )
+    retryable = False
 
-class NullRunBlockedException(BreakerError):
+
+# ---------------------------------------------------------------------------
+# Block decisions (budget, loop, rate, tool-block)
+# ---------------------------------------------------------------------------
+class NullRunBlockedException(NullRunError):
     """
     Raised when NullRun circuit breaker trips.
 
@@ -154,6 +416,11 @@ class NullRunBlockedException(BreakerError):
     - Retry storm (>5 retries)
     - Rate limit exceeded
 
+    Subclasses (:class:`NullRunBudgetError`, :class:`NullRunToolBlockedError`)
+    carry the specific ``error_code`` and ``user_action`` for each
+    block reason. ``except NullRunBlockedException`` continues to
+    match all of them — back-compat.
+
     Attributes:
         workflow_id: Workflow that was blocked (may be a sentinel like
             "<unknown>" when the block fires outside a workflow context,
@@ -165,10 +432,19 @@ class NullRunBlockedException(BreakerError):
             Surfaced as a first-class attribute (not just `details`) so
             cookbook examples and audit pipelines can read
             `exc.tool_name` without indexing into `**details`.
-            `None` when the block is workflow-scoped rather than
+            ``None`` when the block is workflow-scoped rather than
             tool-scoped.
         details: Free-form structured payload forwarded by the caller.
     """
+
+    error_code = "NR-X001"  # generic block; subclasses override
+    user_action = (
+        "NullRun blocked this call. The body did not run. See the "
+        "error_code link in the exception for the specific reason "
+        "and the fix."
+    )
+    retryable = False
+
     def __init__(
         self,
         workflow_id: str,
@@ -183,12 +459,58 @@ def __init__(
         self.tool_name = tool_name
         self.details = details
         tool_suffix = f", tool={tool_name}" if tool_name else ""
+        # ``code`` / ``user_action`` / ``retryable`` can be overridden
+        # by the caller via ``details`` — useful when the same call
+        # site raises for multiple block reasons and wants the
+        # catalog value to be exact (e.g. loop vs. retry storm).
+        error_code = details.pop("error_code", None) or self.error_code
+        user_action = details.pop("user_action", None) or self.user_action
+        retryable = details.pop("retryable", None)
+        if retryable is None:
+            retryable = self.retryable
         super().__init__(
             f"Workflow {workflow_id} blocked: {reason} "
-            f"(action={action}{tool_suffix}, details={details})"
+            f"(action={action}{tool_suffix}, details={details})",
+            error_code=error_code,
+            user_action=user_action,
+            retryable=retryable,
         )
 
 
+class NullRunBudgetError(NullRunBlockedException):
+    """Budget exhausted — every cost-bearing call will be rejected.
+
+    Subclass of :class:`NullRunBlockedException` so the existing
+    ``except NullRunBlockedException:`` pattern keeps matching.
+    """
+
+    error_code = "NR-B004"
+    user_action = (
+        "Workflow budget is exhausted. Increase the budget at "
+        "https://app.nullrun.io/billing or wait for the next billing "
+        "cycle. Until then, every @protect call will be rejected."
+    )
+    retryable = False
+
+
+class NullRunToolBlockedError(NullRunBlockedException):
+    """The tool is in the workflow's block list.
+
+    Subclass of :class:`NullRunBlockedException` so the existing
+    ``except NullRunBlockedException:`` pattern keeps matching.
+    Carries ``tool_name`` (set by the raise site) so the user knows
+    which tool is the offender.
+    """
+
+    error_code = "NR-T001"
+    user_action = (
+        "This tool is in the workflow's block list. Remove it from the "
+        "block list at https://app.nullrun.io/policies/<workflow> or "
+        "use a different tool."
+    )
+    retryable = False
+
+
 # NOTE (Sprint 2.2): the following six exception classes were removed
 # in 0.4.0 because they had no callers in the SDK or in any
 # test. They were zombie public surface — defined but never raised.
@@ -203,14 +525,26 @@ def __init__(
 #   - RateLimitExceededException
 
 
-class WorkflowPausedException(BreakerError):
+class WorkflowPausedException(NullRunError):
     """
     Raised when workflow is paused by NullRun.
 
     This allows the workflow to be resumed later after
     human approval or automatic cooldown.
+
+    Inherits from :class:`NullRunError` (Layer 1) so it carries
+    ``error_code`` (``NR-W003``) and a ``user_action`` hint pointing
+    at the workflow page on the dashboard.
     """
 
+    error_code = "NR-W003"
+    user_action = (
+        "The workflow is paused. Resume it at "
+        "https://app.nullrun.io/workflows/<workflow_id> or wait for "
+        "the cooldown to expire."
+    )
+    retryable = False
+
     def __init__(self, workflow_id: str, reason: str, resume_after: float | None = None) -> None:
         self.workflow_id = workflow_id
         self.reason = reason
@@ -235,7 +569,7 @@ class WorkflowKilledException(BaseException):
     be removed in a future major release; migrate new code to
     :class:`WorkflowKilledInterrupt` and update existing
     ``except WorkflowKilledException`` clauses to
-    ``except WorkflowKilledInterrupt``, or, if recovery is impossible,
+    ``except WorkflowKilledInterrupt`, or, if recovery is impossible,
     let the exception propagate to the top of the loop.
 
     This class is **not** an ``Exception`` subclass — kill is a
@@ -243,10 +577,28 @@ class WorkflowKilledException(BaseException):
     ``except Exception`` clauses. Only ``except BaseException`` or the
     explicit ``except WorkflowKilledInterrupt`` reliably stops the work.
     See ``docs/kill-contract.md`` §6 for the full rationale.
+
+    NOTE: NOT inheriting from :class:`NullRunError` because
+    ``NullRunError`` is an ``Exception`` subclass — and the kill
+    contract deliberately excludes ``except Exception`` from catching
+    this signal. The structured fields are attached at construction
+    time as instance attributes (not class attributes) so the kill
+    site can still stamp ``error_code`` / ``user_action`` without
+    breaking the BaseException contract.
     """
 
+    error_code = "NR-W002"
+    user_action = (
+        "The workflow was killed. The body did not run and the kill "
+        "is non-recoverable from inside the agent loop. Inspect the "
+        "reason and, if appropriate, resume the workflow at "
+        "https://app.nullrun.io/workflows/<workflow_id>."
+    )
+    retryable = False
+
     def __init__(self, workflow_id: str, reason: str) -> None:
         import warnings as _w
+
         _w.warn(
             "WorkflowKilledException is deprecated. Catch "
             "WorkflowKilledInterrupt (BaseException) instead. The class "
@@ -279,7 +631,7 @@ class WorkflowKilledInterrupt(WorkflowKilledException):
         silently bypass the kill.
       * ``except BaseException`` catches it, like the stdlib interrupts.
 
-    See ``docs/kill-contract.md`` §6 for the full rationale, including
+    See ``docs/kill-contract.md` §6 for the full rationale, including
     the four-level coverage model and the decision tree for users.
 
     Fields:
diff --git a/src/nullrun/decorators.py b/src/nullrun/decorators.py
index 0c3a54e..4700904 100644
--- a/src/nullrun/decorators.py
+++ b/src/nullrun/decorators.py
@@ -72,19 +72,41 @@ def researcher(q):
 # ``_safe_kwargs`` would have shipped them in the audit log.
 # Matching is case-insensitive (see ``_safe_kwargs`` which calls
 # ``.lower()`` on the key).
-SENSITIVE_ARG_KEYS = frozenset({
-    # Credentials / secrets
-    "password", "passwd", "pwd",
-    "token", "secret", "api_key", "apikey",
-    "key", "auth", "authorization", "bearer",
-    "session", "session_id", "cookie",
-    "access_token", "refresh_token", "id_token",
-    "private_key", "secret_key",
-    # PII
-    "email", "phone", "ssn",
-    "credit_card", "credit_card_number", "cvv", "cvc", "pin",
-    "otp", "mfa",
-})
+SENSITIVE_ARG_KEYS = frozenset(
+    {
+        # Credentials / secrets
+        "password",
+        "passwd",
+        "pwd",
+        "token",
+        "secret",
+        "api_key",
+        "apikey",
+        "key",
+        "auth",
+        "authorization",
+        "bearer",
+        "session",
+        "session_id",
+        "cookie",
+        "access_token",
+        "refresh_token",
+        "id_token",
+        "private_key",
+        "secret_key",
+        # PII
+        "email",
+        "phone",
+        "ssn",
+        "credit_card",
+        "credit_card_number",
+        "cvv",
+        "cvc",
+        "pin",
+        "otp",
+        "mfa",
+    }
+)
 
 
 def _safe_repr(value: object, max_len: int = 50) -> str:
@@ -128,8 +150,7 @@ def _safe_repr(value: object, max_len: int = 50) -> str:
 def _safe_kwargs(kwargs: dict[str, Any]) -> dict[str, Any]:
     """Mask sensitive kwargs (case-insensitive)."""
     return {
-        k: "***" if k.lower() in SENSITIVE_ARG_KEYS else _safe_repr(v)
-        for k, v in kwargs.items()
+        k: "***" if k.lower() in SENSITIVE_ARG_KEYS else _safe_repr(v) for k, v in kwargs.items()
     }
 
 
@@ -172,7 +193,7 @@ def _safe_args(fn: Callable[..., Any], args: tuple[Any, ...]) -> list[Any]:
         else:
             masked.append(_safe_repr(value))
     # Trailing *args have no name — best-effort safe repr.
-    for value in args[len(bound_params):]:
+    for value in args[len(bound_params) :]:
         masked.append(_safe_repr(value))
     return masked
 
@@ -498,9 +519,15 @@ def sync_wrapper(*args: Any, **kwargs: Any) -> Any:
             # exception type so callers that distinguish hard vs
             # soft blocks keep that signal.
             if isinstance(exc, (WorkflowKilledInterrupt, WorkflowPausedException)):
+                # Layer 1: pass through the kill/pause error_code so
+                # the user can tell WHY the body did not run —
+                # ``NR-W002`` (killed) vs ``NR-W003`` (paused). The
+                # block subclass carries the right user_action hint.
+                _code = "NR-W002" if isinstance(exc, WorkflowKilledInterrupt) else "NR-W003"
                 raise NullRunBlockedException(
                     workflow_id=exc.workflow_id,
                     reason=exc.reason,
+                    error_code=_code,
                 ) from exc
             raise
         finally:
@@ -614,10 +641,29 @@ def _enforce_sensitive_tool(
                 f"{exc.source} on /{exc.endpoint}. NULLRUN_SENSITIVE_FAIL_OPEN=1 — body will run."
             )
             return
+        # Layer 1: stamp the source-specific error code so the
+        # caller can distinguish "backend is down" from "we tripped
+        # the local circuit breaker". Both are retryable in the
+        # sense that the body will run when the policy engine
+        # recovers, but the body still MUST NOT run now (fail-CLOSED).
+        _code = {
+            TransportErrorSource.NETWORK_ERROR: "NR-B001",
+            TransportErrorSource.GATEWAY_ERROR: "NR-B002",
+            TransportErrorSource.AUTH_ERROR: "NR-A003",
+            TransportErrorSource.BREAKER_OPEN: "NR-B005",
+        }.get(exc.source, "NR-B001")
         raise NullRunBlockedException(
             workflow_id=workflow_id,
             reason=f"policy engine unavailable: {exc.source}",
             tool_name=fn.__name__,
+            error_code=_code,
+            user_action=(
+                f"The NullRun policy engine is unreachable "
+                f"({exc.source.value}). The body of @sensitive "
+                f"'{fn.__name__}' did NOT run (fail-CLOSED). "
+                f"Set NULLRUN_SENSITIVE_FAIL_OPEN=1 to opt out for "
+                f"tests / staging — production should leave it off."
+            ),
         ) from exc
     except Exception as exc:  # noqa: BLE001
         # Any other exception is a transport / network / backend
@@ -634,6 +680,14 @@ def _enforce_sensitive_tool(
             workflow_id=workflow_id,
             reason=f"policy engine unavailable: {exc}",
             tool_name=fn.__name__,
+            error_code="NR-B001",
+            user_action=(
+                f"The NullRun policy engine raised an unexpected "
+                f"exception during the @sensitive pre-check of "
+                f"'{fn.__name__}'. The body did NOT run. Check the "
+                f"chained exception (raise ... from exc) for the "
+                f"root cause."
+            ),
         ) from exc
 
     # Defense in depth (ADR-008 Rule 1 + Rule 2): if `runtime.execute`
@@ -645,7 +699,8 @@ def _enforce_sensitive_tool(
         decision_source = result.get("decision_source", "")
         if isinstance(decision_source, str) and (
             decision_source.startswith("FALLBACK_")
-            or decision_source in {
+            or decision_source
+            in {
                 TransportErrorSource.NETWORK_ERROR,
                 TransportErrorSource.GATEWAY_ERROR,
                 TransportErrorSource.BREAKER_OPEN,
@@ -658,10 +713,29 @@ def _enforce_sensitive_tool(
                     f"{decision_source}; NULLRUN_SENSITIVE_FAIL_OPEN=1 — body will run."
                 )
                 return
+            # Layer 1: stamp the source-specific code on the
+            # fallback block so cookbook code can distinguish
+            # between "the policy engine said block" (NR-T001 etc.)
+            # and "we blocked because the policy engine never
+            # answered" (NR-B001/B002).
+            _code = {
+                "NETWORK_ERROR": "NR-B001",
+                "GATEWAY_ERROR": "NR-B002",
+                "AUTH_ERROR": "NR-A003",
+                "BREAKER_OPEN": "NR-B005",
+            }.get(decision_source, "NR-B001")
             raise NullRunBlockedException(
                 workflow_id=workflow_id,
                 reason=f"policy engine unavailable: {decision_source}",
                 tool_name=fn.__name__,
+                error_code=_code,
+                user_action=(
+                    f"The NullRun policy engine returned a fallback "
+                    f"({decision_source}) for @sensitive '{fn.__name__}'. "
+                    f"The body did NOT run. Retry once the policy engine "
+                    f"is back — or set NULLRUN_SENSITIVE_FAIL_OPEN=1 for "
+                    f"tests / staging."
+                ),
             )
 
     # Real `decision=block` from the gateway is already converted to
diff --git a/src/nullrun/runtime.py b/src/nullrun/runtime.py
index e763c6f..7c5bef8 100644
--- a/src/nullrun/runtime.py
+++ b/src/nullrun/runtime.py
@@ -109,6 +109,7 @@ def _prune(self, tool_name: str, before: float) -> None:
         while self._calls[tool_name] and self._calls[tool_name][0] < before:
             self._calls[tool_name].popleft()
 
+
 class RateTracker:
     """
     In-memory rate tracking using deque with timestamps.
@@ -159,6 +160,7 @@ def _prune(self, before: float) -> None:
         while self._calls and self._calls[0] < before:
             self._calls.popleft()
 
+
 @dataclass
 class LocalDecision:
     """Decision from local check (no network round-trip)."""
@@ -167,6 +169,7 @@ class LocalDecision:
     reason: str = None
     suggestion: str = None
 
+
 logger = logging.getLogger(__name__)
 
 # Phase 0.3.1: sentinel used when a gate fires outside a
@@ -176,6 +179,7 @@ class LocalDecision:
 # collision hazard). Wire compat: still a string.
 UNKNOWN_WORKFLOW_ID: str = "__nullrun_unknown__"
 
+
 @dataclass
 class Policy:
     """
@@ -255,6 +259,7 @@ def from_dict(cls, data: dict[str, Any]) -> "Policy":
             retry_detection_enabled=data.get("retry_detection_enabled", True),
         )
 
+
 class NullRunRuntime:
     """
     Central runtime for NullRun SDK.
@@ -634,7 +639,17 @@ def _authenticate(self) -> None:
         a secret key rotation. The SDK stores this and uses it for signing.
         """
         if not self.api_key:
-            raise BreakerError("API key required for cloud mode")
+            from nullrun.breaker.exceptions import NullRunConfigError
+
+            raise NullRunConfigError(
+                "API key required for cloud mode",
+                error_code="NR-C001",
+                user_action=(
+                    "Set NULLRUN_API_KEY env var or pass api_key='nr_live_...' "
+                    "to nullrun.init(). The SDK cannot operate without "
+                    "credentials — the no-op local mode was removed in 0.3.0."
+                ),
+            )
 
         logger.debug(f"Authenticating with API at {self.api_url}/auth/verify")
         try:
@@ -651,7 +666,15 @@ def _authenticate(self) -> None:
                 if not org_id:
                     raise NullRunAuthenticationError(
                         "Auth response missing organization_id - server may be outdated or compromised. "
-                        "Refusing to operate with legacy identity."
+                        "Refusing to operate with legacy identity.",
+                        error_code="NR-A002",
+                        user_action=(
+                            "The NullRun backend returned a 200 but the response "
+                            "is missing organization_id. This usually means the "
+                            "backend is on an older version than the SDK expects — "
+                            "update the backend, or downgrade the SDK to a "
+                            "version compatible with the deployed backend."
+                        ),
                     )
                 self.organization_id = org_id
 
@@ -703,13 +726,23 @@ def _authenticate(self) -> None:
                 # Auth failed - raise exception instead of silent fallback
                 raise NullRunAuthenticationError(
                     f"Auth failed with status {response.status_code}. "
-                    f"API key may be invalid or expired. Not operating in unsafe mode."
+                    f"API key may be invalid or expired. Not operating in unsafe mode.",
+                    error_code=("NR-A003" if response.status_code == 401 else "NR-A001"),
                 )
         except httpx.RequestError as e:
             # Network error - raise exception, do not fall back silently
             raise NullRunAuthenticationError(
                 f"Auth request failed: {e}. Cannot establish secure connection to NullRun. "
-                f"Refusing to operate in unprotected mode."
+                f"Refusing to operate in unprotected mode.",
+                error_code="NR-B001",
+                user_action=(
+                    "Could not reach the NullRun backend at "
+                    f"{self.api_url}. Check network connectivity and the "
+                    "configured api_url. This is a transport failure (not "
+                    "an auth failure) — the API key may be valid, the "
+                    "backend is just unreachable."
+                ),
+                cause=e,
             ) from e
 
     def _fetch_policy(self) -> None:
@@ -763,9 +796,7 @@ def _fetch_policy(self) -> None:
         fail_open = os.environ.get("NULLRUN_POLICY_FAIL_OPEN", "").strip() == "1"
 
         if not self.organization_id:
-            self._policy = (
-                Policy.default_local() if fail_open else Policy.strict_local()
-            )
+            self._policy = Policy.default_local() if fail_open else Policy.strict_local()
             logger.warning(
                 "No organization_id; policy fetch skipped. fail-OPEN=%s "
                 "(NULLRUN_POLICY_FAIL_OPEN=1 to restore permissive fallback).",
@@ -821,9 +852,7 @@ def _fetch_policy(self) -> None:
                     self.organization_id,
                 )
         except Exception as e:
-            logger.warning(
-                "Failed to fetch policy for org=%s: %s", self.organization_id, e
-            )
+            logger.warning("Failed to fetch policy for org=%s: %s", self.organization_id, e)
 
         # Audit F-R2-02: fail-CLOSED. Order of precedence:
         #   1. last known-good cached policy (if any)
@@ -1705,7 +1734,13 @@ def get_org_status(self, org_id: str | None = None) -> dict[str, Any]:
         resolved = org_id or self.organization_id
         if not resolved:
             raise NullRunAuthenticationError(
-                "get_org_status requires org_id (or a runtime bound to one)"
+                "get_org_status requires org_id (or a runtime bound to one)",
+                error_code="NR-C003",
+                user_action=(
+                    "Call nullrun.init() first, or pass org_id=<uuid> "
+                    "explicitly. The runtime is not bound to an organization "
+                    "yet — auth() must complete before this method can be used."
+                ),
             )
         response = self._transport._client.get(
             f"{self.api_url}/api/v1/orgs/{resolved}/status",
@@ -2120,9 +2155,11 @@ def track_event(
             event["_fingerprint"] = _fingerprint_for_event_dict(event)
         return self.track(event)
 
+
 # Module-level convenience functions
 _runtime: NullRunRuntime | None = None
 
+
 def get_runtime() -> NullRunRuntime:
     """Get or create the global runtime instance."""
     global _runtime
@@ -2130,6 +2167,7 @@ def get_runtime() -> NullRunRuntime:
         _runtime = NullRunRuntime.get_instance()
     return _runtime
 
+
 def track(event: dict[str, Any]) -> dict[str, Any]:
     """
     Module-level track function.
@@ -2144,11 +2182,13 @@ def track(event: dict[str, Any]) -> dict[str, Any]:
     """
     return get_runtime().track(event)
 
+
 # Phase 3.4: explicit alias for `track()` -- same call signature, friendlier
 # name for users who reach for `track_event` first. Both names share the
 # same callable object, so `nullrun.track is nullrun.track_event` is True.
 track_event = track
 
+
 def track_llm(
     input_tokens: int,
     output_tokens: int = 0,
@@ -2170,6 +2210,7 @@ def track_llm(
     """
     return get_runtime().track_llm(input_tokens, output_tokens, **kwargs)
 
+
 def track_tool(
     tool_name: str,
     duration_ms: int | None = None,
diff --git a/src/nullrun/transport.py b/src/nullrun/transport.py
index 32772b5..af17171 100644
--- a/src/nullrun/transport.py
+++ b/src/nullrun/transport.py
@@ -54,6 +54,7 @@
 # HMAC Request Signing (Task 11)
 # =============================================================================
 
+
 def generate_hmac_signature(
     api_key: str,
     secret_key: str,
@@ -89,6 +90,7 @@ def generate_hmac_signature(
 
     return signature
 
+
 def verify_hmac_signature(
     api_key: str,
     secret_key: str,
@@ -133,10 +135,12 @@ def verify_hmac_signature(
     # Constant-time comparison to prevent timing attacks
     return hmac.compare_digest(expected, signature)
 
+
 # =============================================================================
 # Policy Cache for CACHED fallback mode
 # =============================================================================
 
+
 class CachedDecision:
     """Represents a cached execute decision."""
 
@@ -157,6 +161,7 @@ def __init__(
     def is_expired(self) -> bool:
         return time.monotonic() - self.cached_at > self.ttl_seconds
 
+
 class PolicyCache:
     """
     LRU cache for execute decisions. Used in CACHED fallback mode.
@@ -224,6 +229,7 @@ def get_stats(self) -> dict:
     def __len__(self) -> int:
         return len(self._cache)
 
+
 def _signed_request_body(payload: dict[str, Any]) -> bytes:
     """Serialise a JSON payload to the canonical bytes the HMAC
     signature is computed over.
@@ -241,6 +247,7 @@ def _signed_request_body(payload: dict[str, Any]) -> bytes:
     """
     return json.dumps(payload, separators=(",", ":")).encode("utf-8")
 
+
 # =============================================================================
 # Retry with exponential backoff + jitter
 # =============================================================================
@@ -249,6 +256,7 @@ def _signed_request_body(payload: dict[str, Any]) -> bytes:
 Retry with exponential backoff + jitter + Retry-After header support
 """
 
+
 def _retry_with_backoff(
     func: Callable[[], Any],
     max_retries: int = 3,
@@ -277,16 +285,29 @@ def _retry_with_backoff(
 
             if hasattr(result, "status_code"):
                 if result.status_code == 401:
-                    raise NullRunAuthenticationError("Invalid API key")
+                    from nullrun.breaker.exceptions import NullRunAuthError
+
+                    raise NullRunAuthError(
+                        "Invalid API key",
+                        error_code="NR-A003",
+                        user_action=(
+                            "The NullRun backend rejected the API key (401). "
+                            "Verify it at https://app.nullrun.io/settings/api-keys "
+                            "and rotate if it was revoked. The key may also be "
+                            "for a different environment (prod vs. staging) — "
+                            "check the API_URL vs. where the key was issued."
+                        ),
+                    )
                 if result.status_code >= 500 and on_transport_error == "raise":
                     # Round 3 (Phase 0.4.0): 5xx is a classified
                     # GATEWAY_ERROR. Don't retry -- this is a server
                     # bug, not a network blip. Only raise when the
                     # caller has opted into the typed-error contract
                     # via on_transport_error="raise".
-                    raise NullRunTransportError(
+                    from nullrun.breaker.exceptions import NullRunBackendError
+
+                    raise NullRunBackendError(
                         f"Gateway returned {result.status_code}",
-                        source=TransportErrorSource.GATEWAY_ERROR,
                         endpoint="execute",
                         status_code=result.status_code,
                     )
@@ -353,10 +374,12 @@ def _retry_with_backoff(
 
     raise BreakerTransportError(f"Request failed after {max_retries + 1} attempts") from last_exc
 
+
 # =============================================================================
 # Fallback Modes (Phase 1 - SDK Resilience)
 # =============================================================================
 
+
 class FallbackMode:
     """
     SDK behavior when Gateway is unavailable.
@@ -372,6 +395,7 @@ class FallbackMode:
     # Use cached decision if Gateway unavailable
     CACHED = "cached"
 
+
 class DecisionSource:
     """
     Where the decision originated - for provenance tracking.
@@ -382,6 +406,7 @@ class DecisionSource:
     FALLBACK = "fallback"
     LOCAL = "local"
 
+
 @dataclass
 class FlushConfig:
     """Configuration for transport flush behavior."""
@@ -393,6 +418,7 @@ class FlushConfig:
     max_buffer_size: int = 1000  # Max events before dropping oldest
     max_failed_flush: int = 10  # Circuit breaker: stop trying after this many failures
 
+
 @dataclass
 class ExecuteConfig:
     """Configuration for execute (strict mode) behavior."""
@@ -408,6 +434,7 @@ class ExecuteConfig:
     # Cache max size
     cache_max_size: int = 10000
 
+
 class Transport:
     """
     HTTP transport with batching support.
@@ -1643,6 +1670,7 @@ async def _refetch_credentials(self) -> None:
         except Exception as e:
             logger.error(f"Error refetching credentials: {e}")
 
+
 # Audit F-R2-13 (2026-06-22): the module-level ``_parse_error_envelope``
 # helper below is documented as "canonical" but is NOT called from any
 # live wire path — every endpoint does its own ad-hoc
diff --git a/tests/test_exception_hierarchy.py b/tests/test_exception_hierarchy.py
new file mode 100644
index 0000000..d5d1103
--- /dev/null
+++ b/tests/test_exception_hierarchy.py
@@ -0,0 +1,258 @@
+"""Unit tests for the Layer-1 structured exception hierarchy.
+
+Every public SDK exception class should:
+  1. Inherit from ``NullRunError`` so a single ``except NullRunError``
+     clause catches them all (with structured fields).
+  2. Carry a stable ``error_code`` (e.g. ``"NR-A001"``) so users can
+     grep / log / document per-code behaviour.
+  3. Carry a ``user_action`` string telling the user what to do next.
+  4. Set ``retryable`` correctly — ``True`` only for transient
+     failures, ``False`` for configuration / permission / budget.
+  5. Have a ``docs_url`` for the per-code docs page.
+
+Back-compat invariants (do not break in Layer 1):
+  A. ``except NullRunAuthenticationError`` still catches
+     ``NullRunAuthError`` (subclass).
+  B. ``except NullRunBlockedException`` still catches
+     ``NullRunBudgetError`` and ``NullRunToolBlockedError``.
+  C. ``except NullRunTransportError`` still catches
+     ``NullRunBackendError`` and ``RateLimitError``.
+  D. ``except WorkflowKilledException`` still catches
+     ``WorkflowKilledInterrupt`` (BaseException inheritance).
+  E. ``except Exception`` does NOT catch ``WorkflowKilledInterrupt``.
+
+The tests below are the safety net for the above — a future
+refactor that breaks one of them is a regression even if no other
+test fails.
+"""
+
+import pytest
+
+from nullrun.breaker.exceptions import (
+    # Base
+    BreakerError,
+    NullRunAuthenticationError,
+    NullRunAuthError,
+    NullRunBackendError,
+    # Block
+    NullRunBlockedException,
+    NullRunBudgetError,
+    # Config / auth
+    NullRunConfigError,
+    NullRunError,
+    NullRunToolBlockedError,
+    # Transport
+    NullRunTransportError,
+    RateLimitError,
+    TransportErrorSource,
+    WorkflowKilledException,
+    WorkflowKilledInterrupt,
+    # Workflow state
+    WorkflowPausedException,
+)
+
+
+# ---------------------------------------------------------------------------
+# 1. Base class — every public exception must inherit from NullRunError
+# ---------------------------------------------------------------------------
+class TestHierarchyRoots:
+    def test_all_exceptions_inherit_from_nullrun_error(self):
+        for cls in (
+            NullRunAuthenticationError,
+            NullRunAuthError,
+            NullRunConfigError,
+            NullRunTransportError,
+            NullRunBackendError,
+            RateLimitError,
+            NullRunBlockedException,
+            NullRunBudgetError,
+            NullRunToolBlockedError,
+            WorkflowPausedException,
+        ):
+            assert issubclass(cls, NullRunError), (
+                f"{cls.__name__} must inherit from NullRunError so users "
+                f"can do `except NullRunError:` to catch every structured "
+                f"SDK failure."
+            )
+
+    def test_killed_interrupt_does_not_inherit_from_exception(self):
+        # WorkflowKilledInterrupt is a BaseException subclass by design
+        # (per docs/kill-contract.md). It MUST NOT inherit from
+        # NullRunError (which is an Exception subclass), so that
+        # `except Exception` does not catch the kill signal.
+        assert not issubclass(WorkflowKilledInterrupt, Exception)
+        assert not issubclass(WorkflowKilledInterrupt, NullRunError)
+        # But it MUST inherit from WorkflowKilledException (legacy
+        # back-compat shim) so old `except WorkflowKilledException`
+        # clauses still match.
+        assert issubclass(WorkflowKilledInterrupt, WorkflowKilledException)
+
+
+# ---------------------------------------------------------------------------
+# 2. Structured fields — error_code, user_action, retryable, docs_url
+# ---------------------------------------------------------------------------
+class TestStructuredFields:
+    def test_default_fields_present_on_base(self):
+        exc = NullRunError("oops")
+        assert exc.error_code == "NR-0000"
+        assert exc.user_action == ""
+        assert exc.retryable is False
+        assert exc.docs_url == "https://docs.nullrun.io/errors"
+
+    def test_per_instance_overrides(self):
+        exc = NullRunError(
+            "boom",
+            error_code="NR-X999",
+            user_action="do X",
+            retryable=True,
+            docs_url="https://docs/x",
+        )
+        assert exc.error_code == "NR-X999"
+        assert exc.user_action == "do X"
+        assert exc.retryable is True
+        assert exc.docs_url == "https://docs/x"
+
+    def test_subclass_class_attribute_inheritance(self):
+        # NullRunBackendError is a real class with a real
+        # ``error_code`` / ``user_action`` / ``retryable`` triple.
+        exc = NullRunBackendError("5xx", endpoint="/api/v1/check")
+        assert exc.error_code == "NR-B002"
+        assert "NullRun backend" in exc.user_action
+        assert exc.retryable is True
+
+    def test_cause_chains_via_from(self):
+        original = RuntimeError("underlying")
+        try:
+            raise NullRunError("wrapper", cause=original) from original
+        except NullRunError as exc:
+            assert exc.cause is original
+            assert exc.__cause__ is original
+
+
+# ---------------------------------------------------------------------------
+# 3. Back-compat — every existing except clause must still match
+# ---------------------------------------------------------------------------
+class TestBackCompat:
+    def test_auth_error_caught_by_authentication_error(self):
+        with pytest.raises(NullRunAuthenticationError):
+            raise NullRunAuthError("key rejected")
+
+    def test_budget_error_caught_by_blocked_exception(self):
+        with pytest.raises(NullRunBlockedException):
+            raise NullRunBudgetError(workflow_id="wf-1", reason="budget exhausted")
+
+    def test_tool_blocked_error_caught_by_blocked_exception(self):
+        with pytest.raises(NullRunBlockedException):
+            raise NullRunToolBlockedError(
+                workflow_id="wf-1", reason="blocked", tool_name="send_email"
+            )
+
+    def test_backend_error_caught_by_transport_error(self):
+        with pytest.raises(NullRunTransportError):
+            raise NullRunBackendError("5xx", endpoint="/api/v1/check", status_code=503)
+
+    def test_killed_interrupt_caught_by_killed_exception(self):
+        # Back-compat shim — legacy `except WorkflowKilledException`
+        # must still match the new interrupt subclass.
+        with pytest.raises(WorkflowKilledException):
+            raise WorkflowKilledInterrupt("wf-1", reason="killed via API")
+
+    def test_killed_interrupt_not_caught_by_exception(self):
+        # The whole point of BaseException inheritance: kill must
+        # not be swallowable by `except Exception`.
+        with pytest.raises(BaseException) as exc_info:
+            raise WorkflowKilledInterrupt("wf-1", reason="killed")
+        assert isinstance(exc_info.value, WorkflowKilledInterrupt)
+        assert not isinstance(exc_info.value, Exception)
+
+
+# ---------------------------------------------------------------------------
+# 4. Specific error codes — the catalog
+# ---------------------------------------------------------------------------
+class TestErrorCodeCatalog:
+    """Spot-checks for the most common error codes. If a future
+    refactor accidentally renames a code, this test fails loudly
+    with a `git grep`-friendly message."""
+
+    def test_no_api_key_is_NR_C001(self):
+        with pytest.raises(NullRunConfigError) as info:
+            raise NullRunConfigError("no api_key", error_code="NR-C001")
+        assert info.value.error_code == "NR-C001"
+
+    def test_api_key_rejected_is_NR_A003(self):
+        with pytest.raises(NullRunAuthError) as info:
+            raise NullRunAuthError("key rejected")
+        assert info.value.error_code == "NR-A003"
+
+    def test_backend_5xx_is_NR_B002(self):
+        with pytest.raises(NullRunBackendError) as info:
+            raise NullRunBackendError("5xx", endpoint="/api/v1/check")
+        assert info.value.error_code == "NR-B002"
+        assert info.value.retryable is True
+
+    def test_budget_exhausted_is_NR_B004(self):
+        with pytest.raises(NullRunBudgetError) as info:
+            raise NullRunBudgetError("wf-1", reason="budget exhausted")
+        assert info.value.error_code == "NR-B004"
+        assert info.value.retryable is False
+
+    def test_tool_blocked_is_NR_T001(self):
+        with pytest.raises(NullRunToolBlockedError) as info:
+            raise NullRunToolBlockedError("wf-1", reason="blocked", tool_name="send_email")
+        assert info.value.error_code == "NR-T001"
+        assert info.value.tool_name == "send_email"
+
+    def test_killed_is_NR_W002(self):
+        with pytest.raises(WorkflowKilledInterrupt) as info:
+            raise WorkflowKilledInterrupt("wf-1", reason="killed")
+        # BaseException subclass so we use .value not .excinfo
+        assert info.value.error_code == "NR-W002"
+
+    def test_paused_is_NR_W003(self):
+        with pytest.raises(WorkflowPausedException) as info:
+            raise WorkflowPausedException("wf-1", reason="paused")
+        assert info.value.error_code == "NR-W003"
+
+    def test_rate_limit_is_NR_R001(self):
+        with pytest.raises(RateLimitError) as info:
+            raise RateLimitError(
+                "429",
+                source=TransportErrorSource.GATEWAY_ERROR,
+                endpoint="/api/v1/check",
+            )
+        assert info.value.error_code == "NR-R001"
+        assert info.value.retryable is True
+
+
+# ---------------------------------------------------------------------------
+# 5. Transport-error → code mapping
+# ---------------------------------------------------------------------------
+class TestTransportCodeMapping:
+    """The transport layer classifies failures by ``TransportErrorSource``;
+    each class maps to a stable ``error_code`` so cookbook code and
+    Sentry rules can branch on it without parsing the message."""
+
+    def test_network_error_maps_to_NR_B001(self):
+        exc = NullRunTransportError(
+            "timeout",
+            source=TransportErrorSource.NETWORK_ERROR,
+            endpoint="/api/v1/check",
+        )
+        assert exc.error_code == "NR-B001"
+        assert exc.retryable is True
+
+    def test_gateway_error_maps_to_NR_B002(self):
+        exc = NullRunTransportError(
+            "5xx",
+            source=TransportErrorSource.GATEWAY_ERROR,
+            endpoint="/api/v1/check",
+        )
+        assert exc.error_code == "NR-B002"
+
+    def test_auth_error_maps_to_NR_A003(self):
+        exc = NullRunTransportError(
+            "401",
+            source=TransportErrorSource.AUTH_ERROR,
+            endpoint="/api/v1/check",
+        )
+        assert exc.error_code == "NR-A003"