ContentOps/contentops/cli/commands/lifecycle.py at main · KustoKing/ContentOps · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
# SPDX-FileCopyrightText: 2026 KustoKing / SecM8
# SPDX-License-Identifier: Apache-2.0

"""Lifecycle commands: disable / lock / unlock / retry-failed / lifecycle promote.

Grouped because ``disable``, ``lock``, ``unlock`` all share
``_find_yaml_for_id`` and the same ID-on-disk resolution semantics.
``retry-failed`` and ``lifecycle promote`` are status-mutation siblings.
"""

from __future__ import annotations

import os
import re
import sys
from datetime import date
from pathlib import Path

import click

from contentops.audit import write_records
from contentops.cli.handler_factories import register_default_handlers
from contentops.cli.commands._shared import _load_all
from contentops.cli.commands.apply import _build_audit_record
from contentops.core.asset import Asset
from contentops.core.discovery import discover_assets, load_asset
from contentops.core.handler import LoadedAsset
from contentops.core.registry import default_registry
from contentops.core.result import ActionResult, PlanAction


_STATUS_LINE_RE = re.compile(r"(?m)^status:[ \t]*\S+[ \t]*$")
_LOCK_TOPLEVEL_RE = re.compile(r"(?m)^localCustomization:[ \t]*\S+[ \t]*\n")


def _find_yaml_for_id(detections_path: Path, rule_id: str) -> Path:
    """Resolve a rule_id to exactly one YAML on disk; click-exits otherwise."""
    matches: list[Path] = []
    for p in discover_assets(detections_path):
        try:
            la = load_asset(p)
        except Exception:
            continue
        if la.envelope.id == rule_id:
            matches.append(p)
    if not matches:
        click.echo(
            f"error: no rule with id={rule_id!r} found under {detections_path}",
            err=True,
        )
        sys.exit(1)
    if len(matches) > 1:
        click.echo(
            f"error: rule id={rule_id!r} matches {len(matches)} files (data integrity issue):",
            err=True,
        )
        for m in matches:
            click.echo(f"  {m}", err=True)
        sys.exit(1)
    return matches[0]


def _disable_one(target: Path, rule_id: str, reason: str | None) -> bool:
    """Apply the status -> deprecated mutation to a single YAML.

    Returns True if the file changed, False if the rule was already
    deprecated (no-op). Raises ``click.ClickException`` if the YAML
    has no top-level ``status:`` line (refuse to mutate ambiguous
    files). Centralises the YAML-rewrite so the single-rule and
    cohort paths share one implementation.
    """
    text = target.read_text(encoding="utf-8")
    status_match = _STATUS_LINE_RE.search(text)
    if status_match is None:
        raise click.ClickException(
            f"cannot find a top-level `status:` line in {target}; "
            f"refusing to mutate"
        )
    if status_match.group(0).strip() == "status: deprecated":
        click.echo(f"warn: {rule_id} already deprecated ({target}); no changes made")
        return False
    new_text = _STATUS_LINE_RE.sub("status: deprecated", text, count=1)
    if reason:
        escaped = reason.replace('"', '\\"')
        addition = f'disableReason: "{escaped}"\n'
    else:
        addition = f"# disabled by contentops disable on {date.today().isoformat()}\n"
    if not new_text.endswith("\n"):
        new_text += "\n"
    new_text += addition
    target.write_text(new_text, encoding="utf-8")
    click.echo(f"disabled {rule_id}: status -> deprecated ({target})")
    return True


def _find_yamls_by_pattern(
    detections_path: Path, pattern: str,
) -> list[tuple[str, Path]]:
    """Walk detections/ and return ``(envelope_id, path)`` pairs whose
    envelope id matches ``pattern`` via ``fnmatch`` (shell glob)."""
    import fnmatch as _fnmatch
    matches: list[tuple[str, Path]] = []
    for p in discover_assets(detections_path):
        try:
            la = load_asset(p)
        except Exception:
            continue
        if _fnmatch.fnmatchcase(la.envelope.id, pattern):
            matches.append((la.envelope.id, p))
    matches.sort(key=lambda t: t[0])  # deterministic order for diff review
    return matches


def _find_yamls_by_cohort(
    detections_path: Path, cohort: str,
) -> list[tuple[str, Path]]:
    """Walk detections/ and return ``(envelope_id, path)`` pairs whose
    ``metadata.cohort`` equals ``cohort`` exactly (parallel to the
    portfolio command's ``--cohort`` filter)."""
    matches: list[tuple[str, Path]] = []
    for p in discover_assets(detections_path):
        try:
            la = load_asset(p)
        except Exception:
            continue
        md = la.envelope.metadata
        if md is not None and md.cohort == cohort:
            matches.append((la.envelope.id, p))
    matches.sort(key=lambda t: t[0])
    return matches


def _select_one_of(
    *, rule_id: str | None, pattern: str | None, cohort: str | None,
) -> None:
    """Enforce that exactly one of the three selectors is provided.

    Shared between ``disable`` and ``enable`` so the UsageError text
    stays in sync.
    """
    given = [name for name, val in (
        ("rule_id", rule_id), ("--pattern", pattern), ("--cohort", cohort),
    ) if val]
    if len(given) > 1:
        raise click.UsageError(
            "``rule_id``, ``--pattern``, and ``--cohort`` are mutually exclusive "
            f"(got: {', '.join(given)})."
        )
    if not given:
        raise click.UsageError(
            "exactly one of positional ``rule_id``, ``--pattern``, or ``--cohort`` is required."
        )


@click.command("disable")
@click.argument("rule_id", required=False)
@click.option(
    "--pattern", "pattern", default=None,
    help=(
        "fnmatch-style glob (e.g. 'aad-*' or 'o365-*-anomaly'). "
        "Disables every rule whose envelope id matches. Mutually "
        "exclusive with the positional ``rule_id`` and ``--cohort``. "
        "Requires ``--yes`` to actually mutate the YAMLs; without "
        "``--yes`` the command lists matches and exits (dry-run by default)."
    ),
)
@click.option(
    "--cohort", "cohort", default=None,
    help=(
        "Cohort name matched exactly against ``metadata.cohort`` "
        "(parallel to ``portfolio --cohort``). Mutually exclusive "
        "with the positional ``rule_id`` and ``--pattern``. Requires "
        "``--yes`` to mutate."
    ),
)
@click.option(
    "--yes", "yes", is_flag=True, default=False,
    help="Required alongside ``--pattern``/``--cohort`` to actually disable the cohort.",
)
@click.option("--reason", default=None, help="Free-text reason recorded in each YAML.")
@click.option(
    "--path", "detections_path",
    type=click.Path(exists=True, path_type=Path),
    default=Path("detections"),
    help="Root detections directory.",
)
def disable_cmd(
    rule_id: str | None,
    pattern: str | None,
    cohort: str | None,
    yes: bool,
    reason: str | None,
    detections_path: Path,
) -> None:
    """Emergency-disable a detection rule (or cohort) by setting
    ``status: deprecated``.

    Single-rule (existing behaviour):

        contentops disable my-rule-id

    Cohort by glob (closes G18 -- bulk disable):

        contentops disable --pattern 'o365-*'        # dry-run: list
        contentops disable --pattern 'o365-*' --yes  # actually disable

    Cohort by metadata.cohort (closes G18 -- bulk disable):

        contentops disable --cohort o365             # dry-run: list
        contentops disable --cohort o365 --yes       # actually disable

    The glob is matched against the envelope ``id`` (case-sensitive
    ``fnmatch.fnmatchcase``); ``--cohort`` is exact-matched against the
    ``metadata.cohort`` field. Dry-run output is deterministically
    sorted so reviewer diffs are stable. Does NOT git-commit -- the
    caller (workflow or human) inspects the diff.
    """
    _select_one_of(rule_id=rule_id, pattern=pattern, cohort=cohort)

    # Single-rule path -- unchanged behaviour, just routed through the
    # shared _disable_one helper for symmetry with the cohort paths.
    if rule_id:
        target = _find_yaml_for_id(detections_path, rule_id)
        _disable_one(target, rule_id, reason)
        return

    if pattern:
        matches = _find_yamls_by_pattern(detections_path, pattern)
        selector_label = f"--pattern {pattern!r}"
    else:
        matches = _find_yamls_by_cohort(detections_path, cohort)
        selector_label = f"--cohort {cohort!r}"

    if not matches:
        click.echo(
            f"error: no rules matched {selector_label} under {detections_path}",
            err=True,
        )
        sys.exit(1)
    if not yes:
        click.echo(f"{selector_label} would disable {len(matches)} rule(s):")
        for rid, p in matches:
            click.echo(f"  {rid}  ({p})")
        click.echo("Pass --yes to proceed.")
        return

    changed = 0
    skipped = 0
    for rid, p in matches:
        if _disable_one(p, rid, reason):
            changed += 1
        else:
            skipped += 1
    click.echo(
        f"\nCohort disable complete: {changed} rule(s) deprecated, "
        f"{skipped} already deprecated."
    )


# ---------------------------------------------------------------------------
# `contentops enable` — inverse of disable (closes G18 round-trip)
# ---------------------------------------------------------------------------


# Match the exact markers `disable` writes so enable can strip them
# cleanly. Manual edits with different shapes are left untouched.
_DISABLE_REASON_LINE_RE = re.compile(r'(?m)^disableReason:[ \t]*".*"[ \t]*\n')
_DISABLE_COMMENT_LINE_RE = re.compile(
    r"(?m)^# disabled by contentops disable on \d{4}-\d{2}-\d{2}[ \t]*\n"
)


_ENABLE_TARGETS = ("experimental", "production", "test")


def _enable_one(
    target: Path, rule_id: str, *, to_status: str, reason: str | None,
) -> bool:
    """Flip ``status: deprecated`` back to ``to_status`` on a single YAML.

    Returns True if the file changed. Already-active rules (anything
    other than ``deprecated``) are warned-and-skipped — mirrors disable's
    "already deprecated → skip" UX.

    Side effects:
      * Strips the exact ``disableReason: "..."`` line or
        ``# disabled by contentops disable on YYYY-MM-DD`` comment that
        ``disable`` writes. Manual additions with other shapes survive.
      * Appends an enable marker so the audit trail is symmetric:
        ``enableReason: "..."`` when ``--reason`` is set, otherwise
        ``# re-enabled by contentops enable on YYYY-MM-DD`` comment.
    """
    text = target.read_text(encoding="utf-8")
    status_match = _STATUS_LINE_RE.search(text)
    if status_match is None:
        raise click.ClickException(
            f"cannot find a top-level `status:` line in {target}; "
            f"refusing to mutate"
        )
    current = status_match.group(0).strip()
    if current != "status: deprecated":
        click.echo(
            f"warn: {rule_id} is not deprecated (current: {current!r}); "
            f"no changes made"
        )
        return False
    new_text = _STATUS_LINE_RE.sub(f"status: {to_status}", text, count=1)
    new_text = _DISABLE_REASON_LINE_RE.sub("", new_text, count=1)
    new_text = _DISABLE_COMMENT_LINE_RE.sub("", new_text, count=1)
    if not new_text.endswith("\n"):
        new_text += "\n"
    if reason:
        escaped = reason.replace('"', '\\"')
        new_text += f'enableReason: "{escaped}"\n'
    else:
        new_text += (
            f"# re-enabled by contentops enable on {date.today().isoformat()}\n"
        )
    target.write_text(new_text, encoding="utf-8")
    click.echo(f"enabled {rule_id}: status -> {to_status} ({target})")
    return True


@click.command("enable")
@click.argument("rule_id", required=False)
@click.option(
    "--pattern", "pattern", default=None,
    help=(
        "fnmatch-style glob matched against envelope id. Enables every "
        "matching deprecated rule. Mutually exclusive with the positional "
        "``rule_id`` and ``--cohort``. Requires ``--yes`` to mutate."
    ),
)
@click.option(
    "--cohort", "cohort", default=None,
    help=(
        "Cohort name matched exactly against ``metadata.cohort``. "
        "Mutually exclusive with ``rule_id`` and ``--pattern``. "
        "Requires ``--yes`` to mutate."
    ),
)
@click.option(
    "--to", "to_status",
    type=click.Choice(_ENABLE_TARGETS),
    default="experimental",
    show_default=True,
    help=(
        "Status to restore to. Default ``experimental`` forces "
        "re-promotion through ``lifecycle promote``'s gates. Override "
        "to ``production`` for maintenance-window restores when you "
        "explicitly want to skip the gates."
    ),
)
@click.option(
    "--yes", "yes", is_flag=True, default=False,
    help="Required alongside ``--pattern``/``--cohort`` to actually re-enable the cohort.",
)
@click.option(
    "--reason", default=None,
    help="Free-text reason recorded in each YAML as ``enableReason``.",
)
@click.option(
    "--path", "detections_path",
    type=click.Path(exists=True, path_type=Path),
    default=Path("detections"),
    help="Root detections directory.",
)
def enable_cmd(
    rule_id: str | None,
    pattern: str | None,
    cohort: str | None,
    to_status: str,
    yes: bool,
    reason: str | None,
    detections_path: Path,
) -> None:
    """Inverse of ``contentops disable`` -- flip ``status: deprecated``
    back to ``--to`` (default ``experimental``).

    Single-rule:

        contentops enable my-rule-id                # -> experimental
        contentops enable my-rule-id --to production

    Cohort by glob:

        contentops enable --pattern 'o365-*'        # dry-run
        contentops enable --pattern 'o365-*' --yes  # restore the cohort

    Cohort by metadata.cohort:

        contentops enable --cohort o365             # dry-run
        contentops enable --cohort o365 --yes       # restore the cohort

    Skips rules that aren't currently deprecated (warn-and-skip,
    matches ``disable``'s already-deprecated UX). Strips the exact
    marker that ``disable`` writes and appends an enable marker so the
    YAML carries a symmetric audit trail. Does NOT git-commit -- the
    caller inspects the diff.
    """
    _select_one_of(rule_id=rule_id, pattern=pattern, cohort=cohort)

    if rule_id:
        target = _find_yaml_for_id(detections_path, rule_id)
        _enable_one(target, rule_id, to_status=to_status, reason=reason)
        return

    if pattern:
        matches = _find_yamls_by_pattern(detections_path, pattern)
        selector_label = f"--pattern {pattern!r}"
    else:
        matches = _find_yamls_by_cohort(detections_path, cohort)
        selector_label = f"--cohort {cohort!r}"

    if not matches:
        click.echo(
            f"error: no rules matched {selector_label} under {detections_path}",
            err=True,
        )
        sys.exit(1)
    if not yes:
        click.echo(
            f"{selector_label} would re-enable up to {len(matches)} rule(s) "
            f"-> status: {to_status}:"
        )
        for rid, p in matches:
            click.echo(f"  {rid}  ({p})")
        click.echo("Pass --yes to proceed.")
        return

    changed = 0
    skipped = 0
    for rid, p in matches:
        if _enable_one(p, rid, to_status=to_status, reason=reason):
            changed += 1
        else:
            skipped += 1
    click.echo(
        f"\nCohort enable complete: {changed} rule(s) restored to "
        f"{to_status}, {skipped} skipped (not deprecated)."
    )


@click.command("lock")
@click.argument("rule_id")
@click.option(
    "--path", "detections_path",
    type=click.Path(exists=True, path_type=Path),
    default=Path("detections"),
    help="Root detections directory.",
)
def lock_cmd(rule_id: str, detections_path: Path) -> None:
    """Pin a rule as locally customised - apply will skip it without --force-overwrite.

    Sets a top-level ``localCustomization: true`` flag in the envelope.
    This is the customisation-protection pattern from Sentinel-as-Code
    Wave 2: an analyst hand-tunes a rule (threshold, KQL filter) and
    we don't want a future bulk apply to flatten the change.
    """
    target = _find_yaml_for_id(detections_path, rule_id)
    text = target.read_text(encoding="utf-8")
    if _LOCK_TOPLEVEL_RE.search(text):
        new_text = _LOCK_TOPLEVEL_RE.sub("localCustomization: true\n", text, count=1)
        if new_text == text:
            click.echo(f"already locked: {rule_id} ({target})")
            return
    else:
        if not text.endswith("\n"):
            text += "\n"
        new_text = text + "localCustomization: true\n"
    target.write_text(new_text, encoding="utf-8")
    click.echo(f"locked: {rule_id} ({target})")


@click.command("unlock")
@click.argument("rule_id")
@click.option(
    "--path", "detections_path",
    type=click.Path(exists=True, path_type=Path),
    default=Path("detections"),
    help="Root detections directory.",
)
def unlock_cmd(rule_id: str, detections_path: Path) -> None:
    """Inverse of `contentops lock` - remove the localCustomization flag."""
    target = _find_yaml_for_id(detections_path, rule_id)
    text = target.read_text(encoding="utf-8")
    if not _LOCK_TOPLEVEL_RE.search(text):
        click.echo(f"not locked: {rule_id} ({target})")
        return
    new_text = _LOCK_TOPLEVEL_RE.sub("", text, count=1)
    target.write_text(new_text, encoding="utf-8")
    click.echo(f"unlocked: {rule_id} ({target})")


@click.command("retry-failed")
@click.option(
    "--path", "detections_path",
    type=click.Path(exists=True, path_type=Path),
    default=Path("detections"),
    help="Root detections directory.",
)
@click.option(
    "--audit-dir",
    type=click.Path(path_type=Path),
    default=Path("audit"),
    help="Audit JSONL directory (default: audit/).",
)
@click.option(
    "--since",
    "since_spec",
    default=None,
    help="Restrict to audit records within this time window. Either a "
         "duration (e.g. '1h', '30m', '7d') or an ISO 8601 timestamp. "
         "Mutually exclusive with --run-id. Without --since/--run-id, "
         "only the latest audit file is read (current behaviour).",
)
@click.option(
    "--run-id",
    "run_id",
    default=None,
    help="Restrict to audit records whose workflow_run matches RUN_ID "
         "(the GITHUB_RUN_ID stamped at apply time). Mutually exclusive "
         "with --since.",
)
@click.option("--dry-run", is_flag=True, help="Print which assets would be retried; no API calls.")
@click.option(
    "--role",
    type=click.Choice(["prod", "integration", "dev", "test"]),
    default=None,
    help="Target the Sentinel workspace with this role (sets "
         "PIPELINE_WORKSPACE_NAME). Mutex with --workspace. "
         "Single-workspace tenants pick implicitly when both omitted. "
         "Required on multi-workspace tenants -- without it, "
         "register_default_handlers raises an ambiguity error before "
         "the first retry can run.",
)
@click.option(
    "--workspace", "workspace_name",
    default=None,
    help="Target the Sentinel workspace with this exact ``workspaceName`` "
         "(must match config/tenant.yml). Mutex with --role.",
)
def retry_failed_cmd(
    detections_path: Path, audit_dir: Path,
    since_spec: str | None, run_id: str | None,
    dry_run: bool,
    role: str | None, workspace_name: str | None,
) -> None:
    """Re-apply only the assets a previous apply marked as failed.

    Default scope is the most recent ``audit/*.jsonl`` file. Pass
    ``--since`` or ``--run-id`` (mutually exclusive) to widen or narrow.

    On a multi-workspace tenant, pass ``--role`` or ``--workspace``
    so handler registration knows which workspace to target.
    Single-workspace tenants pick implicitly.

    \b
    Use --since when:
      * The latest audit file is a *successful* later run that
        masks the partial failure two runs ago.
      * You want to retry everything that failed in a recovery
        window (e.g. last 4 hours after a Graph outage).
    Use --run-id when:
      * A workflow needs to retry exactly one run-id without
        touching anything else.

    Examples:

    \b
        contentops retry-failed                          # latest file
        contentops retry-failed --since 4h               # last 4 hours
        contentops retry-failed --since 2026-05-07T08:00Z
        contentops retry-failed --run-id 9123456789
        contentops retry-failed --role integration       # multi-workspace
    """
    if since_spec is not None and run_id is not None:
        click.echo(
            "error: --since and --run-id are mutually exclusive.",
            err=True,
        )
        sys.exit(2)

    # Resolve --role / --workspace BEFORE register_default_handlers
    # so multi-workspace tenants get a clean Click error (exit 2)
    # rather than the bare RuntimeError ``_active_workspace`` would
    # raise. Cross-phase review-2 Seam C.
    from contentops.cli.commands._shared import _resolve_single_workspace_or_exit
    _resolve_single_workspace_or_exit(role, workspace_name)

    if not audit_dir.exists():
        click.echo(f"no audit directory at {audit_dir} — nothing to retry")
        return
    files = sorted(audit_dir.glob("*.jsonl"))
    if not files:
        click.echo(f"no audit files under {audit_dir} — nothing to retry")
        return

    from contentops.audit_filter import (
        AuditFilterError, collect_failed_pairs, parse_since,
    )

    # Decide which files + which predicate to use.
    if since_spec is not None:
        try:
            since_dt = parse_since(since_spec)
        except AuditFilterError as exc:
            click.echo(f"error: {exc}", err=True)
            sys.exit(2)
        scope = files
        predicate = ("since", since_dt)
        scope_label = f"--since={since_spec} ({since_dt.isoformat()})"
    elif run_id is not None:
        scope = files
        predicate = ("run_id", run_id)
        scope_label = f"--run-id={run_id}"
    else:
        scope = [files[-1]]
        predicate = ("none", None)
        scope_label = f"latest file ({files[-1].name})"

    failed = collect_failed_pairs(scope, predicate)

    if not failed:
        click.echo(f"no failed records in scope ({scope_label}) — nothing to retry")
        return

    click.echo(f"scope: {scope_label} → {len(failed)} failed (asset, id) pair(s)")

    register_default_handlers()
    loaded = _load_all(detections_path)
    target_set = {(la.envelope.asset.value, la.envelope.id) for la in loaded}
    missing = sorted(failed - target_set)
    if missing:
        click.echo("[warn] failed records with no matching local YAML (skipped):")
        for asset_value, rule_id in missing:
            click.echo(f"  {asset_value}  {rule_id}")

    selected = [la for la in loaded
                if (la.envelope.asset.value, la.envelope.id) in failed]
    if not selected:
        click.echo("nothing to retry — all failed records lack a local YAML.")
        return

    click.echo(f"retrying {len(selected)} failed asset(s) ({scope_label}):")
    for la in selected:
        click.echo(f"  - {la.envelope.asset.value}  {la.envelope.id}  ({la.path})")

    if dry_run:
        click.echo("[dry-run] no API calls made.")
        return

    results: list[ActionResult] = []
    audit_pairs: list[tuple[LoadedAsset, ActionResult]] = []
    try:
        for la in selected:
            handler = default_registry.get(la.envelope.asset)
            try:
                handler.validate(la)
                result = handler.apply(la, dry_run=False)
            except Exception as exc:
                result = ActionResult(
                    asset_id=la.envelope.id, asset_kind=la.envelope.asset.value,
                    action=PlanAction.NOOP, status="error-apply", detail=str(exc),
                )
            results.append(result)
            audit_pairs.append((la, result))
    finally:
        default_registry.close_all()

    click.echo(f"\nRetry summary ({len(results)} assets):")
    for r in results:
        click.echo(r.as_row())

    if audit_pairs:
        # Thread the active
        # workspace into each record so multi-workspace retries are
        # attributable.
        active_ws = os.environ.get("PIPELINE_WORKSPACE_NAME")
        records = [
            _build_audit_record(la, r, workspace=active_ws)
            for la, r in audit_pairs
        ]
        path = write_records(Path.cwd(), records)
        click.echo(f"[audit] wrote {len(records)} records to {path}")

    failed_now = [r for r in results if r.is_failure]
    if failed_now:
        click.echo(f"\n{len(failed_now)} asset(s) still failing.", err=True)
        sys.exit(1)


@click.group("lifecycle")
def lifecycle_group() -> None:
    """Status promotion gates (F8). Reduced gate set in this PR."""


@lifecycle_group.command("promote")
@click.argument("rule_id", required=False)
@click.option(
    "--rules", "rules_csv", default=None,
    help=(
        "Bulk-promote a comma-separated list of rule IDs. Mutually "
        "exclusive with the positional rule_id argument and --cohort. "
        "Each rule is gated independently."
    ),
)
@click.option(
    "--cohort", "cohort", default=None,
    help=(
        "Bulk-promote every envelope whose metadata.cohort matches "
        "this value. Mutually exclusive with the positional rule_id "
        "argument and --rules. Each rule is gated independently."
    ),
)
@click.option(
    "--continue-on-failure", "continue_on_failure",
    is_flag=True, default=False,
    help=(
        "Bulk mode: exit 0 even if some rules failed their gates "
        "(failures are still printed in the summary). Default exits "
        "1 if any rule failed promotion. Has no effect in single-rule "
        "mode (a single failed rule always exits 1)."
    ),
)
@click.option(
    "--path", "detections_path",
    type=click.Path(exists=True, path_type=Path),
    default=Path("detections"),
)
@click.option(
    "--max-validation-age-days", type=int, default=30,
    help="metadata.lastValidatedAt must be no older than this. Default 30.",
)
@click.option(
    "--workspace-id", "workspace_id",
    envvar="PIPELINE_WORKSPACE_ID",
    default=None,
    help="LA workspace ID for the fp_rate_threshold gate (env: "
         "PIPELINE_WORKSPACE_ID). When unset OR --no-workspace-query "
         "is on, the gate stays deferred.",
)
@click.option(
    "--telemetry-since", "telemetry_since_days",
    type=int, default=30,
    help="Telemetry lookback window in days for the fp_rate_threshold "
         "gate (default 30).",
)
@click.option(
    "--no-workspace-query", "no_workspace_query",
    is_flag=True, default=False,
    help="Skip the workspace query even when --workspace-id is set. "
         "Useful for offline dry-runs or when the operator already "
         "has out-of-band FP-rate evidence. The fp_rate_threshold "
         "gate is reported as deferred.",
)
@click.option(
    "--force", is_flag=True, default=False,
    help="Promote even when gates fail (record reviewer approval "
         "elsewhere - PR comment, audit-trail message).",
)
@click.option("--dry-run", is_flag=True, default=False,
              help="Print the gate report without mutating the YAML.")
def lifecycle_promote_cmd(
    rule_id: str | None, rules_csv: str | None, cohort: str | None,
    continue_on_failure: bool,
    detections_path: Path,
    max_validation_age_days: int,
    workspace_id: str | None, telemetry_since_days: int,
    no_workspace_query: bool,
    force: bool, dry_run: bool,
) -> None:
    """Promote RULE_ID (or --rules / --cohort) from `experimental` to `production`.

    \b
    Selector modes (mutually exclusive):
      * Positional RULE_ID -- single rule, detailed output (original behaviour).
      * --rules a,b,c -- comma-separated rule ID list, summary table output.
      * --cohort foo -- every envelope with metadata.cohort == "foo",
        summary table output.

    \b
    Gates (applied per-rule):
      * status_is_experimental - current status must be experimental.
      * recent_validation - metadata.lastValidatedAt within
        --max-validation-age-days (default 30).
      * live_test_pass - DEFERRED (F2). Always passes (skipped).
      * fp_rate_threshold - live when --workspace-id is set and
        --no-workspace-query is unset. Compares closed_fp_30d /
        incidents_30d against config/lifecycle.yml's
        fp_rate_threshold (default 0.5). Fail-closed on workspace
        errors (use --force or --no-workspace-query to bypass).
    """
    from contentops.lifecycle import (
        LifecycleError, load_lifecycle_config, promote, promote_many,
    )

    # Validate selectors: exactly one mode.
    selectors_given = [
        name for name, val in (
            ("rule_id", rule_id),
            ("--rules", rules_csv),
            ("--cohort", cohort),
        ) if val
    ]
    if len(selectors_given) == 0:
        click.echo(
            "error: exactly one selector required -- positional rule_id, "
            "--rules, or --cohort.",
            err=True,
        )
        sys.exit(2)
    if len(selectors_given) > 1:
        click.echo(
            f"error: positional rule_id, --rules, and --cohort are "
            f"mutually exclusive ({', '.join(selectors_given)} given).",
            err=True,
        )
        sys.exit(2)

    # Resolve bulk selectors to a rule_id list.
    bulk_rule_ids: list[str] | None = None
    if rules_csv is not None:
        bulk_rule_ids = [
            r.strip() for r in rules_csv.split(",") if r.strip()
        ]
        if not bulk_rule_ids:
            click.echo(
                "error: --rules was empty after splitting on commas.",
                err=True,
            )
            sys.exit(2)
    elif cohort is not None:
        matches = _find_yamls_by_cohort(detections_path, cohort)
        if not matches:
            click.echo(
                f"error: no envelopes with metadata.cohort={cohort!r} "
                f"under {detections_path}",
                err=True,
            )
            sys.exit(1)
        bulk_rule_ids = [env_id for env_id, _ in matches]
        click.echo(
            f"cohort {cohort!r} matched {len(bulk_rule_ids)} envelope(s).",
            err=True,
        )

    config, info = load_lifecycle_config()
    if info:
        click.echo(f"info: {info}", err=True)

    effective_workspace_id: str | None = None
    token: str | None = None
    if not no_workspace_query:
        from contentops.utils.auth import get_credential
        from contentops.workspace_kql import (
            LA_SCOPE, WorkspaceKqlError, resolve_workspace_id,
        )
        try:
            cred = get_credential()
            if not workspace_id:
                workspace_id = resolve_workspace_id(
                    role="prod", credential=cred,
                )
            if workspace_id:
                token = cred.get_token(LA_SCOPE).token
                effective_workspace_id = workspace_id
        except WorkspaceKqlError as exc:
            click.echo(
                f"info: fp_rate gate stays deferred "
                f"(workspace auto-derive failed: {exc}).",
                err=True,
            )
        except Exception as exc:
            click.echo(
                f"info: fp_rate gate stays deferred "
                f"(credential/token acquisition failed: {exc}). "
                "Pass --no-workspace-query to silence this notice.",
                err=True,
            )

    # ------------------------------------------------------------------
    # Bulk mode: iterate via promote_many and emit a summary table.
    # Single-rule mode falls through to the detailed report below.
    # ------------------------------------------------------------------
    if bulk_rule_ids is not None:
        reports = promote_many(
            bulk_rule_ids,
            detections_root=detections_path,
            max_validation_age_days=max_validation_age_days,
            force=force, dry_run=dry_run,
            workspace_id=effective_workspace_id,
            token=token,
            fp_rate_threshold=config.fp_rate_threshold,
            telemetry_since_days=telemetry_since_days,
        )
        promoted = sum(1 for r in reports if r.promoted)
        passed_no_write = sum(
            1 for r in reports if r.all_passed() and not r.promoted
        )
        failed = sum(1 for r in reports if not r.all_passed())

        click.echo("")
        click.echo(
            f"## Bulk promote — {len(reports)} rule(s)  "
            f"({'dry-run' if dry_run else 'live'}, force={force})"
        )
        click.echo("")
        click.echo(
            f"{'rule_id':<50} {'status':<14} {'outcome':<16} detail"
        )
        click.echo("-" * 100)
        for r in reports:
            if r.promoted:
                outcome = "PROMOTED"
            elif r.all_passed() and dry_run:
                outcome = "would-promote"
            elif r.all_passed():
                outcome = "no-change"
            elif force:
                outcome = "FORCED" if not dry_run else "would-force"
            else:
                outcome = "REFUSED"
            failed_gates = [g.name for g in r.gates if not g.passed and not g.deferred]
            detail = ", ".join(failed_gates) if failed_gates else "all gates pass"
            click.echo(
                f"{r.rule_id:<50} {r.current_status:<14} {outcome:<16} {detail}"
            )
        click.echo("")
        click.echo(
            f"Summary: {promoted} promoted, {passed_no_write} no-change, "
            f"{failed} failed/refused."
        )

        if failed > 0 and not continue_on_failure and not force:
            sys.exit(1)
        return

    # ------------------------------------------------------------------
    # Single-rule mode (original behaviour, unchanged).
    # ------------------------------------------------------------------
    try:
        report = promote(
            rule_id, detections_root=detections_path,
            max_validation_age_days=max_validation_age_days,
            force=force, dry_run=dry_run,
            workspace_id=effective_workspace_id,
            token=token,
            fp_rate_threshold=config.fp_rate_threshold,
            telemetry_since_days=telemetry_since_days,
        )
    except LifecycleError as exc:
        click.echo(f"error: {exc}", err=True)
        sys.exit(1)

    click.echo(f"## {rule_id}  (status: {report.current_status})")
    click.echo("")
    for g in report.gates:
        if g.deferred:
            tag = "[skip]"
        elif g.passed:
            tag = "[pass]"
        else:
            tag = "[FAIL]"
        click.echo(f"  {tag} {g.name}: {g.detail}")
    click.echo("")
    if report.promoted:
        click.echo(f"PROMOTED: {report.path} now has status: production")
    elif report.all_passed():
        if dry_run:
            click.echo("[dry-run] all gates pass; would write status: production")
        else:
            click.echo("(no change — already promoted? See status line.)")
    else:
        if force:
            click.echo("FORCED PROMOTION: gates failed but --force was set.")
            if dry_run:
                click.echo("[dry-run] no write performed.")
        else:
            click.echo(
                "REFUSED: gates failed. Re-run with --force after recording "
                "explicit reviewer approval, or fix the gates and re-run."
            )
            sys.exit(1)