From ca23d4f5f43e17cc06d22dd12bf3eeac6b2aaab3 Mon Sep 17 00:00:00 2001
From: Dylan Huang <dhuang@fireworks.ai>
Date: Mon, 15 Dec 2025 18:19:22 -0800
Subject: [PATCH 1/5] save

---
 eval_protocol/cli.py                      |  15 +
 eval_protocol/cli_commands/export_docs.py | 466 ++++++++++++++++++++++
 2 files changed, 481 insertions(+)
 create mode 100644 eval_protocol/cli_commands/export_docs.py

diff --git a/eval_protocol/cli.py b/eval_protocol/cli.py
index e8125390..2a360c29 100644
--- a/eval_protocol/cli.py
+++ b/eval_protocol/cli.py
@@ -494,6 +494,17 @@ def parse_args(args=None):
     #     help="Run an evaluation using a Hydra configuration. All arguments after 'run' are passed to Hydra.",
     # )
 
+    # Hidden command: export-docs (for generating CLI reference documentation)
+    export_docs_parser = subparsers.add_parser(
+        "export-docs",
+        help=argparse.SUPPRESS,  # Hidden from help output
+    )
+    export_docs_parser.add_argument(
+        "--output-dir",
+        default="./docs/cli-reference",
+        help="Directory to write markdown files to (default: ./docs/cli-reference)",
+    )
+
     # Use parse_known_args to allow Hydra to handle its own arguments
     return parser.parse_known_args(args)
 
@@ -623,6 +634,10 @@ def _extract_flag_value(argv_list, flag_name):
         from .cli_commands.local_test import local_test_command
 
         return local_test_command(args)
+    elif args.command == "export-docs":
+        from .cli_commands.export_docs import export_docs_command
+
+        return export_docs_command(args)
     # elif args.command == "run":
     #     # For the 'run' command, Hydra takes over argument parsing.
     #
diff --git a/eval_protocol/cli_commands/export_docs.py b/eval_protocol/cli_commands/export_docs.py
new file mode 100644
index 00000000..8ad7ade5
--- /dev/null
+++ b/eval_protocol/cli_commands/export_docs.py
@@ -0,0 +1,466 @@
+"""
+Export CLI reference documentation as markdown files.
+
+This module provides functionality to introspect the argparse-based CLI
+and generate markdown documentation for each command.
+"""
+
+import argparse
+import logging
+import os
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+logger = logging.getLogger(__name__)
+
+
+def _get_parser_info(parser: argparse.ArgumentParser) -> Dict:
+    """Extract information from an ArgumentParser."""
+    info = {
+        "prog": parser.prog,
+        "description": parser.description or "",
+        "epilog": parser.epilog or "",
+        "arguments": [],
+        "subparsers": {},
+    }
+
+    # Extract arguments
+    for action in parser._actions:
+        if isinstance(action, argparse._SubParsersAction):
+            # Handle subparsers
+            for name, subparser in action.choices.items():
+                info["subparsers"][name] = _get_parser_info(subparser)
+        elif isinstance(action, argparse._HelpAction):
+            # Skip help action, it's always present
+            continue
+        else:
+            arg_info = {
+                "option_strings": action.option_strings,
+                "dest": action.dest,
+                "help": action.help or "",
+                "default": action.default,
+                "required": getattr(action, "required", False),
+                "type": getattr(action, "type", None),
+                "choices": getattr(action, "choices", None),
+                "nargs": getattr(action, "nargs", None),
+                "metavar": getattr(action, "metavar", None),
+            }
+            # Check if help is suppressed
+            if action.help != argparse.SUPPRESS:
+                info["arguments"].append(arg_info)
+
+    return info
+
+
+def _format_argument_row(arg: Dict) -> str:
+    """Format a single argument as a markdown table row."""
+    # Build the flag/argument name
+    if arg["option_strings"]:
+        name = ", ".join(f"`{opt}`" for opt in arg["option_strings"])
+    else:
+        name = f"`{arg['dest']}`"
+
+    # Build type info
+    type_str = ""
+    if arg["type"]:
+        type_str = getattr(arg["type"], "__name__", str(arg["type"]))
+    if arg["choices"]:
+        type_str = f"choices: {arg['choices']}"
+
+    # Format default value
+    default = arg["default"]
+    if default is None:
+        default_str = "-"
+    elif default == argparse.SUPPRESS:
+        default_str = "-"
+    elif isinstance(default, bool):
+        default_str = str(default).lower()
+    else:
+        default_str = f"`{default}`"
+
+    # Help text (escape pipe characters for markdown tables)
+    help_text = (arg["help"] or "-").replace("|", "\\|")
+
+    # Required indicator
+    required = "Yes" if arg["required"] else "No"
+
+    return f"| {name} | {type_str} | {default_str} | {required} | {help_text} |"
+
+
+def _generate_command_markdown(
+    name: str,
+    info: Dict,
+    parent_command: str = "",
+    level: int = 1,
+) -> str:
+    """Generate markdown documentation for a single command."""
+    lines = []
+
+    # Command title
+    full_command = f"{parent_command} {name}".strip() if parent_command else name
+    heading = "#" * min(level, 4)
+    lines.append(f"{heading} `{full_command}`")
+    lines.append("")
+
+    # Description
+    if info["description"]:
+        lines.append(info["description"])
+        lines.append("")
+
+    # Arguments table
+    if info["arguments"]:
+        lines.append("**Options:**")
+        lines.append("")
+        lines.append("| Option | Type | Default | Required | Description |")
+        lines.append("|--------|------|---------|----------|-------------|")
+        for arg in info["arguments"]:
+            lines.append(_format_argument_row(arg))
+        lines.append("")
+
+    # Epilog
+    if info["epilog"]:
+        lines.append(info["epilog"])
+        lines.append("")
+
+    return "\n".join(lines)
+
+
+def _generate_subcommand_docs(
+    subparsers: Dict,
+    parent_command: str,
+    level: int,
+) -> List[Tuple[str, str]]:
+    """Generate markdown docs for all subcommands, returns list of (filename, content)."""
+    docs = []
+
+    for name, info in subparsers.items():
+        full_command = f"{parent_command} {name}".strip()
+
+        # Generate this command's doc
+        content_lines = []
+        content_lines.append(f"# `{full_command}`")
+        content_lines.append("")
+
+        if info["description"]:
+            content_lines.append(info["description"])
+            content_lines.append("")
+
+        # Arguments table
+        if info["arguments"]:
+            content_lines.append("## Options")
+            content_lines.append("")
+            content_lines.append("| Option | Type | Default | Required | Description |")
+            content_lines.append("|--------|------|---------|----------|-------------|")
+            for arg in info["arguments"]:
+                content_lines.append(_format_argument_row(arg))
+            content_lines.append("")
+
+        # Handle nested subparsers
+        if info["subparsers"]:
+            content_lines.append("## Subcommands")
+            content_lines.append("")
+            for subname in info["subparsers"].keys():
+                sub_full = f"{full_command} {subname}"
+                content_lines.append(f"- [`{sub_full}`]({name}-{subname}.md)")
+            content_lines.append("")
+
+            # Recursively generate docs for nested subcommands
+            nested_docs = _generate_subcommand_docs(
+                info["subparsers"],
+                full_command,
+                level + 1,
+            )
+            for nested_filename, nested_content in nested_docs:
+                docs.append((f"{name}-{nested_filename}", nested_content))
+
+        if info["epilog"]:
+            content_lines.append(info["epilog"])
+            content_lines.append("")
+
+        filename = name.replace(" ", "-") + ".md"
+        docs.append((filename, "\n".join(content_lines)))
+
+    return docs
+
+
+def generate_cli_docs(parser: argparse.ArgumentParser, output_dir: str) -> int:
+    """
+    Generate markdown documentation from an ArgumentParser.
+
+    Args:
+            parser: The root ArgumentParser instance.
+            output_dir: Directory to write markdown files to.
+
+    Returns:
+            0 on success, 1 on failure.
+    """
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+
+    # Extract parser info
+    info = _get_parser_info(parser)
+
+    # Generate index/overview page
+    index_lines = []
+    index_lines.append("# CLI Reference")
+    index_lines.append("")
+    index_lines.append(f"**{info['prog']}** - {info['description']}")
+    index_lines.append("")
+
+    # Global options
+    if info["arguments"]:
+        index_lines.append("## Global Options")
+        index_lines.append("")
+        index_lines.append("| Option | Type | Default | Required | Description |")
+        index_lines.append("|--------|------|---------|----------|-------------|")
+        for arg in info["arguments"]:
+            index_lines.append(_format_argument_row(arg))
+        index_lines.append("")
+
+    # Commands section
+    if info["subparsers"]:
+        index_lines.append("## Commands")
+        index_lines.append("")
+        for name, subinfo in info["subparsers"].items():
+            description = subinfo["description"] or ""
+            # Truncate long descriptions for the index
+            if len(description) > 100:
+                description = description[:97] + "..."
+            index_lines.append(f"- [`{name}`]({name}.md) - {description}")
+        index_lines.append("")
+
+    # Write index file
+    index_path = output_path / "index.md"
+    index_path.write_text("\n".join(index_lines), encoding="utf-8")
+    logger.info(f"Generated: {index_path}")
+
+    # Generate individual command docs
+    if info["subparsers"]:
+        docs = _generate_subcommand_docs(info["subparsers"], info["prog"], 1)
+        for filename, content in docs:
+            file_path = output_path / filename
+            file_path.write_text(content, encoding="utf-8")
+            logger.info(f"Generated: {file_path}")
+
+    logger.info(f"CLI documentation exported to: {output_path}")
+    return 0
+
+
+def export_docs_command(args: argparse.Namespace) -> int:
+    """
+    Export CLI documentation to markdown files.
+
+    This command introspects the CLI parser and generates markdown documentation.
+    """
+    # Import here to avoid circular imports
+    from eval_protocol.cli import parse_args
+
+    # Create a fresh parser by calling parse_args with empty args
+    # We need to access the parser directly
+    parser = argparse.ArgumentParser(description="eval-protocol: Tools for evaluation and reward modeling")
+    parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging")
+    parser.add_argument(
+        "--profile",
+        help="Fireworks profile to use (reads ~/.fireworks/profiles/<name>/auth.ini and settings.ini)",
+    )
+    parser.add_argument(
+        "--server",
+        help="Fireworks API server hostname or URL (e.g., dev.api.fireworks.ai or https://dev.api.fireworks.ai)",
+    )
+
+    subparsers = parser.add_subparsers(dest="command", help="Command to run")
+
+    # Logs command
+    logs_parser = subparsers.add_parser("logs", help="Serve logs with file watching and real-time updates")
+    logs_parser.add_argument("--port", type=int, default=8000, help="Port to bind to (default: 8000)")
+    logs_parser.add_argument("--debug", action="store_true", help="Enable debug mode")
+    logs_parser.add_argument("--disable-elasticsearch-setup", action="store_true", help="Disable Elasticsearch setup")
+    logs_parser.add_argument(
+        "--use-env-elasticsearch-config",
+        action="store_true",
+        help="Use env vars for Elasticsearch config (requires ELASTICSEARCH_URL, ELASTICSEARCH_API_KEY, ELASTICSEARCH_INDEX_NAME)",
+    )
+    logs_parser.add_argument(
+        "--use-fireworks",
+        action="store_true",
+        help="Force Fireworks tracing backend for logs UI (overrides env auto-detection)",
+    )
+    logs_parser.add_argument(
+        "--use-elasticsearch",
+        action="store_true",
+        help="Force Elasticsearch backend for logs UI (overrides env auto-detection)",
+    )
+
+    # Upload command
+    upload_parser = subparsers.add_parser(
+        "upload",
+        help="Scan for evaluation tests, select, and upload as Fireworks evaluators",
+    )
+    upload_parser.add_argument(
+        "--path",
+        default=".",
+        help="Path to search for evaluation tests (default: current directory)",
+    )
+    upload_parser.add_argument(
+        "--entry",
+        help="Entrypoint of evaluation test to upload (module:function or path::function). For multiple, separate by commas.",
+    )
+    upload_parser.add_argument(
+        "--id",
+        help="Evaluator ID to use (if multiple selections, a numeric suffix is appended)",
+    )
+    upload_parser.add_argument(
+        "--display-name",
+        help="Display name for evaluator (defaults to ID)",
+    )
+    upload_parser.add_argument(
+        "--description",
+        help="Description for evaluator",
+    )
+    upload_parser.add_argument(
+        "--force",
+        action="store_true",
+        help="Overwrite existing evaluator with the same ID",
+    )
+    upload_parser.add_argument(
+        "--yes",
+        "-y",
+        action="store_true",
+        help="Non-interactive: upload all discovered evaluation tests",
+    )
+    upload_parser.add_argument(
+        "--env-file",
+        help="Path to .env file containing secrets to upload (default: .env in current directory)",
+    )
+
+    # Create command group
+    create_parser = subparsers.add_parser(
+        "create",
+        help="Resource creation commands",
+    )
+    create_subparsers = create_parser.add_subparsers(dest="create_command")
+    rft_parser = create_subparsers.add_parser(
+        "rft",
+        help="Create a Reinforcement Fine-tuning Job on Fireworks",
+    )
+    rft_parser.add_argument(
+        "--evaluator",
+        help="Evaluator ID or fully-qualified resource (accounts/{acct}/evaluators/{id}); if omitted, derive from local tests",
+    )
+    rft_parser.add_argument(
+        "--dataset",
+        help="Use existing dataset (ID or resource 'accounts/{acct}/datasets/{id}') to skip local materialization",
+    )
+    rft_parser.add_argument(
+        "--dataset-jsonl",
+        help="Path to JSONL to upload as a new Fireworks dataset",
+    )
+    rft_parser.add_argument(
+        "--dataset-builder",
+        help="Explicit dataset builder spec (module::function or path::function)",
+    )
+    rft_parser.add_argument(
+        "--dataset-display-name",
+        help="Display name for dataset on Fireworks (defaults to dataset id)",
+    )
+    rft_parser.add_argument("--base-model", help="Base model resource id")
+    rft_parser.add_argument("--warm-start-from", help="Addon model to warm start from")
+    rft_parser.add_argument("--output-model", help="Output model id (defaults from evaluator)")
+    rft_parser.add_argument("--epochs", type=int, default=1, help="Number of training epochs")
+    rft_parser.add_argument("--batch-size", type=int, default=128000, help="Training batch size")
+    rft_parser.add_argument("--learning-rate", type=float, default=3e-5, help="Learning rate")
+    rft_parser.add_argument("--max-context-length", type=int, default=65536, help="Maximum context length")
+    rft_parser.add_argument("--lora-rank", type=int, default=16, help="LoRA rank")
+    rft_parser.add_argument("--gradient-accumulation-steps", type=int, help="Number of gradient accumulation steps")
+    rft_parser.add_argument("--learning-rate-warmup-steps", type=int, help="Number of LR warmup steps")
+    rft_parser.add_argument("--accelerator-count", type=int, help="Number of accelerators")
+    rft_parser.add_argument("--region", help="Fireworks region enum value")
+    rft_parser.add_argument("--display-name", help="RFT job display name")
+    rft_parser.add_argument("--evaluation-dataset", help="Optional separate eval dataset id")
+    rft_parser.add_argument(
+        "--eval-auto-carveout",
+        dest="eval_auto_carveout",
+        action="store_true",
+        default=True,
+        help="Enable auto carveout for evaluation (default: true)",
+    )
+    rft_parser.add_argument(
+        "--no-eval-auto-carveout",
+        dest="eval_auto_carveout",
+        action="store_false",
+        help="Disable auto carveout for evaluation",
+    )
+    rft_parser.add_argument("--chunk-size", type=int, default=100, help="Data chunk size for rollout batching")
+    rft_parser.add_argument("--temperature", type=float, help="Sampling temperature")
+    rft_parser.add_argument("--top-p", type=float, help="Top-p sampling parameter")
+    rft_parser.add_argument("--top-k", type=int, help="Top-k sampling parameter")
+    rft_parser.add_argument("--max-output-tokens", type=int, default=32768, help="Maximum output tokens")
+    rft_parser.add_argument("--response-candidates-count", type=int, default=8, help="Number of response candidates")
+    rft_parser.add_argument("--extra-body", help="JSON string for extra inference params")
+    rft_parser.add_argument(
+        "--mcp-server",
+        help="The MCP server resource name to use for the reinforcement fine-tuning job.",
+    )
+    rft_parser.add_argument("--wandb-enabled", action="store_true", help="Enable Weights & Biases logging")
+    rft_parser.add_argument("--wandb-project", help="Weights & Biases project name")
+    rft_parser.add_argument("--wandb-entity", help="Weights & Biases entity")
+    rft_parser.add_argument("--wandb-run-id", help="Weights & Biases run ID")
+    rft_parser.add_argument("--wandb-api-key", help="Weights & Biases API key")
+    rft_parser.add_argument("--job-id", help="Specify an explicit RFT job id")
+    rft_parser.add_argument("--yes", "-y", action="store_true", help="Non-interactive mode")
+    rft_parser.add_argument("--dry-run", action="store_true", help="Print planned REST calls without sending")
+    rft_parser.add_argument("--force", action="store_true", help="Overwrite existing evaluator with the same ID")
+    rft_parser.add_argument(
+        "--skip-validation",
+        action="store_true",
+        help="Skip local dataset and evaluator validation before creating the RFT job",
+    )
+    rft_parser.add_argument(
+        "--ignore-docker",
+        action="store_true",
+        help="Ignore Dockerfile even if present; run pytest on host during evaluator validation",
+    )
+    rft_parser.add_argument(
+        "--docker-build-extra",
+        default="",
+        help="Extra flags to pass to 'docker build' when validating evaluator (quoted string)",
+    )
+    rft_parser.add_argument(
+        "--docker-run-extra",
+        default="",
+        help="Extra flags to pass to 'docker run' when validating evaluator (quoted string)",
+    )
+
+    # Local test command
+    local_test_parser = subparsers.add_parser(
+        "local-test",
+        help="Select an evaluation test and run it locally. If a Dockerfile exists, build and run via Docker; otherwise run on host.",
+    )
+    local_test_parser.add_argument(
+        "--entry",
+        help="Entrypoint to run (path::function or path). If not provided, a selector will be shown (unless --yes).",
+    )
+    local_test_parser.add_argument(
+        "--ignore-docker",
+        action="store_true",
+        help="Ignore Dockerfile even if present; run pytest on host",
+    )
+    local_test_parser.add_argument(
+        "--yes",
+        "-y",
+        action="store_true",
+        help="Non-interactive: if multiple tests exist and no --entry, fails with guidance",
+    )
+    local_test_parser.add_argument(
+        "--docker-build-extra",
+        default="",
+        help="Extra flags to pass to 'docker build' (quoted string)",
+    )
+    local_test_parser.add_argument(
+        "--docker-run-extra",
+        default="",
+        help="Extra flags to pass to 'docker run' (quoted string)",
+    )
+
+    output_dir = args.output_dir
+    return generate_cli_docs(parser, output_dir)

From 3ff6f8cdeafe6683e012ca50f3b479c980b0dbf1 Mon Sep 17 00:00:00 2001
From: Dylan Huang <dhuang@fireworks.ai>
Date: Mon, 15 Dec 2025 21:54:22 -0800
Subject: [PATCH 2/5] update

---
 eval_protocol/cli.py                      |  22 +-
 eval_protocol/cli_commands/export_docs.py | 392 ++++------------------
 2 files changed, 79 insertions(+), 335 deletions(-)

diff --git a/eval_protocol/cli.py b/eval_protocol/cli.py
index 2a360c29..59c0392c 100644
--- a/eval_protocol/cli.py
+++ b/eval_protocol/cli.py
@@ -32,9 +32,14 @@
     preview_command = None  # type: ignore[assignment]
 
 
-def parse_args(args=None):
-    """Parse command line arguments"""
+def build_parser() -> argparse.ArgumentParser:
+    """Build and return the argument parser for the CLI."""
     parser = argparse.ArgumentParser(description="eval-protocol: Tools for evaluation and reward modeling")
+    return _configure_parser(parser)
+
+
+def _configure_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    """Configure all arguments and subparsers on the given parser."""
     parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging")
     parser.add_argument(
         "--profile",
@@ -500,11 +505,18 @@ def parse_args(args=None):
         help=argparse.SUPPRESS,  # Hidden from help output
     )
     export_docs_parser.add_argument(
-        "--output-dir",
-        default="./docs/cli-reference",
-        help="Directory to write markdown files to (default: ./docs/cli-reference)",
+        "--output",
+        "-o",
+        default="./docs/cli-reference.md",
+        help="Output markdown file path (default: ./docs/cli-reference.md)",
     )
 
+    return parser
+
+
+def parse_args(args=None):
+    """Parse command line arguments."""
+    parser = build_parser()
     # Use parse_known_args to allow Hydra to handle its own arguments
     return parser.parse_known_args(args)
 
diff --git a/eval_protocol/cli_commands/export_docs.py b/eval_protocol/cli_commands/export_docs.py
index 8ad7ade5..5b39742c 100644
--- a/eval_protocol/cli_commands/export_docs.py
+++ b/eval_protocol/cli_commands/export_docs.py
@@ -7,9 +7,8 @@
 
 import argparse
 import logging
-import os
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List
 
 logger = logging.getLogger(__name__)
 
@@ -87,380 +86,113 @@ def _format_argument_row(arg: Dict) -> str:
     return f"| {name} | {type_str} | {default_str} | {required} | {help_text} |"
 
 
-def _generate_command_markdown(
+def _generate_command_section(
     name: str,
     info: Dict,
-    parent_command: str = "",
-    level: int = 1,
-) -> str:
-    """Generate markdown documentation for a single command."""
+    parent_command: str,
+    heading_level: int = 2,
+) -> List[str]:
+    """Generate markdown section for a single command."""
     lines = []
+    full_command = f"{parent_command} {name}".strip()
+    heading = "#" * heading_level
 
-    # Command title
-    full_command = f"{parent_command} {name}".strip() if parent_command else name
-    heading = "#" * min(level, 4)
     lines.append(f"{heading} `{full_command}`")
     lines.append("")
 
-    # Description
     if info["description"]:
         lines.append(info["description"])
         lines.append("")
 
     # Arguments table
     if info["arguments"]:
-        lines.append("**Options:**")
-        lines.append("")
         lines.append("| Option | Type | Default | Required | Description |")
         lines.append("|--------|------|---------|----------|-------------|")
         for arg in info["arguments"]:
             lines.append(_format_argument_row(arg))
         lines.append("")
 
-    # Epilog
+    # Handle nested subparsers recursively
+    if info["subparsers"]:
+        for subname, subinfo in info["subparsers"].items():
+            lines.extend(
+                _generate_command_section(
+                    subname,
+                    subinfo,
+                    full_command,
+                    heading_level + 1,
+                )
+            )
+
     if info["epilog"]:
         lines.append(info["epilog"])
         lines.append("")
 
-    return "\n".join(lines)
-
-
-def _generate_subcommand_docs(
-    subparsers: Dict,
-    parent_command: str,
-    level: int,
-) -> List[Tuple[str, str]]:
-    """Generate markdown docs for all subcommands, returns list of (filename, content)."""
-    docs = []
-
-    for name, info in subparsers.items():
-        full_command = f"{parent_command} {name}".strip()
-
-        # Generate this command's doc
-        content_lines = []
-        content_lines.append(f"# `{full_command}`")
-        content_lines.append("")
-
-        if info["description"]:
-            content_lines.append(info["description"])
-            content_lines.append("")
-
-        # Arguments table
-        if info["arguments"]:
-            content_lines.append("## Options")
-            content_lines.append("")
-            content_lines.append("| Option | Type | Default | Required | Description |")
-            content_lines.append("|--------|------|---------|----------|-------------|")
-            for arg in info["arguments"]:
-                content_lines.append(_format_argument_row(arg))
-            content_lines.append("")
-
-        # Handle nested subparsers
-        if info["subparsers"]:
-            content_lines.append("## Subcommands")
-            content_lines.append("")
-            for subname in info["subparsers"].keys():
-                sub_full = f"{full_command} {subname}"
-                content_lines.append(f"- [`{sub_full}`]({name}-{subname}.md)")
-            content_lines.append("")
-
-            # Recursively generate docs for nested subcommands
-            nested_docs = _generate_subcommand_docs(
-                info["subparsers"],
-                full_command,
-                level + 1,
-            )
-            for nested_filename, nested_content in nested_docs:
-                docs.append((f"{name}-{nested_filename}", nested_content))
-
-        if info["epilog"]:
-            content_lines.append(info["epilog"])
-            content_lines.append("")
-
-        filename = name.replace(" ", "-") + ".md"
-        docs.append((filename, "\n".join(content_lines)))
+    return lines
 
-    return docs
 
-
-def generate_cli_docs(parser: argparse.ArgumentParser, output_dir: str) -> int:
+def generate_cli_docs(parser: argparse.ArgumentParser, output_path: str) -> int:
     """
-    Generate markdown documentation from an ArgumentParser.
+    Generate markdown documentation from an ArgumentParser to a single file.
 
     Args:
-            parser: The root ArgumentParser instance.
-            output_dir: Directory to write markdown files to.
+        parser: The root ArgumentParser instance.
+        output_path: Path to write the markdown file to.
 
     Returns:
-            0 on success, 1 on failure.
+        0 on success, 1 on failure.
     """
-    output_path = Path(output_dir)
-    output_path.mkdir(parents=True, exist_ok=True)
-
     # Extract parser info
     info = _get_parser_info(parser)
 
-    # Generate index/overview page
-    index_lines = []
-    index_lines.append("# CLI Reference")
-    index_lines.append("")
-    index_lines.append(f"**{info['prog']}** - {info['description']}")
-    index_lines.append("")
+    # Filter out hidden commands (like export-docs itself)
+    visible_subparsers = {
+        name: subinfo
+        for name, subinfo in info["subparsers"].items()
+        if name != "export-docs"  # Don't document the hidden command
+    }
+
+    # Generate single page
+    lines = []
+    lines.append("# CLI Reference")
+    lines.append("")
+    lines.append(f"**{info['prog']}** - {info['description']}")
+    lines.append("")
 
     # Global options
     if info["arguments"]:
-        index_lines.append("## Global Options")
-        index_lines.append("")
-        index_lines.append("| Option | Type | Default | Required | Description |")
-        index_lines.append("|--------|------|---------|----------|-------------|")
+        lines.append("## Global Options")
+        lines.append("")
+        lines.append("| Option | Type | Default | Required | Description |")
+        lines.append("|--------|------|---------|----------|-------------|")
         for arg in info["arguments"]:
-            index_lines.append(_format_argument_row(arg))
-        index_lines.append("")
+            lines.append(_format_argument_row(arg))
+        lines.append("")
 
     # Commands section
-    if info["subparsers"]:
-        index_lines.append("## Commands")
-        index_lines.append("")
-        for name, subinfo in info["subparsers"].items():
-            description = subinfo["description"] or ""
-            # Truncate long descriptions for the index
-            if len(description) > 100:
-                description = description[:97] + "..."
-            index_lines.append(f"- [`{name}`]({name}.md) - {description}")
-        index_lines.append("")
-
-    # Write index file
-    index_path = output_path / "index.md"
-    index_path.write_text("\n".join(index_lines), encoding="utf-8")
-    logger.info(f"Generated: {index_path}")
-
-    # Generate individual command docs
-    if info["subparsers"]:
-        docs = _generate_subcommand_docs(info["subparsers"], info["prog"], 1)
-        for filename, content in docs:
-            file_path = output_path / filename
-            file_path.write_text(content, encoding="utf-8")
-            logger.info(f"Generated: {file_path}")
+    if visible_subparsers:
+        lines.append("## Commands")
+        lines.append("")
+        for name, subinfo in visible_subparsers.items():
+            lines.extend(_generate_command_section(name, subinfo, info["prog"], heading_level=3))
+
+    # Write single file
+    out = Path(output_path)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    out.write_text("\n".join(lines), encoding="utf-8")
+    logger.info(f"Generated: {out}")
 
-    logger.info(f"CLI documentation exported to: {output_path}")
     return 0
 
 
 def export_docs_command(args: argparse.Namespace) -> int:
     """
-    Export CLI documentation to markdown files.
+    Export CLI documentation to a single markdown file.
 
     This command introspects the CLI parser and generates markdown documentation.
     """
-    # Import here to avoid circular imports
-    from eval_protocol.cli import parse_args
-
-    # Create a fresh parser by calling parse_args with empty args
-    # We need to access the parser directly
-    parser = argparse.ArgumentParser(description="eval-protocol: Tools for evaluation and reward modeling")
-    parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging")
-    parser.add_argument(
-        "--profile",
-        help="Fireworks profile to use (reads ~/.fireworks/profiles/<name>/auth.ini and settings.ini)",
-    )
-    parser.add_argument(
-        "--server",
-        help="Fireworks API server hostname or URL (e.g., dev.api.fireworks.ai or https://dev.api.fireworks.ai)",
-    )
-
-    subparsers = parser.add_subparsers(dest="command", help="Command to run")
-
-    # Logs command
-    logs_parser = subparsers.add_parser("logs", help="Serve logs with file watching and real-time updates")
-    logs_parser.add_argument("--port", type=int, default=8000, help="Port to bind to (default: 8000)")
-    logs_parser.add_argument("--debug", action="store_true", help="Enable debug mode")
-    logs_parser.add_argument("--disable-elasticsearch-setup", action="store_true", help="Disable Elasticsearch setup")
-    logs_parser.add_argument(
-        "--use-env-elasticsearch-config",
-        action="store_true",
-        help="Use env vars for Elasticsearch config (requires ELASTICSEARCH_URL, ELASTICSEARCH_API_KEY, ELASTICSEARCH_INDEX_NAME)",
-    )
-    logs_parser.add_argument(
-        "--use-fireworks",
-        action="store_true",
-        help="Force Fireworks tracing backend for logs UI (overrides env auto-detection)",
-    )
-    logs_parser.add_argument(
-        "--use-elasticsearch",
-        action="store_true",
-        help="Force Elasticsearch backend for logs UI (overrides env auto-detection)",
-    )
-
-    # Upload command
-    upload_parser = subparsers.add_parser(
-        "upload",
-        help="Scan for evaluation tests, select, and upload as Fireworks evaluators",
-    )
-    upload_parser.add_argument(
-        "--path",
-        default=".",
-        help="Path to search for evaluation tests (default: current directory)",
-    )
-    upload_parser.add_argument(
-        "--entry",
-        help="Entrypoint of evaluation test to upload (module:function or path::function). For multiple, separate by commas.",
-    )
-    upload_parser.add_argument(
-        "--id",
-        help="Evaluator ID to use (if multiple selections, a numeric suffix is appended)",
-    )
-    upload_parser.add_argument(
-        "--display-name",
-        help="Display name for evaluator (defaults to ID)",
-    )
-    upload_parser.add_argument(
-        "--description",
-        help="Description for evaluator",
-    )
-    upload_parser.add_argument(
-        "--force",
-        action="store_true",
-        help="Overwrite existing evaluator with the same ID",
-    )
-    upload_parser.add_argument(
-        "--yes",
-        "-y",
-        action="store_true",
-        help="Non-interactive: upload all discovered evaluation tests",
-    )
-    upload_parser.add_argument(
-        "--env-file",
-        help="Path to .env file containing secrets to upload (default: .env in current directory)",
-    )
-
-    # Create command group
-    create_parser = subparsers.add_parser(
-        "create",
-        help="Resource creation commands",
-    )
-    create_subparsers = create_parser.add_subparsers(dest="create_command")
-    rft_parser = create_subparsers.add_parser(
-        "rft",
-        help="Create a Reinforcement Fine-tuning Job on Fireworks",
-    )
-    rft_parser.add_argument(
-        "--evaluator",
-        help="Evaluator ID or fully-qualified resource (accounts/{acct}/evaluators/{id}); if omitted, derive from local tests",
-    )
-    rft_parser.add_argument(
-        "--dataset",
-        help="Use existing dataset (ID or resource 'accounts/{acct}/datasets/{id}') to skip local materialization",
-    )
-    rft_parser.add_argument(
-        "--dataset-jsonl",
-        help="Path to JSONL to upload as a new Fireworks dataset",
-    )
-    rft_parser.add_argument(
-        "--dataset-builder",
-        help="Explicit dataset builder spec (module::function or path::function)",
-    )
-    rft_parser.add_argument(
-        "--dataset-display-name",
-        help="Display name for dataset on Fireworks (defaults to dataset id)",
-    )
-    rft_parser.add_argument("--base-model", help="Base model resource id")
-    rft_parser.add_argument("--warm-start-from", help="Addon model to warm start from")
-    rft_parser.add_argument("--output-model", help="Output model id (defaults from evaluator)")
-    rft_parser.add_argument("--epochs", type=int, default=1, help="Number of training epochs")
-    rft_parser.add_argument("--batch-size", type=int, default=128000, help="Training batch size")
-    rft_parser.add_argument("--learning-rate", type=float, default=3e-5, help="Learning rate")
-    rft_parser.add_argument("--max-context-length", type=int, default=65536, help="Maximum context length")
-    rft_parser.add_argument("--lora-rank", type=int, default=16, help="LoRA rank")
-    rft_parser.add_argument("--gradient-accumulation-steps", type=int, help="Number of gradient accumulation steps")
-    rft_parser.add_argument("--learning-rate-warmup-steps", type=int, help="Number of LR warmup steps")
-    rft_parser.add_argument("--accelerator-count", type=int, help="Number of accelerators")
-    rft_parser.add_argument("--region", help="Fireworks region enum value")
-    rft_parser.add_argument("--display-name", help="RFT job display name")
-    rft_parser.add_argument("--evaluation-dataset", help="Optional separate eval dataset id")
-    rft_parser.add_argument(
-        "--eval-auto-carveout",
-        dest="eval_auto_carveout",
-        action="store_true",
-        default=True,
-        help="Enable auto carveout for evaluation (default: true)",
-    )
-    rft_parser.add_argument(
-        "--no-eval-auto-carveout",
-        dest="eval_auto_carveout",
-        action="store_false",
-        help="Disable auto carveout for evaluation",
-    )
-    rft_parser.add_argument("--chunk-size", type=int, default=100, help="Data chunk size for rollout batching")
-    rft_parser.add_argument("--temperature", type=float, help="Sampling temperature")
-    rft_parser.add_argument("--top-p", type=float, help="Top-p sampling parameter")
-    rft_parser.add_argument("--top-k", type=int, help="Top-k sampling parameter")
-    rft_parser.add_argument("--max-output-tokens", type=int, default=32768, help="Maximum output tokens")
-    rft_parser.add_argument("--response-candidates-count", type=int, default=8, help="Number of response candidates")
-    rft_parser.add_argument("--extra-body", help="JSON string for extra inference params")
-    rft_parser.add_argument(
-        "--mcp-server",
-        help="The MCP server resource name to use for the reinforcement fine-tuning job.",
-    )
-    rft_parser.add_argument("--wandb-enabled", action="store_true", help="Enable Weights & Biases logging")
-    rft_parser.add_argument("--wandb-project", help="Weights & Biases project name")
-    rft_parser.add_argument("--wandb-entity", help="Weights & Biases entity")
-    rft_parser.add_argument("--wandb-run-id", help="Weights & Biases run ID")
-    rft_parser.add_argument("--wandb-api-key", help="Weights & Biases API key")
-    rft_parser.add_argument("--job-id", help="Specify an explicit RFT job id")
-    rft_parser.add_argument("--yes", "-y", action="store_true", help="Non-interactive mode")
-    rft_parser.add_argument("--dry-run", action="store_true", help="Print planned REST calls without sending")
-    rft_parser.add_argument("--force", action="store_true", help="Overwrite existing evaluator with the same ID")
-    rft_parser.add_argument(
-        "--skip-validation",
-        action="store_true",
-        help="Skip local dataset and evaluator validation before creating the RFT job",
-    )
-    rft_parser.add_argument(
-        "--ignore-docker",
-        action="store_true",
-        help="Ignore Dockerfile even if present; run pytest on host during evaluator validation",
-    )
-    rft_parser.add_argument(
-        "--docker-build-extra",
-        default="",
-        help="Extra flags to pass to 'docker build' when validating evaluator (quoted string)",
-    )
-    rft_parser.add_argument(
-        "--docker-run-extra",
-        default="",
-        help="Extra flags to pass to 'docker run' when validating evaluator (quoted string)",
-    )
-
-    # Local test command
-    local_test_parser = subparsers.add_parser(
-        "local-test",
-        help="Select an evaluation test and run it locally. If a Dockerfile exists, build and run via Docker; otherwise run on host.",
-    )
-    local_test_parser.add_argument(
-        "--entry",
-        help="Entrypoint to run (path::function or path). If not provided, a selector will be shown (unless --yes).",
-    )
-    local_test_parser.add_argument(
-        "--ignore-docker",
-        action="store_true",
-        help="Ignore Dockerfile even if present; run pytest on host",
-    )
-    local_test_parser.add_argument(
-        "--yes",
-        "-y",
-        action="store_true",
-        help="Non-interactive: if multiple tests exist and no --entry, fails with guidance",
-    )
-    local_test_parser.add_argument(
-        "--docker-build-extra",
-        default="",
-        help="Extra flags to pass to 'docker build' (quoted string)",
-    )
-    local_test_parser.add_argument(
-        "--docker-run-extra",
-        default="",
-        help="Extra flags to pass to 'docker run' (quoted string)",
-    )
-
-    output_dir = args.output_dir
-    return generate_cli_docs(parser, output_dir)
+    # Import the parser builder from cli.py to get the actual parser
+    from eval_protocol.cli import build_parser
+
+    parser = build_parser()
+    return generate_cli_docs(parser, args.output)

From 5e194935f73f51b171405e6d7158b8992be124e3 Mon Sep 17 00:00:00 2001
From: Dylan Huang <dhuang@fireworks.ai>
Date: Mon, 15 Dec 2025 22:02:12 -0800
Subject: [PATCH 3/5] Enhance CLI documentation generation by updating
 subparser help extraction. Introduce a method to hide suppressed commands
 from help output and ensure accurate help text is included for subparsers.

---
 docs/cli-reference.md                     | 106 ++++++++++++++++++++++
 eval_protocol/cli.py                      |  22 ++++-
 eval_protocol/cli_commands/export_docs.py |  20 +++-
 3 files changed, 139 insertions(+), 9 deletions(-)
 create mode 100644 docs/cli-reference.md

diff --git a/docs/cli-reference.md b/docs/cli-reference.md
new file mode 100644
index 00000000..73de1862
--- /dev/null
+++ b/docs/cli-reference.md
@@ -0,0 +1,106 @@
+# CLI Reference
+
+**ep** - eval-protocol: Tools for evaluation and reward modeling
+
+## Global Options
+
+| Option | Type | Default | Required | Description |
+|--------|------|---------|----------|-------------|
+| `--verbose`, `-v` |  | false | No | Enable verbose logging |
+| `--profile` |  | - | No | Fireworks profile to use (reads ~/.fireworks/profiles/<name>/auth.ini and settings.ini) |
+| `--server` |  | - | No | Fireworks API server hostname or URL (e.g., dev.api.fireworks.ai or https://dev.api.fireworks.ai) |
+
+## Commands
+
+### `ep logs`
+
+Serve logs with file watching and real-time updates
+
+| Option | Type | Default | Required | Description |
+|--------|------|---------|----------|-------------|
+| `--port` | int | `8000` | No | Port to bind to (default: 8000) |
+| `--debug` |  | false | No | Enable debug mode |
+| `--disable-elasticsearch-setup` |  | false | No | Disable Elasticsearch setup |
+| `--use-env-elasticsearch-config` |  | false | No | Use env vars for Elasticsearch config (requires ELASTICSEARCH_URL, ELASTICSEARCH_API_KEY, ELASTICSEARCH_INDEX_NAME) |
+| `--use-fireworks` |  | false | No | Force Fireworks tracing backend for logs UI (overrides env auto-detection) |
+| `--use-elasticsearch` |  | false | No | Force Elasticsearch backend for logs UI (overrides env auto-detection) |
+
+### `ep upload`
+
+Scan for evaluation tests, select, and upload as Fireworks evaluators
+
+| Option | Type | Default | Required | Description |
+|--------|------|---------|----------|-------------|
+| `--path` |  | `.` | No | Path to search for evaluation tests (default: current directory) |
+| `--entry` |  | - | No | Entrypoint of evaluation test to upload (module:function or path::function). For multiple, separate by commas. |
+| `--id` |  | - | No | Evaluator ID to use (if multiple selections, a numeric suffix is appended) |
+| `--display-name` |  | - | No | Display name for evaluator (defaults to ID) |
+| `--description` |  | - | No | Description for evaluator |
+| `--force` |  | false | No | Overwrite existing evaluator with the same ID |
+| `--yes`, `-y` |  | false | No | Non-interactive: upload all discovered evaluation tests |
+| `--env-file` |  | - | No | Path to .env file containing secrets to upload (default: .env in current directory) |
+
+### `ep create`
+
+Resource creation commands
+
+#### `ep create rft`
+
+Create a Reinforcement Fine-tuning Job on Fireworks
+
+| Option | Type | Default | Required | Description |
+|--------|------|---------|----------|-------------|
+| `--evaluator` |  | - | No | Evaluator ID or fully-qualified resource (accounts/{acct}/evaluators/{id}); if omitted, derive from local tests |
+| `--dataset` |  | - | No | Use existing dataset (ID or resource 'accounts/{acct}/datasets/{id}') to skip local materialization |
+| `--dataset-jsonl` |  | - | No | Path to JSONL to upload as a new Fireworks dataset |
+| `--dataset-builder` |  | - | No | Explicit dataset builder spec (module::function or path::function) |
+| `--dataset-display-name` |  | - | No | Display name for dataset on Fireworks (defaults to dataset id) |
+| `--base-model` |  | - | No | Base model resource id |
+| `--warm-start-from` |  | - | No | Addon model to warm start from |
+| `--output-model` |  | - | No | Output model id (defaults from evaluator) |
+| `--epochs` | int | `1` | No | - |
+| `--batch-size` | int | `128000` | No | - |
+| `--learning-rate` | float | `3e-05` | No | - |
+| `--max-context-length` | int | `65536` | No | - |
+| `--lora-rank` | int | `16` | No | - |
+| `--gradient-accumulation-steps` | int | - | No | Number of gradient accumulation steps |
+| `--learning-rate-warmup-steps` | int | - | No | Number of LR warmup steps |
+| `--accelerator-count` | int | - | No | - |
+| `--region` |  | - | No | Fireworks region enum value |
+| `--display-name` |  | - | No | RFT job display name |
+| `--evaluation-dataset` |  | - | No | Optional separate eval dataset id |
+| `--eval-auto-carveout` |  | true | No | - |
+| `--no-eval-auto-carveout` |  | true | No | - |
+| `--chunk-size` | int | `100` | No | Data chunk size for rollout batching |
+| `--temperature` | float | - | No | - |
+| `--top-p` | float | - | No | - |
+| `--top-k` | int | - | No | - |
+| `--max-output-tokens` | int | `32768` | No | - |
+| `--response-candidates-count` | int | `8` | No | - |
+| `--extra-body` |  | - | No | JSON string for extra inference params |
+| `--mcp-server` |  | - | No | The MCP server resource name to use for the reinforcement fine-tuning job. |
+| `--wandb-enabled` |  | false | No | - |
+| `--wandb-project` |  | - | No | - |
+| `--wandb-entity` |  | - | No | - |
+| `--wandb-run-id` |  | - | No | - |
+| `--wandb-api-key` |  | - | No | - |
+| `--job-id` |  | - | No | Specify an explicit RFT job id |
+| `--yes`, `-y` |  | false | No | Non-interactive mode |
+| `--dry-run` |  | false | No | Print planned REST calls without sending |
+| `--force` |  | false | No | Overwrite existing evaluator with the same ID |
+| `--skip-validation` |  | false | No | Skip local dataset and evaluator validation before creating the RFT job |
+| `--ignore-docker` |  | false | No | Ignore Dockerfile even if present; run pytest on host during evaluator validation |
+| `--docker-build-extra` |  | `` | No | Extra flags to pass to 'docker build' when validating evaluator (quoted string, e.g. "--no-cache --pull --progress=plain") |
+| `--docker-run-extra` |  | `` | No | Extra flags to pass to 'docker run' when validating evaluator (quoted string, e.g. "--env-file .env --memory=8g") |
+
+### `ep local-test`
+
+Select an evaluation test and run it locally. If a Dockerfile exists, build and run via Docker; otherwise run on host.
+
+| Option | Type | Default | Required | Description |
+|--------|------|---------|----------|-------------|
+| `--entry` |  | - | No | Entrypoint to run (path::function or path). If not provided, a selector will be shown (unless --yes). |
+| `--ignore-docker` |  | false | No | Ignore Dockerfile even if present; run pytest on host |
+| `--yes`, `-y` |  | false | No | Non-interactive: if multiple tests exist and no --entry, fails with guidance |
+| `--docker-build-extra` |  | `` | No | Extra flags to pass to 'docker build' (quoted string, e.g. "--no-cache --pull --progress=plain") |
+| `--docker-run-extra` |  | `` | No | Extra flags to pass to 'docker run' (quoted string, e.g. "--env-file .env --memory=8g") |
diff --git a/eval_protocol/cli.py b/eval_protocol/cli.py
index 59c0392c..3bb455d7 100644
--- a/eval_protocol/cli.py
+++ b/eval_protocol/cli.py
@@ -500,10 +500,7 @@ def _configure_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParse
     # )
 
     # Hidden command: export-docs (for generating CLI reference documentation)
-    export_docs_parser = subparsers.add_parser(
-        "export-docs",
-        help=argparse.SUPPRESS,  # Hidden from help output
-    )
+    export_docs_parser = subparsers.add_parser("export-docs", help=argparse.SUPPRESS)
     export_docs_parser.add_argument(
         "--output",
         "-o",
@@ -511,9 +508,26 @@ def _configure_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParse
         help="Output markdown file path (default: ./docs/cli-reference.md)",
     )
 
+    # Update metavar to only show visible commands (exclude those with SUPPRESS)
+    _hide_suppressed_subparsers(parser)
+
     return parser
 
 
+def _hide_suppressed_subparsers(parser: argparse.ArgumentParser) -> None:
+    """Update subparsers to exclude commands with help=SUPPRESS from help output."""
+    for action in parser._actions:
+        if isinstance(action, argparse._SubParsersAction):
+            # Filter _choices_actions to only visible commands
+            choices_actions = getattr(action, "_choices_actions", [])
+            visible_actions = [a for a in choices_actions if a.help != argparse.SUPPRESS]
+            action._choices_actions = visible_actions
+            # Update metavar to match
+            visible_names = [a.dest for a in visible_actions]
+            if visible_names:
+                action.metavar = "{" + ",".join(visible_names) + "}"
+
+
 def parse_args(args=None):
     """Parse command line arguments."""
     parser = build_parser()
diff --git a/eval_protocol/cli_commands/export_docs.py b/eval_protocol/cli_commands/export_docs.py
index 5b39742c..feb083b0 100644
--- a/eval_protocol/cli_commands/export_docs.py
+++ b/eval_protocol/cli_commands/export_docs.py
@@ -13,11 +13,12 @@
 logger = logging.getLogger(__name__)
 
 
-def _get_parser_info(parser: argparse.ArgumentParser) -> Dict:
+def _get_parser_info(parser: argparse.ArgumentParser, subparser_help: str = "") -> Dict:
     """Extract information from an ArgumentParser."""
     info = {
         "prog": parser.prog,
         "description": parser.description or "",
+        "help": subparser_help,  # The help text from add_parser()
         "epilog": parser.epilog or "",
         "arguments": [],
         "subparsers": {},
@@ -26,9 +27,16 @@ def _get_parser_info(parser: argparse.ArgumentParser) -> Dict:
     # Extract arguments
     for action in parser._actions:
         if isinstance(action, argparse._SubParsersAction):
-            # Handle subparsers
+            # Handle subparsers - also extract the help text for each
             for name, subparser in action.choices.items():
-                info["subparsers"][name] = _get_parser_info(subparser)
+                # Get the help text from the subparser action's _parser_class
+                subparser_help_text = ""
+                if hasattr(action, "_choices_actions"):
+                    for choice_action in action._choices_actions:
+                        if choice_action.dest == name:
+                            subparser_help_text = choice_action.help or ""
+                            break
+                info["subparsers"][name] = _get_parser_info(subparser, subparser_help_text)
         elif isinstance(action, argparse._HelpAction):
             # Skip help action, it's always present
             continue
@@ -100,8 +108,10 @@ def _generate_command_section(
     lines.append(f"{heading} `{full_command}`")
     lines.append("")
 
-    if info["description"]:
-        lines.append(info["description"])
+    # Use help text (from add_parser) or description (from ArgumentParser)
+    description = info.get("help") or info.get("description") or ""
+    if description and description != argparse.SUPPRESS:
+        lines.append(description)
         lines.append("")
 
     # Arguments table

From a1d4cd5ed52c23e88125ae6b23e564ccb2496d28 Mon Sep 17 00:00:00 2001
From: Dylan Huang <dhuang@fireworks.ai>
Date: Mon, 15 Dec 2025 22:03:25 -0800
Subject: [PATCH 4/5] remove generated cli-reference

---
 docs/cli-reference.md | 106 ------------------------------------------
 1 file changed, 106 deletions(-)
 delete mode 100644 docs/cli-reference.md

diff --git a/docs/cli-reference.md b/docs/cli-reference.md
deleted file mode 100644
index 73de1862..00000000
--- a/docs/cli-reference.md
+++ /dev/null
@@ -1,106 +0,0 @@
-# CLI Reference
-
-**ep** - eval-protocol: Tools for evaluation and reward modeling
-
-## Global Options
-
-| Option | Type | Default | Required | Description |
-|--------|------|---------|----------|-------------|
-| `--verbose`, `-v` |  | false | No | Enable verbose logging |
-| `--profile` |  | - | No | Fireworks profile to use (reads ~/.fireworks/profiles/<name>/auth.ini and settings.ini) |
-| `--server` |  | - | No | Fireworks API server hostname or URL (e.g., dev.api.fireworks.ai or https://dev.api.fireworks.ai) |
-
-## Commands
-
-### `ep logs`
-
-Serve logs with file watching and real-time updates
-
-| Option | Type | Default | Required | Description |
-|--------|------|---------|----------|-------------|
-| `--port` | int | `8000` | No | Port to bind to (default: 8000) |
-| `--debug` |  | false | No | Enable debug mode |
-| `--disable-elasticsearch-setup` |  | false | No | Disable Elasticsearch setup |
-| `--use-env-elasticsearch-config` |  | false | No | Use env vars for Elasticsearch config (requires ELASTICSEARCH_URL, ELASTICSEARCH_API_KEY, ELASTICSEARCH_INDEX_NAME) |
-| `--use-fireworks` |  | false | No | Force Fireworks tracing backend for logs UI (overrides env auto-detection) |
-| `--use-elasticsearch` |  | false | No | Force Elasticsearch backend for logs UI (overrides env auto-detection) |
-
-### `ep upload`
-
-Scan for evaluation tests, select, and upload as Fireworks evaluators
-
-| Option | Type | Default | Required | Description |
-|--------|------|---------|----------|-------------|
-| `--path` |  | `.` | No | Path to search for evaluation tests (default: current directory) |
-| `--entry` |  | - | No | Entrypoint of evaluation test to upload (module:function or path::function). For multiple, separate by commas. |
-| `--id` |  | - | No | Evaluator ID to use (if multiple selections, a numeric suffix is appended) |
-| `--display-name` |  | - | No | Display name for evaluator (defaults to ID) |
-| `--description` |  | - | No | Description for evaluator |
-| `--force` |  | false | No | Overwrite existing evaluator with the same ID |
-| `--yes`, `-y` |  | false | No | Non-interactive: upload all discovered evaluation tests |
-| `--env-file` |  | - | No | Path to .env file containing secrets to upload (default: .env in current directory) |
-
-### `ep create`
-
-Resource creation commands
-
-#### `ep create rft`
-
-Create a Reinforcement Fine-tuning Job on Fireworks
-
-| Option | Type | Default | Required | Description |
-|--------|------|---------|----------|-------------|
-| `--evaluator` |  | - | No | Evaluator ID or fully-qualified resource (accounts/{acct}/evaluators/{id}); if omitted, derive from local tests |
-| `--dataset` |  | - | No | Use existing dataset (ID or resource 'accounts/{acct}/datasets/{id}') to skip local materialization |
-| `--dataset-jsonl` |  | - | No | Path to JSONL to upload as a new Fireworks dataset |
-| `--dataset-builder` |  | - | No | Explicit dataset builder spec (module::function or path::function) |
-| `--dataset-display-name` |  | - | No | Display name for dataset on Fireworks (defaults to dataset id) |
-| `--base-model` |  | - | No | Base model resource id |
-| `--warm-start-from` |  | - | No | Addon model to warm start from |
-| `--output-model` |  | - | No | Output model id (defaults from evaluator) |
-| `--epochs` | int | `1` | No | - |
-| `--batch-size` | int | `128000` | No | - |
-| `--learning-rate` | float | `3e-05` | No | - |
-| `--max-context-length` | int | `65536` | No | - |
-| `--lora-rank` | int | `16` | No | - |
-| `--gradient-accumulation-steps` | int | - | No | Number of gradient accumulation steps |
-| `--learning-rate-warmup-steps` | int | - | No | Number of LR warmup steps |
-| `--accelerator-count` | int | - | No | - |
-| `--region` |  | - | No | Fireworks region enum value |
-| `--display-name` |  | - | No | RFT job display name |
-| `--evaluation-dataset` |  | - | No | Optional separate eval dataset id |
-| `--eval-auto-carveout` |  | true | No | - |
-| `--no-eval-auto-carveout` |  | true | No | - |
-| `--chunk-size` | int | `100` | No | Data chunk size for rollout batching |
-| `--temperature` | float | - | No | - |
-| `--top-p` | float | - | No | - |
-| `--top-k` | int | - | No | - |
-| `--max-output-tokens` | int | `32768` | No | - |
-| `--response-candidates-count` | int | `8` | No | - |
-| `--extra-body` |  | - | No | JSON string for extra inference params |
-| `--mcp-server` |  | - | No | The MCP server resource name to use for the reinforcement fine-tuning job. |
-| `--wandb-enabled` |  | false | No | - |
-| `--wandb-project` |  | - | No | - |
-| `--wandb-entity` |  | - | No | - |
-| `--wandb-run-id` |  | - | No | - |
-| `--wandb-api-key` |  | - | No | - |
-| `--job-id` |  | - | No | Specify an explicit RFT job id |
-| `--yes`, `-y` |  | false | No | Non-interactive mode |
-| `--dry-run` |  | false | No | Print planned REST calls without sending |
-| `--force` |  | false | No | Overwrite existing evaluator with the same ID |
-| `--skip-validation` |  | false | No | Skip local dataset and evaluator validation before creating the RFT job |
-| `--ignore-docker` |  | false | No | Ignore Dockerfile even if present; run pytest on host during evaluator validation |
-| `--docker-build-extra` |  | `` | No | Extra flags to pass to 'docker build' when validating evaluator (quoted string, e.g. "--no-cache --pull --progress=plain") |
-| `--docker-run-extra` |  | `` | No | Extra flags to pass to 'docker run' when validating evaluator (quoted string, e.g. "--env-file .env --memory=8g") |
-
-### `ep local-test`
-
-Select an evaluation test and run it locally. If a Dockerfile exists, build and run via Docker; otherwise run on host.
-
-| Option | Type | Default | Required | Description |
-|--------|------|---------|----------|-------------|
-| `--entry` |  | - | No | Entrypoint to run (path::function or path). If not provided, a selector will be shown (unless --yes). |
-| `--ignore-docker` |  | false | No | Ignore Dockerfile even if present; run pytest on host |
-| `--yes`, `-y` |  | false | No | Non-interactive: if multiple tests exist and no --entry, fails with guidance |
-| `--docker-build-extra` |  | `` | No | Extra flags to pass to 'docker build' (quoted string, e.g. "--no-cache --pull --progress=plain") |
-| `--docker-run-extra` |  | `` | No | Extra flags to pass to 'docker run' (quoted string, e.g. "--env-file .env --memory=8g") |

From cefc4618d91022b60cd70835f716c120609e5f85 Mon Sep 17 00:00:00 2001
From: Dylan Huang <dhuang@fireworks.ai>
Date: Mon, 15 Dec 2025 22:43:45 -0800
Subject: [PATCH 5/5] update

---
 eval_protocol/cli.py                      |  63 ++++++----
 eval_protocol/cli_commands/export_docs.py | 136 ++++++++++++++++------
 2 files changed, 139 insertions(+), 60 deletions(-)

diff --git a/eval_protocol/cli.py b/eval_protocol/cli.py
index 3bb455d7..471a5bae 100644
--- a/eval_protocol/cli.py
+++ b/eval_protocol/cli.py
@@ -34,7 +34,9 @@
 
 def build_parser() -> argparse.ArgumentParser:
     """Build and return the argument parser for the CLI."""
-    parser = argparse.ArgumentParser(description="eval-protocol: Tools for evaluation and reward modeling")
+    parser = argparse.ArgumentParser(
+        description="Inspect evaluation runs locally, upload evaluators, and create reinforcement fine-tuning jobs on Fireworks"
+    )
     return _configure_parser(parser)
 
 
@@ -401,39 +403,52 @@ def _configure_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParse
     rft_parser.add_argument("--base-model", help="Base model resource id")
     rft_parser.add_argument("--warm-start-from", help="Addon model to warm start from")
     rft_parser.add_argument("--output-model", help="Output model id (defaults from evaluator)")
-    rft_parser.add_argument("--epochs", type=int, default=1)
-    rft_parser.add_argument("--batch-size", type=int, default=128000)
-    rft_parser.add_argument("--learning-rate", type=float, default=3e-5)
-    rft_parser.add_argument("--max-context-length", type=int, default=65536)
-    rft_parser.add_argument("--lora-rank", type=int, default=16)
+    rft_parser.add_argument("--epochs", type=int, default=1, help="Number of training epochs")
+    rft_parser.add_argument("--batch-size", type=int, default=128000, help="Training batch size in tokens")
+    rft_parser.add_argument("--learning-rate", type=float, default=3e-5, help="Learning rate for training")
+    rft_parser.add_argument("--max-context-length", type=int, default=65536, help="Maximum context length in tokens")
+    rft_parser.add_argument("--lora-rank", type=int, default=16, help="LoRA rank for fine-tuning")
     rft_parser.add_argument("--gradient-accumulation-steps", type=int, help="Number of gradient accumulation steps")
-    rft_parser.add_argument("--learning-rate-warmup-steps", type=int, help="Number of LR warmup steps")
-    rft_parser.add_argument("--accelerator-count", type=int)
-    rft_parser.add_argument("--region", help="Fireworks region enum value")
-    rft_parser.add_argument("--display-name", help="RFT job display name")
-    rft_parser.add_argument("--evaluation-dataset", help="Optional separate eval dataset id")
-    rft_parser.add_argument("--eval-auto-carveout", dest="eval_auto_carveout", action="store_true", default=True)
-    rft_parser.add_argument("--no-eval-auto-carveout", dest="eval_auto_carveout", action="store_false")
+    rft_parser.add_argument("--learning-rate-warmup-steps", type=int, help="Number of learning rate warmup steps")
+    rft_parser.add_argument("--accelerator-count", type=int, help="Number of accelerators (GPUs) to use")
+    rft_parser.add_argument("--region", help="Fireworks region for training")
+    rft_parser.add_argument("--display-name", help="Display name for the RFT job")
+    rft_parser.add_argument("--evaluation-dataset", help="Separate dataset id for evaluation")
+    rft_parser.add_argument(
+        "--eval-auto-carveout",
+        dest="eval_auto_carveout",
+        action="store_true",
+        default=True,
+        help="Automatically carve out evaluation data from training set",
+    )
+    rft_parser.add_argument(
+        "--no-eval-auto-carveout",
+        dest="eval_auto_carveout",
+        action="store_false",
+        help="Disable automatic evaluation data carveout",
+    )
     # Rollout chunking
     rft_parser.add_argument("--chunk-size", type=int, default=100, help="Data chunk size for rollout batching")
     # Inference params
-    rft_parser.add_argument("--temperature", type=float)
-    rft_parser.add_argument("--top-p", type=float)
-    rft_parser.add_argument("--top-k", type=int)
-    rft_parser.add_argument("--max-output-tokens", type=int, default=32768)
-    rft_parser.add_argument("--response-candidates-count", type=int, default=8)
+    rft_parser.add_argument("--temperature", type=float, help="Sampling temperature for rollouts")
+    rft_parser.add_argument("--top-p", type=float, help="Top-p (nucleus) sampling parameter")
+    rft_parser.add_argument("--top-k", type=int, help="Top-k sampling parameter")
+    rft_parser.add_argument("--max-output-tokens", type=int, default=32768, help="Maximum output tokens per rollout")
+    rft_parser.add_argument(
+        "--response-candidates-count", type=int, default=8, help="Number of response candidates per prompt"
+    )
     rft_parser.add_argument("--extra-body", help="JSON string for extra inference params")
     # MCP server (optional)
     rft_parser.add_argument(
         "--mcp-server",
-        help="The MCP server resource name to use for the reinforcement fine-tuning job.",
+        help="MCP server resource name for agentic rollouts",
     )
     # Wandb
-    rft_parser.add_argument("--wandb-enabled", action="store_true")
-    rft_parser.add_argument("--wandb-project")
-    rft_parser.add_argument("--wandb-entity")
-    rft_parser.add_argument("--wandb-run-id")
-    rft_parser.add_argument("--wandb-api-key")
+    rft_parser.add_argument("--wandb-enabled", action="store_true", help="Enable Weights & Biases logging")
+    rft_parser.add_argument("--wandb-project", help="Weights & Biases project name")
+    rft_parser.add_argument("--wandb-entity", help="Weights & Biases entity (username or team)")
+    rft_parser.add_argument("--wandb-run-id", help="Weights & Biases run id for resuming")
+    rft_parser.add_argument("--wandb-api-key", help="Weights & Biases API key")
     # Misc
     rft_parser.add_argument("--job-id", help="Specify an explicit RFT job id")
     rft_parser.add_argument("--yes", "-y", action="store_true", help="Non-interactive mode")
diff --git a/eval_protocol/cli_commands/export_docs.py b/eval_protocol/cli_commands/export_docs.py
index feb083b0..4240b5c3 100644
--- a/eval_protocol/cli_commands/export_docs.py
+++ b/eval_protocol/cli_commands/export_docs.py
@@ -59,39 +59,83 @@ def _get_parser_info(parser: argparse.ArgumentParser, subparser_help: str = "")
     return info
 
 
-def _format_argument_row(arg: Dict) -> str:
-    """Format a single argument as a markdown table row."""
-    # Build the flag/argument name
+def _format_argument_item(arg: Dict) -> List[str]:
+    """Format a single argument as a Mintlify ParamField component."""
+    lines = []
+
+    # Build the flag name
     if arg["option_strings"]:
-        name = ", ".join(f"`{opt}`" for opt in arg["option_strings"])
+        long_opts = [o for o in arg["option_strings"] if o.startswith("--")]
+        short_opts = [o for o in arg["option_strings"] if not o.startswith("--")]
+        primary = long_opts[0] if long_opts else arg["option_strings"][0]
     else:
-        name = f"`{arg['dest']}`"
+        primary = arg["dest"]
+        short_opts = []
 
-    # Build type info
+    # Map Python types to ParamField types
     type_str = ""
     if arg["type"]:
-        type_str = getattr(arg["type"], "__name__", str(arg["type"]))
-    if arg["choices"]:
-        type_str = f"choices: {arg['choices']}"
-
-    # Format default value
+        python_type = getattr(arg["type"], "__name__", str(arg["type"]))
+        type_map = {"int": "number", "float": "number", "str": "string", "bool": "boolean"}
+        type_str = type_map.get(python_type, python_type)
+    elif arg["default"] is not None:
+        # Infer type from default
+        if isinstance(arg["default"], bool):
+            type_str = "boolean"
+        elif isinstance(arg["default"], int):
+            type_str = "number"
+        elif isinstance(arg["default"], float):
+            type_str = "number"
+        elif isinstance(arg["default"], str):
+            type_str = "string"
+
+    # Build ParamField attributes
+    attrs = [f'path="{primary}"']
+
+    if type_str:
+        attrs.append(f'type="{type_str}"')
+
+    # Default value
     default = arg["default"]
-    if default is None:
-        default_str = "-"
-    elif default == argparse.SUPPRESS:
-        default_str = "-"
-    elif isinstance(default, bool):
-        default_str = str(default).lower()
-    else:
-        default_str = f"`{default}`"
+    if default is not None and default != argparse.SUPPRESS:
+        if isinstance(default, bool):
+            default_str = str(default).lower()
+        elif isinstance(default, str):
+            # Escape quotes in string defaults
+            default_str = default.replace('"', '\\"')
+        else:
+            default_str = str(default)
+        attrs.append(f'default="{default_str}"')
+
+    if arg["required"]:
+        attrs.append("required")
+
+    # Build description with short alias mention
+    help_text = (arg["help"] or "").replace("<", "&lt;").replace(">", "&gt;")
+    if short_opts:
+        alias_note = f"Short: `{short_opts[0]}`"
+        if help_text:
+            help_text = f"{help_text} ({alias_note})"
+        else:
+            help_text = alias_note
 
-    # Help text (escape pipe characters for markdown tables)
-    help_text = (arg["help"] or "-").replace("|", "\\|")
+    # Add choices info to description
+    if arg["choices"]:
+        choices_str = ", ".join(f"`{c}`" for c in arg["choices"])
+        choices_note = f"Choices: {choices_str}"
+        if help_text:
+            help_text = f"{help_text}. {choices_note}"
+        else:
+            help_text = choices_note
 
-    # Required indicator
-    required = "Yes" if arg["required"] else "No"
+    # Generate ParamField
+    lines.append(f"<ParamField {' '.join(attrs)}>")
+    if help_text:
+        lines.append(f"  {help_text}")
+    lines.append("</ParamField>")
+    lines.append("")
 
-    return f"| {name} | {type_str} | {default_str} | {required} | {help_text} |"
+    return lines
 
 
 def _generate_command_section(
@@ -105,6 +149,21 @@ def _generate_command_section(
     full_command = f"{parent_command} {name}".strip()
     heading = "#" * heading_level
 
+    # Skip commands that have no arguments and only subparsers (like "ep create")
+    # Instead, just render the subcommands directly at the same level
+    if not info["arguments"] and info["subparsers"]:
+        # Skip this level, render subcommands directly
+        for subname, subinfo in info["subparsers"].items():
+            lines.extend(
+                _generate_command_section(
+                    subname,
+                    subinfo,
+                    full_command,
+                    heading_level,  # Keep same heading level
+                )
+            )
+        return lines
+
     lines.append(f"{heading} `{full_command}`")
     lines.append("")
 
@@ -114,13 +173,10 @@ def _generate_command_section(
         lines.append(description)
         lines.append("")
 
-    # Arguments table
+    # Arguments (no extra heading to keep TOC clean)
     if info["arguments"]:
-        lines.append("| Option | Type | Default | Required | Description |")
-        lines.append("|--------|------|---------|----------|-------------|")
         for arg in info["arguments"]:
-            lines.append(_format_argument_row(arg))
-        lines.append("")
+            lines.extend(_format_argument_item(arg))
 
     # Handle nested subparsers recursively
     if info["subparsers"]:
@@ -162,22 +218,30 @@ def generate_cli_docs(parser: argparse.ArgumentParser, output_path: str) -> int:
         if name != "export-docs"  # Don't document the hidden command
     }
 
-    # Generate single page
+    # Generate single page with Mintlify frontmatter
     lines = []
-    lines.append("# CLI Reference")
+    lines.append("---")
+    lines.append("title: CLI")
+    lines.append("icon: terminal")
+    lines.append("---")
     lines.append("")
-    lines.append(f"**{info['prog']}** - {info['description']}")
+    lines.append(
+        f"The `{info['prog']}` command-line interface can {info['description'][0].lower()}{info['description'][1:]}."
+    )
+    lines.append("")
+    lines.append("```bash")
+    lines.append(f"{info['prog']} [global options] <command> [command options]")
+    lines.append("```")
     lines.append("")
 
     # Global options
     if info["arguments"]:
         lines.append("## Global Options")
         lines.append("")
-        lines.append("| Option | Type | Default | Required | Description |")
-        lines.append("|--------|------|---------|----------|-------------|")
-        for arg in info["arguments"]:
-            lines.append(_format_argument_row(arg))
+        lines.append("These options can be used with any command:")
         lines.append("")
+        for arg in info["arguments"]:
+            lines.extend(_format_argument_item(arg))
 
     # Commands section
     if visible_subparsers: