From e595e40dc5e97b632d40ac7e7984aed84079ab86 Mon Sep 17 00:00:00 2001
From: Mike McKiernan <mmckiernan@nvidia.com>
Date: Wed, 6 May 2026 15:22:57 -0400
Subject: [PATCH 1/2] docs: SDG and domain data set example

Signed-off-by: Mike McKiernan <mmckiernan@nvidia.com>
---
 docs/_static/customize.css                    |   2 +
 docs/index.md                                 |  10 +
 docs/sdg/_snippets/input/greenteme.yaml       |  75 ++++
 .../input/greenteme_inquiry_seeds.jsonl       |  12 +
 .../input/step-with-person-datetime.py        | 327 ++++++++++++++++++
 docs/sdg/getting-started.md                   | 138 ++++++++
 docs/sdg/how-to/create-domain-dataset.md      | 131 +++++++
 docs/sdg/how-to/dispatch-to-cluster.md        | 162 +++++++++
 docs/sdg/how-to/index.md                      |  77 +++++
 docs/sdg/how-to/preference-data.md            |  93 +++++
 docs/sdg/how-to/run.md                        |  61 ++++
 docs/sdg/how-to/tool-call-data.md             | 120 +++++++
 docs/sdg/index.md                             | 137 ++++++++
 docs/sdg/reference/cli-reference.md           | 111 ++++++
 docs/sdg/reference/config-schema.md           | 213 ++++++++++++
 docs/sdg/reference/index.md                   |  68 ++++
 docs/sdg/reference/output-projections.md      | 148 ++++++++
 docs/sdg/reference/troubleshooting.md         | 112 ++++++
 18 files changed, 1997 insertions(+)
 create mode 100644 docs/sdg/_snippets/input/greenteme.yaml
 create mode 100644 docs/sdg/_snippets/input/greenteme_inquiry_seeds.jsonl
 create mode 100644 docs/sdg/_snippets/input/step-with-person-datetime.py
 create mode 100644 docs/sdg/getting-started.md
 create mode 100644 docs/sdg/how-to/create-domain-dataset.md
 create mode 100644 docs/sdg/how-to/dispatch-to-cluster.md
 create mode 100644 docs/sdg/how-to/index.md
 create mode 100644 docs/sdg/how-to/preference-data.md
 create mode 100644 docs/sdg/how-to/run.md
 create mode 100644 docs/sdg/how-to/tool-call-data.md
 create mode 100644 docs/sdg/index.md
 create mode 100644 docs/sdg/reference/cli-reference.md
 create mode 100644 docs/sdg/reference/config-schema.md
 create mode 100644 docs/sdg/reference/index.md
 create mode 100644 docs/sdg/reference/output-projections.md
 create mode 100644 docs/sdg/reference/troubleshooting.md

diff --git a/docs/_static/customize.css b/docs/_static/customize.css
index 82510d30f..0359a65db 100644
--- a/docs/_static/customize.css
+++ b/docs/_static/customize.css
@@ -1,2 +1,4 @@
 .admonition.pattern-metadata { border-left-color: #f0ad4e; }
 .admonition.paper-reference { border-left-color: #5bc0de; }
+
+.scrollable pre { max-height: 400px; overflow-y: auto; }
diff --git a/docs/index.md b/docs/index.md
index 336f9e9a1..8ba98b838 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -171,6 +171,16 @@ nemotron/embed/README.md
 nemotron/artifacts.md
 ```
 
+```{toctree}
+:caption: Synthetic Data Generation
+:hidden:
+
+About <sdg/index>
+Getting Started <sdg/getting-started>
+Tasks <sdg/how-to/index>
+Reference <sdg/reference/index>
+```
+
 ```{toctree}
 :caption: Customization
 :hidden:
diff --git a/docs/sdg/_snippets/input/greenteme.yaml b/docs/sdg/_snippets/input/greenteme.yaml
new file mode 100644
index 000000000..d4f88c796
--- /dev/null
+++ b/docs/sdg/_snippets/input/greenteme.yaml
@@ -0,0 +1,75 @@
+output_dir: ${oc.env:SDG_OUTPUT_DIR,${oc.env:NEMO_RUN_DIR,${oc.env:PWD}/output}/sdg}
+output_path: ${output_dir}/greenteme_sft.jsonl
+num_records: 100
+
+seed_dataset:
+  path: ${oc.env:PWD}/src/nemotron/steps/sdg/data_designer/data/greenteme_inquiry_seeds.jsonl
+  strategy: shuffle
+  fields: [scenario]
+
+models:
+  - alias: nvidia-text
+    model: nvidia/nemotron-3-nano-30b-a3b
+    provider: nvidia
+    skip_health_check: true
+    inference_parameters:
+      temperature: 0.8
+      top_p: 1.0
+      max_tokens: 1200
+
+columns:
+  - name: traveler_segment
+    type: category
+    values:
+      - frequent_flyer
+      - business_traveler
+      - family_with_children
+      - first_time_international
+      - elite_loyalty_member
+      - leisure_couple
+
+  - name: inquiry_type
+    type: category
+    values:
+      - rebooking
+      - baggage_issue
+      - refund_request
+      - loyalty_status
+      - fare_rules
+      - flight_status
+
+  - name: channel
+    type: category
+    values: [chat, phone, app]
+
+  - name: user_query
+    type: llm_text
+    model_alias: nvidia-text
+    prompt: |
+      You are role-playing a {{ traveler_segment }} contacting Greenteme Airlines
+      via {{ channel }} about a {{ inquiry_type }}. The scenario is:
+      "{{ scenario }}"
+
+      Write the customer's first message. Keep it natural, 1-3 sentences.
+      Do not reference any real airline name, real flight number, or real
+      loyalty program.
+
+  - name: assistant_response
+    type: llm_text
+    model_alias: nvidia-text
+    prompt: |
+      You are a customer-service agent at Greenteme Airlines, a fictional airline.
+      Reply to this customer message:
+
+      "{{ user_query }}"
+
+      Provide a concise, professional, compliant response, 2-4 sentences. Stay
+      realistic and grounded in standard airline policy. Do not invent real
+      airline names, real flight numbers, real PNR codes, or real loyalty
+      program details. No markdown.
+
+output_projection:
+  type: openai_messages
+  user_field: user_query
+  assistant_field: assistant_response
+  metadata_fields: [traveler_segment, inquiry_type, channel, scenario]
diff --git a/docs/sdg/_snippets/input/greenteme_inquiry_seeds.jsonl b/docs/sdg/_snippets/input/greenteme_inquiry_seeds.jsonl
new file mode 100644
index 000000000..a20321e1e
--- /dev/null
+++ b/docs/sdg/_snippets/input/greenteme_inquiry_seeds.jsonl
@@ -0,0 +1,12 @@
+{"scenario": "Connecting flight cancelled due to weather; customer needs to arrive at destination by tomorrow morning for a wedding."}
+{"scenario": "Checked baggage missing on arrival; flight landed two hours ago and the bag did not appear at the carousel."}
+{"scenario": "Customer wants a refund on a non-refundable ticket due to a documented medical emergency."}
+{"scenario": "Customer is unsure why their loyalty status was downgraded this year and wants to understand the qualifying criteria."}
+{"scenario": "Customer wants to change a fare class on an existing booking and needs to know the fare difference and any change fees."}
+{"scenario": "Flight is showing a four-hour delay and the customer wants to know whether they will make their connection."}
+{"scenario": "Customer was double-charged for a seat upgrade and wants the duplicate charge reversed."}
+{"scenario": "Customer needs to add a service animal to an upcoming international flight and wants to know what documentation is required."}
+{"scenario": "Bag damaged in transit; customer needs to file a claim and wants the timeline and required documentation."}
+{"scenario": "Customer rebooked through self-service and is now seated apart from a travel companion; they want to be reseated together."}
+{"scenario": "Customer wants to use a travel credit from a previous cancellation but cannot find the credit number in their account."}
+{"scenario": "Customer's payment method was declined when trying to complete a booking and they want to know what to do."}
diff --git a/docs/sdg/_snippets/input/step-with-person-datetime.py b/docs/sdg/_snippets/input/step-with-person-datetime.py
new file mode 100644
index 000000000..cde4f083b
--- /dev/null
+++ b/docs/sdg/_snippets/input/step-with-person-datetime.py
@@ -0,0 +1,327 @@
+#!/usr/bin/env python3
+# /// script
+# [tool.runspec]
+# schema = "1"
+# name = "steps/sdg/data_designer"
+#
+# [tool.runspec.run]
+# launch = "python"
+#
+# [tool.runspec.config]
+# dir = "./config"
+# default = "default"
+# format = "omegaconf"
+#
+# [tool.runspec.resources]
+# nodes = 1
+# gpus_per_node = 0
+# ///
+
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Generate synthetic SFT or RL preference data with NeMo Data Designer.
+
+Mirrors the upstream NVIDIA-NeMo/DataDesigner Python SDK: build a
+``DataDesignerConfigBuilder`` from a declarative YAML column spec, then call
+``client.preview(builder)`` (fast iteration) or ``client.create(builder, …)``
+(full dataset).
+
+Two configs ship out of the box:
+  - ``default.yaml``  — SFT chat data (sampler ``persona`` × seed ``topic`` +
+    LLM-generated ``user_query`` / ``assistant_response``).
+  - ``rl_pref.yaml``  — DPO preference data (two LLM-generated responses + an
+    LLM judge to label chosen / rejected).
+
+Generation uses a remote inference endpoint, so this step needs no GPUs of its
+own — only network access to the configured model service. Customisation lives
+entirely in YAML.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any
+
+from omegaconf import OmegaConf
+
+from nemotron.kit.train_script import (
+    apply_hydra_overrides,
+    load_omegaconf_yaml,
+    parse_config_and_overrides,
+)
+
+DEFAULT_CONFIG = Path(__file__).parent / "config" / "default.yaml"
+
+
+def build_columns(builder: Any, columns: list[dict[str, Any]], dd: Any) -> None:
+    """Translate declarative column specs into typed Data Designer column configs.
+
+    Supported ``type``s:
+      - ``category``        — pick uniformly from a fixed list of values.
+      - ``person``          — Census-grounded persona profile via the person sampler.
+      - ``datetime``        — random datetime within a start/end range.
+      - ``seed``            — surface a column from the seed dataset by name.
+      - ``llm_text``        — generate free text via an LLM.
+      - ``llm_structured``  — generate structured JSON via an LLM (provide ``output_format``).
+      - ``llm_judge``       — alias for ``llm_structured``.
+    """
+    for spec in columns:
+        kind = spec["type"]
+        name = spec["name"]
+
+        if kind == "category":
+            builder.add_column(
+                dd.SamplerColumnConfig(
+                    name=name,
+                    sampler_type=dd.SamplerType.CATEGORY,
+                    params=dd.CategorySamplerParams(values=spec["values"]),
+                )
+            )
+
+        elif kind == "person":
+            builder.add_column(
+                dd.SamplerColumnConfig(
+                    name=name,
+                    sampler_type=dd.SamplerType.PERSON,
+                    params=dd.PersonSamplerParams(
+                        locale=spec.get("locale", "en_US"),
+                        age_range=spec.get("age_range"),
+                        with_synthetic_personas=spec.get("with_synthetic_personas", True),
+                    ),
+                )
+            )
+
+        elif kind == "datetime":
+            builder.add_column(
+                dd.SamplerColumnConfig(
+                    name=name,
+                    sampler_type=dd.SamplerType.DATETIME,
+                    params=dd.DatetimeSamplerParams(
+                        start=spec["start"],
+                        end=spec["end"],
+                    ),
+                )
+            )
+
+        elif kind == "seed":
+            # The column name must match the field in the seed dataset.
+            builder.add_column(dd.SeedDatasetColumnConfig(name=name))
+
+        elif kind == "llm_text":
+            builder.add_column(
+                dd.LLMTextColumnConfig(
+                    name=name,
+                    model_alias=spec.get("model_alias", "nvidia-text"),
+                    prompt=spec["prompt"],
+                )
+            )
+
+        elif kind in ("llm_structured", "llm_judge"):
+            builder.add_column(
+                dd.LLMStructuredColumnConfig(
+                    name=name,
+                    model_alias=spec.get("model_alias", "nvidia-text"),
+                    prompt=spec["prompt"],
+                    output_format=spec["output_format"],
+                )
+            )
+
+        else:
+            raise ValueError(f"Unknown column type: {kind!r}")
+
+
+def project_records(records: list[dict[str, Any]], projection: dict[str, Any] | None) -> list[dict[str, Any]]:
+    """Project Data Designer records into training-ready JSONL schemas."""
+    if not projection:
+        return records
+
+    kind = projection.get("type")
+    if kind == "structured_messages":
+        source_field = projection.get("source_field", "conversation")
+        messages_field = projection.get("messages_field", "messages")
+        tools_field = projection.get("tools_field", "tools")
+        metadata_fields = projection.get("metadata_fields") or []
+        projected = []
+        for record in records:
+            source = record[source_field]
+            if isinstance(source, str):
+                source = json.loads(source)
+            if not isinstance(source, dict):
+                raise ValueError(f"{source_field!r} must be a mapping or JSON object string")
+
+            item = {"messages": source[messages_field]}
+            if tools_field in source:
+                item["tools"] = source[tools_field]
+            for field in metadata_fields:
+                if field in record:
+                    item[field] = record[field]
+            projected.append(item)
+        return projected
+
+    if kind == "openai_messages":
+        user_field = projection.get("user_field", "user_query")
+        assistant_field = projection.get("assistant_field", "assistant_response")
+        metadata_fields = projection.get("metadata_fields") or []
+        projected = []
+        for record in records:
+            item = {
+                "messages": [
+                    {"role": "user", "content": record[user_field]},
+                    {"role": "assistant", "content": record[assistant_field]},
+                ]
+            }
+            for field in metadata_fields:
+                if field in record:
+                    item[field] = record[field]
+            projected.append(item)
+        return projected
+
+    if kind == "dpo_preference":
+        prompt_field = projection.get("prompt_field", "prompt")
+        response_a_field = projection.get("response_a_field", "response_a")
+        response_b_field = projection.get("response_b_field", "response_b")
+        judge_field = projection.get("judge_field", "judge")
+        winner_field = projection.get("winner_field", "winner")
+        projected = []
+        for record in records:
+            judge = record.get(judge_field)
+            if isinstance(judge, str):
+                judge = json.loads(judge)
+            if not isinstance(judge, dict):
+                raise ValueError(f"{judge_field!r} must be a mapping or JSON object string")
+
+            winner = str(judge.get(winner_field, "")).upper()
+            if winner == "A":
+                chosen = record[response_a_field]
+                rejected = record[response_b_field]
+            elif winner == "B":
+                chosen = record[response_b_field]
+                rejected = record[response_a_field]
+            else:
+                raise ValueError(f"Unsupported preference winner {winner!r}; expected 'A' or 'B'")
+
+            projected.append(
+                {
+                    "prompt": record[prompt_field],
+                    "chosen": chosen,
+                    "rejected": rejected,
+                }
+            )
+        return projected
+
+    raise ValueError(f"Unknown output_projection type: {kind!r}")
+
+
+def records_from_designer_result(result: Any) -> list[dict[str, Any]]:
+    """Extract records from either preview or dataset-creation results."""
+    if hasattr(result, "load_dataset"):
+        dataset = result.load_dataset()
+    elif hasattr(result, "dataset"):
+        dataset = result.dataset
+    else:
+        raise TypeError(
+            "Data Designer result must expose either `load_dataset()` "
+            "or an in-memory `dataset` attribute"
+        )
+
+    if dataset is None:
+        raise ValueError("Data Designer returned an empty dataset result")
+
+    if isinstance(dataset, list):
+        return dataset
+
+    if hasattr(dataset, "to_pandas"):
+        dataset = dataset.to_pandas()
+
+    if hasattr(dataset, "to_dict"):
+        return dataset.to_dict(orient="records")
+
+    raise TypeError(f"Unsupported Data Designer dataset type: {type(dataset).__name__}")
+
+
+def main() -> None:
+    config_path, cli_overrides = parse_config_and_overrides(default_config=DEFAULT_CONFIG)
+    raw = apply_hydra_overrides(load_omegaconf_yaml(config_path), cli_overrides)
+    cfg = OmegaConf.to_container(raw, resolve=True)
+
+    columns = cfg.get("columns")
+    if not columns:
+        raise ValueError(f"{config_path}: config must declare a non-empty `columns:` list")
+
+    output_path = Path(cfg["output_path"])
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    # Deferred imports keep the module importable on dev hosts without
+    # data_designer installed.
+    import data_designer.config as dd
+    from data_designer.interface import DataDesigner
+
+    builder = dd.DataDesignerConfigBuilder()
+
+    # Models — translate the YAML `models:` list into typed ModelConfig objects.
+    # The builder ships with default model aliases; replace them when the YAML
+    # declares the same alias so our endpoint / parameters win.
+    for spec in cfg.get("models") or []:
+        alias = spec["alias"]
+        try:
+            builder.delete_model_config(alias)
+        except Exception:
+            pass  # alias not yet registered — fine, just add it.
+
+        params = spec.get("inference_parameters") or {}
+        builder.add_model_config(
+            dd.ModelConfig(
+                alias=alias,
+                model=spec["model"],
+                provider=spec.get("provider"),
+                skip_health_check=spec.get("skip_health_check", False),
+                inference_parameters=dd.ChatCompletionInferenceParams(**params),
+            )
+        )
+
+    seed = cfg.get("seed_dataset")
+    if seed:
+        strategy_name = seed.get("strategy", "shuffle").upper()
+        builder.with_seed_dataset(
+            dd.LocalFileSeedSource(path=seed["path"]),
+            sampling_strategy=dd.SamplingStrategy[strategy_name],
+        )
+
+    build_columns(builder, columns, dd)
+
+    client = DataDesigner()
+
+    if cfg.get("preview", False):
+        result = client.preview(builder, num_records=cfg["num_records"])
+        verb = "Preview"
+    else:
+        result = client.create(
+            builder,
+            num_records=cfg["num_records"],
+        )
+        verb = "Generated"
+
+    records = records_from_designer_result(result)
+    records = project_records(records, cfg.get("output_projection"))
+
+    with output_path.open("w") as f:
+        for record in records:
+            f.write(json.dumps(record) + "\n")
+    print(f"{verb} {len(records)} records → {output_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/sdg/getting-started.md b/docs/sdg/getting-started.md
new file mode 100644
index 000000000..cd635b4cf
--- /dev/null
+++ b/docs/sdg/getting-started.md
@@ -0,0 +1,138 @@
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+(sdg-getting-started)=
+# Generate Your First Synthetic Dataset
+
+::::{grid} 2
+
+:::{grid-item-card}
+:columns: 8
+
+**What You'll Build**: A small synthetic SFT chat dataset in OpenAI format--five records grounded in the bundled `sft_topic_seeds.jsonl` seed file, generated through Data Designer against an NVIDIA-hosted LLM endpoint.
+
+^^^
+
+**In this tutorial, you will**:
+
+1. Set up prerequisites: the repository and an NVIDIA API key.
+2. Read the bundled pipeline configuration.
+3. Run a preview to verify the pipeline and model.
+4. Generate a small dataset of five records.
+5. Locate and inspect the output JSONL.
+
+{octicon}`clock;1.5em;sd-mr-1` This tutorial requires between 5 and 10 minutes to complete.
+:::
+
+:::{grid-item-card}
+:columns: 4
+
+{octicon}`flame;1.5em;sd-mr-1` **Sample Prompt**
+
+^^^
+
+Run a 2-record preview of the default SDG pipeline, then generate 5 records and show me the first output record.
+
+:::
+::::
+
+## Start Here
+
+- Run all commands from the repository root.
+- Data generation uses an NVIDIA-hosted endpoint, so the step needs no local GPUs.
+  However, you must set the `NVIDIA_API_KEY` environment variable and you must have network access.
+
+## Prerequisites
+
+- ✅ Repository cloned and `uv sync` complete. Refer to [Quick Start](../index.md) if you have not done this yet.
+- ✅ `NVIDIA_API_KEY` for the default model, `nvidia/nemotron-3-nano-30b-a3b`.
+
+## How the Default Pipeline Works
+
+The `src/nemotron/steps/sdg/data_designer/config/default.yaml` combines two sources of variation to generate each record.
+A seed topic, such as "safe deployment of AI assistants in enterprise support workflows" or
+"ways to monitor data drift in production machine learning systems", is drawn from `.../data/sft_topic_seeds.jsonl`.
+A persona category, such as teacher or engineer, is sampled from a fixed category.
+Together they anchor the user prompt: a researcher might ask a concise technical question about RAG and a student might ask the same topic more tentatively.
+
+The pipeline generates a matching assistant response and then projects the result into OpenAI chat-format messages.
+
+The full configuration is stored at `src/nemotron/steps/sdg/data_designer/config/default.yaml`.
+
+```{literalinclude} ../../src/nemotron/steps/sdg/data_designer/config/default.yaml
+:language: yaml
+:lines: 15-
+:class: scrollable
+```
+
+## Procedure
+
+1. Set your API key:
+
+   ```console
+   $ export NVIDIA_API_KEY="<your-api-key>"
+   ```
+
+1. Run a two-record preview.
+   Preview mode runs the same pipeline against a tiny record count so you can verify the model alias, prompts, and column wiring cheaply before generating at scale.
+
+   ```console
+   $ nemotron step run sdg/data_designer -c default preview=true num_records=2
+   ```
+
+   The pipeline registers the model alias, generate two rows, and prints a summary:
+
+   ````{dropdown} Example Output
+   :icon: code-square
+
+   ```{literalinclude} _snippets/output/preview.txt
+   :language: text
+   ```
+   ````
+
+   The default output path is `./output/sdg/sft.jsonl`.
+   You can override by setting `SDG_OUTPUT_DIR` or specifying `output_path=...` on the command line.
+
+   Inspect the output.
+   Each line is one chat record.
+   The `openai_messages` projection emits a `messages` array plus the seed `topic` and sampled `persona` as metadata for traceability.
+   The following shows one sample record from the `sft.jsonl` file.
+
+   ```{literalinclude} _snippets/output/sft_first_record.jsonl
+   :language: json
+   ```
+
+## Summary
+
+What you learned:
+
+- ✅ Ran a two-record preview to verify the pipeline and model.
+- ✅ Generated a five-record SFT chat dataset with `default.yaml`.
+- ✅ Located the OpenAI-format JSONL output.
+
+Key takeaways:
+
+- **Preview first.** `preview=true num_records=N` runs the same pipeline against a tiny record count. Use it to iterate on column specs and prompts before scaling `num_records` up.
+- **Output format matches the trainer.** The `openai_messages` projection emits records ready for `prep/sft_packing` or AutoModel SFT.
+
+## Next Steps
+
+- **Adapt the pipeline to a domain you care about**: {doc}`how-to/create-domain-dataset`.
+- **Preview, generate, and customize output**: {doc}`how-to/run`.
+- **Generate preference pairs for DPO**: {doc}`how-to/preference-data`.
+- **Dispatch to a cluster**: {doc}`how-to/dispatch-to-cluster` learn about env.toml profiles and container images.
+- **Look up flags and config fields**: {doc}`reference/cli-reference`, {doc}`reference/config-schema`.
diff --git a/docs/sdg/how-to/create-domain-dataset.md b/docs/sdg/how-to/create-domain-dataset.md
new file mode 100644
index 000000000..70e872c16
--- /dev/null
+++ b/docs/sdg/how-to/create-domain-dataset.md
@@ -0,0 +1,131 @@
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+(sdg-create-greenteme-airlines-dataset)=
+# Create a Domain Dataset for Airlines Customer Service
+
+::::{grid} 2
+
+:::{grid-item-card}
+:columns: 8
+
+**What You'll Build**: A domain-adapted SFT chat dataset modeled on fictional airlines customer-service conversations.
+
+^^^
+
+**In this how-to guide, you will**:
+
+1. Create an airline-domain pipeline config.
+2. Create a seed file of airline inquiry scenarios.
+3. Swap the category columns for three airline-relevant dimensions.
+4. Rewrite the LLM prompts for the airline domain.
+5. Update the output projection and output path.
+6. Run a preview to verify, then generate 100 records.
+
+{octicon}`clock;1.5em;sd-mr-1` This guide requires between 20 and 30 minutes to complete.
+:::
+
+:::{grid-item-card}
+:columns: 4
+
+{octicon}`flame;1.5em;sd-mr-1` **Sample Prompt**
+
+^^^
+
+Adapt the default SDG pipeline for Greenteme Airlines customer service with three category dimensions, run a 2-record preview, then generate 100 records and show me one output record.
+
+:::
+::::
+
+## Prerequisites
+
+- ✅ Completed {doc}`../getting-started` — at least one successful preview and full run of `default.yaml` so you know the pipeline works end-to-end.
+- ✅ `NVIDIA_API_KEY` set in your environment.
+
+## How This Differs From the Default Pipeline
+
+The default pipeline mixes a single category dimension, `persona`, with seed topics.
+This example adds category dimensions, `traveler_segment`, `inquiry_type`, and `channel`, on top of seed scenarios so that diversity comes from explicit, controllable values.
+
+## Procedure
+
+1. Create a `src/nemotron/steps/sdg/data_designer/config/greenteme.yaml` ([download](../_snippets/input/greenteme.yaml)) file like the following example:
+
+   ```{literalinclude} ../_snippets/input/greenteme.yaml
+   :language: yaml
+   :class: scrollable
+   ```
+
+   The key differences from the default pipeline:
+   - The variation for traveler segment,  inquiry type, and channel are all provided by category-type columns.
+   - The variation for the scenarios is provided by the seed JSONL file from the next step.
+   - The system-style instruction lives at the top of each prompt rather than as a separate field. The LLM text columns take a single prompt that includes the role for the LLM to assume.
+   - The `output_projection` field includes the new metadata fields.
+
+2. Create a seed file, `src/nemotron/steps/sdg/data_designer/data/greenteme_inquiry_seeds.jsonl`, ([download](../_snippets/input/greenteme_inquiry_seeds.jsonl)) like the following example:
+
+   ```{literalinclude} ../_snippets/input/greenteme_inquiry_seeds.jsonl
+   :language: json
+   :class: scrollable
+   ```
+
+1. Run a preview by specifying `preview=true num_records=2` to verify the pipeline before scaling:
+
+   ```console
+   $ nemotron step run sdg/data_designer -c greenteme preview=true num_records=2
+   ```
+
+   ````{dropdown} Example Output
+   :icon: code-square
+
+   ```{literalinclude} ../_snippets/output/greenteme_preview.jsonl
+   :language: json
+   ```
+   ````
+
+1. Generate the dataset by raising `num_records` after the preview output looks correct:
+
+   ```console
+   $ nemotron step run sdg/data_designer -c greenteme num_records=100
+   ```
+
+## Going Further
+
+**Locale-aware persona profiles.** The current YAML schema supports category, seed, and LLM column types. To replace the static `traveler_segment` category with Census-grounded persona profiles using Data Designer's [person sampler](https://nvidia-nemo.github.io/DataDesigner/latest/concepts/person_sampling/), you can include locale, age range, and synthetic-personas integration.
+
+**Multi-turn conversations.** The example shows a single user and assistant exchange.
+For multi-turn dialogue, swap the two `llm_text` columns for one `llm_structured` column whose `output_format` is a Pydantic conversation schema.
+Refer to the `customer_support_tools.yaml` in the config directory for the structured-output pattern.
+
+**Dispatch to a cluster.** Generation runs locally against the NVIDIA-hosted endpoint by default. To run on Lepton or Slurm, see {doc}`dispatch-to-cluster` — env.toml profiles, container images, and the gotchas that bite first-time cluster runs.
+
+## Schema and Downstream Use
+
+The `openai_messages` projection emits records with a `messages` array plus the metadata fields you list. These flow directly into:
+
+- `prep/sft_packing` for Megatron-Bridge-style training, or
+- AutoModel SFT, which consumes the chat format directly.
+
+For a full reference of available projection shapes, see {doc}`../reference/output-projections`.
+
+## Next Steps
+
+- **Generate preference pairs for DPO**: {doc}`preference-data` — the `rl_pref.yaml` pattern.
+- **Generate tool-calling SFT data**: {doc}`tool-call-data` — multi-turn with `output_format=Conversation`.
+- **CLI flags and overrides**: {doc}`../reference/cli-reference`.
+- **Config schema**: {doc}`../reference/config-schema` — full reference for column types, samplers, and projections.
+- **Pipeline overview**: {doc}`../index`.
diff --git a/docs/sdg/how-to/dispatch-to-cluster.md b/docs/sdg/how-to/dispatch-to-cluster.md
new file mode 100644
index 000000000..b7943e9e8
--- /dev/null
+++ b/docs/sdg/how-to/dispatch-to-cluster.md
@@ -0,0 +1,162 @@
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+(sdg-dispatch-to-cluster)=
+# Dispatch SDG to a Cluster
+
+This guide covers configuring an env.toml profile and running `sdg/data_designer` on Lepton or Slurm. Generation is CPU-only (no GPUs needed) and calls a remote LLM endpoint, so the step fits naturally on a CPU node with outbound network access.
+
+## env.toml Profile Shape
+
+Add a profile to `env.toml` (repository root). The example below targets a Lepton CPU node:
+
+```toml
+[lepton-sdg]
+executor = "lepton"
+container_image = "nvcr.io/nvidia/nemo:25.11.nemotron_3_nano"
+nemo_run_dir = "/mnt/shared/nemo-run"
+nodes = 1
+gpus_per_node = 0
+resource_shape = "cpu.large"
+node_group = "your-node-group"
+shared_memory_size = 1024
+can_be_preempted = true
+queue_priority = "mid-4000"
+startup_commands = [
+    "python -m pip install --quiet --break-system-packages 'data-designer>=0.5.6'"
+]
+mounts = [
+    { path = "/your-nfs-source", mount_path = "/mnt/shared", from = "node-nfs:your-nfs-id" }
+]
+
+[lepton-sdg.env_vars]
+NVIDIA_API_KEY = "${oc.env:NVIDIA_API_KEY}"
+```
+
+## Run
+
+```console
+$ nemotron step run sdg/data_designer -c default --batch lepton-sdg num_records=1000
+```
+
+Use `--run` instead of `--batch` to stream logs interactively.
+
+## Known Gotchas
+
+These are the failure modes that commonly affect first-time cluster SDG runs.
+
+### `data-designer` is not pre-installed in the container
+
+The NeMo container image does not include `data-designer`. Install it at startup via `startup_commands`:
+
+```toml
+startup_commands = [
+    "python -m pip install --quiet --break-system-packages 'data-designer>=0.5.6'"
+]
+```
+
+Do not omit `--break-system-packages` — without it pip refuses to install into the system Python on recent NeMo images.
+
+### Default `shared_memory_size` crashes a CPU node
+
+The runspec default for `shared_memory_size` is 65536 MB (64 GB), which exceeds the RAM of most CPU node types and causes the job to be rejected or OOM-killed immediately. Set it to a small value; this step makes no use of shared memory:
+
+```toml
+shared_memory_size = 1024
+```
+
+### `nemo_run_dir` must be on shared storage
+
+`nemo-run` uses a busybox data-mover sidecar to stage the launch script into `nemo_run_dir`. If this path is not visible to both the data-mover and the main container — specifically if it is local to one node — the main container never finds the script and the job fails with `No such file or directory`.
+
+Set `nemo_run_dir` to a path on the shared NFS mount and include the mount in your profile:
+
+```toml
+nemo_run_dir = "/mnt/shared/nemo-run"
+mounts = [
+    { path = "/your-nfs-source", mount_path = "/mnt/shared", from = "node-nfs:your-nfs-id" }
+]
+```
+
+:::{note}
+In the `mounts` table, `path` is the NFS **source** path on the NFS server — not the in-container destination. `mount_path` is the in-container path.
+:::
+
+### `NVIDIA_API_KEY` is not forwarded automatically
+
+Unlike `HF_TOKEN` and `WANDB_API_KEY`, `NVIDIA_API_KEY` is not automatically forwarded to the container. Declare it explicitly in the `env_vars` section:
+
+```toml
+[lepton-sdg.env_vars]
+NVIDIA_API_KEY = "${oc.env:NVIDIA_API_KEY}"
+```
+
+Set it in your local shell before submitting the job:
+
+```console
+$ export NVIDIA_API_KEY="your-api-key"
+$ nemotron step run sdg/data_designer -c default --batch lepton-sdg num_records=1000
+```
+
+### Container image: always look up, never guess
+
+Do not invent image tags. `nemo:latest` does not exist on `nvcr.io`. Check `src/nemotron/steps/sdg/data_designer/step.py` header comments or `src/nemotron/steps/env/env_toml/config/lepton.yaml` for known-good image references before setting `container_image`.
+
+### Preemption and queue-priority fields were not wired (now fixed)
+
+`can_be_preempted`, `can_preempt`, and `queue_priority` are now forwarded from env.toml to `LeptonExecutor`. If you are on an older version of the repo where these were silently ignored, upgrade before expecting preemption scheduling to take effect.
+
+## Slurm Profile
+
+For Slurm, replace the Lepton-specific fields with Slurm equivalents. The `startup_commands` and `env_vars` gotchas apply equally:
+
+```toml
+[slurm-sdg]
+executor = "slurm"
+container_image = "nvcr.io/nvidia/nemo:25.11.nemotron_3_nano"
+nemo_run_dir = "/lustre/team/nemo-run"
+nodes = 1
+gpus_per_node = 0
+run_partition = "cpu"
+batch_partition = "cpu"
+startup_commands = [
+    "python -m pip install --quiet --break-system-packages 'data-designer>=0.5.6'"
+]
+
+[slurm-sdg.env_vars]
+NVIDIA_API_KEY = "${oc.env:NVIDIA_API_KEY}"
+```
+
+:::{tip}
+On clusters where the default partition requires GPUs (for example, NVIDIA's `dlw` cluster), set `run_partition` and `batch_partition` to a CPU-capable partition. `gpus_per_node = 0` alone is not sufficient — the partition itself must accept zero-GPU jobs.
+:::
+
+## Verify Before Scaling
+
+Run a preview via the cluster profile before a large batch:
+
+```console
+$ nemotron step run sdg/data_designer -c default --run lepton-sdg preview=true num_records=2
+```
+
+Confirm the job reaches `Running`, the model alias check succeeds, and two records are returned before submitting the full job.
+
+## Next Steps
+
+- **env.toml reference**: `docs/nemo_runspec/nemo-run.md` — full profile field reference.
+- **CLI flags**: {doc}`../reference/cli-reference`.
+- **Troubleshooting**: {doc}`../reference/troubleshooting` — full failure-mode reference.
diff --git a/docs/sdg/how-to/index.md b/docs/sdg/how-to/index.md
new file mode 100644
index 000000000..1b4f23333
--- /dev/null
+++ b/docs/sdg/how-to/index.md
@@ -0,0 +1,77 @@
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+(sdg-how-to-index)=
+# Synthetic Data How-To Guides
+
+Task-focused guides for common SDG workflows. For pipeline overview and when to use it, refer to {doc}`../index`. For your first run, start with {doc}`../getting-started`.
+
+::::{grid} 1 2 2 2
+:gutter: 1 1 1 2
+
+:::{grid-item-card} {octicon}`play;1.5em;sd-mr-1` Run the Pipeline
+:link: run
+:link-type: doc
+Preview, generate, and customize output path and projection.
++++
+{bdg-success}`10 min` {bdg-secondary}`intermediate`
+:::
+
+:::{grid-item-card} {octicon}`briefcase;1.5em;sd-mr-1` Create a Domain Dataset
+:link: create-domain-dataset
+:link-type: doc
+Adapt the pipeline to a custom domain with a seed file and multiple category dimensions.
++++
+{bdg-success}`20 min` {bdg-secondary}`intermediate`
+:::
+
+:::{grid-item-card} {octicon}`tools;1.5em;sd-mr-1` Generate Tool-Call Data
+:link: tool-call-data
+:link-type: doc
+Generate multi-turn conversations with OpenAI-style tool calls for tool-use SFT.
++++
+{bdg-success}`15 min` {bdg-secondary}`intermediate`
+:::
+
+:::{grid-item-card} {octicon}`git-compare;1.5em;sd-mr-1` Generate Preference Data
+:link: preference-data
+:link-type: doc
+Generate DPO preference pairs (prompt / chosen / rejected) from `rl_pref.yaml`.
++++
+{bdg-success}`15 min` {bdg-secondary}`intermediate`
+:::
+
+:::{grid-item-card} {octicon}`server;1.5em;sd-mr-1` Dispatch to a Cluster
+:link: dispatch-to-cluster
+:link-type: doc
+Configure an env.toml profile and run SDG on Lepton or Slurm.
++++
+{bdg-success}`30 min` {bdg-secondary}`intermediate`
+:::
+
+::::
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+
+Create a Domain Dataset <create-domain-dataset>
+Create Tool-Calling Dataset <tool-call-data>
+preference-data
+dispatch-to-cluster
+run
+```
diff --git a/docs/sdg/how-to/preference-data.md b/docs/sdg/how-to/preference-data.md
new file mode 100644
index 000000000..f78238369
--- /dev/null
+++ b/docs/sdg/how-to/preference-data.md
@@ -0,0 +1,93 @@
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+(sdg-preference-data)=
+# Generate Preference Data for DPO
+
+This example shows how to use the `rl_pref.yaml` configuration file.
+The example generates _prompt_, _chosen_, and _rejected_ triples for direct preference optimization (DPO) training.
+Output flows directly into `prep/rl_prep` and then `rl/nemo_rl/dpo`.
+
+## How It Works
+
+The `rl_pref.yaml` file registers two model aliases at different temperatures:
+a high-temperature creative model and a low-temperature precise model.
+The goal is to produce two responses per prompt that are distinct:
+
+```{literalinclude} ../../../src/nemotron/steps/sdg/data_designer/config/rl_pref.yaml
+:language: yaml
+:lines: 15-
+:class: scrollable
+```
+
+For each seed prompt the pipeline:
+
+1. Generates `response_a` (high temperature) and `response_b` (low temperature) independently.
+2. Asks a third LLM call (`judge` column, `llm_judge` type) to compare them and return `{"winner": "A"}` or `{"winner": "B"}`.
+3. The `dpo_preference` projection maps winner → chosen / rejected and writes `{"prompt": "...", "chosen": "...", "rejected": "..."}`.
+
+## Prerequisites
+
+- `NVIDIA_API_KEY` set in your environment.
+- A seed file with one `prompt` field per line. The bundled `rl_pref_prompt_seeds.jsonl` contains general reasoning prompts. Replace it with domain-specific prompts for targeted preference data.
+
+## Procedure
+
+1. Preview two records to verify the judge returns valid `winner` values:
+
+   ```console
+   $ nemotron step run sdg/data_designer -c rl_pref preview=true num_records=2
+   ```
+
+2. Generate the dataset. The default is 2000 records:
+
+   ```console
+   $ nemotron step run sdg/data_designer -c rl_pref num_records=500
+   ```
+
+   Output is written to `./output/sdg/rl_pref.jsonl`.
+
+   Inspect the output. Each line is a preference triple:
+
+   ```json
+   {"prompt": "Explain why retrieval-augmented generation can reduce hallucinations.", "chosen": "RAG grounds the model in retrieved documents, so claims are tied to specific passages rather than purely to weights.", "rejected": "RAG is better because it uses more data and is generally smarter than standard models."}
+   ```
+
+## Adapt the Seed File
+
+Swap `seed_dataset.path` to point at your own prompt seed file. Each line must be valid JSON with a `prompt` field:
+
+```json
+{"prompt": "Describe the tradeoffs between batch and streaming inference for real-time applications."}
+```
+
+Keep seed prompts representative of the target capability and diverse across difficulty levels.
+The judge performs better when the two responses have a clear quality difference--consider widening the temperature gap between the two model aliases if the judge returns many ties or unexpected results.
+
+## Downstream Pipeline
+
+```text
+rl_pref.jsonl  →  prep/rl_prep  →  rl/nemo_rl/dpo
+```
+
+`prep/rl_prep` tokenizes and packs preference pairs. `rl/nemo_rl/dpo` consumes the packed dataset. Verify the `prompt`, `chosen`, and `rejected` fields are present in every record before handing off.
+
+## Next Steps
+
+- **Output projection reference**: {doc}`../reference/output-projections` — `dpo_preference` schema.
+- **Config schema**: {doc}`../reference/config-schema` — `llm_judge` column type and `dpo_preference` projection fields.
+- **Dispatch to a cluster**: {doc}`dispatch-to-cluster`.
diff --git a/docs/sdg/how-to/run.md b/docs/sdg/how-to/run.md
new file mode 100644
index 000000000..91ec2e0e3
--- /dev/null
+++ b/docs/sdg/how-to/run.md
@@ -0,0 +1,61 @@
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+(sdg-run)=
+# Tips for the Data Generation Pipeline
+
+## Preview Before Generating
+
+Always preview before running a full generation job. Preview mode calls the same pipeline but returns a small number of records without writing the final JSONL:
+
+```console
+$ nemotron step run sdg/data_designer -c default preview=true num_records=2
+```
+
+Use preview to verify:
+
+- Column references in prompts (`{{ column_name }}`) resolve to the expected values.
+- Seed fields, such as `{{ scenario }}`, `{{ prompt }}`, and so on, are populated from the seed file.
+- The model returns text that matches the prompt's intent.
+- The `output_projection` produces the schema downstream steps expect.
+
+## Specify a Configuration File
+
+The repository includes the following sample config files in the `src/nemotron/steps/sdg/data_designer/config` directory:
+
+| Config | Output | Use for |
+|---|---|---|
+| `default.yaml` | SFT chat (`openai_messages`) | General chat SFT |
+| `customer_support_tools.yaml` | Tool-call SFT (`structured_messages`) | Tool-use SFT |
+| `rl_pref.yaml` | Preference pairs (`dpo_preference`) | DPO / RLHF |
+| `tiny.yaml` | SFT chat, 10 records, short tokens | Fast iteration |
+
+Specify the file in the `-c` argument:
+
+```console
+$ nemotron step run sdg/data_designer -c customer_support_tools preview=true num_records=2
+```
+
+## Run Attached on a Cluster Profile
+
+To dispatch to a Lepton or Slurm profile configured in `env.toml`, use `--run` (attached, streams logs) or `--batch` (detached):
+
+```console
+$ nemotron step run sdg/data_designer -c default --run my-lepton-profile num_records=1000
+```
+
+For cluster setup, see {doc}`dispatch-to-cluster`.
diff --git a/docs/sdg/how-to/tool-call-data.md b/docs/sdg/how-to/tool-call-data.md
new file mode 100644
index 000000000..812a822f7
--- /dev/null
+++ b/docs/sdg/how-to/tool-call-data.md
@@ -0,0 +1,120 @@
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+(sdg-tool-call-data)=
+# Generate Tool-Calling Data for SFT
+
+Use this guide when you need multi-turn chat JSONL where the assistant issues OpenAI-style `tool_calls` and a `tool` role returns structured results, suitable for supervised fine-tuning (SFT) with a `tools` definition array.
+
+You will use the sample config `customer_support_tools.yaml`, which produces ecommerce-style support threads. Each output row includes a `messages` array (with tool turns) and a `tools` array, ready for packing and training.
+
+## Outcomes
+
+- Understand how one `llm_structured` column can emit a full multi-turn trace in a single model call.
+- Preview, generate, and validate records before training.
+- Know how to retarget seeds, prompts, and schema for your own domain.
+
+## How It Works
+
+Compared with single-turn configs such as `default.yaml`, this setup drives the whole conversation from one `llm_structured` column. That column’s `output_format` is a JSON schema that fixes roles, tool-call shape, and approximate turn count so the model cannot drift into invalid shapes.
+
+```{literalinclude} ../../../src/nemotron/steps/sdg/data_designer/config/customer_support_tools.yaml
+:language: yaml
+:lines: 15-
+:class: scrollable
+```
+
+Each seed row supplies five anchor fields the prompt interpolates: `customer_name`, `issue`, `order_id`, `product`, and `policy_hint`. Two extra category columns (`urgency`, `channel`) add variety without multiplying seed rows for every combination.
+
+## Prerequisites
+
+- Nemotron CLI available and working; if this is your first SDG run, complete {doc}`../getting-started`.
+- `NVIDIA_API_KEY` set in the environment.
+- The bundled seed file `data/customer_support_tool_seeds.jsonl` (shipped with the step). Add rows, or point the config at your own JSONL.
+
+## Procedure
+
+1. Preview two records so structured output matches the schema:
+
+   ```console
+   $ nemotron step run sdg/data_designer -c customer_support_tools preview=true num_records=2
+   ```
+
+   In the preview, confirm:
+
+   - Exactly one assistant message with `tool_calls`.
+   - Exactly one `tool` message whose `tool_call_id` matches the call.
+   - `function.arguments` is a JSON string, not a nested object.
+   - The assistant’s closing turn references the tool result (not a generic reply).
+   - No markdown in message `content` if your trainer expects plain text.
+
+2. Generate the dataset:
+
+   ```console
+   $ nemotron step run sdg/data_designer -c customer_support_tools num_records=200
+   ```
+
+   Output path: `./output/sdg/customer_support_tool_sft.jsonl`.
+   Spot-check a few lines. Each record exposes top-level `messages` and `tools` plus metadata, like the following example:
+
+   ```text
+   {
+     "messages": [
+       {"role": "system", "content": "You are a helpful ecommerce support agent..."},
+       {"role": "user", "content": "Hi, I haven't received my headphones yet..."},
+       {"role": "assistant", "content": "I'd be happy to help. Could you share your order number?"},
+       {"role": "user", "content": "It's ORD-10492."},
+       {"role": "assistant", "content": "", "tool_calls": [{"id": "call_001", "type": "function", "function": {"name": "lookup_order", "arguments": "{\"order_id\":\"ORD-10492\"}"}}]},
+       {"role": "tool", "tool_call_id": "call_001", "name": "lookup_order", "content": "{\"status\":\"delayed\",\"eta\":\"tomorrow\"}"},
+       {"role": "assistant", "content": "Your order is delayed and should arrive tomorrow. Per our policy, I can arrange an expedited replacement if you prefer."}
+     ],
+     "tools": [{"type": "function", "function": {"name": "lookup_order", "description": "...", "parameters": {...}}}],
+     "customer_name": "Priya", "issue": "late delivery", "urgency": "frustrated", "channel": "web_chat"
+   }
+   ```
+
+## Adapt to Your Domain
+
+1. Replace or extend the seed file so rows cover your entities. You may rename the five anchor fields as long as the prompt and YAML refer to the same names.
+2. Update `seed_dataset.fields` in the YAML to match those names.
+3. Rewrite the `prompt` for your scenario and tool surface.
+4. Adjust `output_format` if the message layout changes (for example, multiple tool calls per conversation).
+
+Keep `output_projection` as `structured_messages` so the step extracts `messages` and `tools` from the structured column and merges category metadata onto each record.
+
+## Validation Checklist
+
+Before training, sample at least 50 records and verify:
+
+- [ ] Every `tool_calls` block has a matching `tool` message with the same `tool_call_id`.
+- [ ] `function.arguments` values are JSON strings, not nested objects.
+- [ ] The assistant’s final reply uses the tool result (not a canned answer that ignores it).
+- [ ] No unexpected markdown in `content` if the trainer assumes plain text.
+- [ ] `tools` is present and non-empty on every record.
+
+## Downstream Use
+
+```text
+customer_support_tool_sft.jsonl  →  prep/sft_packing  →  SFT training
+```
+
+The `structured_messages` projection writes `messages` and `tools` at the top level, matching formats common to AutoModel-style SFT and Megatron-Bridge-style workflows. Run `prep/sft_packing` in dry-run mode before a large training job to confirm the packer accepts your file.
+
+## Next Steps
+
+- Output projection reference: {doc}`../reference/output-projections` to learn the `structured_messages` schema.
+- Config schema: {doc}`../reference/config-schema` for information about the `llm_structured` column type and `output_format`.
diff --git a/docs/sdg/index.md b/docs/sdg/index.md
new file mode 100644
index 000000000..29f036f04
--- /dev/null
+++ b/docs/sdg/index.md
@@ -0,0 +1,137 @@
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+(sdg-index)=
+# About Synthetic Data Generation
+
+Generate synthetic training data with [NeMo Data Designer](https://nvidia-nemo.github.io/DataDesigner/) using a declarative YAML pipeline. Seed a generation run with your domain-specific topics, scenarios, or personas; define the column structure and prompts in YAML; and produce training-ready JSONL without writing Python.
+
+Three output shapes ship out of the box: SFT chat data, tool-calling SFT data, and DPO preference pairs.
+
+:::{tip}
+New to SDG? Start with the {doc}`getting-started` tutorial to run the bundled pipeline and produce your first dataset in 5–10 minutes.
+:::
+
+## When to Use
+
+Use SDG when you need training data that does not already exist in sufficient quantity or quality for your target domain or task.
+
+- **SFT chat data** — Generate user/assistant conversation pairs grounded in domain-specific topics, scenarios, or personas. Use `default.yaml` as a starting point and adapt it to your domain.
+- **Tool-calling SFT data** — Generate multi-turn conversations that include assistant tool calls and tool responses in OpenAI format. Use `customer_support_tools.yaml` as a starting point.
+- **DPO preference data** — Generate prompt / chosen / rejected triples for preference learning. Use `rl_pref.yaml`.
+- **Custom domains** — Swap the seed file, category columns, and prompts to target any domain. The pipeline is fully declarative; customisation does not require editing Python.
+- **Cluster-scale generation** — Dispatch generation to Lepton or Slurm via env.toml profiles when local throughput is insufficient.
+
+## Pipeline at a Glance
+
+```mermaid
+%%{init: {'theme': 'base', 'themeVariables': { 'primaryBorderColor': '#333333', 'lineColor': '#333333', 'primaryTextColor': '#333333', 'clusterBkg': '#ffffff', 'clusterBorder': '#333333'}}}%%
+flowchart TB
+    seed["Seed file (optional)"] --> dsg
+    cat["Category samplers"] --> dsg
+    per["Person sampler (optional)"] --> dsg
+    dsg["Data Designer column graph<br/>Jinja2 prompts · LLM calls"]
+    dsg --> proj["output_projection"]
+    proj --> om["openai_messages"]
+    proj --> dpo["dpo_preference"]
+    proj --> sm["structured_messages"]
+    om --> jsonl["JSONL"]
+    dpo --> jsonl
+    sm --> jsonl
+    jsonl --> train["prep/sft_packing or AutoModel SFT"]
+```
+
+Each run is reproducible: the seed file, column specs, model alias, inference parameters, and projection rules are all version-controlled in a single YAML file.
+
+## Documentation
+
+::::{grid} 1 2 2 2
+:gutter: 1 1 1 2
+
+:::{grid-item-card} {octicon}`rocket;1.5em;sd-mr-1` Getting Started
+:link: getting-started
+:link-type: doc
+Run the bundled pipeline end-to-end: preview two records, generate five, inspect the output JSONL.
++++
+{bdg-success}`5–10 min` {bdg-secondary}`tutorial`
+:::
+
+:::{grid-item-card} {octicon}`checklist;1.5em;sd-mr-1` How-To Guides
+:link: how-to/index
+:link-type: doc
+Task-focused guides: adapt the pipeline to a domain, generate preference pairs, dispatch to a cluster.
++++
+{bdg-success}`5 guides` {bdg-secondary}`task-focused`
+:::
+
+:::{grid-item-card} {octicon}`list-unordered;1.5em;sd-mr-1` Reference
+:link: reference/index
+:link-type: doc
+YAML config schema, CLI flags, output projection shapes, and troubleshooting.
++++
+{bdg-success}`4 references` {bdg-secondary}`lookup`
+:::
+
+::::
+
+## All Documentation
+
+````{tab-set}
+
+```{tab-item} Getting Started
+
+| Guide | What You'll Do | Time |
+|---|---|---|
+| {doc}`getting-started` | Preview and generate your first synthetic SFT dataset | 5–10 min |
+
+```
+
+```{tab-item} How-To Guides
+
+| Guide | What You'll Do |
+|---|---|
+| {doc}`how-to/run` | Preview, generate, and customize output path and projection |
+| {doc}`how-to/create-domain-dataset` | Adapt the pipeline to a custom domain with a seed file and multiple category dimensions |
+| {doc}`how-to/tool-call-data` | Generate multi-turn tool-calling SFT data |
+| {doc}`how-to/preference-data` | Generate DPO preference pairs from `rl_pref.yaml` |
+| {doc}`how-to/dispatch-to-cluster` | Dispatch generation to Lepton or Slurm via env.toml |
+
+```
+
+```{tab-item} Reference
+
+| Reference | What You'll Find |
+|---|---|
+| {doc}`reference/config-schema` | Full YAML column types, sampler parameters, and projection fields |
+| {doc}`reference/cli-reference` | `nemotron step run sdg/data_designer` flags and hydra overrides |
+| {doc}`reference/output-projections` | The three projection shapes with annotated JSONL examples |
+| {doc}`reference/troubleshooting` | Dispatch failures, image pull errors, API key issues, schema drift |
+
+```
+
+````
+
+## Before You Start
+
+- The `NVIDIA_API_KEY` environment variable is required for the default model, `nvidia/nemotron-3-nano-30b-a3b`, hosted on integrate.nvidia.com.
+
+## Limitations and Considerations
+
+- **Cost**: Generation calls a hosted LLM endpoint; each record incurs API cost.
+- **Quality**: After generating records, review them before training.
+- **Scale**: API rate limits apply. For large generation runs, dispatch to a cluster and consider batching across multiple nodes.
+- **Reproducibility**: Seed files, column specs, model aliases, and inference parameters should all be version-controlled together. Changing any one of them changes the output distribution.
diff --git a/docs/sdg/reference/cli-reference.md b/docs/sdg/reference/cli-reference.md
new file mode 100644
index 000000000..af466f608
--- /dev/null
+++ b/docs/sdg/reference/cli-reference.md
@@ -0,0 +1,111 @@
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+(sdg-cli-reference)=
+# CLI Reference
+
+Command-line reference for `nemotron step run sdg/data_designer`. For pipeline overview, see {doc}`../index`.
+
+## Syntax
+
+```console
+$ nemotron step run sdg/data_designer \
+    [-c CONFIG] \
+    [--run PROFILE | --batch PROFILE] \
+    [--dry-run] \
+    [KEY=VALUE ...]
+```
+
+## Flags
+
+```{option} -c, --config CONFIG
+
+Config name (resolved from the step's `config/` directory) or an absolute/relative path to a YAML file.
+
+Bundled names: `default`, `customer_support_tools`, `rl_pref`, `tiny`.
+
+**Default**: `default`
+```
+
+```{option} -r, --run PROFILE
+
+Run attached using the env.toml profile named `PROFILE`. Job output streams to the terminal. Use for short interactive runs.
+```
+
+```{option} -b, --batch PROFILE
+
+Run detached using the env.toml profile named `PROFILE`. Job is submitted and the command returns immediately. Use for long cluster jobs.
+```
+
+```{option} -d, --dry-run
+
+Compile the config and print the resolved job spec without executing. Useful for verifying hydra overrides before submission.
+```
+
+## Hydra Overrides
+
+Any `KEY=VALUE` argument after the flags is passed as a hydra dotlist override and merged into the resolved config. Overrides take precedence over YAML values.
+
+| Override | Example | Effect |
+|---|---|---|
+| `num_records=N` | `num_records=50` | Generate N records |
+| `preview=true` | `preview=true` | Run in preview mode |
+| `output_path=PATH` | `output_path=/data/out.jsonl` | Write output to PATH |
+| `seed_dataset.path=PATH` | `seed_dataset.path=/data/seeds.jsonl` | Override seed file |
+| `models.0.inference_parameters.temperature=T` | `models.0.inference_parameters.temperature=0.5` | Override first model's temperature |
+
+Dotlist path follows the YAML structure. Nested keys use `.` as separator; list items use `.N` (zero-indexed).
+
+## Examples
+
+Preview the default config with two records:
+
+```console
+$ nemotron step run sdg/data_designer -c default preview=true num_records=2
+```
+
+Generate 100 SFT records with a custom output path:
+
+```console
+$ nemotron step run sdg/data_designer -c default \
+    num_records=100 \
+    output_path=/data/my-project/sft.jsonl
+```
+
+Dry-run a cluster submission to check the resolved config:
+
+```console
+$ nemotron step run sdg/data_designer -c default --run my-profile --dry-run
+```
+
+Run attached on a Lepton profile with 500 records:
+
+```console
+$ nemotron step run sdg/data_designer -c default --run lepton-sdg num_records=500
+```
+
+Use a config at an arbitrary path:
+
+```console
+$ nemotron step run sdg/data_designer -c /path/to/my-config.yaml preview=true num_records=2
+```
+
+## Related
+
+- {doc}`../how-to/run` — Preview, generate, and customize output.
+- {doc}`../how-to/dispatch-to-cluster` — env.toml profile setup.
+- {doc}`config-schema` — YAML config field reference.
diff --git a/docs/sdg/reference/config-schema.md b/docs/sdg/reference/config-schema.md
new file mode 100644
index 000000000..ff23a2568
--- /dev/null
+++ b/docs/sdg/reference/config-schema.md
@@ -0,0 +1,213 @@
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+(sdg-config-schema)=
+# Config Schema
+
+This page provides the reference information for the YAML config file consumed by `sdg/data_designer`.
+
+## Simple Fields
+
+| Field | Type | Required | Description |
+|---|---|---|---|
+| `output_dir` | string | no | Base output directory. Supports OmegaConf env-var interpolation. Default resolves `$SDG_OUTPUT_DIR`, then `$NEMO_RUN_DIR/sdg`, then `./output/sdg`. |
+| `output_path` | string | yes | Full path for the output JSONL file. Typically `${output_dir}/my-dataset.jsonl`. |
+| `num_records` | int | yes | Number of records to generate (`client.create`) or preview (`client.preview`). |
+| `preview` | bool | no | When `true`, calls `client.preview()` instead of `client.create()`. Default: `false`. Prefer setting this as a CLI override (`preview=true`) rather than in the YAML. |
+
+## seed_dataset
+
+Optional top-level field.
+When present, Data Designer samples one row per generated record from the seed file and makes the fields available to column prompts by using Jinja2.
+
+| Field | Type | Required | Description |
+|---|---|---|---|
+| `path` | string | yes | Path to a JSONL file. Each line is a JSON object. |
+| `strategy` | string | no | `shuffle` (default) or `ordered`. |
+| `fields` | list[string] | yes | Column names to expose. Must match keys in the seed JSONL objects. These become available as `{{ field_name }}` in prompts without being declared in `columns`. |
+
+## models
+
+A required top-level field.
+The field specifies a list of model configurations.
+Each entry defines one alias that column specs reference by name.
+
+| Field | Type | Required | Description |
+|---|---|---|---|
+| `alias` | string | yes | Short name referenced by `model_alias` in column specs. |
+| `model` | string | yes | Model identifier such as `nvidia/nemotron-3-nano-30b-a3b` and `openai/gpt-oss-20b`. |
+| `provider` | string | no | Provider name, such as `nvidia` or `anthropic`. |
+| `skip_health_check` | bool | no | Skip the startup probe against the model provider. Useful for local or offline endpoints. Default: `false`. |
+| `inference_parameters.temperature` | float | no | Sampling temperature. |
+| `inference_parameters.top_p` | float | no | Top-p nucleus sampling. |
+| `inference_parameters.max_tokens` | int | no | Maximum output tokens per call. |
+
+## columns
+
+A required top-level field.
+This field is an ordered list of column specs.
+Each column has a `name`, a `type`, and type-specific fields.
+Columns can reference earlier columns and seed fields in prompts by using Jinja2 syntax like `{{ column_name }}`.
+
+### Categorical Columns
+
+Samples uniformly from a fixed list of string or numeric values like the following example.
+
+```yaml
+- name: persona
+  type: category
+  values: [teacher, engineer, student, researcher]
+```
+
+| Field | Required | Description |
+|---|---|---|
+| `name` | yes | Column name. |
+| `values` | yes | List of values to sample from. |
+
+### Seed Columns
+
+Provides a named field from the seed dataset as a column.
+Use this column type when a seed field needs to appear in `metadata_fields` or must be referenced in a way that requires it to be an explicit column.
+
+```yaml
+- name: topic
+  type: seed
+```
+
+| Field | Required | Description |
+|---|---|---|
+| `name` | yes | Must match a field name in `seed_dataset.fields`. |
+
+Seed fields declared in `seed_dataset.fields` are available directly in prompts without this column type.
+Use `seed` only when you need the field as a named column in the output schema.
+
+### LLM Text Columns
+
+Generates free-form text using an LLM call.
+These columns can references earlier specified columns and seed fields in `prompt` by using Jinja2 syntax.
+
+```yaml
+- name: user_query
+  type: llm_text
+  model_alias: nvidia-text
+  prompt: |
+    Write a message from a {{ persona }} asking about: {{ topic }}.
+```
+
+| Field | Required | Description |
+|---|---|---|
+| `name` | yes | Column name. |
+| `model_alias` | no | Alias from `models`. Default: `nvidia-text`. |
+| `prompt` | yes | Jinja2 template. Reference any earlier column or seed field with `{{ name }}`. |
+
+### LLM Structured Columns
+
+This column type generates structured JSON by making an LLM call.
+The column definition instructs the model to return JSON matching `output_format`.
+Use this column type for multi-turn conversations, preference judges, and any output that must conform to a schema.
+
+```yaml
+- name: conversation
+  type: llm_structured
+  model_alias: nvidia-text
+  prompt: |
+    Generate a support conversation for customer {{ customer_name }}...
+  output_format:
+    type: object
+    properties:
+      messages:
+        type: array
+        ...
+    required: [messages]
+```
+
+| Field | Required | Description |
+|---|---|---|
+| `name` | yes | Column name. |
+| `model_alias` | no | Alias from `models`. Default: `nvidia-text`. |
+| `prompt` | yes | Jinja2 template. |
+| `output_format` | yes | JSON Schema dict describing the expected output structure. |
+
+### LLM Judge Columns
+
+This type is an alias for `llm_structured`.
+This type is typically used for columns that compare or evaluate other columns.
+
+```yaml
+- name: judge
+  type: llm_judge
+  model_alias: nvidia-text
+  prompt: |
+    Compare response A and B for: {{ prompt }}
+    A: {{ response_a }}
+    B: {{ response_b }}
+  output_format:
+    type: object
+    properties:
+      winner:
+        type: string
+        enum: [A, B]
+    required: [winner]
+```
+
+## output_projection
+
+This top-level field maps raw Data Designer records into the schema expected by downstream steps.
+Refer to {doc}`output-projections` for full field tables and annotated JSONL examples for each type.
+
+| `type` | Use for | Downstream |
+|---|---|---|
+| `openai_messages` | Single-turn SFT chat | `prep/sft_packing`, AutoModel SFT |
+| `dpo_preference` | Preference pairs | `prep/rl_prep`, `rl/nemo_rl/dpo` |
+| `structured_messages` | Multi-turn with tool calls | `prep/sft_packing`, AutoModel SFT |
+
+## Extending the Schema: `person` and `datetime` Samplers
+
+The current `step.py` supports the column types above. To use Data Designer's locale-aware person sampler or datetime sampler, `step.py`'s `build_columns()` function must be extended with `person` and `datetime` branches. A reference implementation showing both additions is in:
+
+```{literalinclude} ../_snippets/input/step-with-person-datetime.py
+:language: python
+:start-at: "        elif kind == \"person\":"
+:end-before: "        elif kind == \"seed\":"
+```
+
+Once merged, configs can declare:
+
+```yaml
+- name: traveler
+  type: person
+  locale: en_US
+  age_range: [22, 75]
+  with_synthetic_personas: true
+
+- name: booking_date
+  type: datetime
+  start: "2024-01-01"
+  end: "2025-12-31"
+```
+
+Download personas for the locale before running:
+
+```console
+$ data-designer download personas --locale en_US
+```
+
+## Related Information
+
+- {doc}`output-projections` — projection field reference and JSONL examples.
+- {doc}`cli-reference` — flags and hydra override syntax.
+- {doc}`../how-to/run` — preview and generate workflow.
diff --git a/docs/sdg/reference/index.md b/docs/sdg/reference/index.md
new file mode 100644
index 000000000..570c00156
--- /dev/null
+++ b/docs/sdg/reference/index.md
@@ -0,0 +1,68 @@
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+(sdg-reference-index)=
+# SDG Reference
+
+Complete specifications for the SDG pipeline. For pipeline overview and when to use it, refer to {doc}`../index`.
+
+::::{grid} 1 2 2 2
+:gutter: 1 1 1 2
+
+:::{grid-item-card} {octicon}`file-code;1.5em;sd-mr-1` Config Schema
+:link: config-schema
+:link-type: doc
+All YAML fields: top-level settings, seed dataset, model aliases, column types, and output projections.
++++
+{bdg-secondary}`lookup`
+:::
+
+:::{grid-item-card} {octicon}`terminal;1.5em;sd-mr-1` CLI Reference
+:link: cli-reference
+:link-type: doc
+`nemotron step run sdg/data_designer` flags and hydra override syntax.
++++
+{bdg-secondary}`lookup`
+:::
+
+:::{grid-item-card} {octicon}`arrow-switch;1.5em;sd-mr-1` Output Projections
+:link: output-projections
+:link-type: doc
+The three projection shapes with annotated JSONL examples.
++++
+{bdg-secondary}`lookup`
+:::
+
+:::{grid-item-card} {octicon}`alert;1.5em;sd-mr-1` Troubleshooting
+:link: troubleshooting
+:link-type: doc
+Failure modes for local runs and cluster dispatch.
++++
+{bdg-secondary}`lookup`
+:::
+
+::::
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+
+config-schema
+cli-reference
+output-projections
+troubleshooting
+```
diff --git a/docs/sdg/reference/output-projections.md b/docs/sdg/reference/output-projections.md
new file mode 100644
index 000000000..8a4ec6a0a
--- /dev/null
+++ b/docs/sdg/reference/output-projections.md
@@ -0,0 +1,148 @@
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+(sdg-output-projections)=
+# Output Projections
+
+The `output_projection` block in a config maps raw Data Designer records into the schema expected by downstream training steps. Each projection type extracts specific columns and writes one JSON object per line.
+
+## OpenAI Messages
+
+Produces single-turn OpenAI chat-format records. Use for SFT chat data that feeds `prep/sft_packing` or AutoModel SFT.
+
+**YAML**:
+
+```yaml
+output_projection:
+  type: openai_messages
+  user_field: user_query        # column containing the user turn
+  assistant_field: assistant_response  # column containing the assistant turn
+  metadata_fields: [persona, topic]    # additional columns to include at top level
+```
+
+**Output** (one JSON object per line):
+
+```json
+{
+  "messages": [
+    {"role": "user", "content": "How do I calibrate the sensor threshold?"},
+    {"role": "assistant", "content": "Set the threshold in the device settings under Calibration → Sensor Range. A value of 0.85 works well for most environments."}
+  ],
+  "persona": "engineer",
+  "topic": "industrial sensor calibration"
+}
+```
+
+Fields:
+
+| Field | Required | Description |
+|---|---|---|
+| `type` | yes | `"openai_messages"` |
+| `user_field` | yes | Column name for the user message content |
+| `assistant_field` | yes | Column name for the assistant message content |
+| `metadata_fields` | no | List of additional column names to include at the top level |
+
+## DPO Preference
+
+Produces preference triples for DPO training. Use with `rl_pref.yaml` and the `llm_judge` column pattern. Output feeds `prep/rl_prep`.
+
+**YAML**:
+
+```yaml
+output_projection:
+  type: dpo_preference
+  prompt_field: prompt          # column containing the input prompt
+  response_a_field: response_a  # column containing the first candidate response
+  response_b_field: response_b  # column containing the second candidate response
+  judge_field: judge            # column containing the judge's structured output
+  winner_field: winner          # key inside the judge output that holds "A" or "B"
+```
+
+**Output** (one JSON object per line):
+
+```json
+{
+  "prompt": "Explain why retrieval-augmented generation can reduce hallucinations in enterprise assistants.",
+  "chosen": "RAG grounds the model in retrieved passages, so factual claims are tied to source documents rather than purely to learned weights.",
+  "rejected": "RAG is better because it uses the internet and knows more things than a regular model."
+}
+```
+
+Fields:
+
+| Field | Required | Description |
+|---|---|---|
+| `type` | yes | `"dpo_preference"` |
+| `prompt_field` | yes | Column name for the input prompt |
+| `response_a_field` | yes | Column name for candidate A |
+| `response_b_field` | yes | Column name for candidate B |
+| `judge_field` | yes | Column name for the judge's structured output |
+| `winner_field` | yes | Key within the judge output JSON that holds `"A"` or `"B"` |
+
+The projection raises `ValueError` if `winner` is not `"A"` or `"B"`. The `llm_judge` column must be configured to return exactly this structure.
+
+## Structured Messages
+
+Produces multi-turn records with `messages` and an optional `tools` array. Use for tool-calling SFT data generated by an `llm_structured` column. Output feeds `prep/sft_packing` or AutoModel SFT.
+
+**YAML**:
+
+```yaml
+output_projection:
+  type: structured_messages
+  source_field: conversation    # column containing the structured JSON object
+  messages_field: messages      # key inside the structured object for the messages array
+  tools_field: tools            # key inside the structured object for the tools array
+  metadata_fields: [customer_name, issue, urgency, channel]
+```
+
+**Output** (one JSON object per line):
+
+```json
+{
+  "messages": [
+    {"role": "system", "content": "You are a helpful ecommerce support agent."},
+    {"role": "user", "content": "I haven't received my order yet."},
+    {"role": "assistant", "content": "", "tool_calls": [{"id": "call_001", "type": "function", "function": {"name": "lookup_order", "arguments": "{\"order_id\":\"ORD-10492\"}"}}]},
+    {"role": "tool", "tool_call_id": "call_001", "name": "lookup_order", "content": "{\"status\":\"delayed\",\"eta\":\"tomorrow\"}"},
+    {"role": "assistant", "content": "Your order is delayed and will arrive tomorrow. I can arrange an expedited replacement if needed."}
+  ],
+  "tools": [{"type": "function", "function": {"name": "lookup_order", "description": "Look up order status by ID.", "parameters": {"type": "object", "properties": {"order_id": {"type": "string"}}, "required": ["order_id"]}}}],
+  "customer_name": "Priya",
+  "issue": "late delivery",
+  "urgency": "frustrated",
+  "channel": "web_chat"
+}
+```
+
+Fields:
+
+| Field | Required | Description |
+|---|---|---|
+| `type` | yes | `"structured_messages"` |
+| `source_field` | yes | Column containing the structured JSON conversation object |
+| `messages_field` | no | Key in `source_field` for the messages array. Default: `"messages"` |
+| `tools_field` | no | Key in `source_field` for the tools array. Omitted from output if not present in the record |
+| `metadata_fields` | no | List of additional column names to include at the top level |
+
+The `source_field` column value may be a JSON string or a dict; both are handled.
+
+## Related Information
+
+- {doc}`config-schema` — Full YAML config field reference.
+- {doc}`../how-to/tool-call-data` — Using `structured_messages` with `customer_support_tools.yaml`.
+- {doc}`../how-to/preference-data` — Using `dpo_preference` with `rl_pref.yaml`.
diff --git a/docs/sdg/reference/troubleshooting.md b/docs/sdg/reference/troubleshooting.md
new file mode 100644
index 000000000..9cb4ad69a
--- /dev/null
+++ b/docs/sdg/reference/troubleshooting.md
@@ -0,0 +1,112 @@
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+(sdg-troubleshooting)=
+# Troubleshooting
+
+Failure modes for local runs and cluster dispatch. For cluster-specific setup, see {doc}`../how-to/dispatch-to-cluster`.
+
+## Local Run Failures
+
+::::{dropdown} `Unknown column type: 'person'` or similar ValueError
+
+**Cause**: The YAML declares a column `type` that `step.py`'s `build_columns()` does not recognise. Currently supported types: `category`, `seed`, `llm_text`, `llm_structured`, `llm_judge`.
+
+**Solution**: Check the spelling. For `person` and `datetime` sampler support, `step.py` must be extended — see the extension reference in {doc}`config-schema`.
+::::
+
+::::{dropdown} `config must declare a non-empty columns: list`
+
+**Cause**: The YAML has an empty or missing `columns:` block.
+
+**Solution**: Add at least one column spec. A minimal config must include at least one `llm_text` or `llm_structured` column that produces output content.
+::::
+
+::::{dropdown} `Jinja2` template references an undefined variable
+
+**Cause**: A prompt uses `{{ column_name }}` but `column_name` is neither a declared column, a seed field in `seed_dataset.fields`, nor an earlier column in the list.
+
+**Solution**: Add the column or seed field, or fix the typo. Run `preview=true num_records=2` to catch this cheaply before a full generation job.
+::::
+
+::::{dropdown} Model health check fails at startup
+
+**Cause**: Data Designer probes the model endpoint at startup. If the model is not available from the configured provider, or if `NVIDIA_API_KEY` is not set, the probe fails and the step exits before generating any records.
+
+**Solution**:
+- Confirm `export NVIDIA_API_KEY="..."` is set.
+- Add `skip_health_check: true` to the model spec to bypass the probe (useful for local or vLLM endpoints that aren't in the provider catalog).
+::::
+
+::::{dropdown} Output JSONL is empty or has fewer records than `num_records`
+
+**Cause**: Data Designer skips or drops records where the structured output doesn't validate against `output_format`, or where the LLM returns a refusal.
+
+**Solution**:
+- Run `preview=true` and inspect a sample for refusals or schema mismatches.
+- Simplify the `output_format` if the model consistently fails to match a complex schema.
+- Raise `max_tokens` if responses are being cut off mid-JSON.
+::::
+
+## Cluster Dispatch Failures
+
+::::{dropdown} Job exits immediately with `No such file or directory` (launch script)
+
+**Cause**: `nemo_run_dir` is not on shared storage. The data-mover sidecar writes the launch script to `nemo_run_dir`, but the main container cannot see it if the path is local to a different node or not mounted.
+
+**Solution**: Set `nemo_run_dir` to a path on the shared NFS mount and add the corresponding `mounts` entry to the env.toml profile. See {doc}`../how-to/dispatch-to-cluster`.
+::::
+
+::::{dropdown} `data-designer` import error inside the container
+
+**Cause**: The NeMo container image does not pre-install `data-designer`.
+
+**Solution**: Add to `startup_commands`:
+
+```toml
+startup_commands = [
+    "python -m pip install --quiet --break-system-packages 'data-designer>=0.5.6'"
+]
+```
+::::
+
+::::{dropdown} Job rejected or OOM-killed immediately on a CPU node
+
+**Cause**: The default `shared_memory_size` (65536 MB) exceeds the available RAM on the CPU node type.
+
+**Solution**: Set `shared_memory_size = 1024` in the env.toml profile. The SDG step makes no use of shared memory.
+::::
+
+::::{dropdown} `NVIDIA_API_KEY` not available inside the container
+
+**Cause**: `NVIDIA_API_KEY` is not automatically forwarded to the job environment the way `HF_TOKEN` and `WANDB_API_KEY` are.
+
+**Solution**: Declare it explicitly in the env.toml profile:
+
+```toml
+[lepton-sdg.env_vars]
+NVIDIA_API_KEY = "${oc.env:NVIDIA_API_KEY}"
+```
+
+And set it in your shell before submitting: `export NVIDIA_API_KEY="..."`.
+::::
+
+## Related
+
+- {doc}`../how-to/dispatch-to-cluster` — Full cluster setup walkthrough.
+- {doc}`cli-reference` — Flags and hydra overrides.
+- {doc}`config-schema` — YAML field reference.

From 3c84fa1c03b220f847c9412ea2eff76de3b2355c Mon Sep 17 00:00:00 2001
From: Mike McKiernan <mmckiernan@nvidia.com>
Date: Wed, 13 May 2026 10:44:27 -0400
Subject: [PATCH 2/2] docs: SDG tips for using agents

Signed-off-by: Mike McKiernan <mmckiernan@nvidia.com>
---
 docs/index.md            |  1 +
 docs/sdg/how-to/index.md |  5 ++-
 docs/sdg/index.md        | 11 ++++-
 docs/sdg/using-skills.md | 90 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 105 insertions(+), 2 deletions(-)
 create mode 100644 docs/sdg/using-skills.md

diff --git a/docs/index.md b/docs/index.md
index 8ba98b838..9e221df2b 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -177,6 +177,7 @@ nemotron/artifacts.md
 
 About <sdg/index>
 Getting Started <sdg/getting-started>
+Tips for Using Agents <sdg/using-skills>
 Tasks <sdg/how-to/index>
 Reference <sdg/reference/index>
 ```
diff --git a/docs/sdg/how-to/index.md b/docs/sdg/how-to/index.md
index 1b4f23333..9b459de6f 100644
--- a/docs/sdg/how-to/index.md
+++ b/docs/sdg/how-to/index.md
@@ -18,7 +18,10 @@ limitations under the License.
 (sdg-how-to-index)=
 # Synthetic Data How-To Guides
 
-Task-focused guides for common SDG workflows. For pipeline overview and when to use it, refer to {doc}`../index`. For your first run, start with {doc}`../getting-started`.
+This section provides task-focused guides for common SDG workflows.
+For your first run, start with {doc}`../getting-started`.
+
+If you are new to model training or want a calmer on-ramp before tasks, read {doc}`../using-skills` for how to run a productive session with a coding agent.
 
 ::::{grid} 1 2 2 2
 :gutter: 1 1 1 2
diff --git a/docs/sdg/index.md b/docs/sdg/index.md
index 29f036f04..b148c9694 100644
--- a/docs/sdg/index.md
+++ b/docs/sdg/index.md
@@ -23,7 +23,7 @@ Generate synthetic training data with [NeMo Data Designer](https://nvidia-nemo.g
 Three output shapes ship out of the box: SFT chat data, tool-calling SFT data, and DPO preference pairs.
 
 :::{tip}
-New to SDG? Start with the {doc}`getting-started` tutorial to run the bundled pipeline and produce your first dataset in 5–10 minutes.
+New to SDG or new to model training? Read {doc}`using-skills` for a short guide to productive agent sessions, then start the {doc}`getting-started` tutorial to run the bundled pipeline and produce your first dataset in 5 to 10 minutes.
 :::
 
 ## When to Use
@@ -70,6 +70,14 @@ Run the bundled pipeline end-to-end: preview two records, generate five, inspect
 {bdg-success}`5–10 min` {bdg-secondary}`tutorial`
 :::
 
+:::{grid-item-card} {octicon}`heart;1.5em;sd-mr-1` Use the SDG Skill With Confidence
+:link: using-skills
+:link-type: doc
+Prepare for a focused chat with a coding agent: opening brief, seed ideas, and how `SKILL.md` supports the session without memorization.
++++
+{bdg-success}`10 min read` {bdg-secondary}`newcomer`
+:::
+
 :::{grid-item-card} {octicon}`checklist;1.5em;sd-mr-1` How-To Guides
 :link: how-to/index
 :link-type: doc
@@ -97,6 +105,7 @@ YAML config schema, CLI flags, output projection shapes, and troubleshooting.
 | Guide | What You'll Do | Time |
 |---|---|---|
 | {doc}`getting-started` | Preview and generate your first synthetic SFT dataset | 5–10 min |
+| {doc}`using-skills` | Run a productive agent session: brief, seeds, plain terms, and light use of `SKILL.md` | 10 min read |
 
 ```
 
diff --git a/docs/sdg/using-skills.md b/docs/sdg/using-skills.md
new file mode 100644
index 000000000..7c7171d1b
--- /dev/null
+++ b/docs/sdg/using-skills.md
@@ -0,0 +1,90 @@
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+(sdg-using-skills)=
+# Use the SDG Skill With Confidence
+
+This page is for newcomers to model training and new to *synthetic data generation (SDG)*.
+The main goal is to help you run a productive, efficient session with a coding agent: less back-and-forth, fewer clarifying questions, and clearer handoffs between what you decide and what the agent edits in the repository.
+
+This page aligns with the `nemotron step run sdg/data_designer` command.
+Use an agent to translate your intent into the right YAML, seed files, and `nemotron` commands.
+
+## Keeping an Agent Session Productive
+
+Provide a short brief you write yourself, not something the agent drafts for you:
+
+```{div} sd-font-italic sd-font-weight-lighter
+- "We need data for a model that can answer short questions about our company’s travel and expense policy."
+- "I need multi-turn  conversations for a retail support bot that can call tools such as order lookup and return eligibility. The tone must be friendly and concise."
+```
+
+Ask the agent to start from shipped configs and the {doc}`getting-started` flow unless there is a strong reason to invent a new layout.
+
+If you want a reusable shape, you can copy the following block into the chat and fill in the bracketed lines.
+
+```text
+Context: [product or domain in one sentence]
+Goal for this session: [one outcome, for example ten seed ideas or a preview command that works]
+“Good” means: [two bullets]
+Hard limits: [language, tone, privacy, or “do not touch cluster dispatch yet”]
+Please: [one request]. Use Nemotron SDG defaults from the repo unless something blocks that.
+```
+
+## What Success Looks Like on Day One
+
+A reasonable first success is a small preview run that prints plausible rows, plus a short list of seed ideas you believe are on-brand for your domain.
+If you have that, you are already operating SDG: iterate small, then scale record counts.
+
+The hands-on path is {doc}`getting-started`.
+When you are ready to attach your own domain, follow {doc}`how-to/create-domain-dataset`.
+
+## Where Domain-Specific Ideas Come From
+
+Seed data can be short anchors that tell the generator which slice of the world each row should reflect.
+A newcomer can build a first seed list the same way a product owner scopes a feature.
+
+Runbooks, internal FAQs, and training decks can inspire situations when policy allows.
+If you cannot paste source text, a neutral rewrite still carries domain truth, for example “partial refund after a split shipment” instead of a ticket ID.
+
+Standards, regulator explainers, textbooks, and course outlines supply topics and jargon.
+Your operator value is the twist that matches your product, not the generic paragraph anyone could find online.
+
+## Ask the Agent to Propose, Then You Curate
+
+Paste a product brief or policy summary and then ask for candidate seed lines.
+The opening brief in the section above keeps this step short: one propose-and-curate round per session is usually enough before you run a preview again.
+
+## Staying Grounded on Policy and Quality
+
+Check licensing and confidentiality before you drop internal documents into an agent or into a seed file.
+Keep evaluation benchmarks separate from training seeds so synthetic items do not leak into the set you use to claim quality.
+Skim `src/nemotron/steps/sdg/SKILL.md` for the short list of pattern links on blending, versioning, and benchmarks when you move past experiments.
+
+## How SKILL.md Fits Your Session
+
+`src/nemotron/steps/sdg/SKILL.md` is written for assistants that route work into the right shipped YAML profile and guardrails.
+You do not need to memorize it.
+
+Skim the decision table once so you know which bundled config matches which need, then let the agent open that file when you change output format or scale record counts.
+You can also say in the chat, “follow `src/nemotron/steps/sdg/SKILL.md` for SDG,” so guardrails land in the thread without a long lecture.
+
+## Next Steps
+
+- Run the tutorial: {doc}`getting-started`.
+- Adapt seeds and YAML to your domain: {doc}`how-to/create-domain-dataset`.
+- Look up flags and fields when the agent names them: {doc}`reference/cli-reference` and {doc}`reference/config-schema`.