From e595e40dc5e97b632d40ac7e7984aed84079ab86 Mon Sep 17 00:00:00 2001 From: Mike McKiernan Date: Wed, 6 May 2026 15:22:57 -0400 Subject: [PATCH 1/2] docs: SDG and domain data set example Signed-off-by: Mike McKiernan --- docs/_static/customize.css | 2 + docs/index.md | 10 + docs/sdg/_snippets/input/greenteme.yaml | 75 ++++ .../input/greenteme_inquiry_seeds.jsonl | 12 + .../input/step-with-person-datetime.py | 327 ++++++++++++++++++ docs/sdg/getting-started.md | 138 ++++++++ docs/sdg/how-to/create-domain-dataset.md | 131 +++++++ docs/sdg/how-to/dispatch-to-cluster.md | 162 +++++++++ docs/sdg/how-to/index.md | 77 +++++ docs/sdg/how-to/preference-data.md | 93 +++++ docs/sdg/how-to/run.md | 61 ++++ docs/sdg/how-to/tool-call-data.md | 120 +++++++ docs/sdg/index.md | 137 ++++++++ docs/sdg/reference/cli-reference.md | 111 ++++++ docs/sdg/reference/config-schema.md | 213 ++++++++++++ docs/sdg/reference/index.md | 68 ++++ docs/sdg/reference/output-projections.md | 148 ++++++++ docs/sdg/reference/troubleshooting.md | 112 ++++++ 18 files changed, 1997 insertions(+) create mode 100644 docs/sdg/_snippets/input/greenteme.yaml create mode 100644 docs/sdg/_snippets/input/greenteme_inquiry_seeds.jsonl create mode 100644 docs/sdg/_snippets/input/step-with-person-datetime.py create mode 100644 docs/sdg/getting-started.md create mode 100644 docs/sdg/how-to/create-domain-dataset.md create mode 100644 docs/sdg/how-to/dispatch-to-cluster.md create mode 100644 docs/sdg/how-to/index.md create mode 100644 docs/sdg/how-to/preference-data.md create mode 100644 docs/sdg/how-to/run.md create mode 100644 docs/sdg/how-to/tool-call-data.md create mode 100644 docs/sdg/index.md create mode 100644 docs/sdg/reference/cli-reference.md create mode 100644 docs/sdg/reference/config-schema.md create mode 100644 docs/sdg/reference/index.md create mode 100644 docs/sdg/reference/output-projections.md create mode 100644 docs/sdg/reference/troubleshooting.md diff --git a/docs/_static/customize.css b/docs/_static/customize.css index 82510d30f..0359a65db 100644 --- a/docs/_static/customize.css +++ b/docs/_static/customize.css @@ -1,2 +1,4 @@ .admonition.pattern-metadata { border-left-color: #f0ad4e; } .admonition.paper-reference { border-left-color: #5bc0de; } + +.scrollable pre { max-height: 400px; overflow-y: auto; } diff --git a/docs/index.md b/docs/index.md index 336f9e9a1..8ba98b838 100644 --- a/docs/index.md +++ b/docs/index.md @@ -171,6 +171,16 @@ nemotron/embed/README.md nemotron/artifacts.md ``` +```{toctree} +:caption: Synthetic Data Generation +:hidden: + +About +Getting Started +Tasks +Reference +``` + ```{toctree} :caption: Customization :hidden: diff --git a/docs/sdg/_snippets/input/greenteme.yaml b/docs/sdg/_snippets/input/greenteme.yaml new file mode 100644 index 000000000..d4f88c796 --- /dev/null +++ b/docs/sdg/_snippets/input/greenteme.yaml @@ -0,0 +1,75 @@ +output_dir: ${oc.env:SDG_OUTPUT_DIR,${oc.env:NEMO_RUN_DIR,${oc.env:PWD}/output}/sdg} +output_path: ${output_dir}/greenteme_sft.jsonl +num_records: 100 + +seed_dataset: + path: ${oc.env:PWD}/src/nemotron/steps/sdg/data_designer/data/greenteme_inquiry_seeds.jsonl + strategy: shuffle + fields: [scenario] + +models: + - alias: nvidia-text + model: nvidia/nemotron-3-nano-30b-a3b + provider: nvidia + skip_health_check: true + inference_parameters: + temperature: 0.8 + top_p: 1.0 + max_tokens: 1200 + +columns: + - name: traveler_segment + type: category + values: + - frequent_flyer + - business_traveler + - family_with_children + - first_time_international + - elite_loyalty_member + - leisure_couple + + - name: inquiry_type + type: category + values: + - rebooking + - baggage_issue + - refund_request + - loyalty_status + - fare_rules + - flight_status + + - name: channel + type: category + values: [chat, phone, app] + + - name: user_query + type: llm_text + model_alias: nvidia-text + prompt: | + You are role-playing a {{ traveler_segment }} contacting Greenteme Airlines + via {{ channel }} about a {{ inquiry_type }}. The scenario is: + "{{ scenario }}" + + Write the customer's first message. Keep it natural, 1-3 sentences. + Do not reference any real airline name, real flight number, or real + loyalty program. + + - name: assistant_response + type: llm_text + model_alias: nvidia-text + prompt: | + You are a customer-service agent at Greenteme Airlines, a fictional airline. + Reply to this customer message: + + "{{ user_query }}" + + Provide a concise, professional, compliant response, 2-4 sentences. Stay + realistic and grounded in standard airline policy. Do not invent real + airline names, real flight numbers, real PNR codes, or real loyalty + program details. No markdown. + +output_projection: + type: openai_messages + user_field: user_query + assistant_field: assistant_response + metadata_fields: [traveler_segment, inquiry_type, channel, scenario] diff --git a/docs/sdg/_snippets/input/greenteme_inquiry_seeds.jsonl b/docs/sdg/_snippets/input/greenteme_inquiry_seeds.jsonl new file mode 100644 index 000000000..a20321e1e --- /dev/null +++ b/docs/sdg/_snippets/input/greenteme_inquiry_seeds.jsonl @@ -0,0 +1,12 @@ +{"scenario": "Connecting flight cancelled due to weather; customer needs to arrive at destination by tomorrow morning for a wedding."} +{"scenario": "Checked baggage missing on arrival; flight landed two hours ago and the bag did not appear at the carousel."} +{"scenario": "Customer wants a refund on a non-refundable ticket due to a documented medical emergency."} +{"scenario": "Customer is unsure why their loyalty status was downgraded this year and wants to understand the qualifying criteria."} +{"scenario": "Customer wants to change a fare class on an existing booking and needs to know the fare difference and any change fees."} +{"scenario": "Flight is showing a four-hour delay and the customer wants to know whether they will make their connection."} +{"scenario": "Customer was double-charged for a seat upgrade and wants the duplicate charge reversed."} +{"scenario": "Customer needs to add a service animal to an upcoming international flight and wants to know what documentation is required."} +{"scenario": "Bag damaged in transit; customer needs to file a claim and wants the timeline and required documentation."} +{"scenario": "Customer rebooked through self-service and is now seated apart from a travel companion; they want to be reseated together."} +{"scenario": "Customer wants to use a travel credit from a previous cancellation but cannot find the credit number in their account."} +{"scenario": "Customer's payment method was declined when trying to complete a booking and they want to know what to do."} diff --git a/docs/sdg/_snippets/input/step-with-person-datetime.py b/docs/sdg/_snippets/input/step-with-person-datetime.py new file mode 100644 index 000000000..cde4f083b --- /dev/null +++ b/docs/sdg/_snippets/input/step-with-person-datetime.py @@ -0,0 +1,327 @@ +#!/usr/bin/env python3 +# /// script +# [tool.runspec] +# schema = "1" +# name = "steps/sdg/data_designer" +# +# [tool.runspec.run] +# launch = "python" +# +# [tool.runspec.config] +# dir = "./config" +# default = "default" +# format = "omegaconf" +# +# [tool.runspec.resources] +# nodes = 1 +# gpus_per_node = 0 +# /// + +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Generate synthetic SFT or RL preference data with NeMo Data Designer. + +Mirrors the upstream NVIDIA-NeMo/DataDesigner Python SDK: build a +``DataDesignerConfigBuilder`` from a declarative YAML column spec, then call +``client.preview(builder)`` (fast iteration) or ``client.create(builder, …)`` +(full dataset). + +Two configs ship out of the box: + - ``default.yaml`` — SFT chat data (sampler ``persona`` × seed ``topic`` + + LLM-generated ``user_query`` / ``assistant_response``). + - ``rl_pref.yaml`` — DPO preference data (two LLM-generated responses + an + LLM judge to label chosen / rejected). + +Generation uses a remote inference endpoint, so this step needs no GPUs of its +own — only network access to the configured model service. Customisation lives +entirely in YAML. +""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +from omegaconf import OmegaConf + +from nemotron.kit.train_script import ( + apply_hydra_overrides, + load_omegaconf_yaml, + parse_config_and_overrides, +) + +DEFAULT_CONFIG = Path(__file__).parent / "config" / "default.yaml" + + +def build_columns(builder: Any, columns: list[dict[str, Any]], dd: Any) -> None: + """Translate declarative column specs into typed Data Designer column configs. + + Supported ``type``s: + - ``category`` — pick uniformly from a fixed list of values. + - ``person`` — Census-grounded persona profile via the person sampler. + - ``datetime`` — random datetime within a start/end range. + - ``seed`` — surface a column from the seed dataset by name. + - ``llm_text`` — generate free text via an LLM. + - ``llm_structured`` — generate structured JSON via an LLM (provide ``output_format``). + - ``llm_judge`` — alias for ``llm_structured``. + """ + for spec in columns: + kind = spec["type"] + name = spec["name"] + + if kind == "category": + builder.add_column( + dd.SamplerColumnConfig( + name=name, + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams(values=spec["values"]), + ) + ) + + elif kind == "person": + builder.add_column( + dd.SamplerColumnConfig( + name=name, + sampler_type=dd.SamplerType.PERSON, + params=dd.PersonSamplerParams( + locale=spec.get("locale", "en_US"), + age_range=spec.get("age_range"), + with_synthetic_personas=spec.get("with_synthetic_personas", True), + ), + ) + ) + + elif kind == "datetime": + builder.add_column( + dd.SamplerColumnConfig( + name=name, + sampler_type=dd.SamplerType.DATETIME, + params=dd.DatetimeSamplerParams( + start=spec["start"], + end=spec["end"], + ), + ) + ) + + elif kind == "seed": + # The column name must match the field in the seed dataset. + builder.add_column(dd.SeedDatasetColumnConfig(name=name)) + + elif kind == "llm_text": + builder.add_column( + dd.LLMTextColumnConfig( + name=name, + model_alias=spec.get("model_alias", "nvidia-text"), + prompt=spec["prompt"], + ) + ) + + elif kind in ("llm_structured", "llm_judge"): + builder.add_column( + dd.LLMStructuredColumnConfig( + name=name, + model_alias=spec.get("model_alias", "nvidia-text"), + prompt=spec["prompt"], + output_format=spec["output_format"], + ) + ) + + else: + raise ValueError(f"Unknown column type: {kind!r}") + + +def project_records(records: list[dict[str, Any]], projection: dict[str, Any] | None) -> list[dict[str, Any]]: + """Project Data Designer records into training-ready JSONL schemas.""" + if not projection: + return records + + kind = projection.get("type") + if kind == "structured_messages": + source_field = projection.get("source_field", "conversation") + messages_field = projection.get("messages_field", "messages") + tools_field = projection.get("tools_field", "tools") + metadata_fields = projection.get("metadata_fields") or [] + projected = [] + for record in records: + source = record[source_field] + if isinstance(source, str): + source = json.loads(source) + if not isinstance(source, dict): + raise ValueError(f"{source_field!r} must be a mapping or JSON object string") + + item = {"messages": source[messages_field]} + if tools_field in source: + item["tools"] = source[tools_field] + for field in metadata_fields: + if field in record: + item[field] = record[field] + projected.append(item) + return projected + + if kind == "openai_messages": + user_field = projection.get("user_field", "user_query") + assistant_field = projection.get("assistant_field", "assistant_response") + metadata_fields = projection.get("metadata_fields") or [] + projected = [] + for record in records: + item = { + "messages": [ + {"role": "user", "content": record[user_field]}, + {"role": "assistant", "content": record[assistant_field]}, + ] + } + for field in metadata_fields: + if field in record: + item[field] = record[field] + projected.append(item) + return projected + + if kind == "dpo_preference": + prompt_field = projection.get("prompt_field", "prompt") + response_a_field = projection.get("response_a_field", "response_a") + response_b_field = projection.get("response_b_field", "response_b") + judge_field = projection.get("judge_field", "judge") + winner_field = projection.get("winner_field", "winner") + projected = [] + for record in records: + judge = record.get(judge_field) + if isinstance(judge, str): + judge = json.loads(judge) + if not isinstance(judge, dict): + raise ValueError(f"{judge_field!r} must be a mapping or JSON object string") + + winner = str(judge.get(winner_field, "")).upper() + if winner == "A": + chosen = record[response_a_field] + rejected = record[response_b_field] + elif winner == "B": + chosen = record[response_b_field] + rejected = record[response_a_field] + else: + raise ValueError(f"Unsupported preference winner {winner!r}; expected 'A' or 'B'") + + projected.append( + { + "prompt": record[prompt_field], + "chosen": chosen, + "rejected": rejected, + } + ) + return projected + + raise ValueError(f"Unknown output_projection type: {kind!r}") + + +def records_from_designer_result(result: Any) -> list[dict[str, Any]]: + """Extract records from either preview or dataset-creation results.""" + if hasattr(result, "load_dataset"): + dataset = result.load_dataset() + elif hasattr(result, "dataset"): + dataset = result.dataset + else: + raise TypeError( + "Data Designer result must expose either `load_dataset()` " + "or an in-memory `dataset` attribute" + ) + + if dataset is None: + raise ValueError("Data Designer returned an empty dataset result") + + if isinstance(dataset, list): + return dataset + + if hasattr(dataset, "to_pandas"): + dataset = dataset.to_pandas() + + if hasattr(dataset, "to_dict"): + return dataset.to_dict(orient="records") + + raise TypeError(f"Unsupported Data Designer dataset type: {type(dataset).__name__}") + + +def main() -> None: + config_path, cli_overrides = parse_config_and_overrides(default_config=DEFAULT_CONFIG) + raw = apply_hydra_overrides(load_omegaconf_yaml(config_path), cli_overrides) + cfg = OmegaConf.to_container(raw, resolve=True) + + columns = cfg.get("columns") + if not columns: + raise ValueError(f"{config_path}: config must declare a non-empty `columns:` list") + + output_path = Path(cfg["output_path"]) + output_path.parent.mkdir(parents=True, exist_ok=True) + + # Deferred imports keep the module importable on dev hosts without + # data_designer installed. + import data_designer.config as dd + from data_designer.interface import DataDesigner + + builder = dd.DataDesignerConfigBuilder() + + # Models — translate the YAML `models:` list into typed ModelConfig objects. + # The builder ships with default model aliases; replace them when the YAML + # declares the same alias so our endpoint / parameters win. + for spec in cfg.get("models") or []: + alias = spec["alias"] + try: + builder.delete_model_config(alias) + except Exception: + pass # alias not yet registered — fine, just add it. + + params = spec.get("inference_parameters") or {} + builder.add_model_config( + dd.ModelConfig( + alias=alias, + model=spec["model"], + provider=spec.get("provider"), + skip_health_check=spec.get("skip_health_check", False), + inference_parameters=dd.ChatCompletionInferenceParams(**params), + ) + ) + + seed = cfg.get("seed_dataset") + if seed: + strategy_name = seed.get("strategy", "shuffle").upper() + builder.with_seed_dataset( + dd.LocalFileSeedSource(path=seed["path"]), + sampling_strategy=dd.SamplingStrategy[strategy_name], + ) + + build_columns(builder, columns, dd) + + client = DataDesigner() + + if cfg.get("preview", False): + result = client.preview(builder, num_records=cfg["num_records"]) + verb = "Preview" + else: + result = client.create( + builder, + num_records=cfg["num_records"], + ) + verb = "Generated" + + records = records_from_designer_result(result) + records = project_records(records, cfg.get("output_projection")) + + with output_path.open("w") as f: + for record in records: + f.write(json.dumps(record) + "\n") + print(f"{verb} {len(records)} records → {output_path}") + + +if __name__ == "__main__": + main() diff --git a/docs/sdg/getting-started.md b/docs/sdg/getting-started.md new file mode 100644 index 000000000..cd635b4cf --- /dev/null +++ b/docs/sdg/getting-started.md @@ -0,0 +1,138 @@ + + +(sdg-getting-started)= +# Generate Your First Synthetic Dataset + +::::{grid} 2 + +:::{grid-item-card} +:columns: 8 + +**What You'll Build**: A small synthetic SFT chat dataset in OpenAI format--five records grounded in the bundled `sft_topic_seeds.jsonl` seed file, generated through Data Designer against an NVIDIA-hosted LLM endpoint. + +^^^ + +**In this tutorial, you will**: + +1. Set up prerequisites: the repository and an NVIDIA API key. +2. Read the bundled pipeline configuration. +3. Run a preview to verify the pipeline and model. +4. Generate a small dataset of five records. +5. Locate and inspect the output JSONL. + +{octicon}`clock;1.5em;sd-mr-1` This tutorial requires between 5 and 10 minutes to complete. +::: + +:::{grid-item-card} +:columns: 4 + +{octicon}`flame;1.5em;sd-mr-1` **Sample Prompt** + +^^^ + +Run a 2-record preview of the default SDG pipeline, then generate 5 records and show me the first output record. + +::: +:::: + +## Start Here + +- Run all commands from the repository root. +- Data generation uses an NVIDIA-hosted endpoint, so the step needs no local GPUs. + However, you must set the `NVIDIA_API_KEY` environment variable and you must have network access. + +## Prerequisites + +- ✅ Repository cloned and `uv sync` complete. Refer to [Quick Start](../index.md) if you have not done this yet. +- ✅ `NVIDIA_API_KEY` for the default model, `nvidia/nemotron-3-nano-30b-a3b`. + +## How the Default Pipeline Works + +The `src/nemotron/steps/sdg/data_designer/config/default.yaml` combines two sources of variation to generate each record. +A seed topic, such as "safe deployment of AI assistants in enterprise support workflows" or +"ways to monitor data drift in production machine learning systems", is drawn from `.../data/sft_topic_seeds.jsonl`. +A persona category, such as teacher or engineer, is sampled from a fixed category. +Together they anchor the user prompt: a researcher might ask a concise technical question about RAG and a student might ask the same topic more tentatively. + +The pipeline generates a matching assistant response and then projects the result into OpenAI chat-format messages. + +The full configuration is stored at `src/nemotron/steps/sdg/data_designer/config/default.yaml`. + +```{literalinclude} ../../src/nemotron/steps/sdg/data_designer/config/default.yaml +:language: yaml +:lines: 15- +:class: scrollable +``` + +## Procedure + +1. Set your API key: + + ```console + $ export NVIDIA_API_KEY="" + ``` + +1. Run a two-record preview. + Preview mode runs the same pipeline against a tiny record count so you can verify the model alias, prompts, and column wiring cheaply before generating at scale. + + ```console + $ nemotron step run sdg/data_designer -c default preview=true num_records=2 + ``` + + The pipeline registers the model alias, generate two rows, and prints a summary: + + ````{dropdown} Example Output + :icon: code-square + + ```{literalinclude} _snippets/output/preview.txt + :language: text + ``` + ```` + + The default output path is `./output/sdg/sft.jsonl`. + You can override by setting `SDG_OUTPUT_DIR` or specifying `output_path=...` on the command line. + + Inspect the output. + Each line is one chat record. + The `openai_messages` projection emits a `messages` array plus the seed `topic` and sampled `persona` as metadata for traceability. + The following shows one sample record from the `sft.jsonl` file. + + ```{literalinclude} _snippets/output/sft_first_record.jsonl + :language: json + ``` + +## Summary + +What you learned: + +- ✅ Ran a two-record preview to verify the pipeline and model. +- ✅ Generated a five-record SFT chat dataset with `default.yaml`. +- ✅ Located the OpenAI-format JSONL output. + +Key takeaways: + +- **Preview first.** `preview=true num_records=N` runs the same pipeline against a tiny record count. Use it to iterate on column specs and prompts before scaling `num_records` up. +- **Output format matches the trainer.** The `openai_messages` projection emits records ready for `prep/sft_packing` or AutoModel SFT. + +## Next Steps + +- **Adapt the pipeline to a domain you care about**: {doc}`how-to/create-domain-dataset`. +- **Preview, generate, and customize output**: {doc}`how-to/run`. +- **Generate preference pairs for DPO**: {doc}`how-to/preference-data`. +- **Dispatch to a cluster**: {doc}`how-to/dispatch-to-cluster` learn about env.toml profiles and container images. +- **Look up flags and config fields**: {doc}`reference/cli-reference`, {doc}`reference/config-schema`. diff --git a/docs/sdg/how-to/create-domain-dataset.md b/docs/sdg/how-to/create-domain-dataset.md new file mode 100644 index 000000000..70e872c16 --- /dev/null +++ b/docs/sdg/how-to/create-domain-dataset.md @@ -0,0 +1,131 @@ + + +(sdg-create-greenteme-airlines-dataset)= +# Create a Domain Dataset for Airlines Customer Service + +::::{grid} 2 + +:::{grid-item-card} +:columns: 8 + +**What You'll Build**: A domain-adapted SFT chat dataset modeled on fictional airlines customer-service conversations. + +^^^ + +**In this how-to guide, you will**: + +1. Create an airline-domain pipeline config. +2. Create a seed file of airline inquiry scenarios. +3. Swap the category columns for three airline-relevant dimensions. +4. Rewrite the LLM prompts for the airline domain. +5. Update the output projection and output path. +6. Run a preview to verify, then generate 100 records. + +{octicon}`clock;1.5em;sd-mr-1` This guide requires between 20 and 30 minutes to complete. +::: + +:::{grid-item-card} +:columns: 4 + +{octicon}`flame;1.5em;sd-mr-1` **Sample Prompt** + +^^^ + +Adapt the default SDG pipeline for Greenteme Airlines customer service with three category dimensions, run a 2-record preview, then generate 100 records and show me one output record. + +::: +:::: + +## Prerequisites + +- ✅ Completed {doc}`../getting-started` — at least one successful preview and full run of `default.yaml` so you know the pipeline works end-to-end. +- ✅ `NVIDIA_API_KEY` set in your environment. + +## How This Differs From the Default Pipeline + +The default pipeline mixes a single category dimension, `persona`, with seed topics. +This example adds category dimensions, `traveler_segment`, `inquiry_type`, and `channel`, on top of seed scenarios so that diversity comes from explicit, controllable values. + +## Procedure + +1. Create a `src/nemotron/steps/sdg/data_designer/config/greenteme.yaml` ([download](../_snippets/input/greenteme.yaml)) file like the following example: + + ```{literalinclude} ../_snippets/input/greenteme.yaml + :language: yaml + :class: scrollable + ``` + + The key differences from the default pipeline: + - The variation for traveler segment, inquiry type, and channel are all provided by category-type columns. + - The variation for the scenarios is provided by the seed JSONL file from the next step. + - The system-style instruction lives at the top of each prompt rather than as a separate field. The LLM text columns take a single prompt that includes the role for the LLM to assume. + - The `output_projection` field includes the new metadata fields. + +2. Create a seed file, `src/nemotron/steps/sdg/data_designer/data/greenteme_inquiry_seeds.jsonl`, ([download](../_snippets/input/greenteme_inquiry_seeds.jsonl)) like the following example: + + ```{literalinclude} ../_snippets/input/greenteme_inquiry_seeds.jsonl + :language: json + :class: scrollable + ``` + +1. Run a preview by specifying `preview=true num_records=2` to verify the pipeline before scaling: + + ```console + $ nemotron step run sdg/data_designer -c greenteme preview=true num_records=2 + ``` + + ````{dropdown} Example Output + :icon: code-square + + ```{literalinclude} ../_snippets/output/greenteme_preview.jsonl + :language: json + ``` + ```` + +1. Generate the dataset by raising `num_records` after the preview output looks correct: + + ```console + $ nemotron step run sdg/data_designer -c greenteme num_records=100 + ``` + +## Going Further + +**Locale-aware persona profiles.** The current YAML schema supports category, seed, and LLM column types. To replace the static `traveler_segment` category with Census-grounded persona profiles using Data Designer's [person sampler](https://nvidia-nemo.github.io/DataDesigner/latest/concepts/person_sampling/), you can include locale, age range, and synthetic-personas integration. + +**Multi-turn conversations.** The example shows a single user and assistant exchange. +For multi-turn dialogue, swap the two `llm_text` columns for one `llm_structured` column whose `output_format` is a Pydantic conversation schema. +Refer to the `customer_support_tools.yaml` in the config directory for the structured-output pattern. + +**Dispatch to a cluster.** Generation runs locally against the NVIDIA-hosted endpoint by default. To run on Lepton or Slurm, see {doc}`dispatch-to-cluster` — env.toml profiles, container images, and the gotchas that bite first-time cluster runs. + +## Schema and Downstream Use + +The `openai_messages` projection emits records with a `messages` array plus the metadata fields you list. These flow directly into: + +- `prep/sft_packing` for Megatron-Bridge-style training, or +- AutoModel SFT, which consumes the chat format directly. + +For a full reference of available projection shapes, see {doc}`../reference/output-projections`. + +## Next Steps + +- **Generate preference pairs for DPO**: {doc}`preference-data` — the `rl_pref.yaml` pattern. +- **Generate tool-calling SFT data**: {doc}`tool-call-data` — multi-turn with `output_format=Conversation`. +- **CLI flags and overrides**: {doc}`../reference/cli-reference`. +- **Config schema**: {doc}`../reference/config-schema` — full reference for column types, samplers, and projections. +- **Pipeline overview**: {doc}`../index`. diff --git a/docs/sdg/how-to/dispatch-to-cluster.md b/docs/sdg/how-to/dispatch-to-cluster.md new file mode 100644 index 000000000..b7943e9e8 --- /dev/null +++ b/docs/sdg/how-to/dispatch-to-cluster.md @@ -0,0 +1,162 @@ + + +(sdg-dispatch-to-cluster)= +# Dispatch SDG to a Cluster + +This guide covers configuring an env.toml profile and running `sdg/data_designer` on Lepton or Slurm. Generation is CPU-only (no GPUs needed) and calls a remote LLM endpoint, so the step fits naturally on a CPU node with outbound network access. + +## env.toml Profile Shape + +Add a profile to `env.toml` (repository root). The example below targets a Lepton CPU node: + +```toml +[lepton-sdg] +executor = "lepton" +container_image = "nvcr.io/nvidia/nemo:25.11.nemotron_3_nano" +nemo_run_dir = "/mnt/shared/nemo-run" +nodes = 1 +gpus_per_node = 0 +resource_shape = "cpu.large" +node_group = "your-node-group" +shared_memory_size = 1024 +can_be_preempted = true +queue_priority = "mid-4000" +startup_commands = [ + "python -m pip install --quiet --break-system-packages 'data-designer>=0.5.6'" +] +mounts = [ + { path = "/your-nfs-source", mount_path = "/mnt/shared", from = "node-nfs:your-nfs-id" } +] + +[lepton-sdg.env_vars] +NVIDIA_API_KEY = "${oc.env:NVIDIA_API_KEY}" +``` + +## Run + +```console +$ nemotron step run sdg/data_designer -c default --batch lepton-sdg num_records=1000 +``` + +Use `--run` instead of `--batch` to stream logs interactively. + +## Known Gotchas + +These are the failure modes that commonly affect first-time cluster SDG runs. + +### `data-designer` is not pre-installed in the container + +The NeMo container image does not include `data-designer`. Install it at startup via `startup_commands`: + +```toml +startup_commands = [ + "python -m pip install --quiet --break-system-packages 'data-designer>=0.5.6'" +] +``` + +Do not omit `--break-system-packages` — without it pip refuses to install into the system Python on recent NeMo images. + +### Default `shared_memory_size` crashes a CPU node + +The runspec default for `shared_memory_size` is 65536 MB (64 GB), which exceeds the RAM of most CPU node types and causes the job to be rejected or OOM-killed immediately. Set it to a small value; this step makes no use of shared memory: + +```toml +shared_memory_size = 1024 +``` + +### `nemo_run_dir` must be on shared storage + +`nemo-run` uses a busybox data-mover sidecar to stage the launch script into `nemo_run_dir`. If this path is not visible to both the data-mover and the main container — specifically if it is local to one node — the main container never finds the script and the job fails with `No such file or directory`. + +Set `nemo_run_dir` to a path on the shared NFS mount and include the mount in your profile: + +```toml +nemo_run_dir = "/mnt/shared/nemo-run" +mounts = [ + { path = "/your-nfs-source", mount_path = "/mnt/shared", from = "node-nfs:your-nfs-id" } +] +``` + +:::{note} +In the `mounts` table, `path` is the NFS **source** path on the NFS server — not the in-container destination. `mount_path` is the in-container path. +::: + +### `NVIDIA_API_KEY` is not forwarded automatically + +Unlike `HF_TOKEN` and `WANDB_API_KEY`, `NVIDIA_API_KEY` is not automatically forwarded to the container. Declare it explicitly in the `env_vars` section: + +```toml +[lepton-sdg.env_vars] +NVIDIA_API_KEY = "${oc.env:NVIDIA_API_KEY}" +``` + +Set it in your local shell before submitting the job: + +```console +$ export NVIDIA_API_KEY="your-api-key" +$ nemotron step run sdg/data_designer -c default --batch lepton-sdg num_records=1000 +``` + +### Container image: always look up, never guess + +Do not invent image tags. `nemo:latest` does not exist on `nvcr.io`. Check `src/nemotron/steps/sdg/data_designer/step.py` header comments or `src/nemotron/steps/env/env_toml/config/lepton.yaml` for known-good image references before setting `container_image`. + +### Preemption and queue-priority fields were not wired (now fixed) + +`can_be_preempted`, `can_preempt`, and `queue_priority` are now forwarded from env.toml to `LeptonExecutor`. If you are on an older version of the repo where these were silently ignored, upgrade before expecting preemption scheduling to take effect. + +## Slurm Profile + +For Slurm, replace the Lepton-specific fields with Slurm equivalents. The `startup_commands` and `env_vars` gotchas apply equally: + +```toml +[slurm-sdg] +executor = "slurm" +container_image = "nvcr.io/nvidia/nemo:25.11.nemotron_3_nano" +nemo_run_dir = "/lustre/team/nemo-run" +nodes = 1 +gpus_per_node = 0 +run_partition = "cpu" +batch_partition = "cpu" +startup_commands = [ + "python -m pip install --quiet --break-system-packages 'data-designer>=0.5.6'" +] + +[slurm-sdg.env_vars] +NVIDIA_API_KEY = "${oc.env:NVIDIA_API_KEY}" +``` + +:::{tip} +On clusters where the default partition requires GPUs (for example, NVIDIA's `dlw` cluster), set `run_partition` and `batch_partition` to a CPU-capable partition. `gpus_per_node = 0` alone is not sufficient — the partition itself must accept zero-GPU jobs. +::: + +## Verify Before Scaling + +Run a preview via the cluster profile before a large batch: + +```console +$ nemotron step run sdg/data_designer -c default --run lepton-sdg preview=true num_records=2 +``` + +Confirm the job reaches `Running`, the model alias check succeeds, and two records are returned before submitting the full job. + +## Next Steps + +- **env.toml reference**: `docs/nemo_runspec/nemo-run.md` — full profile field reference. +- **CLI flags**: {doc}`../reference/cli-reference`. +- **Troubleshooting**: {doc}`../reference/troubleshooting` — full failure-mode reference. diff --git a/docs/sdg/how-to/index.md b/docs/sdg/how-to/index.md new file mode 100644 index 000000000..1b4f23333 --- /dev/null +++ b/docs/sdg/how-to/index.md @@ -0,0 +1,77 @@ + + +(sdg-how-to-index)= +# Synthetic Data How-To Guides + +Task-focused guides for common SDG workflows. For pipeline overview and when to use it, refer to {doc}`../index`. For your first run, start with {doc}`../getting-started`. + +::::{grid} 1 2 2 2 +:gutter: 1 1 1 2 + +:::{grid-item-card} {octicon}`play;1.5em;sd-mr-1` Run the Pipeline +:link: run +:link-type: doc +Preview, generate, and customize output path and projection. ++++ +{bdg-success}`10 min` {bdg-secondary}`intermediate` +::: + +:::{grid-item-card} {octicon}`briefcase;1.5em;sd-mr-1` Create a Domain Dataset +:link: create-domain-dataset +:link-type: doc +Adapt the pipeline to a custom domain with a seed file and multiple category dimensions. ++++ +{bdg-success}`20 min` {bdg-secondary}`intermediate` +::: + +:::{grid-item-card} {octicon}`tools;1.5em;sd-mr-1` Generate Tool-Call Data +:link: tool-call-data +:link-type: doc +Generate multi-turn conversations with OpenAI-style tool calls for tool-use SFT. ++++ +{bdg-success}`15 min` {bdg-secondary}`intermediate` +::: + +:::{grid-item-card} {octicon}`git-compare;1.5em;sd-mr-1` Generate Preference Data +:link: preference-data +:link-type: doc +Generate DPO preference pairs (prompt / chosen / rejected) from `rl_pref.yaml`. ++++ +{bdg-success}`15 min` {bdg-secondary}`intermediate` +::: + +:::{grid-item-card} {octicon}`server;1.5em;sd-mr-1` Dispatch to a Cluster +:link: dispatch-to-cluster +:link-type: doc +Configure an env.toml profile and run SDG on Lepton or Slurm. ++++ +{bdg-success}`30 min` {bdg-secondary}`intermediate` +::: + +:::: + +```{toctree} +:hidden: +:maxdepth: 1 + +Create a Domain Dataset +Create Tool-Calling Dataset +preference-data +dispatch-to-cluster +run +``` diff --git a/docs/sdg/how-to/preference-data.md b/docs/sdg/how-to/preference-data.md new file mode 100644 index 000000000..f78238369 --- /dev/null +++ b/docs/sdg/how-to/preference-data.md @@ -0,0 +1,93 @@ + + +(sdg-preference-data)= +# Generate Preference Data for DPO + +This example shows how to use the `rl_pref.yaml` configuration file. +The example generates _prompt_, _chosen_, and _rejected_ triples for direct preference optimization (DPO) training. +Output flows directly into `prep/rl_prep` and then `rl/nemo_rl/dpo`. + +## How It Works + +The `rl_pref.yaml` file registers two model aliases at different temperatures: +a high-temperature creative model and a low-temperature precise model. +The goal is to produce two responses per prompt that are distinct: + +```{literalinclude} ../../../src/nemotron/steps/sdg/data_designer/config/rl_pref.yaml +:language: yaml +:lines: 15- +:class: scrollable +``` + +For each seed prompt the pipeline: + +1. Generates `response_a` (high temperature) and `response_b` (low temperature) independently. +2. Asks a third LLM call (`judge` column, `llm_judge` type) to compare them and return `{"winner": "A"}` or `{"winner": "B"}`. +3. The `dpo_preference` projection maps winner → chosen / rejected and writes `{"prompt": "...", "chosen": "...", "rejected": "..."}`. + +## Prerequisites + +- `NVIDIA_API_KEY` set in your environment. +- A seed file with one `prompt` field per line. The bundled `rl_pref_prompt_seeds.jsonl` contains general reasoning prompts. Replace it with domain-specific prompts for targeted preference data. + +## Procedure + +1. Preview two records to verify the judge returns valid `winner` values: + + ```console + $ nemotron step run sdg/data_designer -c rl_pref preview=true num_records=2 + ``` + +2. Generate the dataset. The default is 2000 records: + + ```console + $ nemotron step run sdg/data_designer -c rl_pref num_records=500 + ``` + + Output is written to `./output/sdg/rl_pref.jsonl`. + + Inspect the output. Each line is a preference triple: + + ```json + {"prompt": "Explain why retrieval-augmented generation can reduce hallucinations.", "chosen": "RAG grounds the model in retrieved documents, so claims are tied to specific passages rather than purely to weights.", "rejected": "RAG is better because it uses more data and is generally smarter than standard models."} + ``` + +## Adapt the Seed File + +Swap `seed_dataset.path` to point at your own prompt seed file. Each line must be valid JSON with a `prompt` field: + +```json +{"prompt": "Describe the tradeoffs between batch and streaming inference for real-time applications."} +``` + +Keep seed prompts representative of the target capability and diverse across difficulty levels. +The judge performs better when the two responses have a clear quality difference--consider widening the temperature gap between the two model aliases if the judge returns many ties or unexpected results. + +## Downstream Pipeline + +```text +rl_pref.jsonl → prep/rl_prep → rl/nemo_rl/dpo +``` + +`prep/rl_prep` tokenizes and packs preference pairs. `rl/nemo_rl/dpo` consumes the packed dataset. Verify the `prompt`, `chosen`, and `rejected` fields are present in every record before handing off. + +## Next Steps + +- **Output projection reference**: {doc}`../reference/output-projections` — `dpo_preference` schema. +- **Config schema**: {doc}`../reference/config-schema` — `llm_judge` column type and `dpo_preference` projection fields. +- **Dispatch to a cluster**: {doc}`dispatch-to-cluster`. diff --git a/docs/sdg/how-to/run.md b/docs/sdg/how-to/run.md new file mode 100644 index 000000000..91ec2e0e3 --- /dev/null +++ b/docs/sdg/how-to/run.md @@ -0,0 +1,61 @@ + + +(sdg-run)= +# Tips for the Data Generation Pipeline + +## Preview Before Generating + +Always preview before running a full generation job. Preview mode calls the same pipeline but returns a small number of records without writing the final JSONL: + +```console +$ nemotron step run sdg/data_designer -c default preview=true num_records=2 +``` + +Use preview to verify: + +- Column references in prompts (`{{ column_name }}`) resolve to the expected values. +- Seed fields, such as `{{ scenario }}`, `{{ prompt }}`, and so on, are populated from the seed file. +- The model returns text that matches the prompt's intent. +- The `output_projection` produces the schema downstream steps expect. + +## Specify a Configuration File + +The repository includes the following sample config files in the `src/nemotron/steps/sdg/data_designer/config` directory: + +| Config | Output | Use for | +|---|---|---| +| `default.yaml` | SFT chat (`openai_messages`) | General chat SFT | +| `customer_support_tools.yaml` | Tool-call SFT (`structured_messages`) | Tool-use SFT | +| `rl_pref.yaml` | Preference pairs (`dpo_preference`) | DPO / RLHF | +| `tiny.yaml` | SFT chat, 10 records, short tokens | Fast iteration | + +Specify the file in the `-c` argument: + +```console +$ nemotron step run sdg/data_designer -c customer_support_tools preview=true num_records=2 +``` + +## Run Attached on a Cluster Profile + +To dispatch to a Lepton or Slurm profile configured in `env.toml`, use `--run` (attached, streams logs) or `--batch` (detached): + +```console +$ nemotron step run sdg/data_designer -c default --run my-lepton-profile num_records=1000 +``` + +For cluster setup, see {doc}`dispatch-to-cluster`. diff --git a/docs/sdg/how-to/tool-call-data.md b/docs/sdg/how-to/tool-call-data.md new file mode 100644 index 000000000..812a822f7 --- /dev/null +++ b/docs/sdg/how-to/tool-call-data.md @@ -0,0 +1,120 @@ + + +(sdg-tool-call-data)= +# Generate Tool-Calling Data for SFT + +Use this guide when you need multi-turn chat JSONL where the assistant issues OpenAI-style `tool_calls` and a `tool` role returns structured results, suitable for supervised fine-tuning (SFT) with a `tools` definition array. + +You will use the sample config `customer_support_tools.yaml`, which produces ecommerce-style support threads. Each output row includes a `messages` array (with tool turns) and a `tools` array, ready for packing and training. + +## Outcomes + +- Understand how one `llm_structured` column can emit a full multi-turn trace in a single model call. +- Preview, generate, and validate records before training. +- Know how to retarget seeds, prompts, and schema for your own domain. + +## How It Works + +Compared with single-turn configs such as `default.yaml`, this setup drives the whole conversation from one `llm_structured` column. That column’s `output_format` is a JSON schema that fixes roles, tool-call shape, and approximate turn count so the model cannot drift into invalid shapes. + +```{literalinclude} ../../../src/nemotron/steps/sdg/data_designer/config/customer_support_tools.yaml +:language: yaml +:lines: 15- +:class: scrollable +``` + +Each seed row supplies five anchor fields the prompt interpolates: `customer_name`, `issue`, `order_id`, `product`, and `policy_hint`. Two extra category columns (`urgency`, `channel`) add variety without multiplying seed rows for every combination. + +## Prerequisites + +- Nemotron CLI available and working; if this is your first SDG run, complete {doc}`../getting-started`. +- `NVIDIA_API_KEY` set in the environment. +- The bundled seed file `data/customer_support_tool_seeds.jsonl` (shipped with the step). Add rows, or point the config at your own JSONL. + +## Procedure + +1. Preview two records so structured output matches the schema: + + ```console + $ nemotron step run sdg/data_designer -c customer_support_tools preview=true num_records=2 + ``` + + In the preview, confirm: + + - Exactly one assistant message with `tool_calls`. + - Exactly one `tool` message whose `tool_call_id` matches the call. + - `function.arguments` is a JSON string, not a nested object. + - The assistant’s closing turn references the tool result (not a generic reply). + - No markdown in message `content` if your trainer expects plain text. + +2. Generate the dataset: + + ```console + $ nemotron step run sdg/data_designer -c customer_support_tools num_records=200 + ``` + + Output path: `./output/sdg/customer_support_tool_sft.jsonl`. + Spot-check a few lines. Each record exposes top-level `messages` and `tools` plus metadata, like the following example: + + ```text + { + "messages": [ + {"role": "system", "content": "You are a helpful ecommerce support agent..."}, + {"role": "user", "content": "Hi, I haven't received my headphones yet..."}, + {"role": "assistant", "content": "I'd be happy to help. Could you share your order number?"}, + {"role": "user", "content": "It's ORD-10492."}, + {"role": "assistant", "content": "", "tool_calls": [{"id": "call_001", "type": "function", "function": {"name": "lookup_order", "arguments": "{\"order_id\":\"ORD-10492\"}"}}]}, + {"role": "tool", "tool_call_id": "call_001", "name": "lookup_order", "content": "{\"status\":\"delayed\",\"eta\":\"tomorrow\"}"}, + {"role": "assistant", "content": "Your order is delayed and should arrive tomorrow. Per our policy, I can arrange an expedited replacement if you prefer."} + ], + "tools": [{"type": "function", "function": {"name": "lookup_order", "description": "...", "parameters": {...}}}], + "customer_name": "Priya", "issue": "late delivery", "urgency": "frustrated", "channel": "web_chat" + } + ``` + +## Adapt to Your Domain + +1. Replace or extend the seed file so rows cover your entities. You may rename the five anchor fields as long as the prompt and YAML refer to the same names. +2. Update `seed_dataset.fields` in the YAML to match those names. +3. Rewrite the `prompt` for your scenario and tool surface. +4. Adjust `output_format` if the message layout changes (for example, multiple tool calls per conversation). + +Keep `output_projection` as `structured_messages` so the step extracts `messages` and `tools` from the structured column and merges category metadata onto each record. + +## Validation Checklist + +Before training, sample at least 50 records and verify: + +- [ ] Every `tool_calls` block has a matching `tool` message with the same `tool_call_id`. +- [ ] `function.arguments` values are JSON strings, not nested objects. +- [ ] The assistant’s final reply uses the tool result (not a canned answer that ignores it). +- [ ] No unexpected markdown in `content` if the trainer assumes plain text. +- [ ] `tools` is present and non-empty on every record. + +## Downstream Use + +```text +customer_support_tool_sft.jsonl → prep/sft_packing → SFT training +``` + +The `structured_messages` projection writes `messages` and `tools` at the top level, matching formats common to AutoModel-style SFT and Megatron-Bridge-style workflows. Run `prep/sft_packing` in dry-run mode before a large training job to confirm the packer accepts your file. + +## Next Steps + +- Output projection reference: {doc}`../reference/output-projections` to learn the `structured_messages` schema. +- Config schema: {doc}`../reference/config-schema` for information about the `llm_structured` column type and `output_format`. diff --git a/docs/sdg/index.md b/docs/sdg/index.md new file mode 100644 index 000000000..29f036f04 --- /dev/null +++ b/docs/sdg/index.md @@ -0,0 +1,137 @@ + + +(sdg-index)= +# About Synthetic Data Generation + +Generate synthetic training data with [NeMo Data Designer](https://nvidia-nemo.github.io/DataDesigner/) using a declarative YAML pipeline. Seed a generation run with your domain-specific topics, scenarios, or personas; define the column structure and prompts in YAML; and produce training-ready JSONL without writing Python. + +Three output shapes ship out of the box: SFT chat data, tool-calling SFT data, and DPO preference pairs. + +:::{tip} +New to SDG? Start with the {doc}`getting-started` tutorial to run the bundled pipeline and produce your first dataset in 5–10 minutes. +::: + +## When to Use + +Use SDG when you need training data that does not already exist in sufficient quantity or quality for your target domain or task. + +- **SFT chat data** — Generate user/assistant conversation pairs grounded in domain-specific topics, scenarios, or personas. Use `default.yaml` as a starting point and adapt it to your domain. +- **Tool-calling SFT data** — Generate multi-turn conversations that include assistant tool calls and tool responses in OpenAI format. Use `customer_support_tools.yaml` as a starting point. +- **DPO preference data** — Generate prompt / chosen / rejected triples for preference learning. Use `rl_pref.yaml`. +- **Custom domains** — Swap the seed file, category columns, and prompts to target any domain. The pipeline is fully declarative; customisation does not require editing Python. +- **Cluster-scale generation** — Dispatch generation to Lepton or Slurm via env.toml profiles when local throughput is insufficient. + +## Pipeline at a Glance + +```mermaid +%%{init: {'theme': 'base', 'themeVariables': { 'primaryBorderColor': '#333333', 'lineColor': '#333333', 'primaryTextColor': '#333333', 'clusterBkg': '#ffffff', 'clusterBorder': '#333333'}}}%% +flowchart TB + seed["Seed file (optional)"] --> dsg + cat["Category samplers"] --> dsg + per["Person sampler (optional)"] --> dsg + dsg["Data Designer column graph
Jinja2 prompts · LLM calls"] + dsg --> proj["output_projection"] + proj --> om["openai_messages"] + proj --> dpo["dpo_preference"] + proj --> sm["structured_messages"] + om --> jsonl["JSONL"] + dpo --> jsonl + sm --> jsonl + jsonl --> train["prep/sft_packing or AutoModel SFT"] +``` + +Each run is reproducible: the seed file, column specs, model alias, inference parameters, and projection rules are all version-controlled in a single YAML file. + +## Documentation + +::::{grid} 1 2 2 2 +:gutter: 1 1 1 2 + +:::{grid-item-card} {octicon}`rocket;1.5em;sd-mr-1` Getting Started +:link: getting-started +:link-type: doc +Run the bundled pipeline end-to-end: preview two records, generate five, inspect the output JSONL. ++++ +{bdg-success}`5–10 min` {bdg-secondary}`tutorial` +::: + +:::{grid-item-card} {octicon}`checklist;1.5em;sd-mr-1` How-To Guides +:link: how-to/index +:link-type: doc +Task-focused guides: adapt the pipeline to a domain, generate preference pairs, dispatch to a cluster. ++++ +{bdg-success}`5 guides` {bdg-secondary}`task-focused` +::: + +:::{grid-item-card} {octicon}`list-unordered;1.5em;sd-mr-1` Reference +:link: reference/index +:link-type: doc +YAML config schema, CLI flags, output projection shapes, and troubleshooting. ++++ +{bdg-success}`4 references` {bdg-secondary}`lookup` +::: + +:::: + +## All Documentation + +````{tab-set} + +```{tab-item} Getting Started + +| Guide | What You'll Do | Time | +|---|---|---| +| {doc}`getting-started` | Preview and generate your first synthetic SFT dataset | 5–10 min | + +``` + +```{tab-item} How-To Guides + +| Guide | What You'll Do | +|---|---| +| {doc}`how-to/run` | Preview, generate, and customize output path and projection | +| {doc}`how-to/create-domain-dataset` | Adapt the pipeline to a custom domain with a seed file and multiple category dimensions | +| {doc}`how-to/tool-call-data` | Generate multi-turn tool-calling SFT data | +| {doc}`how-to/preference-data` | Generate DPO preference pairs from `rl_pref.yaml` | +| {doc}`how-to/dispatch-to-cluster` | Dispatch generation to Lepton or Slurm via env.toml | + +``` + +```{tab-item} Reference + +| Reference | What You'll Find | +|---|---| +| {doc}`reference/config-schema` | Full YAML column types, sampler parameters, and projection fields | +| {doc}`reference/cli-reference` | `nemotron step run sdg/data_designer` flags and hydra overrides | +| {doc}`reference/output-projections` | The three projection shapes with annotated JSONL examples | +| {doc}`reference/troubleshooting` | Dispatch failures, image pull errors, API key issues, schema drift | + +``` + +```` + +## Before You Start + +- The `NVIDIA_API_KEY` environment variable is required for the default model, `nvidia/nemotron-3-nano-30b-a3b`, hosted on integrate.nvidia.com. + +## Limitations and Considerations + +- **Cost**: Generation calls a hosted LLM endpoint; each record incurs API cost. +- **Quality**: After generating records, review them before training. +- **Scale**: API rate limits apply. For large generation runs, dispatch to a cluster and consider batching across multiple nodes. +- **Reproducibility**: Seed files, column specs, model aliases, and inference parameters should all be version-controlled together. Changing any one of them changes the output distribution. diff --git a/docs/sdg/reference/cli-reference.md b/docs/sdg/reference/cli-reference.md new file mode 100644 index 000000000..af466f608 --- /dev/null +++ b/docs/sdg/reference/cli-reference.md @@ -0,0 +1,111 @@ + + +(sdg-cli-reference)= +# CLI Reference + +Command-line reference for `nemotron step run sdg/data_designer`. For pipeline overview, see {doc}`../index`. + +## Syntax + +```console +$ nemotron step run sdg/data_designer \ + [-c CONFIG] \ + [--run PROFILE | --batch PROFILE] \ + [--dry-run] \ + [KEY=VALUE ...] +``` + +## Flags + +```{option} -c, --config CONFIG + +Config name (resolved from the step's `config/` directory) or an absolute/relative path to a YAML file. + +Bundled names: `default`, `customer_support_tools`, `rl_pref`, `tiny`. + +**Default**: `default` +``` + +```{option} -r, --run PROFILE + +Run attached using the env.toml profile named `PROFILE`. Job output streams to the terminal. Use for short interactive runs. +``` + +```{option} -b, --batch PROFILE + +Run detached using the env.toml profile named `PROFILE`. Job is submitted and the command returns immediately. Use for long cluster jobs. +``` + +```{option} -d, --dry-run + +Compile the config and print the resolved job spec without executing. Useful for verifying hydra overrides before submission. +``` + +## Hydra Overrides + +Any `KEY=VALUE` argument after the flags is passed as a hydra dotlist override and merged into the resolved config. Overrides take precedence over YAML values. + +| Override | Example | Effect | +|---|---|---| +| `num_records=N` | `num_records=50` | Generate N records | +| `preview=true` | `preview=true` | Run in preview mode | +| `output_path=PATH` | `output_path=/data/out.jsonl` | Write output to PATH | +| `seed_dataset.path=PATH` | `seed_dataset.path=/data/seeds.jsonl` | Override seed file | +| `models.0.inference_parameters.temperature=T` | `models.0.inference_parameters.temperature=0.5` | Override first model's temperature | + +Dotlist path follows the YAML structure. Nested keys use `.` as separator; list items use `.N` (zero-indexed). + +## Examples + +Preview the default config with two records: + +```console +$ nemotron step run sdg/data_designer -c default preview=true num_records=2 +``` + +Generate 100 SFT records with a custom output path: + +```console +$ nemotron step run sdg/data_designer -c default \ + num_records=100 \ + output_path=/data/my-project/sft.jsonl +``` + +Dry-run a cluster submission to check the resolved config: + +```console +$ nemotron step run sdg/data_designer -c default --run my-profile --dry-run +``` + +Run attached on a Lepton profile with 500 records: + +```console +$ nemotron step run sdg/data_designer -c default --run lepton-sdg num_records=500 +``` + +Use a config at an arbitrary path: + +```console +$ nemotron step run sdg/data_designer -c /path/to/my-config.yaml preview=true num_records=2 +``` + +## Related + +- {doc}`../how-to/run` — Preview, generate, and customize output. +- {doc}`../how-to/dispatch-to-cluster` — env.toml profile setup. +- {doc}`config-schema` — YAML config field reference. diff --git a/docs/sdg/reference/config-schema.md b/docs/sdg/reference/config-schema.md new file mode 100644 index 000000000..ff23a2568 --- /dev/null +++ b/docs/sdg/reference/config-schema.md @@ -0,0 +1,213 @@ + + +(sdg-config-schema)= +# Config Schema + +This page provides the reference information for the YAML config file consumed by `sdg/data_designer`. + +## Simple Fields + +| Field | Type | Required | Description | +|---|---|---|---| +| `output_dir` | string | no | Base output directory. Supports OmegaConf env-var interpolation. Default resolves `$SDG_OUTPUT_DIR`, then `$NEMO_RUN_DIR/sdg`, then `./output/sdg`. | +| `output_path` | string | yes | Full path for the output JSONL file. Typically `${output_dir}/my-dataset.jsonl`. | +| `num_records` | int | yes | Number of records to generate (`client.create`) or preview (`client.preview`). | +| `preview` | bool | no | When `true`, calls `client.preview()` instead of `client.create()`. Default: `false`. Prefer setting this as a CLI override (`preview=true`) rather than in the YAML. | + +## seed_dataset + +Optional top-level field. +When present, Data Designer samples one row per generated record from the seed file and makes the fields available to column prompts by using Jinja2. + +| Field | Type | Required | Description | +|---|---|---|---| +| `path` | string | yes | Path to a JSONL file. Each line is a JSON object. | +| `strategy` | string | no | `shuffle` (default) or `ordered`. | +| `fields` | list[string] | yes | Column names to expose. Must match keys in the seed JSONL objects. These become available as `{{ field_name }}` in prompts without being declared in `columns`. | + +## models + +A required top-level field. +The field specifies a list of model configurations. +Each entry defines one alias that column specs reference by name. + +| Field | Type | Required | Description | +|---|---|---|---| +| `alias` | string | yes | Short name referenced by `model_alias` in column specs. | +| `model` | string | yes | Model identifier such as `nvidia/nemotron-3-nano-30b-a3b` and `openai/gpt-oss-20b`. | +| `provider` | string | no | Provider name, such as `nvidia` or `anthropic`. | +| `skip_health_check` | bool | no | Skip the startup probe against the model provider. Useful for local or offline endpoints. Default: `false`. | +| `inference_parameters.temperature` | float | no | Sampling temperature. | +| `inference_parameters.top_p` | float | no | Top-p nucleus sampling. | +| `inference_parameters.max_tokens` | int | no | Maximum output tokens per call. | + +## columns + +A required top-level field. +This field is an ordered list of column specs. +Each column has a `name`, a `type`, and type-specific fields. +Columns can reference earlier columns and seed fields in prompts by using Jinja2 syntax like `{{ column_name }}`. + +### Categorical Columns + +Samples uniformly from a fixed list of string or numeric values like the following example. + +```yaml +- name: persona + type: category + values: [teacher, engineer, student, researcher] +``` + +| Field | Required | Description | +|---|---|---| +| `name` | yes | Column name. | +| `values` | yes | List of values to sample from. | + +### Seed Columns + +Provides a named field from the seed dataset as a column. +Use this column type when a seed field needs to appear in `metadata_fields` or must be referenced in a way that requires it to be an explicit column. + +```yaml +- name: topic + type: seed +``` + +| Field | Required | Description | +|---|---|---| +| `name` | yes | Must match a field name in `seed_dataset.fields`. | + +Seed fields declared in `seed_dataset.fields` are available directly in prompts without this column type. +Use `seed` only when you need the field as a named column in the output schema. + +### LLM Text Columns + +Generates free-form text using an LLM call. +These columns can references earlier specified columns and seed fields in `prompt` by using Jinja2 syntax. + +```yaml +- name: user_query + type: llm_text + model_alias: nvidia-text + prompt: | + Write a message from a {{ persona }} asking about: {{ topic }}. +``` + +| Field | Required | Description | +|---|---|---| +| `name` | yes | Column name. | +| `model_alias` | no | Alias from `models`. Default: `nvidia-text`. | +| `prompt` | yes | Jinja2 template. Reference any earlier column or seed field with `{{ name }}`. | + +### LLM Structured Columns + +This column type generates structured JSON by making an LLM call. +The column definition instructs the model to return JSON matching `output_format`. +Use this column type for multi-turn conversations, preference judges, and any output that must conform to a schema. + +```yaml +- name: conversation + type: llm_structured + model_alias: nvidia-text + prompt: | + Generate a support conversation for customer {{ customer_name }}... + output_format: + type: object + properties: + messages: + type: array + ... + required: [messages] +``` + +| Field | Required | Description | +|---|---|---| +| `name` | yes | Column name. | +| `model_alias` | no | Alias from `models`. Default: `nvidia-text`. | +| `prompt` | yes | Jinja2 template. | +| `output_format` | yes | JSON Schema dict describing the expected output structure. | + +### LLM Judge Columns + +This type is an alias for `llm_structured`. +This type is typically used for columns that compare or evaluate other columns. + +```yaml +- name: judge + type: llm_judge + model_alias: nvidia-text + prompt: | + Compare response A and B for: {{ prompt }} + A: {{ response_a }} + B: {{ response_b }} + output_format: + type: object + properties: + winner: + type: string + enum: [A, B] + required: [winner] +``` + +## output_projection + +This top-level field maps raw Data Designer records into the schema expected by downstream steps. +Refer to {doc}`output-projections` for full field tables and annotated JSONL examples for each type. + +| `type` | Use for | Downstream | +|---|---|---| +| `openai_messages` | Single-turn SFT chat | `prep/sft_packing`, AutoModel SFT | +| `dpo_preference` | Preference pairs | `prep/rl_prep`, `rl/nemo_rl/dpo` | +| `structured_messages` | Multi-turn with tool calls | `prep/sft_packing`, AutoModel SFT | + +## Extending the Schema: `person` and `datetime` Samplers + +The current `step.py` supports the column types above. To use Data Designer's locale-aware person sampler or datetime sampler, `step.py`'s `build_columns()` function must be extended with `person` and `datetime` branches. A reference implementation showing both additions is in: + +```{literalinclude} ../_snippets/input/step-with-person-datetime.py +:language: python +:start-at: " elif kind == \"person\":" +:end-before: " elif kind == \"seed\":" +``` + +Once merged, configs can declare: + +```yaml +- name: traveler + type: person + locale: en_US + age_range: [22, 75] + with_synthetic_personas: true + +- name: booking_date + type: datetime + start: "2024-01-01" + end: "2025-12-31" +``` + +Download personas for the locale before running: + +```console +$ data-designer download personas --locale en_US +``` + +## Related Information + +- {doc}`output-projections` — projection field reference and JSONL examples. +- {doc}`cli-reference` — flags and hydra override syntax. +- {doc}`../how-to/run` — preview and generate workflow. diff --git a/docs/sdg/reference/index.md b/docs/sdg/reference/index.md new file mode 100644 index 000000000..570c00156 --- /dev/null +++ b/docs/sdg/reference/index.md @@ -0,0 +1,68 @@ + + +(sdg-reference-index)= +# SDG Reference + +Complete specifications for the SDG pipeline. For pipeline overview and when to use it, refer to {doc}`../index`. + +::::{grid} 1 2 2 2 +:gutter: 1 1 1 2 + +:::{grid-item-card} {octicon}`file-code;1.5em;sd-mr-1` Config Schema +:link: config-schema +:link-type: doc +All YAML fields: top-level settings, seed dataset, model aliases, column types, and output projections. ++++ +{bdg-secondary}`lookup` +::: + +:::{grid-item-card} {octicon}`terminal;1.5em;sd-mr-1` CLI Reference +:link: cli-reference +:link-type: doc +`nemotron step run sdg/data_designer` flags and hydra override syntax. ++++ +{bdg-secondary}`lookup` +::: + +:::{grid-item-card} {octicon}`arrow-switch;1.5em;sd-mr-1` Output Projections +:link: output-projections +:link-type: doc +The three projection shapes with annotated JSONL examples. ++++ +{bdg-secondary}`lookup` +::: + +:::{grid-item-card} {octicon}`alert;1.5em;sd-mr-1` Troubleshooting +:link: troubleshooting +:link-type: doc +Failure modes for local runs and cluster dispatch. ++++ +{bdg-secondary}`lookup` +::: + +:::: + +```{toctree} +:hidden: +:maxdepth: 1 + +config-schema +cli-reference +output-projections +troubleshooting +``` diff --git a/docs/sdg/reference/output-projections.md b/docs/sdg/reference/output-projections.md new file mode 100644 index 000000000..8a4ec6a0a --- /dev/null +++ b/docs/sdg/reference/output-projections.md @@ -0,0 +1,148 @@ + + +(sdg-output-projections)= +# Output Projections + +The `output_projection` block in a config maps raw Data Designer records into the schema expected by downstream training steps. Each projection type extracts specific columns and writes one JSON object per line. + +## OpenAI Messages + +Produces single-turn OpenAI chat-format records. Use for SFT chat data that feeds `prep/sft_packing` or AutoModel SFT. + +**YAML**: + +```yaml +output_projection: + type: openai_messages + user_field: user_query # column containing the user turn + assistant_field: assistant_response # column containing the assistant turn + metadata_fields: [persona, topic] # additional columns to include at top level +``` + +**Output** (one JSON object per line): + +```json +{ + "messages": [ + {"role": "user", "content": "How do I calibrate the sensor threshold?"}, + {"role": "assistant", "content": "Set the threshold in the device settings under Calibration → Sensor Range. A value of 0.85 works well for most environments."} + ], + "persona": "engineer", + "topic": "industrial sensor calibration" +} +``` + +Fields: + +| Field | Required | Description | +|---|---|---| +| `type` | yes | `"openai_messages"` | +| `user_field` | yes | Column name for the user message content | +| `assistant_field` | yes | Column name for the assistant message content | +| `metadata_fields` | no | List of additional column names to include at the top level | + +## DPO Preference + +Produces preference triples for DPO training. Use with `rl_pref.yaml` and the `llm_judge` column pattern. Output feeds `prep/rl_prep`. + +**YAML**: + +```yaml +output_projection: + type: dpo_preference + prompt_field: prompt # column containing the input prompt + response_a_field: response_a # column containing the first candidate response + response_b_field: response_b # column containing the second candidate response + judge_field: judge # column containing the judge's structured output + winner_field: winner # key inside the judge output that holds "A" or "B" +``` + +**Output** (one JSON object per line): + +```json +{ + "prompt": "Explain why retrieval-augmented generation can reduce hallucinations in enterprise assistants.", + "chosen": "RAG grounds the model in retrieved passages, so factual claims are tied to source documents rather than purely to learned weights.", + "rejected": "RAG is better because it uses the internet and knows more things than a regular model." +} +``` + +Fields: + +| Field | Required | Description | +|---|---|---| +| `type` | yes | `"dpo_preference"` | +| `prompt_field` | yes | Column name for the input prompt | +| `response_a_field` | yes | Column name for candidate A | +| `response_b_field` | yes | Column name for candidate B | +| `judge_field` | yes | Column name for the judge's structured output | +| `winner_field` | yes | Key within the judge output JSON that holds `"A"` or `"B"` | + +The projection raises `ValueError` if `winner` is not `"A"` or `"B"`. The `llm_judge` column must be configured to return exactly this structure. + +## Structured Messages + +Produces multi-turn records with `messages` and an optional `tools` array. Use for tool-calling SFT data generated by an `llm_structured` column. Output feeds `prep/sft_packing` or AutoModel SFT. + +**YAML**: + +```yaml +output_projection: + type: structured_messages + source_field: conversation # column containing the structured JSON object + messages_field: messages # key inside the structured object for the messages array + tools_field: tools # key inside the structured object for the tools array + metadata_fields: [customer_name, issue, urgency, channel] +``` + +**Output** (one JSON object per line): + +```json +{ + "messages": [ + {"role": "system", "content": "You are a helpful ecommerce support agent."}, + {"role": "user", "content": "I haven't received my order yet."}, + {"role": "assistant", "content": "", "tool_calls": [{"id": "call_001", "type": "function", "function": {"name": "lookup_order", "arguments": "{\"order_id\":\"ORD-10492\"}"}}]}, + {"role": "tool", "tool_call_id": "call_001", "name": "lookup_order", "content": "{\"status\":\"delayed\",\"eta\":\"tomorrow\"}"}, + {"role": "assistant", "content": "Your order is delayed and will arrive tomorrow. I can arrange an expedited replacement if needed."} + ], + "tools": [{"type": "function", "function": {"name": "lookup_order", "description": "Look up order status by ID.", "parameters": {"type": "object", "properties": {"order_id": {"type": "string"}}, "required": ["order_id"]}}}], + "customer_name": "Priya", + "issue": "late delivery", + "urgency": "frustrated", + "channel": "web_chat" +} +``` + +Fields: + +| Field | Required | Description | +|---|---|---| +| `type` | yes | `"structured_messages"` | +| `source_field` | yes | Column containing the structured JSON conversation object | +| `messages_field` | no | Key in `source_field` for the messages array. Default: `"messages"` | +| `tools_field` | no | Key in `source_field` for the tools array. Omitted from output if not present in the record | +| `metadata_fields` | no | List of additional column names to include at the top level | + +The `source_field` column value may be a JSON string or a dict; both are handled. + +## Related Information + +- {doc}`config-schema` — Full YAML config field reference. +- {doc}`../how-to/tool-call-data` — Using `structured_messages` with `customer_support_tools.yaml`. +- {doc}`../how-to/preference-data` — Using `dpo_preference` with `rl_pref.yaml`. diff --git a/docs/sdg/reference/troubleshooting.md b/docs/sdg/reference/troubleshooting.md new file mode 100644 index 000000000..9cb4ad69a --- /dev/null +++ b/docs/sdg/reference/troubleshooting.md @@ -0,0 +1,112 @@ + + +(sdg-troubleshooting)= +# Troubleshooting + +Failure modes for local runs and cluster dispatch. For cluster-specific setup, see {doc}`../how-to/dispatch-to-cluster`. + +## Local Run Failures + +::::{dropdown} `Unknown column type: 'person'` or similar ValueError + +**Cause**: The YAML declares a column `type` that `step.py`'s `build_columns()` does not recognise. Currently supported types: `category`, `seed`, `llm_text`, `llm_structured`, `llm_judge`. + +**Solution**: Check the spelling. For `person` and `datetime` sampler support, `step.py` must be extended — see the extension reference in {doc}`config-schema`. +:::: + +::::{dropdown} `config must declare a non-empty columns: list` + +**Cause**: The YAML has an empty or missing `columns:` block. + +**Solution**: Add at least one column spec. A minimal config must include at least one `llm_text` or `llm_structured` column that produces output content. +:::: + +::::{dropdown} `Jinja2` template references an undefined variable + +**Cause**: A prompt uses `{{ column_name }}` but `column_name` is neither a declared column, a seed field in `seed_dataset.fields`, nor an earlier column in the list. + +**Solution**: Add the column or seed field, or fix the typo. Run `preview=true num_records=2` to catch this cheaply before a full generation job. +:::: + +::::{dropdown} Model health check fails at startup + +**Cause**: Data Designer probes the model endpoint at startup. If the model is not available from the configured provider, or if `NVIDIA_API_KEY` is not set, the probe fails and the step exits before generating any records. + +**Solution**: +- Confirm `export NVIDIA_API_KEY="..."` is set. +- Add `skip_health_check: true` to the model spec to bypass the probe (useful for local or vLLM endpoints that aren't in the provider catalog). +:::: + +::::{dropdown} Output JSONL is empty or has fewer records than `num_records` + +**Cause**: Data Designer skips or drops records where the structured output doesn't validate against `output_format`, or where the LLM returns a refusal. + +**Solution**: +- Run `preview=true` and inspect a sample for refusals or schema mismatches. +- Simplify the `output_format` if the model consistently fails to match a complex schema. +- Raise `max_tokens` if responses are being cut off mid-JSON. +:::: + +## Cluster Dispatch Failures + +::::{dropdown} Job exits immediately with `No such file or directory` (launch script) + +**Cause**: `nemo_run_dir` is not on shared storage. The data-mover sidecar writes the launch script to `nemo_run_dir`, but the main container cannot see it if the path is local to a different node or not mounted. + +**Solution**: Set `nemo_run_dir` to a path on the shared NFS mount and add the corresponding `mounts` entry to the env.toml profile. See {doc}`../how-to/dispatch-to-cluster`. +:::: + +::::{dropdown} `data-designer` import error inside the container + +**Cause**: The NeMo container image does not pre-install `data-designer`. + +**Solution**: Add to `startup_commands`: + +```toml +startup_commands = [ + "python -m pip install --quiet --break-system-packages 'data-designer>=0.5.6'" +] +``` +:::: + +::::{dropdown} Job rejected or OOM-killed immediately on a CPU node + +**Cause**: The default `shared_memory_size` (65536 MB) exceeds the available RAM on the CPU node type. + +**Solution**: Set `shared_memory_size = 1024` in the env.toml profile. The SDG step makes no use of shared memory. +:::: + +::::{dropdown} `NVIDIA_API_KEY` not available inside the container + +**Cause**: `NVIDIA_API_KEY` is not automatically forwarded to the job environment the way `HF_TOKEN` and `WANDB_API_KEY` are. + +**Solution**: Declare it explicitly in the env.toml profile: + +```toml +[lepton-sdg.env_vars] +NVIDIA_API_KEY = "${oc.env:NVIDIA_API_KEY}" +``` + +And set it in your shell before submitting: `export NVIDIA_API_KEY="..."`. +:::: + +## Related + +- {doc}`../how-to/dispatch-to-cluster` — Full cluster setup walkthrough. +- {doc}`cli-reference` — Flags and hydra overrides. +- {doc}`config-schema` — YAML field reference. From 3c84fa1c03b220f847c9412ea2eff76de3b2355c Mon Sep 17 00:00:00 2001 From: Mike McKiernan Date: Wed, 13 May 2026 10:44:27 -0400 Subject: [PATCH 2/2] docs: SDG tips for using agents Signed-off-by: Mike McKiernan --- docs/index.md | 1 + docs/sdg/how-to/index.md | 5 ++- docs/sdg/index.md | 11 ++++- docs/sdg/using-skills.md | 90 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 105 insertions(+), 2 deletions(-) create mode 100644 docs/sdg/using-skills.md diff --git a/docs/index.md b/docs/index.md index 8ba98b838..9e221df2b 100644 --- a/docs/index.md +++ b/docs/index.md @@ -177,6 +177,7 @@ nemotron/artifacts.md About Getting Started +Tips for Using Agents Tasks Reference ``` diff --git a/docs/sdg/how-to/index.md b/docs/sdg/how-to/index.md index 1b4f23333..9b459de6f 100644 --- a/docs/sdg/how-to/index.md +++ b/docs/sdg/how-to/index.md @@ -18,7 +18,10 @@ limitations under the License. (sdg-how-to-index)= # Synthetic Data How-To Guides -Task-focused guides for common SDG workflows. For pipeline overview and when to use it, refer to {doc}`../index`. For your first run, start with {doc}`../getting-started`. +This section provides task-focused guides for common SDG workflows. +For your first run, start with {doc}`../getting-started`. + +If you are new to model training or want a calmer on-ramp before tasks, read {doc}`../using-skills` for how to run a productive session with a coding agent. ::::{grid} 1 2 2 2 :gutter: 1 1 1 2 diff --git a/docs/sdg/index.md b/docs/sdg/index.md index 29f036f04..b148c9694 100644 --- a/docs/sdg/index.md +++ b/docs/sdg/index.md @@ -23,7 +23,7 @@ Generate synthetic training data with [NeMo Data Designer](https://nvidia-nemo.g Three output shapes ship out of the box: SFT chat data, tool-calling SFT data, and DPO preference pairs. :::{tip} -New to SDG? Start with the {doc}`getting-started` tutorial to run the bundled pipeline and produce your first dataset in 5–10 minutes. +New to SDG or new to model training? Read {doc}`using-skills` for a short guide to productive agent sessions, then start the {doc}`getting-started` tutorial to run the bundled pipeline and produce your first dataset in 5 to 10 minutes. ::: ## When to Use @@ -70,6 +70,14 @@ Run the bundled pipeline end-to-end: preview two records, generate five, inspect {bdg-success}`5–10 min` {bdg-secondary}`tutorial` ::: +:::{grid-item-card} {octicon}`heart;1.5em;sd-mr-1` Use the SDG Skill With Confidence +:link: using-skills +:link-type: doc +Prepare for a focused chat with a coding agent: opening brief, seed ideas, and how `SKILL.md` supports the session without memorization. ++++ +{bdg-success}`10 min read` {bdg-secondary}`newcomer` +::: + :::{grid-item-card} {octicon}`checklist;1.5em;sd-mr-1` How-To Guides :link: how-to/index :link-type: doc @@ -97,6 +105,7 @@ YAML config schema, CLI flags, output projection shapes, and troubleshooting. | Guide | What You'll Do | Time | |---|---|---| | {doc}`getting-started` | Preview and generate your first synthetic SFT dataset | 5–10 min | +| {doc}`using-skills` | Run a productive agent session: brief, seeds, plain terms, and light use of `SKILL.md` | 10 min read | ``` diff --git a/docs/sdg/using-skills.md b/docs/sdg/using-skills.md new file mode 100644 index 000000000..7c7171d1b --- /dev/null +++ b/docs/sdg/using-skills.md @@ -0,0 +1,90 @@ + + +(sdg-using-skills)= +# Use the SDG Skill With Confidence + +This page is for newcomers to model training and new to *synthetic data generation (SDG)*. +The main goal is to help you run a productive, efficient session with a coding agent: less back-and-forth, fewer clarifying questions, and clearer handoffs between what you decide and what the agent edits in the repository. + +This page aligns with the `nemotron step run sdg/data_designer` command. +Use an agent to translate your intent into the right YAML, seed files, and `nemotron` commands. + +## Keeping an Agent Session Productive + +Provide a short brief you write yourself, not something the agent drafts for you: + +```{div} sd-font-italic sd-font-weight-lighter +- "We need data for a model that can answer short questions about our company’s travel and expense policy." +- "I need multi-turn conversations for a retail support bot that can call tools such as order lookup and return eligibility. The tone must be friendly and concise." +``` + +Ask the agent to start from shipped configs and the {doc}`getting-started` flow unless there is a strong reason to invent a new layout. + +If you want a reusable shape, you can copy the following block into the chat and fill in the bracketed lines. + +```text +Context: [product or domain in one sentence] +Goal for this session: [one outcome, for example ten seed ideas or a preview command that works] +“Good” means: [two bullets] +Hard limits: [language, tone, privacy, or “do not touch cluster dispatch yet”] +Please: [one request]. Use Nemotron SDG defaults from the repo unless something blocks that. +``` + +## What Success Looks Like on Day One + +A reasonable first success is a small preview run that prints plausible rows, plus a short list of seed ideas you believe are on-brand for your domain. +If you have that, you are already operating SDG: iterate small, then scale record counts. + +The hands-on path is {doc}`getting-started`. +When you are ready to attach your own domain, follow {doc}`how-to/create-domain-dataset`. + +## Where Domain-Specific Ideas Come From + +Seed data can be short anchors that tell the generator which slice of the world each row should reflect. +A newcomer can build a first seed list the same way a product owner scopes a feature. + +Runbooks, internal FAQs, and training decks can inspire situations when policy allows. +If you cannot paste source text, a neutral rewrite still carries domain truth, for example “partial refund after a split shipment” instead of a ticket ID. + +Standards, regulator explainers, textbooks, and course outlines supply topics and jargon. +Your operator value is the twist that matches your product, not the generic paragraph anyone could find online. + +## Ask the Agent to Propose, Then You Curate + +Paste a product brief or policy summary and then ask for candidate seed lines. +The opening brief in the section above keeps this step short: one propose-and-curate round per session is usually enough before you run a preview again. + +## Staying Grounded on Policy and Quality + +Check licensing and confidentiality before you drop internal documents into an agent or into a seed file. +Keep evaluation benchmarks separate from training seeds so synthetic items do not leak into the set you use to claim quality. +Skim `src/nemotron/steps/sdg/SKILL.md` for the short list of pattern links on blending, versioning, and benchmarks when you move past experiments. + +## How SKILL.md Fits Your Session + +`src/nemotron/steps/sdg/SKILL.md` is written for assistants that route work into the right shipped YAML profile and guardrails. +You do not need to memorize it. + +Skim the decision table once so you know which bundled config matches which need, then let the agent open that file when you change output format or scale record counts. +You can also say in the chat, “follow `src/nemotron/steps/sdg/SKILL.md` for SDG,” so guardrails land in the thread without a long lecture. + +## Next Steps + +- Run the tutorial: {doc}`getting-started`. +- Adapt seeds and YAML to your domain: {doc}`how-to/create-domain-dataset`. +- Look up flags and fields when the agent names them: {doc}`reference/cli-reference` and {doc}`reference/config-schema`.