Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/nemotron/steps/sdg/data_designer/SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,17 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
- SFT SDG: use `config/default.yaml` or `config/customer_support_tools.yaml`.
- RL preference SDG: use `config/rl_pref.yaml` for chosen and rejected preference pairs.
- Tiny validation: use `config/tiny.yaml` or preview mode while editing columns.
- Custom endpoint example: see the commented `providers:` block in
`config/customer_support_tools.yaml`.

## Configure

- Set `num_records` to the target generated count only after preview output looks correct.
- Set `seed_dataset.path` for seed-typed columns.
- For custom inference endpoints, add `providers:` and point each
`models[].provider` at a declared provider name.
- In `providers[].api_key`, write the environment variable name such as
`OPENAI_API_KEY`; do not resolve the secret into YAML with `${oc.env:...}`.
- Add post-processing or projection columns so downstream steps receive the expected schema.
- Use SFT output with AutoModel directly only after it is projected to chat `messages`.
- Use preference output with `rl/nemo_rl/dpo` only after prompt, chosen, and rejected fields are present.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,30 @@ seed_dataset:
strategy: shuffle
fields: [customer_name, issue, order_id, product, policy_hint]

# Optional custom endpoint example:
#
# To route this pipeline through an OpenAI-compatible endpoint instead of the
# built-in NVIDIA provider, uncomment and edit both blocks below.
# Keep providers[].api_key as the environment variable name. Data Designer
# resolves it at request time; using `${oc.env:OPENAI_API_KEY}` here would put
# the secret into the resolved config.
#
# providers:
# - name: my-provider
# endpoint: ${oc.env:OPENAI_BASE_URL}
# provider_type: openai
# api_key: OPENAI_API_KEY
#
# models:
# - alias: nvidia-text
# model: google/gemma-4-31B-it
# provider: my-provider
# skip_health_check: true
# inference_parameters:
# temperature: 0.75
# top_p: 0.95
# max_tokens: 1800

models:
- alias: nvidia-text
model: openai/gpt-oss-20b
Expand Down
27 changes: 26 additions & 1 deletion src/nemotron/steps/sdg/data_designer/step.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,31 @@ def records_from_designer_result(result: Any) -> list[dict[str, Any]]:
raise TypeError(f"Unsupported Data Designer dataset type: {type(dataset).__name__}")


def build_model_providers(cfg: dict[str, Any], dd: Any) -> list[Any] | None:
"""Build custom Data Designer model providers from optional YAML config."""
providers = cfg.get("providers") or []
if not providers:
return None
if not isinstance(providers, list):
raise ValueError("`providers:` must be a list when declared")

model_providers = []
for spec in providers:
if not isinstance(spec, dict):
raise ValueError("each `providers:` entry must be a mapping")
model_providers.append(
dd.ModelProvider(
name=spec["name"],
endpoint=spec["endpoint"],
provider_type=spec.get("provider_type", "openai"),
api_key=spec.get("api_key") or None,
extra_body=spec.get("extra_body"),
extra_headers=spec.get("extra_headers"),
)
)
return model_providers


def main() -> None:
config_path, cli_overrides = parse_config_and_overrides(default_config=DEFAULT_CONFIG)
raw = apply_hydra_overrides(load_omegaconf_yaml(config_path), cli_overrides)
Expand Down Expand Up @@ -345,7 +370,7 @@ def main() -> None:

build_columns(builder, columns, dd)

client = DataDesigner()
client = DataDesigner(model_providers=build_model_providers(cfg, dd))

if cfg.get("preview", False):
result = client.preview(builder, num_records=cfg["num_records"])
Expand Down
12 changes: 12 additions & 0 deletions src/nemotron/steps/sdg/data_designer/step.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,22 @@ default = 1000
name = "seed_dataset.path"
description = "Path to seed JSONL referenced by 'seed'-typed columns."

[[parameters]]
name = "providers"
description = "Optional custom Data Designer model providers for OpenAI-compatible or Anthropic endpoints. Provider api_key values should be environment variable names, not resolved secret values."

[[parameters]]
name = "models.provider"
description = "Provider name for each model alias. Use a built-in provider such as 'nvidia', or a name declared under providers."

[[strategies]]
when = "Iterating on column specs"
then = "Run with --preview to emit a small batch via client.preview() before scaling to client.create()."

[[strategies]]
when = "Using a self-hosted or non-NVIDIA OpenAI-compatible endpoint"
then = "Use the commented providers example in config/customer_support_tools.yaml and set OPENAI_BASE_URL plus OPENAI_API_KEY in the execution environment."

[[strategies]]
when = "Generating preference data for DPO"
then = "Use config/rl_pref.yaml — it emits chosen / rejected fields ready for rl/nemo_rl/dpo."
Expand Down
82 changes: 82 additions & 0 deletions tests/steps/sdg/test_data_designer.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import yaml

from nemotron.steps.sdg.data_designer.step import (
build_model_providers,
parse_json_object,
project_records,
records_from_designer_result,
Expand All @@ -27,6 +28,7 @@

VALID_COLUMN_TYPES = {"category", "seed", "llm_text", "llm_structured", "llm_judge"}
LLM_COLUMN_TYPES = {"llm_text", "llm_structured", "llm_judge"}
BUILTIN_PROVIDER_NAMES = {"anthropic", "nvidia", "openai", "openrouter"}

STEP = step_dir(__file__, "sdg", "data_designer")
REPO_ROOT = STEP.parents[4]
Expand Down Expand Up @@ -121,6 +123,86 @@ def test_llm_columns_reference_declared_model_aliases() -> None:
assert alias in aliases, f"{path.name}: column {col['name']!r} references unknown model {alias!r}"


def test_custom_providers_are_well_formed() -> None:
for path in _config_paths():
cfg = _load_config(path)
providers = cfg.get("providers") or []
assert isinstance(providers, list), f"{path.name}: providers must be a list"

names = []
for provider in providers:
assert isinstance(provider, dict), f"{path.name}: providers entries must be mappings"
assert provider.get("name"), f"{path.name}: providers entries require name"
assert provider.get("endpoint"), f"{path.name}: provider {provider.get('name')!r} requires endpoint"
provider_type = provider.get("provider_type", "openai")
assert provider_type in {"anthropic", "openai"}, (
f"{path.name}: provider {provider['name']!r} has unsupported provider_type {provider_type!r}"
)
api_key = provider.get("api_key")
assert not (isinstance(api_key, str) and api_key.startswith("${oc.env:")), (
f"{path.name}: provider {provider['name']!r} should reference the API key env var name, "
"not resolve the secret through OmegaConf"
)
names.append(provider["name"])

assert len(names) == len(set(names)), f"{path.name}: provider names must be unique"


def test_model_providers_reference_declared_or_builtin_providers() -> None:
for path in _config_paths():
cfg = _load_config(path)
declared_providers = {provider["name"] for provider in cfg.get("providers") or []}
for model in cfg.get("models") or []:
provider = model.get("provider")
if declared_providers:
assert provider, f"{path.name}: models[].provider is required when custom providers are declared"
if provider:
assert provider in declared_providers | BUILTIN_PROVIDER_NAMES, (
f"{path.name}: model {model['alias']!r} references unknown provider {provider!r}"
)


def test_build_model_providers_from_config() -> None:
class FakeProvider:
def __init__(self, **kwargs):
self.kwargs = kwargs

class FakeDD:
ModelProvider = FakeProvider

providers = build_model_providers(
{
"providers": [
{
"name": "my-provider",
"endpoint": "https://example.test/v1",
"provider_type": "openai",
"api_key": "OPENAI_API_KEY",
"extra_body": {"foo": "bar"},
"extra_headers": {"X-Test": "1"},
},
{
"name": "no-auth-provider",
"endpoint": "http://localhost:8000/v1",
"api_key": "",
},
]
},
FakeDD,
)

assert providers is not None
assert providers[0].kwargs == {
"name": "my-provider",
"endpoint": "https://example.test/v1",
"provider_type": "openai",
"api_key": "OPENAI_API_KEY",
"extra_body": {"foo": "bar"},
"extra_headers": {"X-Test": "1"},
}
assert providers[1].kwargs["api_key"] is None


def test_structured_llm_columns_have_output_format() -> None:
for path in _config_paths():
for col in _load_columns(path):
Expand Down