From 5807439028c9613482b2d81ca526d42bdcdbab7d Mon Sep 17 00:00:00 2001 From: Rakesh Paul Date: Mon, 11 May 2026 11:19:09 +0530 Subject: [PATCH 1/3] Add configurable Data Designer providers Signed-off-by: Rakesh Paul --- src/nemotron/steps/sdg/data_designer/SKILL.md | 4 + src/nemotron/steps/sdg/data_designer/step.py | 27 +++++- .../steps/sdg/data_designer/step.toml | 8 ++ tests/steps/sdg/test_data_designer.py | 82 +++++++++++++++++++ 4 files changed, 120 insertions(+), 1 deletion(-) diff --git a/src/nemotron/steps/sdg/data_designer/SKILL.md b/src/nemotron/steps/sdg/data_designer/SKILL.md index 90be93d10..03babfdbd 100644 --- a/src/nemotron/steps/sdg/data_designer/SKILL.md +++ b/src/nemotron/steps/sdg/data_designer/SKILL.md @@ -19,6 +19,10 @@ Before changing configs or code, read `step.toml` to understand the step flow, c - Set `num_records` to the target generated count only after preview output looks correct. - Set `seed_dataset.path` for seed-typed columns. +- For custom inference endpoints, add `providers:` and point each + `models[].provider` at a declared provider name. +- In `providers[].api_key`, write the environment variable name such as + `OPENAI_API_KEY`; do not resolve the secret into YAML with `${oc.env:...}`. - Add post-processing or projection columns so downstream steps receive the expected schema. - Use SFT output with AutoModel directly only after it is projected to chat `messages`. - Use preference output with `rl/nemo_rl/dpo` only after prompt, chosen, and rejected fields are present. diff --git a/src/nemotron/steps/sdg/data_designer/step.py b/src/nemotron/steps/sdg/data_designer/step.py index 4e1db779b..e172d087c 100644 --- a/src/nemotron/steps/sdg/data_designer/step.py +++ b/src/nemotron/steps/sdg/data_designer/step.py @@ -292,6 +292,31 @@ def records_from_designer_result(result: Any) -> list[dict[str, Any]]: raise TypeError(f"Unsupported Data Designer dataset type: {type(dataset).__name__}") +def build_model_providers(cfg: dict[str, Any], dd: Any) -> list[Any] | None: + """Build custom Data Designer model providers from optional YAML config.""" + providers = cfg.get("providers") or [] + if not providers: + return None + if not isinstance(providers, list): + raise ValueError("`providers:` must be a list when declared") + + model_providers = [] + for spec in providers: + if not isinstance(spec, dict): + raise ValueError("each `providers:` entry must be a mapping") + model_providers.append( + dd.ModelProvider( + name=spec["name"], + endpoint=spec["endpoint"], + provider_type=spec.get("provider_type", "openai"), + api_key=spec.get("api_key") or None, + extra_body=spec.get("extra_body"), + extra_headers=spec.get("extra_headers"), + ) + ) + return model_providers + + def main() -> None: config_path, cli_overrides = parse_config_and_overrides(default_config=DEFAULT_CONFIG) raw = apply_hydra_overrides(load_omegaconf_yaml(config_path), cli_overrides) @@ -345,7 +370,7 @@ def main() -> None: build_columns(builder, columns, dd) - client = DataDesigner() + client = DataDesigner(model_providers=build_model_providers(cfg, dd)) if cfg.get("preview", False): result = client.preview(builder, num_records=cfg["num_records"]) diff --git a/src/nemotron/steps/sdg/data_designer/step.toml b/src/nemotron/steps/sdg/data_designer/step.toml index 87d036f7f..27c7ad491 100644 --- a/src/nemotron/steps/sdg/data_designer/step.toml +++ b/src/nemotron/steps/sdg/data_designer/step.toml @@ -43,6 +43,14 @@ default = 1000 name = "seed_dataset.path" description = "Path to seed JSONL referenced by 'seed'-typed columns." +[[parameters]] +name = "providers" +description = "Optional custom Data Designer model providers for OpenAI-compatible or Anthropic endpoints. Provider api_key values should be environment variable names, not resolved secret values." + +[[parameters]] +name = "models.provider" +description = "Provider name for each model alias. Use a built-in provider such as 'nvidia', or a name declared under providers." + [[strategies]] when = "Iterating on column specs" then = "Run with --preview to emit a small batch via client.preview() before scaling to client.create()." diff --git a/tests/steps/sdg/test_data_designer.py b/tests/steps/sdg/test_data_designer.py index aebf791c2..77adfa055 100644 --- a/tests/steps/sdg/test_data_designer.py +++ b/tests/steps/sdg/test_data_designer.py @@ -18,6 +18,7 @@ import yaml from nemotron.steps.sdg.data_designer.step import ( + build_model_providers, parse_json_object, project_records, records_from_designer_result, @@ -27,6 +28,7 @@ VALID_COLUMN_TYPES = {"category", "seed", "llm_text", "llm_structured", "llm_judge"} LLM_COLUMN_TYPES = {"llm_text", "llm_structured", "llm_judge"} +BUILTIN_PROVIDER_NAMES = {"anthropic", "nvidia", "openai", "openrouter"} STEP = step_dir(__file__, "sdg", "data_designer") REPO_ROOT = STEP.parents[4] @@ -121,6 +123,86 @@ def test_llm_columns_reference_declared_model_aliases() -> None: assert alias in aliases, f"{path.name}: column {col['name']!r} references unknown model {alias!r}" +def test_custom_providers_are_well_formed() -> None: + for path in _config_paths(): + cfg = _load_config(path) + providers = cfg.get("providers") or [] + assert isinstance(providers, list), f"{path.name}: providers must be a list" + + names = [] + for provider in providers: + assert isinstance(provider, dict), f"{path.name}: providers entries must be mappings" + assert provider.get("name"), f"{path.name}: providers entries require name" + assert provider.get("endpoint"), f"{path.name}: provider {provider.get('name')!r} requires endpoint" + provider_type = provider.get("provider_type", "openai") + assert provider_type in {"anthropic", "openai"}, ( + f"{path.name}: provider {provider['name']!r} has unsupported provider_type {provider_type!r}" + ) + api_key = provider.get("api_key") + assert not (isinstance(api_key, str) and api_key.startswith("${oc.env:")), ( + f"{path.name}: provider {provider['name']!r} should reference the API key env var name, " + "not resolve the secret through OmegaConf" + ) + names.append(provider["name"]) + + assert len(names) == len(set(names)), f"{path.name}: provider names must be unique" + + +def test_model_providers_reference_declared_or_builtin_providers() -> None: + for path in _config_paths(): + cfg = _load_config(path) + declared_providers = {provider["name"] for provider in cfg.get("providers") or []} + for model in cfg.get("models") or []: + provider = model.get("provider") + if declared_providers: + assert provider, f"{path.name}: models[].provider is required when custom providers are declared" + if provider: + assert provider in declared_providers | BUILTIN_PROVIDER_NAMES, ( + f"{path.name}: model {model['alias']!r} references unknown provider {provider!r}" + ) + + +def test_build_model_providers_from_config() -> None: + class FakeProvider: + def __init__(self, **kwargs): + self.kwargs = kwargs + + class FakeDD: + ModelProvider = FakeProvider + + providers = build_model_providers( + { + "providers": [ + { + "name": "my-provider", + "endpoint": "https://example.test/v1", + "provider_type": "openai", + "api_key": "OPENAI_API_KEY", + "extra_body": {"foo": "bar"}, + "extra_headers": {"X-Test": "1"}, + }, + { + "name": "no-auth-provider", + "endpoint": "http://localhost:8000/v1", + "api_key": "", + }, + ] + }, + FakeDD, + ) + + assert providers is not None + assert providers[0].kwargs == { + "name": "my-provider", + "endpoint": "https://example.test/v1", + "provider_type": "openai", + "api_key": "OPENAI_API_KEY", + "extra_body": {"foo": "bar"}, + "extra_headers": {"X-Test": "1"}, + } + assert providers[1].kwargs["api_key"] is None + + def test_structured_llm_columns_have_output_format() -> None: for path in _config_paths(): for col in _load_columns(path): From 7d9244b63ab69fbd402d2e4ded2e955dc8c9a758 Mon Sep 17 00:00:00 2001 From: Rakesh Paul Date: Mon, 11 May 2026 11:29:18 +0530 Subject: [PATCH 2/3] Add Data Designer custom provider example Signed-off-by: Rakesh Paul --- src/nemotron/steps/sdg/data_designer/SKILL.md | 4 +- .../config/custom_provider_example.yaml | 41 +++++++++++++++++++ .../steps/sdg/data_designer/step.toml | 4 ++ 3 files changed, 48 insertions(+), 1 deletion(-) create mode 100644 src/nemotron/steps/sdg/data_designer/config/custom_provider_example.yaml diff --git a/src/nemotron/steps/sdg/data_designer/SKILL.md b/src/nemotron/steps/sdg/data_designer/SKILL.md index 03babfdbd..632c3265c 100644 --- a/src/nemotron/steps/sdg/data_designer/SKILL.md +++ b/src/nemotron/steps/sdg/data_designer/SKILL.md @@ -14,6 +14,8 @@ Before changing configs or code, read `step.toml` to understand the step flow, c - SFT SDG: use `config/default.yaml` or `config/customer_support_tools.yaml`. - RL preference SDG: use `config/rl_pref.yaml` for chosen and rejected preference pairs. - Tiny validation: use `config/tiny.yaml` or preview mode while editing columns. +- Custom endpoint example: use `config/custom_provider_example.yaml` after + setting `OPENAI_BASE_URL` and `OPENAI_API_KEY`. ## Configure @@ -32,7 +34,7 @@ Before changing configs or code, read `step.toml` to understand the step flow, c - Contract: `src/nemotron/steps/sdg/data_designer/step.toml` - Runner: `src/nemotron/steps/sdg/data_designer/step.py` -- Configs: `config/default.yaml`, `config/customer_support_tools.yaml`, `config/rl_pref.yaml`, `config/tiny.yaml` +- Configs: `config/default.yaml`, `config/customer_support_tools.yaml`, `config/rl_pref.yaml`, `config/tiny.yaml`, `config/custom_provider_example.yaml` - Seeds: `data/sft_topic_seeds.jsonl`, `data/customer_support_tool_seeds.jsonl`, `data/rl_pref_prompt_seeds.jsonl` ## Guardrails diff --git a/src/nemotron/steps/sdg/data_designer/config/custom_provider_example.yaml b/src/nemotron/steps/sdg/data_designer/config/custom_provider_example.yaml new file mode 100644 index 000000000..b4547c81f --- /dev/null +++ b/src/nemotron/steps/sdg/data_designer/config/custom_provider_example.yaml @@ -0,0 +1,41 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +defaults: customer_support_tools.yaml + +# Example: route the customer-support SDG pipeline through a custom +# OpenAI-compatible endpoint instead of the built-in NVIDIA provider. +# +# Required environment: +# OPENAI_BASE_URL=https://your-endpoint.example/v1 +# OPENAI_API_KEY= +# +# Keep api_key as the environment variable name. Data Designer resolves it at +# request time; resolving the secret here with ${oc.env:OPENAI_API_KEY} would +# materialize the secret in the compiled config. +providers: + - name: my-provider + endpoint: ${oc.env:OPENAI_BASE_URL} + provider_type: openai + api_key: OPENAI_API_KEY + +models: + - alias: nvidia-text + model: google/gemma-4-31B-it + provider: my-provider + skip_health_check: true + inference_parameters: + temperature: 0.75 + top_p: 0.95 + max_tokens: 1800 diff --git a/src/nemotron/steps/sdg/data_designer/step.toml b/src/nemotron/steps/sdg/data_designer/step.toml index 27c7ad491..89e8097bb 100644 --- a/src/nemotron/steps/sdg/data_designer/step.toml +++ b/src/nemotron/steps/sdg/data_designer/step.toml @@ -55,6 +55,10 @@ description = "Provider name for each model alias. Use a built-in provider such when = "Iterating on column specs" then = "Run with --preview to emit a small batch via client.preview() before scaling to client.create()." +[[strategies]] +when = "Using a self-hosted or non-NVIDIA OpenAI-compatible endpoint" +then = "Start from config/custom_provider_example.yaml and set OPENAI_BASE_URL plus OPENAI_API_KEY in the execution environment." + [[strategies]] when = "Generating preference data for DPO" then = "Use config/rl_pref.yaml — it emits chosen / rejected fields ready for rl/nemo_rl/dpo." From 321f71cb60448680af7a05a4cee5e0f91fc7e34d Mon Sep 17 00:00:00 2001 From: Rakesh Paul Date: Mon, 11 May 2026 11:43:02 +0530 Subject: [PATCH 3/3] Move Data Designer provider example into config comments Signed-off-by: Rakesh Paul --- src/nemotron/steps/sdg/data_designer/SKILL.md | 6 +-- .../config/custom_provider_example.yaml | 41 ------------------- .../config/customer_support_tools.yaml | 24 +++++++++++ .../steps/sdg/data_designer/step.toml | 2 +- 4 files changed, 28 insertions(+), 45 deletions(-) delete mode 100644 src/nemotron/steps/sdg/data_designer/config/custom_provider_example.yaml diff --git a/src/nemotron/steps/sdg/data_designer/SKILL.md b/src/nemotron/steps/sdg/data_designer/SKILL.md index 632c3265c..0e3e2fbbc 100644 --- a/src/nemotron/steps/sdg/data_designer/SKILL.md +++ b/src/nemotron/steps/sdg/data_designer/SKILL.md @@ -14,8 +14,8 @@ Before changing configs or code, read `step.toml` to understand the step flow, c - SFT SDG: use `config/default.yaml` or `config/customer_support_tools.yaml`. - RL preference SDG: use `config/rl_pref.yaml` for chosen and rejected preference pairs. - Tiny validation: use `config/tiny.yaml` or preview mode while editing columns. -- Custom endpoint example: use `config/custom_provider_example.yaml` after - setting `OPENAI_BASE_URL` and `OPENAI_API_KEY`. +- Custom endpoint example: see the commented `providers:` block in + `config/customer_support_tools.yaml`. ## Configure @@ -34,7 +34,7 @@ Before changing configs or code, read `step.toml` to understand the step flow, c - Contract: `src/nemotron/steps/sdg/data_designer/step.toml` - Runner: `src/nemotron/steps/sdg/data_designer/step.py` -- Configs: `config/default.yaml`, `config/customer_support_tools.yaml`, `config/rl_pref.yaml`, `config/tiny.yaml`, `config/custom_provider_example.yaml` +- Configs: `config/default.yaml`, `config/customer_support_tools.yaml`, `config/rl_pref.yaml`, `config/tiny.yaml` - Seeds: `data/sft_topic_seeds.jsonl`, `data/customer_support_tool_seeds.jsonl`, `data/rl_pref_prompt_seeds.jsonl` ## Guardrails diff --git a/src/nemotron/steps/sdg/data_designer/config/custom_provider_example.yaml b/src/nemotron/steps/sdg/data_designer/config/custom_provider_example.yaml deleted file mode 100644 index b4547c81f..000000000 --- a/src/nemotron/steps/sdg/data_designer/config/custom_provider_example.yaml +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -defaults: customer_support_tools.yaml - -# Example: route the customer-support SDG pipeline through a custom -# OpenAI-compatible endpoint instead of the built-in NVIDIA provider. -# -# Required environment: -# OPENAI_BASE_URL=https://your-endpoint.example/v1 -# OPENAI_API_KEY= -# -# Keep api_key as the environment variable name. Data Designer resolves it at -# request time; resolving the secret here with ${oc.env:OPENAI_API_KEY} would -# materialize the secret in the compiled config. -providers: - - name: my-provider - endpoint: ${oc.env:OPENAI_BASE_URL} - provider_type: openai - api_key: OPENAI_API_KEY - -models: - - alias: nvidia-text - model: google/gemma-4-31B-it - provider: my-provider - skip_health_check: true - inference_parameters: - temperature: 0.75 - top_p: 0.95 - max_tokens: 1800 diff --git a/src/nemotron/steps/sdg/data_designer/config/customer_support_tools.yaml b/src/nemotron/steps/sdg/data_designer/config/customer_support_tools.yaml index 41809096a..915ec453e 100644 --- a/src/nemotron/steps/sdg/data_designer/config/customer_support_tools.yaml +++ b/src/nemotron/steps/sdg/data_designer/config/customer_support_tools.yaml @@ -29,6 +29,30 @@ seed_dataset: strategy: shuffle fields: [customer_name, issue, order_id, product, policy_hint] +# Optional custom endpoint example: +# +# To route this pipeline through an OpenAI-compatible endpoint instead of the +# built-in NVIDIA provider, uncomment and edit both blocks below. +# Keep providers[].api_key as the environment variable name. Data Designer +# resolves it at request time; using `${oc.env:OPENAI_API_KEY}` here would put +# the secret into the resolved config. +# +# providers: +# - name: my-provider +# endpoint: ${oc.env:OPENAI_BASE_URL} +# provider_type: openai +# api_key: OPENAI_API_KEY +# +# models: +# - alias: nvidia-text +# model: google/gemma-4-31B-it +# provider: my-provider +# skip_health_check: true +# inference_parameters: +# temperature: 0.75 +# top_p: 0.95 +# max_tokens: 1800 + models: - alias: nvidia-text model: openai/gpt-oss-20b diff --git a/src/nemotron/steps/sdg/data_designer/step.toml b/src/nemotron/steps/sdg/data_designer/step.toml index 89e8097bb..f1d12a845 100644 --- a/src/nemotron/steps/sdg/data_designer/step.toml +++ b/src/nemotron/steps/sdg/data_designer/step.toml @@ -57,7 +57,7 @@ then = "Run with --preview to emit a small batch via client.preview() before sca [[strategies]] when = "Using a self-hosted or non-NVIDIA OpenAI-compatible endpoint" -then = "Start from config/custom_provider_example.yaml and set OPENAI_BASE_URL plus OPENAI_API_KEY in the execution environment." +then = "Use the commented providers example in config/customer_support_tools.yaml and set OPENAI_BASE_URL plus OPENAI_API_KEY in the execution environment." [[strategies]] when = "Generating preference data for DPO"