From 5807439028c9613482b2d81ca526d42bdcdbab7d Mon Sep 17 00:00:00 2001
From: Rakesh Paul <rapaul@nvidia.com>
Date: Mon, 11 May 2026 11:19:09 +0530
Subject: [PATCH 1/3] Add configurable Data Designer providers

Signed-off-by: Rakesh Paul <rapaul@nvidia.com>
---
 src/nemotron/steps/sdg/data_designer/SKILL.md |  4 +
 src/nemotron/steps/sdg/data_designer/step.py  | 27 +++++-
 .../steps/sdg/data_designer/step.toml         |  8 ++
 tests/steps/sdg/test_data_designer.py         | 82 +++++++++++++++++++
 4 files changed, 120 insertions(+), 1 deletion(-)

diff --git a/src/nemotron/steps/sdg/data_designer/SKILL.md b/src/nemotron/steps/sdg/data_designer/SKILL.md
index 90be93d10..03babfdbd 100644
--- a/src/nemotron/steps/sdg/data_designer/SKILL.md
+++ b/src/nemotron/steps/sdg/data_designer/SKILL.md
@@ -19,6 +19,10 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 
 - Set `num_records` to the target generated count only after preview output looks correct.
 - Set `seed_dataset.path` for seed-typed columns.
+- For custom inference endpoints, add `providers:` and point each
+  `models[].provider` at a declared provider name.
+- In `providers[].api_key`, write the environment variable name such as
+  `OPENAI_API_KEY`; do not resolve the secret into YAML with `${oc.env:...}`.
 - Add post-processing or projection columns so downstream steps receive the expected schema.
 - Use SFT output with AutoModel directly only after it is projected to chat `messages`.
 - Use preference output with `rl/nemo_rl/dpo` only after prompt, chosen, and rejected fields are present.
diff --git a/src/nemotron/steps/sdg/data_designer/step.py b/src/nemotron/steps/sdg/data_designer/step.py
index 4e1db779b..e172d087c 100644
--- a/src/nemotron/steps/sdg/data_designer/step.py
+++ b/src/nemotron/steps/sdg/data_designer/step.py
@@ -292,6 +292,31 @@ def records_from_designer_result(result: Any) -> list[dict[str, Any]]:
     raise TypeError(f"Unsupported Data Designer dataset type: {type(dataset).__name__}")
 
 
+def build_model_providers(cfg: dict[str, Any], dd: Any) -> list[Any] | None:
+    """Build custom Data Designer model providers from optional YAML config."""
+    providers = cfg.get("providers") or []
+    if not providers:
+        return None
+    if not isinstance(providers, list):
+        raise ValueError("`providers:` must be a list when declared")
+
+    model_providers = []
+    for spec in providers:
+        if not isinstance(spec, dict):
+            raise ValueError("each `providers:` entry must be a mapping")
+        model_providers.append(
+            dd.ModelProvider(
+                name=spec["name"],
+                endpoint=spec["endpoint"],
+                provider_type=spec.get("provider_type", "openai"),
+                api_key=spec.get("api_key") or None,
+                extra_body=spec.get("extra_body"),
+                extra_headers=spec.get("extra_headers"),
+            )
+        )
+    return model_providers
+
+
 def main() -> None:
     config_path, cli_overrides = parse_config_and_overrides(default_config=DEFAULT_CONFIG)
     raw = apply_hydra_overrides(load_omegaconf_yaml(config_path), cli_overrides)
@@ -345,7 +370,7 @@ def main() -> None:
 
     build_columns(builder, columns, dd)
 
-    client = DataDesigner()
+    client = DataDesigner(model_providers=build_model_providers(cfg, dd))
 
     if cfg.get("preview", False):
         result = client.preview(builder, num_records=cfg["num_records"])
diff --git a/src/nemotron/steps/sdg/data_designer/step.toml b/src/nemotron/steps/sdg/data_designer/step.toml
index 87d036f7f..27c7ad491 100644
--- a/src/nemotron/steps/sdg/data_designer/step.toml
+++ b/src/nemotron/steps/sdg/data_designer/step.toml
@@ -43,6 +43,14 @@ default = 1000
 name = "seed_dataset.path"
 description = "Path to seed JSONL referenced by 'seed'-typed columns."
 
+[[parameters]]
+name = "providers"
+description = "Optional custom Data Designer model providers for OpenAI-compatible or Anthropic endpoints. Provider api_key values should be environment variable names, not resolved secret values."
+
+[[parameters]]
+name = "models.provider"
+description = "Provider name for each model alias. Use a built-in provider such as 'nvidia', or a name declared under providers."
+
 [[strategies]]
 when = "Iterating on column specs"
 then = "Run with --preview to emit a small batch via client.preview() before scaling to client.create()."
diff --git a/tests/steps/sdg/test_data_designer.py b/tests/steps/sdg/test_data_designer.py
index aebf791c2..77adfa055 100644
--- a/tests/steps/sdg/test_data_designer.py
+++ b/tests/steps/sdg/test_data_designer.py
@@ -18,6 +18,7 @@
 import yaml
 
 from nemotron.steps.sdg.data_designer.step import (
+    build_model_providers,
     parse_json_object,
     project_records,
     records_from_designer_result,
@@ -27,6 +28,7 @@
 
 VALID_COLUMN_TYPES = {"category", "seed", "llm_text", "llm_structured", "llm_judge"}
 LLM_COLUMN_TYPES = {"llm_text", "llm_structured", "llm_judge"}
+BUILTIN_PROVIDER_NAMES = {"anthropic", "nvidia", "openai", "openrouter"}
 
 STEP = step_dir(__file__, "sdg", "data_designer")
 REPO_ROOT = STEP.parents[4]
@@ -121,6 +123,86 @@ def test_llm_columns_reference_declared_model_aliases() -> None:
                 assert alias in aliases, f"{path.name}: column {col['name']!r} references unknown model {alias!r}"
 
 
+def test_custom_providers_are_well_formed() -> None:
+    for path in _config_paths():
+        cfg = _load_config(path)
+        providers = cfg.get("providers") or []
+        assert isinstance(providers, list), f"{path.name}: providers must be a list"
+
+        names = []
+        for provider in providers:
+            assert isinstance(provider, dict), f"{path.name}: providers entries must be mappings"
+            assert provider.get("name"), f"{path.name}: providers entries require name"
+            assert provider.get("endpoint"), f"{path.name}: provider {provider.get('name')!r} requires endpoint"
+            provider_type = provider.get("provider_type", "openai")
+            assert provider_type in {"anthropic", "openai"}, (
+                f"{path.name}: provider {provider['name']!r} has unsupported provider_type {provider_type!r}"
+            )
+            api_key = provider.get("api_key")
+            assert not (isinstance(api_key, str) and api_key.startswith("${oc.env:")), (
+                f"{path.name}: provider {provider['name']!r} should reference the API key env var name, "
+                "not resolve the secret through OmegaConf"
+            )
+            names.append(provider["name"])
+
+        assert len(names) == len(set(names)), f"{path.name}: provider names must be unique"
+
+
+def test_model_providers_reference_declared_or_builtin_providers() -> None:
+    for path in _config_paths():
+        cfg = _load_config(path)
+        declared_providers = {provider["name"] for provider in cfg.get("providers") or []}
+        for model in cfg.get("models") or []:
+            provider = model.get("provider")
+            if declared_providers:
+                assert provider, f"{path.name}: models[].provider is required when custom providers are declared"
+            if provider:
+                assert provider in declared_providers | BUILTIN_PROVIDER_NAMES, (
+                    f"{path.name}: model {model['alias']!r} references unknown provider {provider!r}"
+                )
+
+
+def test_build_model_providers_from_config() -> None:
+    class FakeProvider:
+        def __init__(self, **kwargs):
+            self.kwargs = kwargs
+
+    class FakeDD:
+        ModelProvider = FakeProvider
+
+    providers = build_model_providers(
+        {
+            "providers": [
+                {
+                    "name": "my-provider",
+                    "endpoint": "https://example.test/v1",
+                    "provider_type": "openai",
+                    "api_key": "OPENAI_API_KEY",
+                    "extra_body": {"foo": "bar"},
+                    "extra_headers": {"X-Test": "1"},
+                },
+                {
+                    "name": "no-auth-provider",
+                    "endpoint": "http://localhost:8000/v1",
+                    "api_key": "",
+                },
+            ]
+        },
+        FakeDD,
+    )
+
+    assert providers is not None
+    assert providers[0].kwargs == {
+        "name": "my-provider",
+        "endpoint": "https://example.test/v1",
+        "provider_type": "openai",
+        "api_key": "OPENAI_API_KEY",
+        "extra_body": {"foo": "bar"},
+        "extra_headers": {"X-Test": "1"},
+    }
+    assert providers[1].kwargs["api_key"] is None
+
+
 def test_structured_llm_columns_have_output_format() -> None:
     for path in _config_paths():
         for col in _load_columns(path):

From 7d9244b63ab69fbd402d2e4ded2e955dc8c9a758 Mon Sep 17 00:00:00 2001
From: Rakesh Paul <rapaul@nvidia.com>
Date: Mon, 11 May 2026 11:29:18 +0530
Subject: [PATCH 2/3] Add Data Designer custom provider example

Signed-off-by: Rakesh Paul <rapaul@nvidia.com>
---
 src/nemotron/steps/sdg/data_designer/SKILL.md |  4 +-
 .../config/custom_provider_example.yaml       | 41 +++++++++++++++++++
 .../steps/sdg/data_designer/step.toml         |  4 ++
 3 files changed, 48 insertions(+), 1 deletion(-)
 create mode 100644 src/nemotron/steps/sdg/data_designer/config/custom_provider_example.yaml

diff --git a/src/nemotron/steps/sdg/data_designer/SKILL.md b/src/nemotron/steps/sdg/data_designer/SKILL.md
index 03babfdbd..632c3265c 100644
--- a/src/nemotron/steps/sdg/data_designer/SKILL.md
+++ b/src/nemotron/steps/sdg/data_designer/SKILL.md
@@ -14,6 +14,8 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 - SFT SDG: use `config/default.yaml` or `config/customer_support_tools.yaml`.
 - RL preference SDG: use `config/rl_pref.yaml` for chosen and rejected preference pairs.
 - Tiny validation: use `config/tiny.yaml` or preview mode while editing columns.
+- Custom endpoint example: use `config/custom_provider_example.yaml` after
+  setting `OPENAI_BASE_URL` and `OPENAI_API_KEY`.
 
 ## Configure
 
@@ -32,7 +34,7 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 
 - Contract: `src/nemotron/steps/sdg/data_designer/step.toml`
 - Runner: `src/nemotron/steps/sdg/data_designer/step.py`
-- Configs: `config/default.yaml`, `config/customer_support_tools.yaml`, `config/rl_pref.yaml`, `config/tiny.yaml`
+- Configs: `config/default.yaml`, `config/customer_support_tools.yaml`, `config/rl_pref.yaml`, `config/tiny.yaml`, `config/custom_provider_example.yaml`
 - Seeds: `data/sft_topic_seeds.jsonl`, `data/customer_support_tool_seeds.jsonl`, `data/rl_pref_prompt_seeds.jsonl`
 
 ## Guardrails
diff --git a/src/nemotron/steps/sdg/data_designer/config/custom_provider_example.yaml b/src/nemotron/steps/sdg/data_designer/config/custom_provider_example.yaml
new file mode 100644
index 000000000..b4547c81f
--- /dev/null
+++ b/src/nemotron/steps/sdg/data_designer/config/custom_provider_example.yaml
@@ -0,0 +1,41 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+defaults: customer_support_tools.yaml
+
+# Example: route the customer-support SDG pipeline through a custom
+# OpenAI-compatible endpoint instead of the built-in NVIDIA provider.
+#
+# Required environment:
+#   OPENAI_BASE_URL=https://your-endpoint.example/v1
+#   OPENAI_API_KEY=<secret>
+#
+# Keep api_key as the environment variable name. Data Designer resolves it at
+# request time; resolving the secret here with ${oc.env:OPENAI_API_KEY} would
+# materialize the secret in the compiled config.
+providers:
+  - name: my-provider
+    endpoint: ${oc.env:OPENAI_BASE_URL}
+    provider_type: openai
+    api_key: OPENAI_API_KEY
+
+models:
+  - alias: nvidia-text
+    model: google/gemma-4-31B-it
+    provider: my-provider
+    skip_health_check: true
+    inference_parameters:
+      temperature: 0.75
+      top_p: 0.95
+      max_tokens: 1800
diff --git a/src/nemotron/steps/sdg/data_designer/step.toml b/src/nemotron/steps/sdg/data_designer/step.toml
index 27c7ad491..89e8097bb 100644
--- a/src/nemotron/steps/sdg/data_designer/step.toml
+++ b/src/nemotron/steps/sdg/data_designer/step.toml
@@ -55,6 +55,10 @@ description = "Provider name for each model alias. Use a built-in provider such
 when = "Iterating on column specs"
 then = "Run with --preview to emit a small batch via client.preview() before scaling to client.create()."
 
+[[strategies]]
+when = "Using a self-hosted or non-NVIDIA OpenAI-compatible endpoint"
+then = "Start from config/custom_provider_example.yaml and set OPENAI_BASE_URL plus OPENAI_API_KEY in the execution environment."
+
 [[strategies]]
 when = "Generating preference data for DPO"
 then = "Use config/rl_pref.yaml — it emits chosen / rejected fields ready for rl/nemo_rl/dpo."

From 321f71cb60448680af7a05a4cee5e0f91fc7e34d Mon Sep 17 00:00:00 2001
From: Rakesh Paul <rapaul@nvidia.com>
Date: Mon, 11 May 2026 11:43:02 +0530
Subject: [PATCH 3/3] Move Data Designer provider example into config comments

Signed-off-by: Rakesh Paul <rapaul@nvidia.com>
---
 src/nemotron/steps/sdg/data_designer/SKILL.md |  6 +--
 .../config/custom_provider_example.yaml       | 41 -------------------
 .../config/customer_support_tools.yaml        | 24 +++++++++++
 .../steps/sdg/data_designer/step.toml         |  2 +-
 4 files changed, 28 insertions(+), 45 deletions(-)
 delete mode 100644 src/nemotron/steps/sdg/data_designer/config/custom_provider_example.yaml

diff --git a/src/nemotron/steps/sdg/data_designer/SKILL.md b/src/nemotron/steps/sdg/data_designer/SKILL.md
index 632c3265c..0e3e2fbbc 100644
--- a/src/nemotron/steps/sdg/data_designer/SKILL.md
+++ b/src/nemotron/steps/sdg/data_designer/SKILL.md
@@ -14,8 +14,8 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 - SFT SDG: use `config/default.yaml` or `config/customer_support_tools.yaml`.
 - RL preference SDG: use `config/rl_pref.yaml` for chosen and rejected preference pairs.
 - Tiny validation: use `config/tiny.yaml` or preview mode while editing columns.
-- Custom endpoint example: use `config/custom_provider_example.yaml` after
-  setting `OPENAI_BASE_URL` and `OPENAI_API_KEY`.
+- Custom endpoint example: see the commented `providers:` block in
+  `config/customer_support_tools.yaml`.
 
 ## Configure
 
@@ -34,7 +34,7 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 
 - Contract: `src/nemotron/steps/sdg/data_designer/step.toml`
 - Runner: `src/nemotron/steps/sdg/data_designer/step.py`
-- Configs: `config/default.yaml`, `config/customer_support_tools.yaml`, `config/rl_pref.yaml`, `config/tiny.yaml`, `config/custom_provider_example.yaml`
+- Configs: `config/default.yaml`, `config/customer_support_tools.yaml`, `config/rl_pref.yaml`, `config/tiny.yaml`
 - Seeds: `data/sft_topic_seeds.jsonl`, `data/customer_support_tool_seeds.jsonl`, `data/rl_pref_prompt_seeds.jsonl`
 
 ## Guardrails
diff --git a/src/nemotron/steps/sdg/data_designer/config/custom_provider_example.yaml b/src/nemotron/steps/sdg/data_designer/config/custom_provider_example.yaml
deleted file mode 100644
index b4547c81f..000000000
--- a/src/nemotron/steps/sdg/data_designer/config/custom_provider_example.yaml
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-defaults: customer_support_tools.yaml
-
-# Example: route the customer-support SDG pipeline through a custom
-# OpenAI-compatible endpoint instead of the built-in NVIDIA provider.
-#
-# Required environment:
-#   OPENAI_BASE_URL=https://your-endpoint.example/v1
-#   OPENAI_API_KEY=<secret>
-#
-# Keep api_key as the environment variable name. Data Designer resolves it at
-# request time; resolving the secret here with ${oc.env:OPENAI_API_KEY} would
-# materialize the secret in the compiled config.
-providers:
-  - name: my-provider
-    endpoint: ${oc.env:OPENAI_BASE_URL}
-    provider_type: openai
-    api_key: OPENAI_API_KEY
-
-models:
-  - alias: nvidia-text
-    model: google/gemma-4-31B-it
-    provider: my-provider
-    skip_health_check: true
-    inference_parameters:
-      temperature: 0.75
-      top_p: 0.95
-      max_tokens: 1800
diff --git a/src/nemotron/steps/sdg/data_designer/config/customer_support_tools.yaml b/src/nemotron/steps/sdg/data_designer/config/customer_support_tools.yaml
index 41809096a..915ec453e 100644
--- a/src/nemotron/steps/sdg/data_designer/config/customer_support_tools.yaml
+++ b/src/nemotron/steps/sdg/data_designer/config/customer_support_tools.yaml
@@ -29,6 +29,30 @@ seed_dataset:
   strategy: shuffle
   fields: [customer_name, issue, order_id, product, policy_hint]
 
+# Optional custom endpoint example:
+#
+# To route this pipeline through an OpenAI-compatible endpoint instead of the
+# built-in NVIDIA provider, uncomment and edit both blocks below.
+# Keep providers[].api_key as the environment variable name. Data Designer
+# resolves it at request time; using `${oc.env:OPENAI_API_KEY}` here would put
+# the secret into the resolved config.
+#
+# providers:
+#   - name: my-provider
+#     endpoint: ${oc.env:OPENAI_BASE_URL}
+#     provider_type: openai
+#     api_key: OPENAI_API_KEY
+#
+# models:
+#   - alias: nvidia-text
+#     model: google/gemma-4-31B-it
+#     provider: my-provider
+#     skip_health_check: true
+#     inference_parameters:
+#       temperature: 0.75
+#       top_p: 0.95
+#       max_tokens: 1800
+
 models:
   - alias: nvidia-text
     model: openai/gpt-oss-20b
diff --git a/src/nemotron/steps/sdg/data_designer/step.toml b/src/nemotron/steps/sdg/data_designer/step.toml
index 89e8097bb..f1d12a845 100644
--- a/src/nemotron/steps/sdg/data_designer/step.toml
+++ b/src/nemotron/steps/sdg/data_designer/step.toml
@@ -57,7 +57,7 @@ then = "Run with --preview to emit a small batch via client.preview() before sca
 
 [[strategies]]
 when = "Using a self-hosted or non-NVIDIA OpenAI-compatible endpoint"
-then = "Start from config/custom_provider_example.yaml and set OPENAI_BASE_URL plus OPENAI_API_KEY in the execution environment."
+then = "Use the commented providers example in config/customer_support_tools.yaml and set OPENAI_BASE_URL plus OPENAI_API_KEY in the execution environment."
 
 [[strategies]]
 when = "Generating preference data for DPO"