Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 39 additions & 14 deletions api/aicr/v1/server.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -214,13 +214,15 @@ paths:
os: any
componentRefs:
- name: gpu-operator
type: Helm
version: v25.3.3
order: 1
repository: https://helm.ngc.nvidia.com/nvidia
source: https://helm.ngc.nvidia.com/nvidia
chart: gpu-operator
- name: network-operator
type: Helm
version: v25.4.0
order: 2
repository: https://helm.ngc.nvidia.com/nvidia
source: https://helm.ngc.nvidia.com/nvidia
chart: network-operator
constraints:
driver:
version: "580.82.07"
Expand Down Expand Up @@ -947,9 +949,10 @@ paths:
kind: Recipe
componentRefs:
- name: gpu-operator
type: Helm
version: v25.3.3
type: helm
repository: https://helm.ngc.nvidia.com/nvidia
source: https://helm.ngc.nvidia.com/nvidia
chart: gpu-operator
fullRecipe:
summary: Full recipe with metadata
value:
Expand Down Expand Up @@ -979,9 +982,10 @@ paths:
os: any
componentRefs:
- name: gpu-operator
type: Helm
version: v25.3.3
order: 1
repository: https://helm.ngc.nvidia.com/nvidia
source: https://helm.ngc.nvidia.com/nvidia
chart: gpu-operator
responses:
"200":
description: Zip archive containing generated bundles
Expand Down Expand Up @@ -1457,19 +1461,40 @@ components:
type: array
items:
type: object
required: [name]
properties:
name:
type: string
description: Component name
type:
type: string
enum: [Helm, Kustomize]
description: >-
Deployment type; send the canonical value `Helm` or
`Kustomize` (the normative contract). A Helm ref must not carry
`tag`/`path`; a Kustomize ref requires `path` (and, when it
sets `tag`, a `source`). Incoherent combinations are rejected
with 400. For back-compatibility the server back-fills a
missing type on an enabled ref from the registry when the
component is known (a missing type on an unknown component is
rejected with 400), and accepts other casings (e.g. `helm`)
and normalizes them — but clients should send the canonical
values above.
version:
type: string
description: Component version
order:
type: integer
description: Deployment order
repository:
description: Chart version (Helm)
tag:
type: string
description: Git tag/ref (Kustomize); only meaningful with a source
source:
type: string
description: Chart repository URL (Helm) or OCI/Git source reference (Kustomize)
chart:
type: string
description: Helm chart name
path:
type: string
description: Helm repository URL
description: Path within the source to the kustomization (Kustomize)
description: List of component references with deployment order
constraints:
type: object
Expand Down
1 change: 1 addition & 0 deletions docs/user/cli-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,7 @@ criteria:
os: any
componentRefs:
- name: gpu-operator
type: Helm
version: vXX.Y.Z # illustrative; see Component Catalog for current pins
source: https://helm.ngc.nvidia.com/nvidia
deploymentOrder:
Expand Down
15 changes: 15 additions & 0 deletions pkg/bundler/bundler.go
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,21 @@ func (b *DefaultBundler) Make(ctx context.Context, recipeResult *recipe.RecipeRe
"recipe must contain at least one component reference")
}

// Reject incoherent component refs (e.g. a Helm ref that also carries a
// Kustomize tag/path, which the deployers silently build as Kustomize)
// before generating anything. DefaultBundler.Make is a public entry point
// (docs/integrator/public-api.md) reachable without the CLI/server
// validation boundaries, so it must apply the same coherence gate. Validate
// a provider-preserving defensive copy — a struct copy keeps the bound
// provider, and a fresh ComponentRefs slice keeps type back-fill/
// canonicalization from mutating the caller's RecipeResult. See #1584.
validated := *recipeResult
validated.ComponentRefs = append([]recipe.ComponentRef(nil), recipeResult.ComponentRefs...)
if err := validated.PrepareAndValidate(); err != nil {
return nil, err
}
recipeResult = &validated

enabledRefs, filteredOrder, filterErr := b.filterEnabledComponents(recipeResult)
if filterErr != nil {
return nil, filterErr
Expand Down
31 changes: 31 additions & 0 deletions pkg/bundler/bundler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3162,3 +3162,34 @@ func TestDynamicTolerationPathExcludedFromBakeIn(t *testing.T) {
t.Errorf("daemonsets.tolerations should not be baked in when declared dynamic, got: %v", val)
}
}

// TestMake_RejectsIncoherentRef verifies the public DefaultBundler.Make entry
// point rejects an incoherent ref (a Helm component carrying a Kustomize path,
// which the deployers would silently build as Kustomize) rather than producing
// a mismatched bundle. Pins issue #1584 at the direct-bundler boundary.
func TestMake_RejectsIncoherentRef(t *testing.T) {
bundler, err := New()
if err != nil {
t.Fatalf("New() error = %v", err)
}
recipeResult := &recipe.RecipeResult{
ComponentRefs: []recipe.ComponentRef{
{Name: "gpu-operator", Type: recipe.ComponentTypeHelm, Version: "v1", Path: "deploy"},
},
}
_, err = bundler.Make(context.Background(), recipeResult, t.TempDir())
if err == nil {
t.Fatal("expected Make to reject an incoherent Helm+path ref, got nil")
}
var se *errors.StructuredError
if !stderrors.As(err, &se) {
t.Fatalf("expected *errors.StructuredError, got %T: %v", err, err)
}
if se.Code != errors.ErrCodeInvalidRequest {
t.Errorf("expected ErrCodeInvalidRequest, got %s: %v", se.Code, err)
}
// The caller's RecipeResult must not have been mutated by validation.
if got := recipeResult.ComponentRefs[0].Type; got != recipe.ComponentTypeHelm {
t.Errorf("caller's ref was mutated: type=%q", got)
}
}
157 changes: 157 additions & 0 deletions pkg/client/v1/aicr_internal_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1046,3 +1046,160 @@ func TestClient_NoCacheGrowthAcrossManyCloseCycles(t *testing.T) {
}
}
}

// TestAdoptRecipe_RejectsIncoherentRef pins issue #1584 at the REST/adopt
// boundary: POST /v1/bundle decodes a RecipeResult and calls adoptRecipe,
// which never runs the resolver — so coherence must be enforced here. An
// incoherent ref (Helm carrying a Kustomize tag) is rejected, and a coherent
// lowercase-typed ref (the OpenAPI wire form) is accepted case-insensitively.
func TestAdoptRecipe_RejectsIncoherentRef(t *testing.T) {
t.Parallel()

client, err := NewClient(WithRecipeSource(EmbeddedSource()))
if err != nil {
t.Fatalf("NewClient: %v", err)
}
t.Cleanup(func() { _ = client.Close() })

base := func(refs []recipe.ComponentRef) *recipe.RecipeResult {
return &recipe.RecipeResult{
Kind: recipe.RecipeResultKind,
APIVersion: recipe.RecipeAPIVersion,
Criteria: &recipe.Criteria{Service: recipe.CriteriaServiceEKS},
ComponentRefs: refs,
}
}

// Incoherent: Helm ref carrying a Kustomize tag -> rejected.
_, err = client.adoptRecipe(t.Context(), base([]recipe.ComponentRef{
{Name: "gpu-operator", Type: recipe.ComponentTypeHelm, Version: "v1", Tag: "v2"},
}))
if err == nil {
t.Fatal("adoptRecipe accepted an incoherent Helm+tag ref; want ErrCodeInvalidRequest")
}
var se *aicrerrors.StructuredError
if !stderrors.As(err, &se) {
t.Fatalf("expected *aicrerrors.StructuredError, got %T: %v", err, err)
}
if se.Code != aicrerrors.ErrCodeInvalidRequest {
t.Errorf("expected ErrCodeInvalidRequest, got %s", se.Code)
}

// Coherent, lowercase type (OpenAPI wire form) -> accepted AND canonicalized
// so downstream deployers see the canonical constant.
res, err := client.adoptRecipe(t.Context(), base([]recipe.ComponentRef{
{Name: "gpu-operator", Type: recipe.ComponentType("helm"), Version: "v1"},
}))
if err != nil {
t.Fatalf("adoptRecipe rejected a coherent lowercase-typed ref: %v", err)
}
if got := res.internal.ComponentRefs[0].Type; got != recipe.ComponentTypeHelm {
t.Errorf("adopted lowercase type not canonicalized: got %q, want %q", got, recipe.ComponentTypeHelm)
}

// A type-less ref for a registry component (valid before #1584 — the
// deployers derive the type from fields) must be back-filled from the
// registry, not rejected.
res2, err := client.adoptRecipe(t.Context(), base([]recipe.ComponentRef{
{Name: "gpu-operator", Version: "v1"},
}))
if err != nil {
t.Fatalf("adoptRecipe rejected a type-less registry ref instead of back-filling: %v", err)
}
if got := res2.internal.ComponentRefs[0].Type; got != recipe.ComponentTypeHelm {
t.Errorf("type-less registry ref not back-filled: got %q, want %q", got, recipe.ComponentTypeHelm)
}
}

// blockingReadFileProvider parks ReadFile of a target file (e.g. registry.yaml)
// on a signal, so a test can deterministically pin adoptRecipe inside its
// registry-backed type back-fill while a second goroutine calls Close.
type blockingReadFileProvider struct {
underlying recipe.DataProvider
target string
readStarted chan struct{}
readUnblock chan struct{}
}

func (b *blockingReadFileProvider) ReadFile(ctx context.Context, path string) ([]byte, error) {
if strings.HasSuffix(path, b.target) {
select {
case <-b.readStarted:
default:
close(b.readStarted)
}
<-b.readUnblock
}
return b.underlying.ReadFile(ctx, path)
}

func (b *blockingReadFileProvider) WalkDir(ctx context.Context, root string, fn fs.WalkDirFunc) error {
return b.underlying.WalkDir(ctx, root, fn)
}

func (b *blockingReadFileProvider) Source(path string) string { return b.underlying.Source(path) }

// TestClient_CloseDrainsInflightAdopt pins the inflight registration added for
// adoptRecipe (issue #1584): PrepareAndValidate reads the component registry to
// back-fill a missing type, so a concurrent Close must drain the adopt before
// evicting caches. A type-less gpu-operator ref forces the registry ReadFile,
// which parks; Close must block on inflight.Wait until the adopt completes.
func TestClient_CloseDrainsInflightAdopt(t *testing.T) {
t.Parallel()

embedded := recipe.NewEmbeddedDataProvider(recipe.GetEmbeddedFS(), ".")
blockedDP := &blockingReadFileProvider{
underlying: embedded,
target: "registry.yaml",
readStarted: make(chan struct{}),
readUnblock: make(chan struct{}),
}
c := &Client{
builder: recipe.NewBuilder(recipe.WithDataProvider(blockedDP)),
dp: blockedDP,
}

adoptDone := make(chan struct{})
go func() {
defer close(adoptDone)
_, _ = c.adoptRecipe(context.Background(), &recipe.RecipeResult{
Kind: recipe.RecipeResultKind,
APIVersion: recipe.RecipeAPIVersion,
Criteria: &recipe.Criteria{Service: recipe.CriteriaServiceEKS},
ComponentRefs: []recipe.ComponentRef{
{Name: "gpu-operator", Version: "v1"}, // type-less -> forces registry back-fill
},
})
}()

select {
case <-blockedDP.readStarted:
case <-time.After(5 * time.Second):
t.Fatal("adoptRecipe never read registry.yaml within 5s")
}

closeDone := make(chan struct{})
go func() {
defer close(closeDone)
_ = c.Close()
}()

select {
case <-closeDone:
t.Fatal("Close returned before in-flight adoptRecipe drained; inflight registration is missing")
case <-time.After(100 * time.Millisecond):
// Expected: Close parked on inflight.Wait().
}

close(blockedDP.readUnblock)
select {
case <-adoptDone:
case <-time.After(10 * time.Second):
t.Fatal("adoptRecipe did not complete within 10s after unblock")
}
select {
case <-closeDone:
case <-time.After(10 * time.Second):
t.Fatal("Close did not complete within 10s after adopt drained")
}
}
30 changes: 26 additions & 4 deletions pkg/client/v1/bundle.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,11 @@ type BundleOptions struct {
// takes an already-decoded RecipeResult and binds the same provider.
//
// Synchronization mirrors LoadRecipe: snapshot the per-Client provider under
// the read lock so a concurrent Close can't race the read. Unlike the
// resolve/load paths it does not register in the inflight WaitGroup — it does
// no cache-using work itself (the subsequent MakeBundle call does, and
// registers there).
// the read lock and register in the inflight WaitGroup so Close drains before
// evicting the cache. PrepareAndValidate reads the component registry (to
// back-fill missing types), so — unlike before #1584 — this path does do
// cache-using work of its own and must register, not only rely on the
// subsequent MakeBundle call.
//
// Errors:
// - ErrCodeInvalidRequest when the Client, ctx, or recipe is nil, or when
Expand All @@ -113,13 +114,21 @@ func (c *Client) adoptRecipe(ctx context.Context, rec *recipe.RecipeResult) (*Re
return nil, errors.New(errors.ErrCodeInvalidRequest, "nil RecipeResult")
}

// Snapshot the provider and register as an in-flight cache user under the
// read lock: PrepareAndValidate below reads the component registry (to
// back-fill missing types), so Close must drain this before evicting the
// cache — otherwise a concurrent Close could evict, this call repopulate,
// and we would return a result owned by a closed Client. Same protocol as
// the resolve/load paths.
c.mu.RLock()
if c.builder == nil {
c.mu.RUnlock()
return nil, errors.New(errors.ErrCodeInvalidRequest, "aicr client not initialized (or already closed)")
}
dp := c.dp
c.inflight.Add(1)
c.mu.RUnlock()
defer c.inflight.Done()

// Deep-copy the caller-supplied recipe BEFORE binding the provider.
// BindDataProvider mutates the receiver's unexported provider field, and
Expand All @@ -131,6 +140,19 @@ func (c *Client) adoptRecipe(ctx context.Context, rec *recipe.RecipeResult) (*Re
cp := rec.DeepCopy()
cp.BindDataProvider(dp)

// An adopted RecipeResult is decoded from an external source (e.g. the
// POST /v1/bundle body) and never passes through the resolver, so
// canonicalize its component types and validate coherence here — otherwise
// an incoherent ref (e.g. Helm + Kustomize tag/path) would deploy as a
// different type than declared and mismatch the attestation BOM, and a
// lowercase type would deploy inconsistently downstream. PrepareAndValidate
// back-fills missing types from the registry first (this boundary does not
// run ApplyRegistryDefaults) so a type-less registry ref — valid before
// #1584 — still resolves rather than being rejected. See issue #1584.
if err := cp.PrepareAndValidate(); err != nil {
return nil, err
}

result, err := loadedResultFromInternal(cp)
if err != nil {
return nil, err
Expand Down
Loading
Loading