From 1cf96f0165d233b8ba4f621a401bdf09a1202732 Mon Sep 17 00:00:00 2001 From: chokevin <11001563+chokevin@users.noreply.github.com> Date: Wed, 22 Apr 2026 11:28:36 -0700 Subject: [PATCH 01/20] feat(crd): add AzureFlexNodeClass v1alpha1 CRD Adds the AzureFlexNodeClass CRD that lets a Karpenter NodePool in an AKS cluster auto-provision single Azure VMs in a (potentially different) Azure region. Spec covers subscription/RG/subnet, image (marketplace ref or SIG ID), security type (Standard only), OS disk size, SSH keys, public IP toggle, max pods, and tags. Implements status.Object. Regenerates deepcopy and CRD manifests. Mirrors the new CRD into the helm chart crds/ directory. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ...ex.aks.azure.com_azureflexnodeclasses.yaml | 214 ++++++++++++++++++ ...ex.aks.azure.com_azureflexnodeclasses.yaml | 214 ++++++++++++++++++ .../flex.aks.azure.com_nebiusnodeclasses.yaml | 14 +- karpenter/pkg/apis/v1alpha1/azureflex.go | 142 ++++++++++++ karpenter/pkg/apis/v1alpha1/doc.go | 2 + karpenter/pkg/apis/v1alpha1/labels.go | 5 + .../apis/v1alpha1/zz_generated.deepcopy.go | 153 +++++++++++++ 7 files changed, 737 insertions(+), 7 deletions(-) create mode 100644 karpenter/charts/karpenter/crds/flex.aks.azure.com_azureflexnodeclasses.yaml create mode 100644 karpenter/pkg/apis/crds/flex.aks.azure.com_azureflexnodeclasses.yaml create mode 100644 karpenter/pkg/apis/v1alpha1/azureflex.go diff --git a/karpenter/charts/karpenter/crds/flex.aks.azure.com_azureflexnodeclasses.yaml b/karpenter/charts/karpenter/crds/flex.aks.azure.com_azureflexnodeclasses.yaml new file mode 100644 index 0000000..4742726 --- /dev/null +++ b/karpenter/charts/karpenter/crds/flex.aks.azure.com_azureflexnodeclasses.yaml @@ -0,0 +1,214 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.20.1 + name: azureflexnodeclasses.flex.aks.azure.com +spec: + group: flex.aks.azure.com + names: + categories: + - karpenter + - nap + kind: AzureFlexNodeClass + listKind: AzureFlexNodeClassList + plural: azureflexnodeclasses + shortNames: + - afnc + - afncs + singular: azureflexnodeclass + scope: Cluster + versions: + - additionalPrinterColumns: + - jsonPath: .status.conditions[?(@.type=='Ready')].status + name: Ready + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + AzureFlexNodeClass is the Schema for the AzureFlexNodeClass API. + + It enables a NodePool in an AKS cluster to auto-provision external Azure VMs in a + (potentially different) Azure region than the AKS cluster's own region. Each node + is a single VM (not VMSS) so that cross-region placement is straightforward. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: |- + AzureFlexNodeClassSpec is the spec for AzureFlexNodeClass. + + Phase 1 scope (issue #63): single region per NodeClass, no spot, no zones, + no identity/UAMI per-NodeClass (the controller MI is assumed to have + Contributor on the target subscription/RG/subnet), no quota preflight, + no PPG/capacity reservation, no spot, no WireGuard. + properties: + allocateNodePublicIP: + default: false + description: AllocateNodePublicIP controls whether each node receives + a public IP. + type: boolean + imageID: + description: ImageID is a SIG / community gallery image resource ID. + Mutually exclusive with ImageReference. + type: string + imageReference: + description: |- + ImageReference selects an Azure Marketplace image. Mutually exclusive with ImageID. + If neither is set, defaults to microsoft-dsvm/ubuntu-hpc/2204/latest. + properties: + offer: + type: string + publisher: + type: string + sku: + type: string + version: + default: latest + type: string + required: + - offer + - publisher + - sku + type: object + location: + description: Location is the Azure region (e.g. "eastus2"). May differ + from the AKS cluster region. + type: string + maxPodsPerNode: + default: 110 + description: MaxPodsPerNode is advertised in the node's capacity and + affects Karpenter scheduling. + format: int32 + type: integer + osDiskSizeGB: + default: 128 + description: OSDiskSizeGB is the size of the OS disk in GB. + format: int32 + type: integer + resourceGroup: + description: |- + ResourceGroup is the resource group where VMs, NICs, and OS disks land. + Must already exist. + type: string + securityType: + default: Standard + description: |- + SecurityType selects the VM security profile. Currently only "Standard" is supported. + TrustedLaunch is deferred — it has been observed to break the DSVM image. + enum: + - Standard + type: string + sshPublicKeys: + description: SSHPublicKeys is the list of SSH public keys to install + on each node. + items: + type: string + type: array + subnetID: + description: |- + SubnetID is the full ARM resource ID of the subnet (must already exist + and be reachable from the AKS cluster). + type: string + subscriptionID: + description: SubscriptionID is the Azure subscription where VMs will + be created. + type: string + tags: + additionalProperties: + type: string + description: Tags are applied to every Azure resource (VM, NIC, OS + disk) created from this NodeClass. + type: object + required: + - location + - resourceGroup + - subnetID + - subscriptionID + type: object + status: + description: status contains the resolved state of the AzureFlexNodeClass. + properties: + conditions: + description: conditions contains signals for health and readiness + items: + description: Condition aliases the upstream type and adds additional + helper methods + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/karpenter/pkg/apis/crds/flex.aks.azure.com_azureflexnodeclasses.yaml b/karpenter/pkg/apis/crds/flex.aks.azure.com_azureflexnodeclasses.yaml new file mode 100644 index 0000000..4742726 --- /dev/null +++ b/karpenter/pkg/apis/crds/flex.aks.azure.com_azureflexnodeclasses.yaml @@ -0,0 +1,214 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.20.1 + name: azureflexnodeclasses.flex.aks.azure.com +spec: + group: flex.aks.azure.com + names: + categories: + - karpenter + - nap + kind: AzureFlexNodeClass + listKind: AzureFlexNodeClassList + plural: azureflexnodeclasses + shortNames: + - afnc + - afncs + singular: azureflexnodeclass + scope: Cluster + versions: + - additionalPrinterColumns: + - jsonPath: .status.conditions[?(@.type=='Ready')].status + name: Ready + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + AzureFlexNodeClass is the Schema for the AzureFlexNodeClass API. + + It enables a NodePool in an AKS cluster to auto-provision external Azure VMs in a + (potentially different) Azure region than the AKS cluster's own region. Each node + is a single VM (not VMSS) so that cross-region placement is straightforward. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: |- + AzureFlexNodeClassSpec is the spec for AzureFlexNodeClass. + + Phase 1 scope (issue #63): single region per NodeClass, no spot, no zones, + no identity/UAMI per-NodeClass (the controller MI is assumed to have + Contributor on the target subscription/RG/subnet), no quota preflight, + no PPG/capacity reservation, no spot, no WireGuard. + properties: + allocateNodePublicIP: + default: false + description: AllocateNodePublicIP controls whether each node receives + a public IP. + type: boolean + imageID: + description: ImageID is a SIG / community gallery image resource ID. + Mutually exclusive with ImageReference. + type: string + imageReference: + description: |- + ImageReference selects an Azure Marketplace image. Mutually exclusive with ImageID. + If neither is set, defaults to microsoft-dsvm/ubuntu-hpc/2204/latest. + properties: + offer: + type: string + publisher: + type: string + sku: + type: string + version: + default: latest + type: string + required: + - offer + - publisher + - sku + type: object + location: + description: Location is the Azure region (e.g. "eastus2"). May differ + from the AKS cluster region. + type: string + maxPodsPerNode: + default: 110 + description: MaxPodsPerNode is advertised in the node's capacity and + affects Karpenter scheduling. + format: int32 + type: integer + osDiskSizeGB: + default: 128 + description: OSDiskSizeGB is the size of the OS disk in GB. + format: int32 + type: integer + resourceGroup: + description: |- + ResourceGroup is the resource group where VMs, NICs, and OS disks land. + Must already exist. + type: string + securityType: + default: Standard + description: |- + SecurityType selects the VM security profile. Currently only "Standard" is supported. + TrustedLaunch is deferred — it has been observed to break the DSVM image. + enum: + - Standard + type: string + sshPublicKeys: + description: SSHPublicKeys is the list of SSH public keys to install + on each node. + items: + type: string + type: array + subnetID: + description: |- + SubnetID is the full ARM resource ID of the subnet (must already exist + and be reachable from the AKS cluster). + type: string + subscriptionID: + description: SubscriptionID is the Azure subscription where VMs will + be created. + type: string + tags: + additionalProperties: + type: string + description: Tags are applied to every Azure resource (VM, NIC, OS + disk) created from this NodeClass. + type: object + required: + - location + - resourceGroup + - subnetID + - subscriptionID + type: object + status: + description: status contains the resolved state of the AzureFlexNodeClass. + properties: + conditions: + description: conditions contains signals for health and readiness + items: + description: Condition aliases the upstream type and adds additional + helper methods + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/karpenter/pkg/apis/crds/flex.aks.azure.com_nebiusnodeclasses.yaml b/karpenter/pkg/apis/crds/flex.aks.azure.com_nebiusnodeclasses.yaml index a216063..daa430b 100644 --- a/karpenter/pkg/apis/crds/flex.aks.azure.com_nebiusnodeclasses.yaml +++ b/karpenter/pkg/apis/crds/flex.aks.azure.com_nebiusnodeclasses.yaml @@ -53,6 +53,13 @@ spec: allocateNodePublicIP: default: false type: boolean + maxPodsPerNode: + default: 110 + description: |- + MaxPodsPerNode is the maximum number of pods that can be scheduled on a single node. + This value is advertised in the node's capacity and affects Karpenter's scheduling decisions. + format: int32 + type: integer osDiskImageFamily: default: ubuntu24.04-driverless type: string @@ -61,13 +68,6 @@ spec: description: OSDiskSizeGB is the size of the OS disk in GB. format: int32 type: integer - maxPodsPerNode: - default: 110 - description: |- - MaxPodsPerNode is the maximum number of pods that can be scheduled on a single node. - This value is advertised in the node's capacity and affects Karpenter's scheduling decisions. - format: int32 - type: integer projectID: description: ProjectID is the nebius project id to launch nodes in. type: string diff --git a/karpenter/pkg/apis/v1alpha1/azureflex.go b/karpenter/pkg/apis/v1alpha1/azureflex.go new file mode 100644 index 0000000..84d8f97 --- /dev/null +++ b/karpenter/pkg/apis/v1alpha1/azureflex.go @@ -0,0 +1,142 @@ +package v1alpha1 + +import ( + "github.com/awslabs/operatorpkg/status" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// AzureFlexNodeClass is the Schema for the AzureFlexNodeClass API. +// +// It enables a NodePool in an AKS cluster to auto-provision external Azure VMs in a +// (potentially different) Azure region than the AKS cluster's own region. Each node +// is a single VM (not VMSS) so that cross-region placement is straightforward. +// +// +kubebuilder:object:root=true +// +kubebuilder:resource:path=azureflexnodeclasses,scope=Cluster,shortName={afnc,afncs},categories={karpenter,nap} +// +kubebuilder:printcolumn:name="Ready",type=string,JSONPath=".status.conditions[?(@.type=='Ready')].status" +// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=".metadata.creationTimestamp" +// +kubebuilder:storageversion +// +kubebuilder:subresource:status +type AzureFlexNodeClass struct { + metav1.TypeMeta `json:",inline"` + // metadata is standard object metadata. + // +optional + metav1.ObjectMeta `json:"metadata,omitempty"` + + // +optional + Spec AzureFlexNodeClassSpec `json:"spec,omitempty"` + + // status contains the resolved state of the AzureFlexNodeClass. + // +optional + Status AzureFlexNodeClassStatus `json:"status,omitempty"` +} + +var _ status.Object = (*AzureFlexNodeClass)(nil) + +func (s *AzureFlexNodeClass) GetConditions() []status.Condition { + return s.Status.Conditions +} + +func (s *AzureFlexNodeClass) SetConditions(conditions []status.Condition) { + s.Status.Conditions = conditions +} + +func (s *AzureFlexNodeClass) StatusConditions() status.ConditionSet { + conds := []string{ + ConditionTypeValidationSucceeded, + } + return status.NewReadyConditions(conds...).For(s) +} + +// AzureFlexImageReference selects an Azure Marketplace image. +// Mutually exclusive with AzureFlexNodeClassSpec.ImageID. +type AzureFlexImageReference struct { + // +required + Publisher string `json:"publisher"` + // +required + Offer string `json:"offer"` + // +required + SKU string `json:"sku"` + // +optional + // +default="latest" + Version string `json:"version,omitempty"` +} + +// AzureFlexNodeClassSpec is the spec for AzureFlexNodeClass. +// +// Phase 1 scope (issue #63): single region per NodeClass, no spot, no zones, +// no identity/UAMI per-NodeClass (the controller MI is assumed to have +// Contributor on the target subscription/RG/subnet), no quota preflight, +// no PPG/capacity reservation, no spot, no WireGuard. +type AzureFlexNodeClassSpec struct { + // SubscriptionID is the Azure subscription where VMs will be created. + // +required + SubscriptionID string `json:"subscriptionID"` + + // Location is the Azure region (e.g. "eastus2"). May differ from the AKS cluster region. + // +required + Location string `json:"location"` + + // ResourceGroup is the resource group where VMs, NICs, and OS disks land. + // Must already exist. + // +required + ResourceGroup string `json:"resourceGroup"` + + // SubnetID is the full ARM resource ID of the subnet (must already exist + // and be reachable from the AKS cluster). + // +required + SubnetID string `json:"subnetID"` + + // ImageReference selects an Azure Marketplace image. Mutually exclusive with ImageID. + // If neither is set, defaults to microsoft-dsvm/ubuntu-hpc/2204/latest. + // +optional + ImageReference *AzureFlexImageReference `json:"imageReference,omitempty"` + + // ImageID is a SIG / community gallery image resource ID. Mutually exclusive with ImageReference. + // +optional + ImageID *string `json:"imageID,omitempty"` + + // SecurityType selects the VM security profile. Currently only "Standard" is supported. + // TrustedLaunch is deferred — it has been observed to break the DSVM image. + // +optional + // +default="Standard" + // +kubebuilder:validation:Enum=Standard + SecurityType *string `json:"securityType,omitempty"` + + // OSDiskSizeGB is the size of the OS disk in GB. + // +optional + // +default=128 + OSDiskSizeGB *int32 `json:"osDiskSizeGB,omitempty"` + + // SSHPublicKeys is the list of SSH public keys to install on each node. + // +optional + SSHPublicKeys []string `json:"sshPublicKeys,omitempty"` + + // AllocateNodePublicIP controls whether each node receives a public IP. + // +optional + // +default=false + AllocateNodePublicIP *bool `json:"allocateNodePublicIP,omitempty"` + + // MaxPodsPerNode is advertised in the node's capacity and affects Karpenter scheduling. + // +optional + // +default=110 + MaxPodsPerNode *int32 `json:"maxPodsPerNode,omitempty"` + + // Tags are applied to every Azure resource (VM, NIC, OS disk) created from this NodeClass. + // +optional + Tags map[string]string `json:"tags,omitempty"` +} + +type AzureFlexNodeClassStatus struct { + // conditions contains signals for health and readiness + // +optional + //nolint:kubeapilinter // conditions: using status.Condition from operatorpkg instead of metav1.Condition for compatibility + Conditions []status.Condition `json:"conditions,omitempty"` +} + +// +kubebuilder:object:root=true +type AzureFlexNodeClassList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []AzureFlexNodeClass `json:"items"` +} diff --git a/karpenter/pkg/apis/v1alpha1/doc.go b/karpenter/pkg/apis/v1alpha1/doc.go index 881f9c4..c50c6b9 100644 --- a/karpenter/pkg/apis/v1alpha1/doc.go +++ b/karpenter/pkg/apis/v1alpha1/doc.go @@ -17,6 +17,8 @@ var ( scheme.AddKnownTypes(SchemeGroupVersion, &NebiusNodeClass{}, &NebiusNodeClassList{}, + &AzureFlexNodeClass{}, + &AzureFlexNodeClassList{}, ) metav1.AddToGroupVersion(scheme, SchemeGroupVersion) return nil diff --git a/karpenter/pkg/apis/v1alpha1/labels.go b/karpenter/pkg/apis/v1alpha1/labels.go index 3cd96da..f861e73 100644 --- a/karpenter/pkg/apis/v1alpha1/labels.go +++ b/karpenter/pkg/apis/v1alpha1/labels.go @@ -4,4 +4,9 @@ import "github.com/Azure/aks-flex/karpenter/pkg/apis" const ( TerminationFinalizer = apis.Group + "/termination" + + // AzureFlexNodeClassHashAnnotation stores the deterministic hash of the + // AzureFlexNodeClass spec at the time a NodeClaim was created. The + // CloudProvider compares it against the current spec hash to compute drift. + AzureFlexNodeClassHashAnnotation = apis.Group + "/azureflex-nodeclass-hash" ) diff --git a/karpenter/pkg/apis/v1alpha1/zz_generated.deepcopy.go b/karpenter/pkg/apis/v1alpha1/zz_generated.deepcopy.go index 3a6a61f..b24a064 100644 --- a/karpenter/pkg/apis/v1alpha1/zz_generated.deepcopy.go +++ b/karpenter/pkg/apis/v1alpha1/zz_generated.deepcopy.go @@ -9,6 +9,159 @@ import ( "k8s.io/apimachinery/pkg/runtime" ) +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AzureFlexImageReference) DeepCopyInto(out *AzureFlexImageReference) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AzureFlexImageReference. +func (in *AzureFlexImageReference) DeepCopy() *AzureFlexImageReference { + if in == nil { + return nil + } + out := new(AzureFlexImageReference) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AzureFlexNodeClass) DeepCopyInto(out *AzureFlexNodeClass) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AzureFlexNodeClass. +func (in *AzureFlexNodeClass) DeepCopy() *AzureFlexNodeClass { + if in == nil { + return nil + } + out := new(AzureFlexNodeClass) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *AzureFlexNodeClass) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AzureFlexNodeClassList) DeepCopyInto(out *AzureFlexNodeClassList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]AzureFlexNodeClass, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AzureFlexNodeClassList. +func (in *AzureFlexNodeClassList) DeepCopy() *AzureFlexNodeClassList { + if in == nil { + return nil + } + out := new(AzureFlexNodeClassList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *AzureFlexNodeClassList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AzureFlexNodeClassSpec) DeepCopyInto(out *AzureFlexNodeClassSpec) { + *out = *in + if in.ImageReference != nil { + in, out := &in.ImageReference, &out.ImageReference + *out = new(AzureFlexImageReference) + **out = **in + } + if in.ImageID != nil { + in, out := &in.ImageID, &out.ImageID + *out = new(string) + **out = **in + } + if in.SecurityType != nil { + in, out := &in.SecurityType, &out.SecurityType + *out = new(string) + **out = **in + } + if in.OSDiskSizeGB != nil { + in, out := &in.OSDiskSizeGB, &out.OSDiskSizeGB + *out = new(int32) + **out = **in + } + if in.SSHPublicKeys != nil { + in, out := &in.SSHPublicKeys, &out.SSHPublicKeys + *out = make([]string, len(*in)) + copy(*out, *in) + } + if in.AllocateNodePublicIP != nil { + in, out := &in.AllocateNodePublicIP, &out.AllocateNodePublicIP + *out = new(bool) + **out = **in + } + if in.MaxPodsPerNode != nil { + in, out := &in.MaxPodsPerNode, &out.MaxPodsPerNode + *out = new(int32) + **out = **in + } + if in.Tags != nil { + in, out := &in.Tags, &out.Tags + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AzureFlexNodeClassSpec. +func (in *AzureFlexNodeClassSpec) DeepCopy() *AzureFlexNodeClassSpec { + if in == nil { + return nil + } + out := new(AzureFlexNodeClassSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AzureFlexNodeClassStatus) DeepCopyInto(out *AzureFlexNodeClassStatus) { + *out = *in + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]status.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AzureFlexNodeClassStatus. +func (in *AzureFlexNodeClassStatus) DeepCopy() *AzureFlexNodeClassStatus { + if in == nil { + return nil + } + out := new(AzureFlexNodeClassStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *NebiusNodeClass) DeepCopyInto(out *NebiusNodeClass) { *out = *in From 6c4786d8b20469592eafc42069acc84f19d57ed9 Mon Sep 17 00:00:00 2001 From: chokevin <11001563+chokevin@users.noreply.github.com> Date: Wed, 22 Apr 2026 11:28:47 -0700 Subject: [PATCH 02/20] chore(plugin): bump flexNodeVersion to v0.0.18 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- plugin/pkg/services/agentpools/userdata/flex/flex.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugin/pkg/services/agentpools/userdata/flex/flex.go b/plugin/pkg/services/agentpools/userdata/flex/flex.go index a1bd145..288f110 100644 --- a/plugin/pkg/services/agentpools/userdata/flex/flex.go +++ b/plugin/pkg/services/agentpools/userdata/flex/flex.go @@ -27,7 +27,7 @@ var bootstrapTmpl string var bootstrapTemplate = template.Must(template.New("bootstrap.sh").Parse(bootstrapTmpl)) const ( - flexNodeVersion = "v0.0.17" + flexNodeVersion = "v0.0.18" defaultArch = "amd64" DefaultKubeVer = "1.34.2" ) From 6bdb3a2eaf0e283c5ff900b053de1f448b08bb8f Mon Sep 17 00:00:00 2001 From: chokevin <11001563+chokevin@users.noreply.github.com> Date: Wed, 22 Apr 2026 11:28:47 -0700 Subject: [PATCH 03/20] feat(plugin): add azure/flexvm agent pool service MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a new gRPC-served agent pool implementation that creates a single Azure VM (not VMSS) per agent pool. VM and NIC names are deterministic from the agent pool ID so retries are idempotent. Both the NIC and OS disk are tagged DeleteOption=Delete so a single VM delete cascades the whole resource trio. Phase 1 scope: Standard security only (TrustedLaunch breaks the DSVM image), default DSVM image, gzip+base64 cloud-init via flex.UserData, no public IP by default. Auth uses the plugin process's default Azure credential — the plugin MI is expected to have Contributor on the target subscription/RG/subnet. Wires the new service alongside ubuntu2404vmss in the agentpools and instances service registries. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- plugin/pkg/services/agentpools/agentpools.go | 5 + .../agentpools/azure/flexvm/agentpools.go | 383 +++++++ .../agentpools/azure/flexvm/agentpools.pb.go | 941 ++++++++++++++++++ .../agentpools/azure/flexvm/agentpools.proto | 74 ++ .../agentpools/azure/flexvm/instances.go | 78 ++ .../agentpools/azure/flexvm/instances.pb.go | 282 ++++++ .../agentpools/azure/flexvm/instances.proto | 23 + .../agentpools/azure/flexvm/redact.go | 10 + plugin/pkg/services/agentpools/instances.go | 5 + 9 files changed, 1801 insertions(+) create mode 100644 plugin/pkg/services/agentpools/azure/flexvm/agentpools.go create mode 100644 plugin/pkg/services/agentpools/azure/flexvm/agentpools.pb.go create mode 100644 plugin/pkg/services/agentpools/azure/flexvm/agentpools.proto create mode 100644 plugin/pkg/services/agentpools/azure/flexvm/instances.go create mode 100644 plugin/pkg/services/agentpools/azure/flexvm/instances.pb.go create mode 100644 plugin/pkg/services/agentpools/azure/flexvm/instances.proto create mode 100644 plugin/pkg/services/agentpools/azure/flexvm/redact.go diff --git a/plugin/pkg/services/agentpools/agentpools.go b/plugin/pkg/services/agentpools/agentpools.go index 5389c60..6b1acaf 100644 --- a/plugin/pkg/services/agentpools/agentpools.go +++ b/plugin/pkg/services/agentpools/agentpools.go @@ -5,6 +5,7 @@ import ( "github.com/Azure/aks-flex/plugin/pkg/server" "github.com/Azure/aks-flex/plugin/pkg/services/agentpools/api" "github.com/Azure/aks-flex/plugin/pkg/services/agentpools/aws/ubuntu2404instance" + "github.com/Azure/aks-flex/plugin/pkg/services/agentpools/azure/flexvm" "github.com/Azure/aks-flex/plugin/pkg/services/agentpools/azure/ubuntu2404vmss" nebiusinstance "github.com/Azure/aks-flex/plugin/pkg/services/agentpools/nebius/instance" ) @@ -27,6 +28,10 @@ func NewAgentPoolsServer(db db.DB) api.AgentPoolsServer { return ubuntu2404vmss.NewAgentPoolsServer(srv.DB) }, &ubuntu2404vmss.AgentPool{}) + server.MustRegister(srv.Servers, func() (api.AgentPoolsServer, error) { + return flexvm.NewAgentPoolsServer(srv.DB) + }, &flexvm.AgentPool{}) + server.MustRegister(srv.Servers, func() (api.AgentPoolsServer, error) { return nebiusinstance.NewAgentPoolsServer(srv.DB) }, &nebiusinstance.AgentPool{}) diff --git a/plugin/pkg/services/agentpools/azure/flexvm/agentpools.go b/plugin/pkg/services/agentpools/azure/flexvm/agentpools.go new file mode 100644 index 0000000..5b793e5 --- /dev/null +++ b/plugin/pkg/services/agentpools/azure/flexvm/agentpools.go @@ -0,0 +1,383 @@ +// Package flexvm implements the cross-region Azure VM agent pool service. +// +// This service creates a single Microsoft.Compute/virtualMachines (and a +// dedicated NIC + OS disk) per AgentPool. It is intentionally separate from +// ubuntu2404vmss (which uses a Virtual Machine Scale Set) because per-VM +// management is required for Karpenter's per-NodeClaim lifecycle and for +// straightforward cross-region placement. +// +// Authentication: the plugin process is expected to authenticate via +// DefaultAzureCredential (e.g. a workload identity / managed identity) and +// hold Contributor on the target subscription / resource group / subnet. +// +// Resource naming is fully deterministic from the AgentPool ID (which is the +// NodeClaim name) so retries are idempotent: +// - VM name = +// - NIC name = -nic +// - IP cfg name = ipconfig +// +// The NIC and OS disk are configured with DeleteOption=Delete so a single VM +// delete cascades cleanup; this is critical for idempotency on Karpenter +// retries. +package flexvm + +import ( + "context" + "encoding/base64" + "errors" + "fmt" + "strings" + + "github.com/Azure/azure-sdk-for-go/sdk/azcore" + "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm" + "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" + "github.com/Azure/azure-sdk-for-go/sdk/azidentity" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v8" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" + "google.golang.org/protobuf/types/known/anypb" + "google.golang.org/protobuf/types/known/timestamppb" + + "github.com/Azure/aks-flex/plugin/api" + "github.com/Azure/aks-flex/plugin/pkg/db" + "github.com/Azure/aks-flex/plugin/pkg/helper" + agentpools "github.com/Azure/aks-flex/plugin/pkg/services/agentpools/api" + "github.com/Azure/aks-flex/plugin/pkg/services/agentpools/userdata/flex" + "github.com/Azure/aks-flex/plugin/pkg/topology" +) + +var _ api.Object = (*AgentPool)(nil) + +// Default DSVM image. SecurityType MUST stay "Standard" — TrustedLaunch is +// known to break the DSVM image (verified during manual H200 bringup). +const ( + defaultImagePublisher = "microsoft-dsvm" + defaultImageOffer = "ubuntu-hpc" + defaultImageSKU = "2204" + defaultImageVersion = "latest" + + defaultAdminUsername = "ubuntu" +) + +type agentpoolsServer struct { + agentpools.UnimplementedAgentPoolsServer + storage db.RODB + + credentials azcore.TokenCredential +} + +func NewAgentPoolsServer(storage db.RODB) (agentpools.AgentPoolsServer, error) { + credentials, err := azidentity.NewDefaultAzureCredential(nil) + if err != nil { + return nil, err + } + + return &agentpoolsServer{ + storage: storage, + credentials: credentials, + }, nil +} + +func (srv *agentpoolsServer) CreateOrUpdate(ctx context.Context, req *api.CreateOrUpdateRequest) (*api.CreateOrUpdateResponse, error) { + ap, err := helper.AnyTo[*AgentPool](req.GetItem()) + if err != nil { + return nil, err + } + if err := validateSpec(ap.GetSpec()); err != nil { + return nil, status.Error(codes.InvalidArgument, err.Error()) + } + + spec := ap.GetSpec() + vmName := ap.GetMetadata().GetId() + nicName := vmName + "-nic" + + // Annotate kubeadm node labels with cross-region topology hints. Note: + // region is the *target* region (the VM's region), which may differ + // from the AKS control-plane region — that's the whole point of flexvm. + kubeadmConfig := spec.GetKubeadm() + kubeadmConfig.AddNodeLabels(map[string]string{ + topology.NodeLabelKeyCloud: "azure", + topology.NodeLabelKeyRegion: strings.ToLower(spec.GetLocation()), + topology.NodeLabelKeyInstanceType: strings.ToLower(spec.GetVmSize()), + }) + + userData, err := flex.UserData( + flex.WithKubeadmConfig(kubeadmConfig), + ) + if err != nil { + return nil, fmt.Errorf("rendering flex user data: %w", err) + } + userDataBytes, err := userData.Gzip() + if err != nil { + return nil, fmt.Errorf("gzipping user data: %w", err) + } + userDataB64 := base64.StdEncoding.EncodeToString(userDataBytes) + + // 1. NIC (idempotent: same name → ARM updates in place). + nicsClient, err := armnetwork.NewInterfacesClient(spec.GetSubscriptionId(), srv.credentials, nil) + if err != nil { + return nil, fmt.Errorf("creating NIC client: %w", err) + } + nicParams := armnetwork.Interface{ + Location: to.Ptr(spec.GetLocation()), + Tags: toARMTags(spec.GetTags()), + Properties: &armnetwork.InterfacePropertiesFormat{ + IPConfigurations: []*armnetwork.InterfaceIPConfiguration{ + { + Name: to.Ptr("ipconfig"), + Properties: &armnetwork.InterfaceIPConfigurationPropertiesFormat{ + Subnet: &armnetwork.Subnet{ID: to.Ptr(spec.GetSubnetId())}, + PrivateIPAllocationMethod: to.Ptr(armnetwork.IPAllocationMethodDynamic), + }, + }, + }, + }, + } + if spec.GetAllocatePublicIp() { + // Phase 1: skip explicit PIP creation — leave a TODO. Per-NodeClass + // public IP is deferred; documented in CRD. + // (Falls through to private-only NIC.) + _ = nicParams // satisfy linter + } + nicPoller, err := nicsClient.BeginCreateOrUpdate(ctx, spec.GetResourceGroup(), nicName, nicParams, nil) + if err != nil { + return nil, fmt.Errorf("creating NIC %q: %w", nicName, err) + } + nicResp, err := nicPoller.PollUntilDone(ctx, nil) + if err != nil { + return nil, fmt.Errorf("polling NIC creation %q: %w", nicName, err) + } + nicID := *nicResp.ID + + // 2. VM. NIC + OS disk both set DeleteOption=Delete so a single VM + // delete cascades — this is critical for Karpenter retry idempotency. + vmsClient, err := armcompute.NewVirtualMachinesClient(spec.GetSubscriptionId(), srv.credentials, nil) + if err != nil { + return nil, fmt.Errorf("creating VM client: %w", err) + } + imageRef, err := buildImageReference(spec) + if err != nil { + return nil, status.Error(codes.InvalidArgument, err.Error()) + } + osDiskSizeGB := spec.GetOsDiskSizeGb() + if osDiskSizeGB == 0 { + osDiskSizeGB = 128 + } + + vmParams := armcompute.VirtualMachine{ + Location: to.Ptr(spec.GetLocation()), + Tags: toARMTags(spec.GetTags()), + Properties: &armcompute.VirtualMachineProperties{ + HardwareProfile: &armcompute.HardwareProfile{ + VMSize: to.Ptr(armcompute.VirtualMachineSizeTypes(spec.GetVmSize())), + }, + SecurityProfile: &armcompute.SecurityProfile{ + // Standard only — TrustedLaunch is deferred (breaks DSVM). + SecurityType: nil, + }, + NetworkProfile: &armcompute.NetworkProfile{ + NetworkInterfaces: []*armcompute.NetworkInterfaceReference{ + { + ID: to.Ptr(nicID), + Properties: &armcompute.NetworkInterfaceReferenceProperties{ + Primary: to.Ptr(true), + DeleteOption: to.Ptr(armcompute.DeleteOptionsDelete), + }, + }, + }, + }, + OSProfile: &armcompute.OSProfile{ + ComputerName: to.Ptr(vmName), + AdminUsername: to.Ptr(defaultAdminUsername), + LinuxConfiguration: &armcompute.LinuxConfiguration{ + DisablePasswordAuthentication: to.Ptr(true), + SSH: buildSSHConfig(spec.GetSshPublicKeys()), + }, + }, + StorageProfile: &armcompute.StorageProfile{ + ImageReference: imageRef, + OSDisk: &armcompute.OSDisk{ + CreateOption: to.Ptr(armcompute.DiskCreateOptionTypesFromImage), + Caching: to.Ptr(armcompute.CachingTypesReadWrite), + DiskSizeGB: to.Ptr(osDiskSizeGB), + DeleteOption: to.Ptr(armcompute.DiskDeleteOptionTypesDelete), + ManagedDisk: &armcompute.ManagedDiskParameters{ + StorageAccountType: to.Ptr(armcompute.StorageAccountTypesPremiumLRS), + }, + }, + }, + // UserData (NOT customData): the bootstrap renderer expects to read + // from the IMDS userData endpoint. Mirrors the ubuntu2404vmss path. + UserData: to.Ptr(userDataB64), + }, + } + + vmPoller, err := vmsClient.BeginCreateOrUpdate(ctx, spec.GetResourceGroup(), vmName, vmParams, nil) + if err != nil { + // Best-effort NIC cleanup if VM create kicked back synchronously. + _, _ = nicsClient.BeginDelete(ctx, spec.GetResourceGroup(), nicName, nil) + return nil, fmt.Errorf("creating VM %q: %w", vmName, err) + } + vmResp, err := vmPoller.PollUntilDone(ctx, nil) + if err != nil { + return nil, fmt.Errorf("polling VM creation %q: %w", vmName, err) + } + + ap.SetStatus(AgentPoolStatus_builder{ + VmResourceId: vmResp.ID, + CreatedAt: timestamppb.Now(), + }.Build()) + + item, err := anypb.New(ap) + if err != nil { + return nil, err + } + + return api.CreateOrUpdateResponse_builder{ + Item: item, + }.Build(), nil +} + +func (srv *agentpoolsServer) Delete(ctx context.Context, req *api.DeleteRequest) (*api.DeleteResponse, error) { + obj, ok := srv.storage.Get(req.GetId()) + if !ok { + return api.DeleteResponse_builder{}.Build(), nil + } + + ap, err := helper.To[*AgentPool](obj) + if err != nil { + return nil, err + } + spec := ap.GetSpec() + + vmName := ap.GetMetadata().GetId() + nicName := vmName + "-nic" + + // Delete VM first; NIC + OS disk cascade because we set DeleteOption=Delete on create. + vmsClient, err := armcompute.NewVirtualMachinesClient(spec.GetSubscriptionId(), srv.credentials, nil) + if err != nil { + return nil, fmt.Errorf("creating VM client: %w", err) + } + vmPoller, err := vmsClient.BeginDelete(ctx, spec.GetResourceGroup(), vmName, &armcompute.VirtualMachinesClientBeginDeleteOptions{ + ForceDeletion: to.Ptr(true), + }) + if err != nil && !isNotFound(err) { + return nil, fmt.Errorf("starting VM delete %q: %w", vmName, err) + } + if vmPoller != nil { + if _, err := vmPoller.PollUntilDone(ctx, nil); err != nil && !isNotFound(err) { + return nil, fmt.Errorf("polling VM delete %q: %w", vmName, err) + } + } + + // Best-effort NIC delete in case the VM never made it to a state where + // DeleteOption applied (e.g. failed mid-create). Idempotent. + nicsClient, err := armnetwork.NewInterfacesClient(spec.GetSubscriptionId(), srv.credentials, nil) + if err != nil { + return nil, fmt.Errorf("creating NIC client: %w", err) + } + nicPoller, err := nicsClient.BeginDelete(ctx, spec.GetResourceGroup(), nicName, nil) + if err != nil && !isNotFound(err) { + return nil, fmt.Errorf("starting NIC delete %q: %w", nicName, err) + } + if nicPoller != nil { + if _, err := nicPoller.PollUntilDone(ctx, nil); err != nil && !isNotFound(err) { + return nil, fmt.Errorf("polling NIC delete %q: %w", nicName, err) + } + } + + return api.DeleteResponse_builder{}.Build(), nil +} + +// validateSpec fails fast with the cheap structural checks that don't need an Azure round-trip. +func validateSpec(spec *AgentPoolSpec) error { + if spec.GetSubscriptionId() == "" { + return errors.New("subscription_id is required") + } + if spec.GetResourceGroup() == "" { + return errors.New("resource_group is required") + } + if spec.GetLocation() == "" { + return errors.New("location is required") + } + if spec.GetSubnetId() == "" { + return errors.New("subnet_id is required") + } + if _, err := arm.ParseResourceID(spec.GetSubnetId()); err != nil { + return fmt.Errorf("subnet_id %q is not a valid ARM resource id: %w", spec.GetSubnetId(), err) + } + if spec.GetVmSize() == "" { + return errors.New("vm_size is required") + } + if spec.GetImageId() != "" && spec.GetImageReference() != nil { + return errors.New("image_id and image_reference are mutually exclusive") + } + if st := spec.GetSecurityType(); st != "" && st != "Standard" { + return fmt.Errorf("unsupported security_type %q (only Standard is supported in Phase 1)", st) + } + return nil +} + +func buildImageReference(spec *AgentPoolSpec) (*armcompute.ImageReference, error) { + if spec.GetImageId() != "" { + return &armcompute.ImageReference{ + ID: to.Ptr(spec.GetImageId()), + }, nil + } + ref := spec.GetImageReference() + if ref == nil { + return &armcompute.ImageReference{ + Publisher: to.Ptr(defaultImagePublisher), + Offer: to.Ptr(defaultImageOffer), + SKU: to.Ptr(defaultImageSKU), + Version: to.Ptr(defaultImageVersion), + }, nil + } + if ref.GetPublisher() == "" || ref.GetOffer() == "" || ref.GetSku() == "" { + return nil, errors.New("image_reference requires publisher, offer, and sku") + } + version := ref.GetVersion() + if version == "" { + version = defaultImageVersion + } + return &armcompute.ImageReference{ + Publisher: to.Ptr(ref.GetPublisher()), + Offer: to.Ptr(ref.GetOffer()), + SKU: to.Ptr(ref.GetSku()), + Version: to.Ptr(version), + }, nil +} + +func buildSSHConfig(keys []string) *armcompute.SSHConfiguration { + if len(keys) == 0 { + return nil + } + pks := make([]*armcompute.SSHPublicKey, 0, len(keys)) + for _, k := range keys { + pks = append(pks, &armcompute.SSHPublicKey{ + Path: to.Ptr("/home/" + defaultAdminUsername + "/.ssh/authorized_keys"), + KeyData: to.Ptr(k), + }) + } + return &armcompute.SSHConfiguration{PublicKeys: pks} +} + +func toARMTags(tags map[string]string) map[string]*string { + if len(tags) == 0 { + return nil + } + out := make(map[string]*string, len(tags)) + for k, v := range tags { + out[k] = to.Ptr(v) + } + return out +} + +func isNotFound(err error) bool { + var rerr *azcore.ResponseError + if errors.As(err, &rerr) { + return rerr.StatusCode == 404 + } + return false +} diff --git a/plugin/pkg/services/agentpools/azure/flexvm/agentpools.pb.go b/plugin/pkg/services/agentpools/azure/flexvm/agentpools.pb.go new file mode 100644 index 0000000..16282f4 --- /dev/null +++ b/plugin/pkg/services/agentpools/azure/flexvm/agentpools.pb.go @@ -0,0 +1,941 @@ +// Code generated by protoc-gen-go. DO NOT EDIT. +// versions: +// protoc-gen-go v1.36.10 +// protoc v6.33.0 +// source: plugin/pkg/services/agentpools/azure/flexvm/agentpools.proto + +package flexvm + +import ( + api "github.com/Azure/aks-flex/plugin/api" + kubeadm "github.com/Azure/aks-flex/plugin/pkg/services/agentpools/api/features/kubeadm" + protoreflect "google.golang.org/protobuf/reflect/protoreflect" + protoimpl "google.golang.org/protobuf/runtime/protoimpl" + timestamppb "google.golang.org/protobuf/types/known/timestamppb" + reflect "reflect" + unsafe "unsafe" +) + +const ( + // Verify that this generated code is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) + // Verify that runtime/protoimpl is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) +) + +// AgentPool models a single cross-region Azure VM. +// +// Unlike the ubuntu2404vmss service which manages a Virtual Machine Scale Set, +// flexvm creates one Microsoft.Compute/virtualMachines per AgentPool. This is +// suitable for cross-region placement and matches Karpenter's per-NodeClaim +// resource lifecycle. +// +// The plugin process is assumed to authenticate via DefaultAzureCredential and +// hold Contributor on the target subscription / resource group / subnet. +type AgentPool struct { + state protoimpl.MessageState `protogen:"opaque.v1"` + xxx_hidden_Metadata *api.Metadata `protobuf:"bytes,1,opt,name=metadata"` + xxx_hidden_Spec *AgentPoolSpec `protobuf:"bytes,2,opt,name=spec"` + xxx_hidden_Status *AgentPoolStatus `protobuf:"bytes,3,opt,name=status"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *AgentPool) Reset() { + *x = AgentPool{} + mi := &file_plugin_pkg_services_agentpools_azure_flexvm_agentpools_proto_msgTypes[0] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *AgentPool) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*AgentPool) ProtoMessage() {} + +func (x *AgentPool) ProtoReflect() protoreflect.Message { + mi := &file_plugin_pkg_services_agentpools_azure_flexvm_agentpools_proto_msgTypes[0] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +func (x *AgentPool) GetMetadata() *api.Metadata { + if x != nil { + return x.xxx_hidden_Metadata + } + return nil +} + +func (x *AgentPool) GetSpec() *AgentPoolSpec { + if x != nil { + return x.xxx_hidden_Spec + } + return nil +} + +func (x *AgentPool) GetStatus() *AgentPoolStatus { + if x != nil { + return x.xxx_hidden_Status + } + return nil +} + +func (x *AgentPool) SetMetadata(v *api.Metadata) { + x.xxx_hidden_Metadata = v +} + +func (x *AgentPool) SetSpec(v *AgentPoolSpec) { + x.xxx_hidden_Spec = v +} + +func (x *AgentPool) SetStatus(v *AgentPoolStatus) { + x.xxx_hidden_Status = v +} + +func (x *AgentPool) HasMetadata() bool { + if x == nil { + return false + } + return x.xxx_hidden_Metadata != nil +} + +func (x *AgentPool) HasSpec() bool { + if x == nil { + return false + } + return x.xxx_hidden_Spec != nil +} + +func (x *AgentPool) HasStatus() bool { + if x == nil { + return false + } + return x.xxx_hidden_Status != nil +} + +func (x *AgentPool) ClearMetadata() { + x.xxx_hidden_Metadata = nil +} + +func (x *AgentPool) ClearSpec() { + x.xxx_hidden_Spec = nil +} + +func (x *AgentPool) ClearStatus() { + x.xxx_hidden_Status = nil +} + +type AgentPool_builder struct { + _ [0]func() // Prevents comparability and use of unkeyed literals for the builder. + + Metadata *api.Metadata + Spec *AgentPoolSpec + Status *AgentPoolStatus +} + +func (b0 AgentPool_builder) Build() *AgentPool { + m0 := &AgentPool{} + b, x := &b0, m0 + _, _ = b, x + x.xxx_hidden_Metadata = b.Metadata + x.xxx_hidden_Spec = b.Spec + x.xxx_hidden_Status = b.Status + return m0 +} + +type AgentPoolSpec struct { + state protoimpl.MessageState `protogen:"opaque.v1"` + xxx_hidden_SubscriptionId *string `protobuf:"bytes,1,opt,name=subscription_id,json=subscriptionId"` + xxx_hidden_ResourceGroup *string `protobuf:"bytes,2,opt,name=resource_group,json=resourceGroup"` + xxx_hidden_Location *string `protobuf:"bytes,3,opt,name=location"` + xxx_hidden_SubnetId *string `protobuf:"bytes,4,opt,name=subnet_id,json=subnetId"` + xxx_hidden_VmSize *string `protobuf:"bytes,5,opt,name=vm_size,json=vmSize"` + xxx_hidden_ImageReference *ImageReference `protobuf:"bytes,6,opt,name=image_reference,json=imageReference"` + xxx_hidden_ImageId *string `protobuf:"bytes,7,opt,name=image_id,json=imageId"` + xxx_hidden_SecurityType *string `protobuf:"bytes,8,opt,name=security_type,json=securityType"` + xxx_hidden_OsDiskSizeGb int32 `protobuf:"varint,9,opt,name=os_disk_size_gb,json=osDiskSizeGb"` + xxx_hidden_SshPublicKeys []string `protobuf:"bytes,10,rep,name=ssh_public_keys,json=sshPublicKeys"` + xxx_hidden_AllocatePublicIp bool `protobuf:"varint,11,opt,name=allocate_public_ip,json=allocatePublicIp"` + xxx_hidden_Tags map[string]string `protobuf:"bytes,12,rep,name=tags" protobuf_key:"bytes,1,opt,name=key" protobuf_val:"bytes,2,opt,name=value"` + xxx_hidden_Kubeadm *kubeadm.Config `protobuf:"bytes,13,opt,name=kubeadm"` + XXX_raceDetectHookData protoimpl.RaceDetectHookData + XXX_presence [1]uint32 + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *AgentPoolSpec) Reset() { + *x = AgentPoolSpec{} + mi := &file_plugin_pkg_services_agentpools_azure_flexvm_agentpools_proto_msgTypes[1] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *AgentPoolSpec) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*AgentPoolSpec) ProtoMessage() {} + +func (x *AgentPoolSpec) ProtoReflect() protoreflect.Message { + mi := &file_plugin_pkg_services_agentpools_azure_flexvm_agentpools_proto_msgTypes[1] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +func (x *AgentPoolSpec) GetSubscriptionId() string { + if x != nil { + if x.xxx_hidden_SubscriptionId != nil { + return *x.xxx_hidden_SubscriptionId + } + return "" + } + return "" +} + +func (x *AgentPoolSpec) GetResourceGroup() string { + if x != nil { + if x.xxx_hidden_ResourceGroup != nil { + return *x.xxx_hidden_ResourceGroup + } + return "" + } + return "" +} + +func (x *AgentPoolSpec) GetLocation() string { + if x != nil { + if x.xxx_hidden_Location != nil { + return *x.xxx_hidden_Location + } + return "" + } + return "" +} + +func (x *AgentPoolSpec) GetSubnetId() string { + if x != nil { + if x.xxx_hidden_SubnetId != nil { + return *x.xxx_hidden_SubnetId + } + return "" + } + return "" +} + +func (x *AgentPoolSpec) GetVmSize() string { + if x != nil { + if x.xxx_hidden_VmSize != nil { + return *x.xxx_hidden_VmSize + } + return "" + } + return "" +} + +func (x *AgentPoolSpec) GetImageReference() *ImageReference { + if x != nil { + return x.xxx_hidden_ImageReference + } + return nil +} + +func (x *AgentPoolSpec) GetImageId() string { + if x != nil { + if x.xxx_hidden_ImageId != nil { + return *x.xxx_hidden_ImageId + } + return "" + } + return "" +} + +func (x *AgentPoolSpec) GetSecurityType() string { + if x != nil { + if x.xxx_hidden_SecurityType != nil { + return *x.xxx_hidden_SecurityType + } + return "" + } + return "" +} + +func (x *AgentPoolSpec) GetOsDiskSizeGb() int32 { + if x != nil { + return x.xxx_hidden_OsDiskSizeGb + } + return 0 +} + +func (x *AgentPoolSpec) GetSshPublicKeys() []string { + if x != nil { + return x.xxx_hidden_SshPublicKeys + } + return nil +} + +func (x *AgentPoolSpec) GetAllocatePublicIp() bool { + if x != nil { + return x.xxx_hidden_AllocatePublicIp + } + return false +} + +func (x *AgentPoolSpec) GetTags() map[string]string { + if x != nil { + return x.xxx_hidden_Tags + } + return nil +} + +func (x *AgentPoolSpec) GetKubeadm() *kubeadm.Config { + if x != nil { + return x.xxx_hidden_Kubeadm + } + return nil +} + +func (x *AgentPoolSpec) SetSubscriptionId(v string) { + x.xxx_hidden_SubscriptionId = &v + protoimpl.X.SetPresent(&(x.XXX_presence[0]), 0, 13) +} + +func (x *AgentPoolSpec) SetResourceGroup(v string) { + x.xxx_hidden_ResourceGroup = &v + protoimpl.X.SetPresent(&(x.XXX_presence[0]), 1, 13) +} + +func (x *AgentPoolSpec) SetLocation(v string) { + x.xxx_hidden_Location = &v + protoimpl.X.SetPresent(&(x.XXX_presence[0]), 2, 13) +} + +func (x *AgentPoolSpec) SetSubnetId(v string) { + x.xxx_hidden_SubnetId = &v + protoimpl.X.SetPresent(&(x.XXX_presence[0]), 3, 13) +} + +func (x *AgentPoolSpec) SetVmSize(v string) { + x.xxx_hidden_VmSize = &v + protoimpl.X.SetPresent(&(x.XXX_presence[0]), 4, 13) +} + +func (x *AgentPoolSpec) SetImageReference(v *ImageReference) { + x.xxx_hidden_ImageReference = v +} + +func (x *AgentPoolSpec) SetImageId(v string) { + x.xxx_hidden_ImageId = &v + protoimpl.X.SetPresent(&(x.XXX_presence[0]), 6, 13) +} + +func (x *AgentPoolSpec) SetSecurityType(v string) { + x.xxx_hidden_SecurityType = &v + protoimpl.X.SetPresent(&(x.XXX_presence[0]), 7, 13) +} + +func (x *AgentPoolSpec) SetOsDiskSizeGb(v int32) { + x.xxx_hidden_OsDiskSizeGb = v + protoimpl.X.SetPresent(&(x.XXX_presence[0]), 8, 13) +} + +func (x *AgentPoolSpec) SetSshPublicKeys(v []string) { + x.xxx_hidden_SshPublicKeys = v +} + +func (x *AgentPoolSpec) SetAllocatePublicIp(v bool) { + x.xxx_hidden_AllocatePublicIp = v + protoimpl.X.SetPresent(&(x.XXX_presence[0]), 10, 13) +} + +func (x *AgentPoolSpec) SetTags(v map[string]string) { + x.xxx_hidden_Tags = v +} + +func (x *AgentPoolSpec) SetKubeadm(v *kubeadm.Config) { + x.xxx_hidden_Kubeadm = v +} + +func (x *AgentPoolSpec) HasSubscriptionId() bool { + if x == nil { + return false + } + return protoimpl.X.Present(&(x.XXX_presence[0]), 0) +} + +func (x *AgentPoolSpec) HasResourceGroup() bool { + if x == nil { + return false + } + return protoimpl.X.Present(&(x.XXX_presence[0]), 1) +} + +func (x *AgentPoolSpec) HasLocation() bool { + if x == nil { + return false + } + return protoimpl.X.Present(&(x.XXX_presence[0]), 2) +} + +func (x *AgentPoolSpec) HasSubnetId() bool { + if x == nil { + return false + } + return protoimpl.X.Present(&(x.XXX_presence[0]), 3) +} + +func (x *AgentPoolSpec) HasVmSize() bool { + if x == nil { + return false + } + return protoimpl.X.Present(&(x.XXX_presence[0]), 4) +} + +func (x *AgentPoolSpec) HasImageReference() bool { + if x == nil { + return false + } + return x.xxx_hidden_ImageReference != nil +} + +func (x *AgentPoolSpec) HasImageId() bool { + if x == nil { + return false + } + return protoimpl.X.Present(&(x.XXX_presence[0]), 6) +} + +func (x *AgentPoolSpec) HasSecurityType() bool { + if x == nil { + return false + } + return protoimpl.X.Present(&(x.XXX_presence[0]), 7) +} + +func (x *AgentPoolSpec) HasOsDiskSizeGb() bool { + if x == nil { + return false + } + return protoimpl.X.Present(&(x.XXX_presence[0]), 8) +} + +func (x *AgentPoolSpec) HasAllocatePublicIp() bool { + if x == nil { + return false + } + return protoimpl.X.Present(&(x.XXX_presence[0]), 10) +} + +func (x *AgentPoolSpec) HasKubeadm() bool { + if x == nil { + return false + } + return x.xxx_hidden_Kubeadm != nil +} + +func (x *AgentPoolSpec) ClearSubscriptionId() { + protoimpl.X.ClearPresent(&(x.XXX_presence[0]), 0) + x.xxx_hidden_SubscriptionId = nil +} + +func (x *AgentPoolSpec) ClearResourceGroup() { + protoimpl.X.ClearPresent(&(x.XXX_presence[0]), 1) + x.xxx_hidden_ResourceGroup = nil +} + +func (x *AgentPoolSpec) ClearLocation() { + protoimpl.X.ClearPresent(&(x.XXX_presence[0]), 2) + x.xxx_hidden_Location = nil +} + +func (x *AgentPoolSpec) ClearSubnetId() { + protoimpl.X.ClearPresent(&(x.XXX_presence[0]), 3) + x.xxx_hidden_SubnetId = nil +} + +func (x *AgentPoolSpec) ClearVmSize() { + protoimpl.X.ClearPresent(&(x.XXX_presence[0]), 4) + x.xxx_hidden_VmSize = nil +} + +func (x *AgentPoolSpec) ClearImageReference() { + x.xxx_hidden_ImageReference = nil +} + +func (x *AgentPoolSpec) ClearImageId() { + protoimpl.X.ClearPresent(&(x.XXX_presence[0]), 6) + x.xxx_hidden_ImageId = nil +} + +func (x *AgentPoolSpec) ClearSecurityType() { + protoimpl.X.ClearPresent(&(x.XXX_presence[0]), 7) + x.xxx_hidden_SecurityType = nil +} + +func (x *AgentPoolSpec) ClearOsDiskSizeGb() { + protoimpl.X.ClearPresent(&(x.XXX_presence[0]), 8) + x.xxx_hidden_OsDiskSizeGb = 0 +} + +func (x *AgentPoolSpec) ClearAllocatePublicIp() { + protoimpl.X.ClearPresent(&(x.XXX_presence[0]), 10) + x.xxx_hidden_AllocatePublicIp = false +} + +func (x *AgentPoolSpec) ClearKubeadm() { + x.xxx_hidden_Kubeadm = nil +} + +type AgentPoolSpec_builder struct { + _ [0]func() // Prevents comparability and use of unkeyed literals for the builder. + + SubscriptionId *string + ResourceGroup *string + Location *string + // Full ARM resource ID of the subnet (must already exist). + SubnetId *string + // VM SKU, e.g. "Standard_ND96isr_H200_v5". + VmSize *string + // ImageReference selects an Azure Marketplace image. + // Mutually exclusive with image_id. + ImageReference *ImageReference + // SIG / community gallery image resource ID. + // Mutually exclusive with image_reference. + ImageId *string + // Currently only "Standard" is supported. TrustedLaunch is deferred — + // it has been observed to break the DSVM image during manual H200 bringup. + SecurityType *string + OsDiskSizeGb *int32 + SshPublicKeys []string + AllocatePublicIp *bool + Tags map[string]string + Kubeadm *kubeadm.Config +} + +func (b0 AgentPoolSpec_builder) Build() *AgentPoolSpec { + m0 := &AgentPoolSpec{} + b, x := &b0, m0 + _, _ = b, x + if b.SubscriptionId != nil { + protoimpl.X.SetPresentNonAtomic(&(x.XXX_presence[0]), 0, 13) + x.xxx_hidden_SubscriptionId = b.SubscriptionId + } + if b.ResourceGroup != nil { + protoimpl.X.SetPresentNonAtomic(&(x.XXX_presence[0]), 1, 13) + x.xxx_hidden_ResourceGroup = b.ResourceGroup + } + if b.Location != nil { + protoimpl.X.SetPresentNonAtomic(&(x.XXX_presence[0]), 2, 13) + x.xxx_hidden_Location = b.Location + } + if b.SubnetId != nil { + protoimpl.X.SetPresentNonAtomic(&(x.XXX_presence[0]), 3, 13) + x.xxx_hidden_SubnetId = b.SubnetId + } + if b.VmSize != nil { + protoimpl.X.SetPresentNonAtomic(&(x.XXX_presence[0]), 4, 13) + x.xxx_hidden_VmSize = b.VmSize + } + x.xxx_hidden_ImageReference = b.ImageReference + if b.ImageId != nil { + protoimpl.X.SetPresentNonAtomic(&(x.XXX_presence[0]), 6, 13) + x.xxx_hidden_ImageId = b.ImageId + } + if b.SecurityType != nil { + protoimpl.X.SetPresentNonAtomic(&(x.XXX_presence[0]), 7, 13) + x.xxx_hidden_SecurityType = b.SecurityType + } + if b.OsDiskSizeGb != nil { + protoimpl.X.SetPresentNonAtomic(&(x.XXX_presence[0]), 8, 13) + x.xxx_hidden_OsDiskSizeGb = *b.OsDiskSizeGb + } + x.xxx_hidden_SshPublicKeys = b.SshPublicKeys + if b.AllocatePublicIp != nil { + protoimpl.X.SetPresentNonAtomic(&(x.XXX_presence[0]), 10, 13) + x.xxx_hidden_AllocatePublicIp = *b.AllocatePublicIp + } + x.xxx_hidden_Tags = b.Tags + x.xxx_hidden_Kubeadm = b.Kubeadm + return m0 +} + +type ImageReference struct { + state protoimpl.MessageState `protogen:"opaque.v1"` + xxx_hidden_Publisher *string `protobuf:"bytes,1,opt,name=publisher"` + xxx_hidden_Offer *string `protobuf:"bytes,2,opt,name=offer"` + xxx_hidden_Sku *string `protobuf:"bytes,3,opt,name=sku"` + xxx_hidden_Version *string `protobuf:"bytes,4,opt,name=version"` + XXX_raceDetectHookData protoimpl.RaceDetectHookData + XXX_presence [1]uint32 + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *ImageReference) Reset() { + *x = ImageReference{} + mi := &file_plugin_pkg_services_agentpools_azure_flexvm_agentpools_proto_msgTypes[2] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *ImageReference) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ImageReference) ProtoMessage() {} + +func (x *ImageReference) ProtoReflect() protoreflect.Message { + mi := &file_plugin_pkg_services_agentpools_azure_flexvm_agentpools_proto_msgTypes[2] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +func (x *ImageReference) GetPublisher() string { + if x != nil { + if x.xxx_hidden_Publisher != nil { + return *x.xxx_hidden_Publisher + } + return "" + } + return "" +} + +func (x *ImageReference) GetOffer() string { + if x != nil { + if x.xxx_hidden_Offer != nil { + return *x.xxx_hidden_Offer + } + return "" + } + return "" +} + +func (x *ImageReference) GetSku() string { + if x != nil { + if x.xxx_hidden_Sku != nil { + return *x.xxx_hidden_Sku + } + return "" + } + return "" +} + +func (x *ImageReference) GetVersion() string { + if x != nil { + if x.xxx_hidden_Version != nil { + return *x.xxx_hidden_Version + } + return "" + } + return "" +} + +func (x *ImageReference) SetPublisher(v string) { + x.xxx_hidden_Publisher = &v + protoimpl.X.SetPresent(&(x.XXX_presence[0]), 0, 4) +} + +func (x *ImageReference) SetOffer(v string) { + x.xxx_hidden_Offer = &v + protoimpl.X.SetPresent(&(x.XXX_presence[0]), 1, 4) +} + +func (x *ImageReference) SetSku(v string) { + x.xxx_hidden_Sku = &v + protoimpl.X.SetPresent(&(x.XXX_presence[0]), 2, 4) +} + +func (x *ImageReference) SetVersion(v string) { + x.xxx_hidden_Version = &v + protoimpl.X.SetPresent(&(x.XXX_presence[0]), 3, 4) +} + +func (x *ImageReference) HasPublisher() bool { + if x == nil { + return false + } + return protoimpl.X.Present(&(x.XXX_presence[0]), 0) +} + +func (x *ImageReference) HasOffer() bool { + if x == nil { + return false + } + return protoimpl.X.Present(&(x.XXX_presence[0]), 1) +} + +func (x *ImageReference) HasSku() bool { + if x == nil { + return false + } + return protoimpl.X.Present(&(x.XXX_presence[0]), 2) +} + +func (x *ImageReference) HasVersion() bool { + if x == nil { + return false + } + return protoimpl.X.Present(&(x.XXX_presence[0]), 3) +} + +func (x *ImageReference) ClearPublisher() { + protoimpl.X.ClearPresent(&(x.XXX_presence[0]), 0) + x.xxx_hidden_Publisher = nil +} + +func (x *ImageReference) ClearOffer() { + protoimpl.X.ClearPresent(&(x.XXX_presence[0]), 1) + x.xxx_hidden_Offer = nil +} + +func (x *ImageReference) ClearSku() { + protoimpl.X.ClearPresent(&(x.XXX_presence[0]), 2) + x.xxx_hidden_Sku = nil +} + +func (x *ImageReference) ClearVersion() { + protoimpl.X.ClearPresent(&(x.XXX_presence[0]), 3) + x.xxx_hidden_Version = nil +} + +type ImageReference_builder struct { + _ [0]func() // Prevents comparability and use of unkeyed literals for the builder. + + Publisher *string + Offer *string + Sku *string + Version *string +} + +func (b0 ImageReference_builder) Build() *ImageReference { + m0 := &ImageReference{} + b, x := &b0, m0 + _, _ = b, x + if b.Publisher != nil { + protoimpl.X.SetPresentNonAtomic(&(x.XXX_presence[0]), 0, 4) + x.xxx_hidden_Publisher = b.Publisher + } + if b.Offer != nil { + protoimpl.X.SetPresentNonAtomic(&(x.XXX_presence[0]), 1, 4) + x.xxx_hidden_Offer = b.Offer + } + if b.Sku != nil { + protoimpl.X.SetPresentNonAtomic(&(x.XXX_presence[0]), 2, 4) + x.xxx_hidden_Sku = b.Sku + } + if b.Version != nil { + protoimpl.X.SetPresentNonAtomic(&(x.XXX_presence[0]), 3, 4) + x.xxx_hidden_Version = b.Version + } + return m0 +} + +type AgentPoolStatus struct { + state protoimpl.MessageState `protogen:"opaque.v1"` + xxx_hidden_VmResourceId *string `protobuf:"bytes,1,opt,name=vm_resource_id,json=vmResourceId"` + xxx_hidden_CreatedAt *timestamppb.Timestamp `protobuf:"bytes,2,opt,name=created_at,json=createdAt"` + XXX_raceDetectHookData protoimpl.RaceDetectHookData + XXX_presence [1]uint32 + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *AgentPoolStatus) Reset() { + *x = AgentPoolStatus{} + mi := &file_plugin_pkg_services_agentpools_azure_flexvm_agentpools_proto_msgTypes[3] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *AgentPoolStatus) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*AgentPoolStatus) ProtoMessage() {} + +func (x *AgentPoolStatus) ProtoReflect() protoreflect.Message { + mi := &file_plugin_pkg_services_agentpools_azure_flexvm_agentpools_proto_msgTypes[3] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +func (x *AgentPoolStatus) GetVmResourceId() string { + if x != nil { + if x.xxx_hidden_VmResourceId != nil { + return *x.xxx_hidden_VmResourceId + } + return "" + } + return "" +} + +func (x *AgentPoolStatus) GetCreatedAt() *timestamppb.Timestamp { + if x != nil { + return x.xxx_hidden_CreatedAt + } + return nil +} + +func (x *AgentPoolStatus) SetVmResourceId(v string) { + x.xxx_hidden_VmResourceId = &v + protoimpl.X.SetPresent(&(x.XXX_presence[0]), 0, 2) +} + +func (x *AgentPoolStatus) SetCreatedAt(v *timestamppb.Timestamp) { + x.xxx_hidden_CreatedAt = v +} + +func (x *AgentPoolStatus) HasVmResourceId() bool { + if x == nil { + return false + } + return protoimpl.X.Present(&(x.XXX_presence[0]), 0) +} + +func (x *AgentPoolStatus) HasCreatedAt() bool { + if x == nil { + return false + } + return x.xxx_hidden_CreatedAt != nil +} + +func (x *AgentPoolStatus) ClearVmResourceId() { + protoimpl.X.ClearPresent(&(x.XXX_presence[0]), 0) + x.xxx_hidden_VmResourceId = nil +} + +func (x *AgentPoolStatus) ClearCreatedAt() { + x.xxx_hidden_CreatedAt = nil +} + +type AgentPoolStatus_builder struct { + _ [0]func() // Prevents comparability and use of unkeyed literals for the builder. + + // Full ARM resource ID of the created VM. + VmResourceId *string + CreatedAt *timestamppb.Timestamp +} + +func (b0 AgentPoolStatus_builder) Build() *AgentPoolStatus { + m0 := &AgentPoolStatus{} + b, x := &b0, m0 + _, _ = b, x + if b.VmResourceId != nil { + protoimpl.X.SetPresentNonAtomic(&(x.XXX_presence[0]), 0, 2) + x.xxx_hidden_VmResourceId = b.VmResourceId + } + x.xxx_hidden_CreatedAt = b.CreatedAt + return m0 +} + +var File_plugin_pkg_services_agentpools_azure_flexvm_agentpools_proto protoreflect.FileDescriptor + +const file_plugin_pkg_services_agentpools_azure_flexvm_agentpools_proto_rawDesc = "" + + "\n" + + " api.Metadata + 1, // 1: agentpools.azure.flexvm.AgentPool.spec:type_name -> agentpools.azure.flexvm.AgentPoolSpec + 3, // 2: agentpools.azure.flexvm.AgentPool.status:type_name -> agentpools.azure.flexvm.AgentPoolStatus + 2, // 3: agentpools.azure.flexvm.AgentPoolSpec.image_reference:type_name -> agentpools.azure.flexvm.ImageReference + 4, // 4: agentpools.azure.flexvm.AgentPoolSpec.tags:type_name -> agentpools.azure.flexvm.AgentPoolSpec.TagsEntry + 6, // 5: agentpools.azure.flexvm.AgentPoolSpec.kubeadm:type_name -> kubeadm.Config + 7, // 6: agentpools.azure.flexvm.AgentPoolStatus.created_at:type_name -> google.protobuf.Timestamp + 7, // [7:7] is the sub-list for method output_type + 7, // [7:7] is the sub-list for method input_type + 7, // [7:7] is the sub-list for extension type_name + 7, // [7:7] is the sub-list for extension extendee + 0, // [0:7] is the sub-list for field type_name +} + +func init() { file_plugin_pkg_services_agentpools_azure_flexvm_agentpools_proto_init() } +func file_plugin_pkg_services_agentpools_azure_flexvm_agentpools_proto_init() { + if File_plugin_pkg_services_agentpools_azure_flexvm_agentpools_proto != nil { + return + } + type x struct{} + out := protoimpl.TypeBuilder{ + File: protoimpl.DescBuilder{ + GoPackagePath: reflect.TypeOf(x{}).PkgPath(), + RawDescriptor: unsafe.Slice(unsafe.StringData(file_plugin_pkg_services_agentpools_azure_flexvm_agentpools_proto_rawDesc), len(file_plugin_pkg_services_agentpools_azure_flexvm_agentpools_proto_rawDesc)), + NumEnums: 0, + NumMessages: 5, + NumExtensions: 0, + NumServices: 0, + }, + GoTypes: file_plugin_pkg_services_agentpools_azure_flexvm_agentpools_proto_goTypes, + DependencyIndexes: file_plugin_pkg_services_agentpools_azure_flexvm_agentpools_proto_depIdxs, + MessageInfos: file_plugin_pkg_services_agentpools_azure_flexvm_agentpools_proto_msgTypes, + }.Build() + File_plugin_pkg_services_agentpools_azure_flexvm_agentpools_proto = out.File + file_plugin_pkg_services_agentpools_azure_flexvm_agentpools_proto_goTypes = nil + file_plugin_pkg_services_agentpools_azure_flexvm_agentpools_proto_depIdxs = nil +} diff --git a/plugin/pkg/services/agentpools/azure/flexvm/agentpools.proto b/plugin/pkg/services/agentpools/azure/flexvm/agentpools.proto new file mode 100644 index 0000000..c866b97 --- /dev/null +++ b/plugin/pkg/services/agentpools/azure/flexvm/agentpools.proto @@ -0,0 +1,74 @@ +edition = "2024"; + +package agentpools.azure.flexvm; + +option go_package = "github.com/Azure/aks-flex/plugin/pkg/services/agentpools/azure/flexvm"; + +import "google/protobuf/timestamp.proto"; +import "plugin/api/api.proto"; +import "plugin/pkg/services/agentpools/api/features/kubeadm/kubeadm.proto"; + +// AgentPool models a single cross-region Azure VM. +// +// Unlike the ubuntu2404vmss service which manages a Virtual Machine Scale Set, +// flexvm creates one Microsoft.Compute/virtualMachines per AgentPool. This is +// suitable for cross-region placement and matches Karpenter's per-NodeClaim +// resource lifecycle. +// +// The plugin process is assumed to authenticate via DefaultAzureCredential and +// hold Contributor on the target subscription / resource group / subnet. +message AgentPool { + api.Metadata metadata = 1; + + AgentPoolSpec spec = 2; + + AgentPoolStatus status = 3; +} + +message AgentPoolSpec { + string subscription_id = 1; + string resource_group = 2; + string location = 3; + + // Full ARM resource ID of the subnet (must already exist). + string subnet_id = 4; + + // VM SKU, e.g. "Standard_ND96isr_H200_v5". + string vm_size = 5; + + // ImageReference selects an Azure Marketplace image. + // Mutually exclusive with image_id. + ImageReference image_reference = 6; + + // SIG / community gallery image resource ID. + // Mutually exclusive with image_reference. + string image_id = 7; + + // Currently only "Standard" is supported. TrustedLaunch is deferred — + // it has been observed to break the DSVM image during manual H200 bringup. + string security_type = 8; + + int32 os_disk_size_gb = 9; + + repeated string ssh_public_keys = 10; + + bool allocate_public_ip = 11; + + map tags = 12; + + kubeadm.Config kubeadm = 13; +} + +message ImageReference { + string publisher = 1; + string offer = 2; + string sku = 3; + string version = 4; +} + +message AgentPoolStatus { + // Full ARM resource ID of the created VM. + string vm_resource_id = 1; + + google.protobuf.Timestamp created_at = 2; +} diff --git a/plugin/pkg/services/agentpools/azure/flexvm/instances.go b/plugin/pkg/services/agentpools/azure/flexvm/instances.go new file mode 100644 index 0000000..94116fc --- /dev/null +++ b/plugin/pkg/services/agentpools/azure/flexvm/instances.go @@ -0,0 +1,78 @@ +package flexvm + +import ( + "context" + "strings" + + "github.com/Azure/azure-sdk-for-go/sdk/azcore" + "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" + "github.com/Azure/azure-sdk-for-go/sdk/azidentity" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" + "google.golang.org/protobuf/types/known/anypb" + + "github.com/Azure/aks-flex/plugin/api" + "github.com/Azure/aks-flex/plugin/pkg/db" + agentpools "github.com/Azure/aks-flex/plugin/pkg/services/agentpools/api" +) + +var _ api.Object = (*Instance)(nil) + +// Each AgentPool maps to exactly one VM, so the Instance API is a thin shim +// over the AgentPool — there is always one instance "/0". +type instancesServer struct { + agentpools.UnimplementedInstancesServer + storage db.RODB + + credentials azcore.TokenCredential +} + +func NewInstancesServer(storage db.RODB) (agentpools.InstancesServer, error) { + credentials, err := azidentity.NewDefaultAzureCredential(nil) + if err != nil { + return nil, err + } + return &instancesServer{ + storage: storage, + credentials: credentials, + }, nil +} + +func (srv *instancesServer) List(ctx context.Context, req *api.ListRequest) (*api.ListResponse, error) { + ap, ok := srv.storage.Get(req.GetId()) + if !ok { + return nil, status.Error(codes.NotFound, "") + } + item, err := anypb.New(Instance_builder{ + Metadata: api.Metadata_builder{ + Id: to.Ptr(ap.GetMetadata().GetId() + "/0"), + }.Build(), + }.Build()) + if err != nil { + return nil, err + } + return api.ListResponse_builder{ + Items: []*anypb.Any{item}, + }.Build(), nil +} + +func (srv *instancesServer) Get(ctx context.Context, req *api.GetRequest) (*api.GetResponse, error) { + ids := strings.Split(req.GetId(), "/") + if len(ids) != 2 || ids[1] != "0" { + return nil, status.Error(codes.NotFound, "") + } + if _, ok := srv.storage.Get(ids[0]); !ok { + return nil, status.Error(codes.NotFound, "") + } + item, err := anypb.New(Instance_builder{ + Metadata: api.Metadata_builder{ + Id: to.Ptr(req.GetId()), + }.Build(), + }.Build()) + if err != nil { + return nil, err + } + return api.GetResponse_builder{ + Item: item, + }.Build(), nil +} diff --git a/plugin/pkg/services/agentpools/azure/flexvm/instances.pb.go b/plugin/pkg/services/agentpools/azure/flexvm/instances.pb.go new file mode 100644 index 0000000..6b213fd --- /dev/null +++ b/plugin/pkg/services/agentpools/azure/flexvm/instances.pb.go @@ -0,0 +1,282 @@ +// Code generated by protoc-gen-go. DO NOT EDIT. +// versions: +// protoc-gen-go v1.36.10 +// protoc v6.33.0 +// source: plugin/pkg/services/agentpools/azure/flexvm/instances.proto + +package flexvm + +import ( + api "github.com/Azure/aks-flex/plugin/api" + protoreflect "google.golang.org/protobuf/reflect/protoreflect" + protoimpl "google.golang.org/protobuf/runtime/protoimpl" + reflect "reflect" + unsafe "unsafe" +) + +const ( + // Verify that this generated code is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) + // Verify that runtime/protoimpl is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) +) + +// flexvm models one VM per AgentPool, so an Instance is just the AgentPool +// itself with id "/0". This matches the AWS ubuntu2404instance shape. +type Instance struct { + state protoimpl.MessageState `protogen:"opaque.v1"` + xxx_hidden_Metadata *api.Metadata `protobuf:"bytes,1,opt,name=metadata"` + xxx_hidden_Spec *InstanceSpec `protobuf:"bytes,2,opt,name=spec"` + xxx_hidden_Status *InstanceStatus `protobuf:"bytes,3,opt,name=status"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *Instance) Reset() { + *x = Instance{} + mi := &file_plugin_pkg_services_agentpools_azure_flexvm_instances_proto_msgTypes[0] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *Instance) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*Instance) ProtoMessage() {} + +func (x *Instance) ProtoReflect() protoreflect.Message { + mi := &file_plugin_pkg_services_agentpools_azure_flexvm_instances_proto_msgTypes[0] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +func (x *Instance) GetMetadata() *api.Metadata { + if x != nil { + return x.xxx_hidden_Metadata + } + return nil +} + +func (x *Instance) GetSpec() *InstanceSpec { + if x != nil { + return x.xxx_hidden_Spec + } + return nil +} + +func (x *Instance) GetStatus() *InstanceStatus { + if x != nil { + return x.xxx_hidden_Status + } + return nil +} + +func (x *Instance) SetMetadata(v *api.Metadata) { + x.xxx_hidden_Metadata = v +} + +func (x *Instance) SetSpec(v *InstanceSpec) { + x.xxx_hidden_Spec = v +} + +func (x *Instance) SetStatus(v *InstanceStatus) { + x.xxx_hidden_Status = v +} + +func (x *Instance) HasMetadata() bool { + if x == nil { + return false + } + return x.xxx_hidden_Metadata != nil +} + +func (x *Instance) HasSpec() bool { + if x == nil { + return false + } + return x.xxx_hidden_Spec != nil +} + +func (x *Instance) HasStatus() bool { + if x == nil { + return false + } + return x.xxx_hidden_Status != nil +} + +func (x *Instance) ClearMetadata() { + x.xxx_hidden_Metadata = nil +} + +func (x *Instance) ClearSpec() { + x.xxx_hidden_Spec = nil +} + +func (x *Instance) ClearStatus() { + x.xxx_hidden_Status = nil +} + +type Instance_builder struct { + _ [0]func() // Prevents comparability and use of unkeyed literals for the builder. + + Metadata *api.Metadata + Spec *InstanceSpec + Status *InstanceStatus +} + +func (b0 Instance_builder) Build() *Instance { + m0 := &Instance{} + b, x := &b0, m0 + _, _ = b, x + x.xxx_hidden_Metadata = b.Metadata + x.xxx_hidden_Spec = b.Spec + x.xxx_hidden_Status = b.Status + return m0 +} + +type InstanceSpec struct { + state protoimpl.MessageState `protogen:"opaque.v1"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *InstanceSpec) Reset() { + *x = InstanceSpec{} + mi := &file_plugin_pkg_services_agentpools_azure_flexvm_instances_proto_msgTypes[1] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *InstanceSpec) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*InstanceSpec) ProtoMessage() {} + +func (x *InstanceSpec) ProtoReflect() protoreflect.Message { + mi := &file_plugin_pkg_services_agentpools_azure_flexvm_instances_proto_msgTypes[1] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +type InstanceSpec_builder struct { + _ [0]func() // Prevents comparability and use of unkeyed literals for the builder. + +} + +func (b0 InstanceSpec_builder) Build() *InstanceSpec { + m0 := &InstanceSpec{} + b, x := &b0, m0 + _, _ = b, x + return m0 +} + +type InstanceStatus struct { + state protoimpl.MessageState `protogen:"opaque.v1"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *InstanceStatus) Reset() { + *x = InstanceStatus{} + mi := &file_plugin_pkg_services_agentpools_azure_flexvm_instances_proto_msgTypes[2] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *InstanceStatus) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*InstanceStatus) ProtoMessage() {} + +func (x *InstanceStatus) ProtoReflect() protoreflect.Message { + mi := &file_plugin_pkg_services_agentpools_azure_flexvm_instances_proto_msgTypes[2] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +type InstanceStatus_builder struct { + _ [0]func() // Prevents comparability and use of unkeyed literals for the builder. + +} + +func (b0 InstanceStatus_builder) Build() *InstanceStatus { + m0 := &InstanceStatus{} + b, x := &b0, m0 + _, _ = b, x + return m0 +} + +var File_plugin_pkg_services_agentpools_azure_flexvm_instances_proto protoreflect.FileDescriptor + +const file_plugin_pkg_services_agentpools_azure_flexvm_instances_proto_rawDesc = "" + + "\n" + + ";plugin/pkg/services/agentpools/azure/flexvm/instances.proto\x12\x17agentpools.azure.flexvm\x1a\x14plugin/api/api.proto\"\xb1\x01\n" + + "\bInstance\x12)\n" + + "\bmetadata\x18\x01 \x01(\v2\r.api.MetadataR\bmetadata\x129\n" + + "\x04spec\x18\x02 \x01(\v2%.agentpools.azure.flexvm.InstanceSpecR\x04spec\x12?\n" + + "\x06status\x18\x03 \x01(\v2'.agentpools.azure.flexvm.InstanceStatusR\x06status\"\x0e\n" + + "\fInstanceSpec\"\x10\n" + + "\x0eInstanceStatusBGZEgithub.com/Azure/aks-flex/plugin/pkg/services/agentpools/azure/flexvmb\beditionsp\xe9\a" + +var file_plugin_pkg_services_agentpools_azure_flexvm_instances_proto_msgTypes = make([]protoimpl.MessageInfo, 3) +var file_plugin_pkg_services_agentpools_azure_flexvm_instances_proto_goTypes = []any{ + (*Instance)(nil), // 0: agentpools.azure.flexvm.Instance + (*InstanceSpec)(nil), // 1: agentpools.azure.flexvm.InstanceSpec + (*InstanceStatus)(nil), // 2: agentpools.azure.flexvm.InstanceStatus + (*api.Metadata)(nil), // 3: api.Metadata +} +var file_plugin_pkg_services_agentpools_azure_flexvm_instances_proto_depIdxs = []int32{ + 3, // 0: agentpools.azure.flexvm.Instance.metadata:type_name -> api.Metadata + 1, // 1: agentpools.azure.flexvm.Instance.spec:type_name -> agentpools.azure.flexvm.InstanceSpec + 2, // 2: agentpools.azure.flexvm.Instance.status:type_name -> agentpools.azure.flexvm.InstanceStatus + 3, // [3:3] is the sub-list for method output_type + 3, // [3:3] is the sub-list for method input_type + 3, // [3:3] is the sub-list for extension type_name + 3, // [3:3] is the sub-list for extension extendee + 0, // [0:3] is the sub-list for field type_name +} + +func init() { file_plugin_pkg_services_agentpools_azure_flexvm_instances_proto_init() } +func file_plugin_pkg_services_agentpools_azure_flexvm_instances_proto_init() { + if File_plugin_pkg_services_agentpools_azure_flexvm_instances_proto != nil { + return + } + type x struct{} + out := protoimpl.TypeBuilder{ + File: protoimpl.DescBuilder{ + GoPackagePath: reflect.TypeOf(x{}).PkgPath(), + RawDescriptor: unsafe.Slice(unsafe.StringData(file_plugin_pkg_services_agentpools_azure_flexvm_instances_proto_rawDesc), len(file_plugin_pkg_services_agentpools_azure_flexvm_instances_proto_rawDesc)), + NumEnums: 0, + NumMessages: 3, + NumExtensions: 0, + NumServices: 0, + }, + GoTypes: file_plugin_pkg_services_agentpools_azure_flexvm_instances_proto_goTypes, + DependencyIndexes: file_plugin_pkg_services_agentpools_azure_flexvm_instances_proto_depIdxs, + MessageInfos: file_plugin_pkg_services_agentpools_azure_flexvm_instances_proto_msgTypes, + }.Build() + File_plugin_pkg_services_agentpools_azure_flexvm_instances_proto = out.File + file_plugin_pkg_services_agentpools_azure_flexvm_instances_proto_goTypes = nil + file_plugin_pkg_services_agentpools_azure_flexvm_instances_proto_depIdxs = nil +} diff --git a/plugin/pkg/services/agentpools/azure/flexvm/instances.proto b/plugin/pkg/services/agentpools/azure/flexvm/instances.proto new file mode 100644 index 0000000..eaabe07 --- /dev/null +++ b/plugin/pkg/services/agentpools/azure/flexvm/instances.proto @@ -0,0 +1,23 @@ +edition = "2024"; + +package agentpools.azure.flexvm; + +option go_package = "github.com/Azure/aks-flex/plugin/pkg/services/agentpools/azure/flexvm"; + +import "plugin/api/api.proto"; + +// flexvm models one VM per AgentPool, so an Instance is just the AgentPool +// itself with id "/0". This matches the AWS ubuntu2404instance shape. +message Instance { + api.Metadata metadata = 1; + + InstanceSpec spec = 2; + + InstanceStatus status = 3; +} + +message InstanceSpec { +} + +message InstanceStatus { +} diff --git a/plugin/pkg/services/agentpools/azure/flexvm/redact.go b/plugin/pkg/services/agentpools/azure/flexvm/redact.go new file mode 100644 index 0000000..fcd1f0a --- /dev/null +++ b/plugin/pkg/services/agentpools/azure/flexvm/redact.go @@ -0,0 +1,10 @@ +// Package flexvm implements the cross-region Azure VM agent pool service. +// See agentpools.go for the package overview and authentication contract. +package flexvm + +func (ap *AgentPool) Redact() { + ap.GetSpec().GetKubeadm().Redact() +} + +func (i *Instance) Redact() { +} diff --git a/plugin/pkg/services/agentpools/instances.go b/plugin/pkg/services/agentpools/instances.go index 37b39f7..3dd7651 100644 --- a/plugin/pkg/services/agentpools/instances.go +++ b/plugin/pkg/services/agentpools/instances.go @@ -11,6 +11,7 @@ import ( "github.com/Azure/aks-flex/plugin/pkg/server" "github.com/Azure/aks-flex/plugin/pkg/services/agentpools/api" "github.com/Azure/aks-flex/plugin/pkg/services/agentpools/aws/ubuntu2404instance" + "github.com/Azure/aks-flex/plugin/pkg/services/agentpools/azure/flexvm" "github.com/Azure/aks-flex/plugin/pkg/services/agentpools/azure/ubuntu2404vmss" ) @@ -32,6 +33,10 @@ func NewInstancesServer(db db.DB) api.InstancesServer { return ubuntu2404vmss.NewInstancesServer(srv.DB) }, &ubuntu2404vmss.AgentPool{}) + server.MustRegister(srv.Servers, func() (api.InstancesServer, error) { + return flexvm.NewInstancesServer(srv.DB) + }, &flexvm.AgentPool{}) + return srv } From 170a427b2ffb5f33127eca8bab8ebd49e532f0f1 Mon Sep 17 00:00:00 2001 From: chokevin <11001563+chokevin@users.noreply.github.com> Date: Wed, 22 Apr 2026 11:29:00 -0700 Subject: [PATCH 04/20] feat(karpenter): add azure cross-region cloudprovider Implements a Karpenter cloudprovider ('azure-flex') that backs the new AzureFlexNodeClass by talking to the plugin's flexvm service. Mirrors the nebius layout: consts/log/api stubs, an instancetype subpackage with a hardcoded Phase 1 SKU catalog (ND96isr_H200_v5, ND96amsr_A100_v4, NC40ads_H100_v5, NC24ads_A100_v4, D8s_v5), nodeclaim conversions, and the cloudprovider.go top-level CRUD. ProviderID format is azure-flex:/// (three slashes); the round-trip via providerIDToARMID is lossless and the parse rejects URLs that put anything in the host position. Drift is computed as a SHA-256 over the AzureFlexNodeClass fields that affect VM identity, with sorted tag keys for determinism. Quota errors from the plugin surface as InsufficientCapacityError so Karpenter stops thrashing on a NodePool whose SKU isn't available. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- karpenter/pkg/cloudproviders/azure/api.go | 57 ++++ .../pkg/cloudproviders/azure/cloudprovider.go | 278 ++++++++++++++++++ karpenter/pkg/cloudproviders/azure/consts.go | 21 ++ .../azure/instancetype/catalog.go | 37 +++ .../azure/instancetype/catalog_test.go | 43 +++ .../azure/instancetype/instancetype.go | 102 +++++++ .../azure/instancetype/offerings.go | 31 ++ .../azure/instancetype/provider.go | 73 +++++ karpenter/pkg/cloudproviders/azure/log.go | 12 + .../pkg/cloudproviders/azure/nodeclaim.go | 214 ++++++++++++++ .../cloudproviders/azure/nodeclaim_test.go | 95 ++++++ 11 files changed, 963 insertions(+) create mode 100644 karpenter/pkg/cloudproviders/azure/api.go create mode 100644 karpenter/pkg/cloudproviders/azure/cloudprovider.go create mode 100644 karpenter/pkg/cloudproviders/azure/consts.go create mode 100644 karpenter/pkg/cloudproviders/azure/instancetype/catalog.go create mode 100644 karpenter/pkg/cloudproviders/azure/instancetype/catalog_test.go create mode 100644 karpenter/pkg/cloudproviders/azure/instancetype/instancetype.go create mode 100644 karpenter/pkg/cloudproviders/azure/instancetype/offerings.go create mode 100644 karpenter/pkg/cloudproviders/azure/instancetype/provider.go create mode 100644 karpenter/pkg/cloudproviders/azure/log.go create mode 100644 karpenter/pkg/cloudproviders/azure/nodeclaim.go create mode 100644 karpenter/pkg/cloudproviders/azure/nodeclaim_test.go diff --git a/karpenter/pkg/cloudproviders/azure/api.go b/karpenter/pkg/cloudproviders/azure/api.go new file mode 100644 index 0000000..f46b56c --- /dev/null +++ b/karpenter/pkg/cloudproviders/azure/api.go @@ -0,0 +1,57 @@ +package azure + +import ( + "errors" + "strings" + + "github.com/Azure/azure-sdk-for-go/sdk/azcore" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" +) + +// IsNotFound returns true if err signals "resource does not exist". The error +// can flow back from either the gRPC plugin (NotFound) or directly from the +// Azure ARM SDK (HTTP 404). +func IsNotFound(err error) bool { + if err == nil { + return false + } + if s, ok := status.FromError(err); ok && s.Code() == codes.NotFound { + return true + } + var rerr *azcore.ResponseError + if errors.As(err, &rerr) && rerr.StatusCode == 404 { + return true + } + return false +} + +// IsQuotaError returns true if err signals an Azure quota / capacity exhaustion. +// We classify both HTTP 429 and the well-known Azure ARM error codes. +func IsQuotaError(err error) bool { + if err == nil { + return false + } + var rerr *azcore.ResponseError + if errors.As(err, &rerr) { + if rerr.StatusCode == 429 { + return true + } + switch rerr.ErrorCode { + case "QuotaExceeded", + "OperationNotAllowed", + "SkuNotAvailable", + "AllocationFailed", + "ZonalAllocationFailed", + "OverconstrainedAllocationRequest": + return true + } + } + // Fallback substring match — covers gRPC-wrapped error strings and any + // codes the SDK didn't surface structurally. + msg := err.Error() + return strings.Contains(msg, "QuotaExceeded") || + strings.Contains(msg, "OperationNotAllowed") || + strings.Contains(msg, "SkuNotAvailable") || + strings.Contains(msg, "AllocationFailed") +} diff --git a/karpenter/pkg/cloudproviders/azure/cloudprovider.go b/karpenter/pkg/cloudproviders/azure/cloudprovider.go new file mode 100644 index 0000000..de2f037 --- /dev/null +++ b/karpenter/pkg/cloudproviders/azure/cloudprovider.go @@ -0,0 +1,278 @@ +// Package azure implements the AzureFlex cross-region cloud provider for +// Karpenter. It is distinct from the upstream Azure/karpenter-provider-azure +// (which only supports VMs in the same Azure region as the AKS cluster) and +// from the in-tree AKS provider wired up alongside it in cmd/controller/main.go. +// +// The provider talks to a colocated plugin gRPC service (flexvm) that performs +// the actual Azure API calls. This isolates the Karpenter controller from +// Azure SDK details and lets the plugin run with its own (Contributor-scoped) +// managed identity. +package azure + +import ( + "context" + "errors" + "fmt" + + karpoptions "github.com/Azure/karpenter-provider-azure/pkg/operator/options" + "github.com/Azure/karpenter-provider-azure/pkg/utils" + "github.com/awslabs/operatorpkg/status" + "google.golang.org/grpc" + "k8s.io/apimachinery/pkg/runtime/schema" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + v1 "sigs.k8s.io/karpenter/pkg/apis/v1" + corecloudprovider "sigs.k8s.io/karpenter/pkg/cloudprovider" + + stretchhelper "github.com/Azure/aks-flex/plugin/pkg/helper" + stretchservices "github.com/Azure/aks-flex/plugin/pkg/services" + agentpoolsapi "github.com/Azure/aks-flex/plugin/pkg/services/agentpools/api" + flexvm "github.com/Azure/aks-flex/plugin/pkg/services/agentpools/azure/flexvm" + + "github.com/Azure/aks-flex/karpenter/pkg/apis" + "github.com/Azure/aks-flex/karpenter/pkg/apis/v1alpha1" + "github.com/Azure/aks-flex/karpenter/pkg/cloudproviders" + "github.com/Azure/aks-flex/karpenter/pkg/cloudproviders/azure/instancetype" +) + +type CloudProvider struct { + stretchPluginConn *grpc.ClientConn + stretchAgentPoolsClient agentpoolsapi.AgentPoolsClient + + kubeClient client.Client + + // clusterCA is captured once at startup. Per the rubber-duck note, the + // AKS bootstrap secret lookup (kubeadm.FromAKS) is "exactly one secret per + // call", so anything cluster-wide we need is captured at struct init — + // we do not re-fetch on every Create. + clusterCA []byte + + instanceTypeProvider *instancetype.Provider +} + +func newCloudProvider( + stretchPluginConn *grpc.ClientConn, + kubeClient client.Client, + clusterCA []byte, +) *CloudProvider { + return &CloudProvider{ + stretchPluginConn: stretchPluginConn, + stretchAgentPoolsClient: agentpoolsapi.NewAgentPoolsClient(stretchPluginConn), + kubeClient: kubeClient, + clusterCA: clusterCA, + instanceTypeProvider: instancetype.NewProvider(), + } +} + +// Register installs the AzureFlex provider into the multiplexing hub. +func Register( + ctx context.Context, + hub *cloudproviders.CloudProvidersHub, + kubeClient client.Client, + clusterCA []byte, +) error { + stretchPluginConn, err := stretchservices.NewConnection() + if err != nil { + return fmt.Errorf("creating stretch plugin connection: %w", err) + } + cp := newCloudProvider(stretchPluginConn, kubeClient, clusterCA) + hub.Register(cp, GroupKind, ProviderIDScheme) + return nil +} + +var _ corecloudprovider.CloudProvider = (*CloudProvider)(nil) + +func (c *CloudProvider) getNodeClass( + ctx context.Context, + ref *v1.NodeClassReference, +) (*v1alpha1.AzureFlexNodeClass, error) { + if ref == nil { + return nil, errors.New("nodeClaim must reference a node class") + } + if ref.Group != apis.Group { + return nil, fmt.Errorf("nodeClassRef %s in group %q, expected %q", ref.Name, ref.Group, apis.Group) + } + + rv := &v1alpha1.AzureFlexNodeClass{} + if err := c.kubeClient.Get(ctx, client.ObjectKey{Name: ref.Name}, rv); err != nil { + return nil, fmt.Errorf("getting AzureFlexNodeClass %s: %w", ref.Name, err) + } + if !rv.DeletionTimestamp.IsZero() { + return nil, utils.NewTerminatingResourceError( + schema.GroupResource{Group: apis.Group, Resource: "azureflexnodeclasses"}, + rv.Name, + ) + } + return rv, nil +} + +func (c *CloudProvider) instanceTypeKey(nc *v1alpha1.AzureFlexNodeClass) instancetype.NodeClassKey { + osDisk := int64(128) + if nc.Spec.OSDiskSizeGB != nil { + osDisk = int64(*nc.Spec.OSDiskSizeGB) + } + pods := instancetype.DefaultPerNodePodsCount + if nc.Spec.MaxPodsPerNode != nil { + pods = *nc.Spec.MaxPodsPerNode + } + return instancetype.NodeClassKey{ + Region: nc.Spec.Location, + OSDiskSizeGiB: osDisk, + PerNodePodsCount: pods, + } +} + +func (c *CloudProvider) Create(ctx context.Context, nodeClaim *v1.NodeClaim) (*v1.NodeClaim, error) { + logger := log.FromContext(ctx).WithValues("nodeClaim", nodeClaim.Name) + logger.Info("creating azure-flex VM for nodeClaim") + + nodeClass, err := c.getNodeClass(ctx, nodeClaim.Spec.NodeClassRef) + if err != nil { + return nil, err + } + + key := c.instanceTypeKey(nodeClass) + it, err := c.instanceTypeProvider.ResolveFromNodeClaim(key, nodeClaim.Spec.Requirements) + if err != nil { + // Schedule-time error: the requested SKU isn't in the Phase 1 catalog. + return nil, corecloudprovider.NewInsufficientCapacityError(err) + } + logger.Info("resolved instance type", "instanceType", it.Name) + + agentPool := nodeClaimToAgentPool( + karpoptions.FromContext(ctx), + c.clusterCA, + nodeClass, + nodeClaim, + it, + ) + created, err := stretchhelper.CreateOrUpdate( + c.stretchAgentPoolsClient.CreateOrUpdate, + ctx, agentPool, + ) + if err != nil { + if IsQuotaError(err) { + return nil, corecloudprovider.NewInsufficientCapacityError(err) + } + return nil, fmt.Errorf("creating azure-flex agent pool: %w", err) + } + + return agentPoolToNodeClaim(created, it), nil +} + +func (c *CloudProvider) Delete(ctx context.Context, nodeClaim *v1.NodeClaim) error { + logger := log.FromContext(ctx).WithValues("nodeClaim", nodeClaim.Name) + if nodeClaim.Status.ProviderID == "" { + logger.V(5).Info("nodeClaim has no providerID, skipping deletion") + return nil + } + + // Per CloudProvider.Delete contract: signal NodeClaimNotFoundError if the + // remote resource is already gone (so karpenter knows it's safe to drop). + if _, err := stretchhelper.Get[*flexvm.AgentPool]( + c.stretchAgentPoolsClient.Get, + ctx, nodeClaim.Name, + ); err != nil { + if IsNotFound(err) { + return corecloudprovider.NewNodeClaimNotFoundError(err) + } + // Non-NotFound get failure: log and proceed with delete in best effort. + logger.V(5).Error(err, "getting agent pool for nodeClaim, proceeding to delete") + } + + if err := stretchhelper.Delete( + c.stretchAgentPoolsClient.Delete, + ctx, nodeClaim.Name, + ); err != nil { + return fmt.Errorf("deleting azure-flex agent pool: %w", err) + } + logger.Info("deleted azure-flex agent pool", "nodeClaim", nodeClaim.Name) + return nil +} + +func (c *CloudProvider) Get(ctx context.Context, providerID string) (*v1.NodeClaim, error) { + name, err := providerIDToVMName(providerID) + if err != nil { + return nil, err + } + ap, err := stretchhelper.Get[*flexvm.AgentPool]( + c.stretchAgentPoolsClient.Get, + ctx, name, + ) + if err != nil { + if IsNotFound(err) { + return nil, corecloudprovider.NewNodeClaimNotFoundError(err) + } + return nil, err + } + // We don't have the NodeClass here (Get is called by reconcilers that may + // not have a class on hand) — pass nil instanceType and accept missing + // well-known labels. They'll be repopulated by the next Create-flow Get. + return agentPoolToNodeClaim(ap, nil), nil +} + +func (c *CloudProvider) List(ctx context.Context) ([]*v1.NodeClaim, error) { + aps, err := stretchhelper.List[*flexvm.AgentPool]( + c.stretchAgentPoolsClient.List, + ctx, "", + ) + if err != nil { + return nil, err + } + out := make([]*v1.NodeClaim, 0, len(aps)) + for _, ap := range aps { + out = append(out, agentPoolToNodeClaim(ap, nil)) + } + return out, nil +} + +func (c *CloudProvider) GetInstanceTypes(ctx context.Context, nodePool *v1.NodePool) ([]*corecloudprovider.InstanceType, error) { + logger := loggerFromContext(ctx).WithValues("nodePool", nodePool.Name) + + nodeClass, err := c.getNodeClass(ctx, nodePool.Spec.Template.Spec.NodeClassRef) + if err != nil { + return nil, fmt.Errorf("getting node class for node pool: %w", err) + } + + its := c.instanceTypeProvider.GetInstanceTypes(c.instanceTypeKey(nodeClass)) + logger.V(5).Info("listed instance types", "count", len(its)) + return its, nil +} + +func (c *CloudProvider) GetSupportedNodeClasses() []status.Object { + return []status.Object{ + &v1alpha1.AzureFlexNodeClass{}, + } +} + +func (c *CloudProvider) IsDrifted(ctx context.Context, nodeClaim *v1.NodeClaim) (corecloudprovider.DriftReason, error) { + if nodeClaim.Spec.NodeClassRef == nil { + return "", nil + } + nc := &v1alpha1.AzureFlexNodeClass{} + if err := c.kubeClient.Get(ctx, client.ObjectKey{Name: nodeClaim.Spec.NodeClassRef.Name}, nc); err != nil { + return "", client.IgnoreNotFound(err) + } + + current := driftHash(nc.Spec) + prior := nodeClaim.Annotations[v1alpha1.AzureFlexNodeClassHashAnnotation] + if prior != "" && prior != current { + return corecloudprovider.DriftReason("AzureFlexNodeClassChanged"), nil + } + return "", nil +} + +func (c *CloudProvider) Name() string { + return ProviderIDScheme +} + +func (c *CloudProvider) RepairPolicies() []corecloudprovider.RepairPolicy { + return []corecloudprovider.RepairPolicy{} +} + +func (c *CloudProvider) Close(context.Context) error { + if c.stretchPluginConn != nil { + return c.stretchPluginConn.Close() + } + return nil +} diff --git a/karpenter/pkg/cloudproviders/azure/consts.go b/karpenter/pkg/cloudproviders/azure/consts.go new file mode 100644 index 0000000..91edac1 --- /dev/null +++ b/karpenter/pkg/cloudproviders/azure/consts.go @@ -0,0 +1,21 @@ +package azure + +import ( + "k8s.io/apimachinery/pkg/runtime/schema" + + "github.com/Azure/aks-flex/karpenter/pkg/apis" +) + +const ( + // ProviderIDScheme is the URL scheme used in NodeClaim.Status.ProviderID + // for instances managed by the Azure cross-region (flex) cloud provider. + // + // Distinct from "azure" (which the AKS in-region provider uses) so the + // Karpenter cloud-provider hub can multiplex correctly. + ProviderIDScheme = "azure-flex" +) + +var GroupKind = schema.GroupKind{ + Group: apis.Group, + Kind: "AzureFlexNodeClass", +} diff --git a/karpenter/pkg/cloudproviders/azure/instancetype/catalog.go b/karpenter/pkg/cloudproviders/azure/instancetype/catalog.go new file mode 100644 index 0000000..8c36bb5 --- /dev/null +++ b/karpenter/pkg/cloudproviders/azure/instancetype/catalog.go @@ -0,0 +1,37 @@ +package instancetype + +// CatalogEntry is a Phase 1 hardcoded SKU description. We deliberately do NOT +// call Azure's SKU/quota APIs from the karpenter controller — letting ARM +// fail and classifying the error is simpler and correct for Phase 1 (issue #63). +// +// Future work: replace this with a dynamic provider backed by armcompute's +// resourceSkus client (and per-region offering refresh). +type CatalogEntry struct { + // Name is the Azure VM size, used as both the karpenter instance type + // name and the Azure VMSize when creating the VM. + Name string + + VCPU int64 + MemoryGB int64 + GPU int64 +} + +// Catalog is the hardcoded allowlist of SKUs that AzureFlexNodeClass +// NodePools may schedule onto in Phase 1. +var Catalog = []CatalogEntry{ + {Name: "Standard_ND96isr_H200_v5", VCPU: 96, MemoryGB: 1900, GPU: 8}, + {Name: "Standard_ND96amsr_A100_v4", VCPU: 96, MemoryGB: 1900, GPU: 8}, + {Name: "Standard_NC40ads_H100_v5", VCPU: 40, MemoryGB: 320, GPU: 1}, + {Name: "Standard_NC24ads_A100_v4", VCPU: 24, MemoryGB: 220, GPU: 1}, + {Name: "Standard_D8s_v5", VCPU: 8, MemoryGB: 32, GPU: 0}, +} + +// Get returns the CatalogEntry for name, or nil if name is not in the catalog. +func Get(name string) *CatalogEntry { + for i := range Catalog { + if Catalog[i].Name == name { + return &Catalog[i] + } + } + return nil +} diff --git a/karpenter/pkg/cloudproviders/azure/instancetype/catalog_test.go b/karpenter/pkg/cloudproviders/azure/instancetype/catalog_test.go new file mode 100644 index 0000000..afcbf17 --- /dev/null +++ b/karpenter/pkg/cloudproviders/azure/instancetype/catalog_test.go @@ -0,0 +1,43 @@ +package instancetype + +import "testing" + +func TestCatalogContainsRequiredSKUs(t *testing.T) { + required := []string{ + "Standard_ND96isr_H200_v5", + "Standard_ND96amsr_A100_v4", + "Standard_NC40ads_H100_v5", + "Standard_NC24ads_A100_v4", + "Standard_D8s_v5", + } + for _, name := range required { + if Get(name) == nil { + t.Errorf("catalog missing required SKU %q", name) + } + } +} + +func TestCatalogGetUnknown(t *testing.T) { + if Get("Standard_DoesNotExist_v1") != nil { + t.Fatalf("Get must return nil for unknown SKU") + } +} + +func TestProviderResolveUnknownSKU(t *testing.T) { + // Bypass NodeClaim machinery: an instance type name not in the catalog + // must come back as a clean error from the provider. We exercise this + // path indirectly via GetByName since ResolveFromNodeClaim requires + // scheduling fixtures. + p := NewProvider() + if p.GetByName(NodeClassKey{Region: "eastus2"}, "Standard_DoesNotExist_v1") != nil { + t.Fatalf("GetByName must return nil for unknown SKU") + } +} + +func TestProviderListCount(t *testing.T) { + p := NewProvider() + its := p.GetInstanceTypes(NodeClassKey{Region: "eastus2", OSDiskSizeGiB: 128, PerNodePodsCount: 110}) + if len(its) != len(Catalog) { + t.Fatalf("GetInstanceTypes returned %d entries, want %d", len(its), len(Catalog)) + } +} diff --git a/karpenter/pkg/cloudproviders/azure/instancetype/instancetype.go b/karpenter/pkg/cloudproviders/azure/instancetype/instancetype.go new file mode 100644 index 0000000..b1f4d84 --- /dev/null +++ b/karpenter/pkg/cloudproviders/azure/instancetype/instancetype.go @@ -0,0 +1,102 @@ +package instancetype + +import ( + "fmt" + + azurev1beta1 "github.com/Azure/karpenter-provider-azure/pkg/apis/v1beta1" + azinstancetype "github.com/Azure/karpenter-provider-azure/pkg/providers/instancetype" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1" + karpcloudprovider "sigs.k8s.io/karpenter/pkg/cloudprovider" + "sigs.k8s.io/karpenter/pkg/scheduling" +) + +const ( + // Architecture is amd64 only for Phase 1 (all five catalog SKUs are amd64). + Architecture = "amd64" + + // DefaultPerNodePodsCount is used when AzureFlexNodeClass.Spec.MaxPodsPerNode is unset. + DefaultPerNodePodsCount int32 = 110 +) + +// NodeClassKey holds the AzureFlexNodeClass fields that affect instance-type +// shape. We use it as a value-type so it's hashable for any future caching. +type NodeClassKey struct { + Region string + OSDiskSizeGiB int64 + PerNodePodsCount int32 +} + +// New builds a Karpenter InstanceType from a CatalogEntry, NodeClass key, and +// pre-built offerings. We follow the nebius shape so labels and overhead match. +func New( + key NodeClassKey, + entry *CatalogEntry, + offerings karpcloudprovider.Offerings, +) *karpcloudprovider.InstanceType { + return &karpcloudprovider.InstanceType{ + Name: entry.Name, + Requirements: requirements(key.Region, entry, offerings), + Offerings: offerings, + Capacity: capacity(key, entry), + Overhead: overhead(entry), + } +} + +func requirements( + region string, + e *CatalogEntry, + offerings karpcloudprovider.Offerings, +) scheduling.Requirements { + // Single zone (empty) for Phase 1: cross-region zonal placement is deferred. + zones := []string{""} + for _, o := range offerings { + if zoneReq := o.Requirements.Get(corev1.LabelTopologyZone); zoneReq != nil { + zones = zoneReq.Values() + } + } + capacityTypes := []string{karpv1.CapacityTypeOnDemand} + + vCPU := fmt.Sprint(e.VCPU) + memMiB := fmt.Sprint(e.MemoryGB * 1024) + gpu := fmt.Sprint(e.GPU) + + return scheduling.NewRequirements( + scheduling.NewRequirement(corev1.LabelInstanceTypeStable, corev1.NodeSelectorOpIn, e.Name), + scheduling.NewRequirement(corev1.LabelTopologyZone, corev1.NodeSelectorOpIn, zones...), + scheduling.NewRequirement(corev1.LabelTopologyRegion, corev1.NodeSelectorOpIn, region), + scheduling.NewRequirement(corev1.LabelOSStable, corev1.NodeSelectorOpIn, string(corev1.Linux)), + scheduling.NewRequirement(corev1.LabelArchStable, corev1.NodeSelectorOpIn, Architecture), + scheduling.NewRequirement(karpv1.CapacityTypeLabelKey, corev1.NodeSelectorOpIn, capacityTypes...), + // Azure-domain labels (mirrors karpenter-provider-azure conventions). + scheduling.NewRequirement(azurev1beta1.LabelSKUCPU, corev1.NodeSelectorOpIn, vCPU), + scheduling.NewRequirement(azurev1beta1.LabelSKUMemory, corev1.NodeSelectorOpIn, memMiB), + scheduling.NewRequirement(azurev1beta1.AKSLabelCPU, corev1.NodeSelectorOpIn, vCPU), + scheduling.NewRequirement(azurev1beta1.AKSLabelMemory, corev1.NodeSelectorOpIn, memMiB), + scheduling.NewRequirement(azurev1beta1.LabelSKUGPUCount, corev1.NodeSelectorOpIn, gpu), + ) +} + +func capacity(key NodeClassKey, e *CatalogEntry) corev1.ResourceList { + osDisk := *resource.NewScaledQuantity(key.OSDiskSizeGiB, resource.Giga) + pods := resource.MustParse(fmt.Sprintf("%d", key.PerNodePodsCount)) + mem := resource.NewScaledQuantity(e.MemoryGB, resource.Giga) + cpu := resource.NewQuantity(e.VCPU, resource.DecimalSI) + gpu := resource.NewQuantity(e.GPU, resource.DecimalSI) + return corev1.ResourceList{ + corev1.ResourceCPU: *cpu, + corev1.ResourceMemory: *mem, + corev1.ResourceEphemeralStorage: osDisk, + corev1.ResourcePods: pods, + corev1.ResourceName("nvidia.com/gpu"): *gpu, + } +} + +func overhead(e *CatalogEntry) *karpcloudprovider.InstanceTypeOverhead { + return &karpcloudprovider.InstanceTypeOverhead{ + KubeReserved: azinstancetype.KubeReservedResources(e.VCPU, float64(e.MemoryGB)), + SystemReserved: azinstancetype.SystemReservedResources(), + EvictionThreshold: azinstancetype.EvictionThreshold(), + } +} diff --git a/karpenter/pkg/cloudproviders/azure/instancetype/offerings.go b/karpenter/pkg/cloudproviders/azure/instancetype/offerings.go new file mode 100644 index 0000000..6b01985 --- /dev/null +++ b/karpenter/pkg/cloudproviders/azure/instancetype/offerings.go @@ -0,0 +1,31 @@ +package instancetype + +import ( + corev1 "k8s.io/api/core/v1" + karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1" + karpcloudprovider "sigs.k8s.io/karpenter/pkg/cloudprovider" + "sigs.k8s.io/karpenter/pkg/scheduling" +) + +// defaultPrice is the placeholder price used for all Phase 1 catalog SKUs. The +// real price is irrelevant because consolidation is not enabled across cross- +// region NodePools — but Karpenter requires a non-zero price for ordering. +const defaultPrice = 1.0 + +// Offerings returns the cross-region offerings for a single SKU. Phase 1: +// - on-demand only (no spot) +// - empty zone (cross-region zonal placement is deferred) +func Offerings(region string) karpcloudprovider.Offerings { + return karpcloudprovider.Offerings{ + { + Price: defaultPrice, + Available: true, + Requirements: scheduling.NewRequirements( + scheduling.NewRequirement(karpv1.CapacityTypeLabelKey, corev1.NodeSelectorOpIn, karpv1.CapacityTypeOnDemand), + // Empty zone string — Phase 1 is region-only. Karpenter requires + // the zone requirement to exist on every offering. + scheduling.NewRequirement(corev1.LabelTopologyZone, corev1.NodeSelectorOpIn, ""), + ), + }, + } +} diff --git a/karpenter/pkg/cloudproviders/azure/instancetype/provider.go b/karpenter/pkg/cloudproviders/azure/instancetype/provider.go new file mode 100644 index 0000000..20d49f0 --- /dev/null +++ b/karpenter/pkg/cloudproviders/azure/instancetype/provider.go @@ -0,0 +1,73 @@ +package instancetype + +import ( + "fmt" + + "github.com/samber/lo" + corev1 "k8s.io/api/core/v1" + karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1" + karpcloudprovider "sigs.k8s.io/karpenter/pkg/cloudprovider" + "sigs.k8s.io/karpenter/pkg/scheduling" +) + +// Provider returns InstanceTypes for a given AzureFlexNodeClass key. There's +// no caching or background refresh because the Phase 1 catalog is hardcoded. +type Provider struct{} + +func NewProvider() *Provider { return &Provider{} } + +// GetInstanceTypes returns one InstanceType per catalog entry, all rooted at +// the NodeClass's region. Order matches Catalog (stable across calls). +func (p *Provider) GetInstanceTypes(key NodeClassKey) []*karpcloudprovider.InstanceType { + offerings := Offerings(key.Region) + out := make([]*karpcloudprovider.InstanceType, 0, len(Catalog)) + for i := range Catalog { + out = append(out, New(key, &Catalog[i], offerings)) + } + return out +} + +// GetByName returns a single InstanceType by SKU name, or nil if the SKU is +// not in the Phase 1 catalog. +func (p *Provider) GetByName(key NodeClassKey, name string) *karpcloudprovider.InstanceType { + entry := Get(name) + if entry == nil { + return nil + } + return New(key, entry, Offerings(key.Region)) +} + +// ResolveFromNodeClaim picks the catalog SKU that matches the NodeClaim's +// requirements. Phase 1 chooses the first matching catalog entry by stable +// catalog order (price is uniform). Returns an error if no catalog SKU +// satisfies the requirements. +func (p *Provider) ResolveFromNodeClaim( + key NodeClassKey, + requirements []karpv1.NodeSelectorRequirementWithMinValues, +) (*karpcloudprovider.InstanceType, error) { + reqs := scheduling.NewNodeSelectorRequirementsWithMinValues(requirements...) + + requested := map[string]struct{}{} + if itReq := reqs.Get(corev1.LabelInstanceTypeStable); itReq != nil { + for _, v := range itReq.Values() { + requested[v] = struct{}{} + } + } + + for i := range Catalog { + e := &Catalog[i] + if len(requested) > 0 { + if _, ok := requested[e.Name]; !ok { + continue + } + } + it := New(key, e, Offerings(key.Region)) + if !reqs.IsCompatible(it.Requirements, scheduling.AllowUndefinedWellKnownLabels) { + continue + } + return it, nil + } + + return nil, fmt.Errorf("no AzureFlex catalog SKU matches NodeClaim requirements (requested=%v)", + lo.Keys(requested)) +} diff --git a/karpenter/pkg/cloudproviders/azure/log.go b/karpenter/pkg/cloudproviders/azure/log.go new file mode 100644 index 0000000..b0bcbdd --- /dev/null +++ b/karpenter/pkg/cloudproviders/azure/log.go @@ -0,0 +1,12 @@ +package azure + +import ( + "context" + + "github.com/go-logr/logr" + "sigs.k8s.io/controller-runtime/pkg/log" +) + +func loggerFromContext(ctx context.Context) logr.Logger { + return log.FromContext(ctx).WithName(ProviderIDScheme) +} diff --git a/karpenter/pkg/cloudproviders/azure/nodeclaim.go b/karpenter/pkg/cloudproviders/azure/nodeclaim.go new file mode 100644 index 0000000..f72afc7 --- /dev/null +++ b/karpenter/pkg/cloudproviders/azure/nodeclaim.go @@ -0,0 +1,214 @@ +package azure + +import ( + "crypto/sha256" + "encoding/hex" + "fmt" + "net/url" + "sort" + "strings" + + "github.com/Azure/karpenter-provider-azure/pkg/operator/options" + labelspkg "github.com/Azure/karpenter-provider-azure/pkg/providers/labels" + "github.com/samber/lo" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + v1 "sigs.k8s.io/karpenter/pkg/apis/v1" + "sigs.k8s.io/karpenter/pkg/cloudprovider" + "sigs.k8s.io/karpenter/pkg/utils/resources" + + stretchapi "github.com/Azure/aks-flex/plugin/api" + "github.com/Azure/aks-flex/plugin/pkg/services/agentpools/api/features/kubeadm" + flexvm "github.com/Azure/aks-flex/plugin/pkg/services/agentpools/azure/flexvm" + "github.com/Azure/aks-flex/plugin/pkg/topology" + + "github.com/Azure/aks-flex/karpenter/pkg/apis/v1alpha1" + "github.com/Azure/aks-flex/karpenter/pkg/cloudproviders" +) + +// providerID format: +// +// azure-flex:///subscriptions//resourceGroups//providers/Microsoft.Compute/virtualMachines/ +// +// Three slashes after the scheme: empty host, then the canonical ARM resource +// id (which starts with a slash). Round-trip via [providerIDToARMID] / +// [armIDToProviderID] is lossless. + +func armIDToProviderID(armID string) string { + if !strings.HasPrefix(armID, "/") { + armID = "/" + armID + } + return ProviderIDScheme + "://" + armID +} + +func providerIDToARMID(providerID string) (string, error) { + u, err := url.Parse(providerID) + if err != nil { + return "", fmt.Errorf("parsing providerID %q: %w", providerID, err) + } + if u.Scheme != ProviderIDScheme { + return "", fmt.Errorf("unexpected providerID scheme %q, expected %q", u.Scheme, ProviderIDScheme) + } + if u.Host != "" { + // Canonical form has empty host. If there's anything in the host + // position the providerID was constructed wrong. + return "", fmt.Errorf("providerID %q has unexpected host %q", providerID, u.Host) + } + if u.Path == "" { + return "", fmt.Errorf("providerID %q has empty ARM path", providerID) + } + return u.Path, nil +} + +// providerIDToVMName extracts the VM name (last path segment) from the providerID. +func providerIDToVMName(providerID string) (string, error) { + armID, err := providerIDToARMID(providerID) + if err != nil { + return "", err + } + parts := strings.Split(strings.TrimPrefix(armID, "/"), "/") + if len(parts) == 0 { + return "", fmt.Errorf("providerID %q has no name segment", providerID) + } + return parts[len(parts)-1], nil +} + +// agentPoolToNodeClaim rebuilds a karpenter NodeClaim from a flexvm AgentPool +// returned by the plugin. The instanceType supplies the well-known scheduling +// labels and capacity. +func agentPoolToNodeClaim( + ap *flexvm.AgentPool, + instanceType *cloudprovider.InstanceType, +) *v1.NodeClaim { + rv := &v1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: ap.GetMetadata().GetId(), + Labels: map[string]string{}, + Annotations: map[string]string{}, + CreationTimestamp: metav1.NewTime(ap.GetStatus().GetCreatedAt().AsTime()), + }, + Spec: v1.NodeClaimSpec{}, + Status: v1.NodeClaimStatus{ + ProviderID: armIDToProviderID(ap.GetStatus().GetVmResourceId()), + }, + } + + if instanceType != nil { + rv.Labels = labelspkg.GetAllSingleValuedRequirementLabels(instanceType.Requirements) + rv.Status.Capacity = lo.PickBy(instanceType.Capacity, filterNonZero) + rv.Status.Allocatable = lo.PickBy(instanceType.Allocatable(), filterNonZero) + } + + // Phase 1: on-demand only, region-only. + rv.Labels[v1.CapacityTypeLabelKey] = v1.CapacityTypeOnDemand + rv.Labels[corev1.LabelTopologyRegion] = ap.GetSpec().GetLocation() + + return rv +} + +// nodeClaimToAgentPool builds the plugin AgentPool message from the karpenter +// NodeClass + NodeClaim + resolved instance type. Resource names are entirely +// deterministic from the NodeClaim name (for retry idempotency). +func nodeClaimToAgentPool( + karpOpts *options.Options, + clusterCA []byte, + nodeClass *v1alpha1.AzureFlexNodeClass, + nodeClaim *v1.NodeClaim, + instanceType *cloudprovider.InstanceType, +) *flexvm.AgentPool { + mdBuilder := stretchapi.Metadata_builder{ + Id: lo.ToPtr(nodeClaim.Name), + } + + osDiskSize := lo.FromPtrOr(nodeClass.Spec.OSDiskSizeGB, 128) + securityType := lo.FromPtrOr(nodeClass.Spec.SecurityType, "Standard") + + kubeadmConfig := kubeadm.Config_builder{ + Server: lo.ToPtr(karpOpts.ClusterEndpoint), + CertificateAuthorityData: clusterCA, + Token: lo.ToPtr(karpOpts.KubeletClientTLSBootstrapToken), + NodeLabels: map[string]string{ + cloudproviders.NodeClaimLabelKey: nodeClaim.Name, + topology.NodeLabelKeyCloudProviderManaged: "false", + topology.NodeLabelKeyCloudProviderCluster: karpOpts.NodeResourceGroup, + topology.NodeLabelKeyStretchManaged: "true", + }, + }.Build() + kubeadmConfig.AddNodeLabels(map[string]string{ + corev1.LabelInstanceTypeStable: instanceType.Name, + corev1.LabelTopologyRegion: nodeClass.Spec.Location, + // Empty zone — region-only Phase 1. + corev1.LabelTopologyZone: "", + v1.CapacityTypeLabelKey: v1.CapacityTypeOnDemand, + "kubernetes.azure.com/mode": "user", + }) + kubeadmConfig.AddK8SRegisterTaints(v1.UnregisteredNoExecuteTaint) + + specBuilder := flexvm.AgentPoolSpec_builder{ + SubscriptionId: lo.ToPtr(nodeClass.Spec.SubscriptionID), + ResourceGroup: lo.ToPtr(nodeClass.Spec.ResourceGroup), + Location: lo.ToPtr(nodeClass.Spec.Location), + SubnetId: lo.ToPtr(nodeClass.Spec.SubnetID), + VmSize: lo.ToPtr(instanceType.Name), + SecurityType: lo.ToPtr(securityType), + OsDiskSizeGb: lo.ToPtr(int32(osDiskSize)), + SshPublicKeys: nodeClass.Spec.SSHPublicKeys, + AllocatePublicIp: lo.ToPtr(lo.FromPtrOr(nodeClass.Spec.AllocateNodePublicIP, false)), + Tags: nodeClass.Spec.Tags, + Kubeadm: kubeadmConfig, + } + if ref := nodeClass.Spec.ImageReference; ref != nil { + specBuilder.ImageReference = flexvm.ImageReference_builder{ + Publisher: lo.ToPtr(ref.Publisher), + Offer: lo.ToPtr(ref.Offer), + Sku: lo.ToPtr(ref.SKU), + Version: lo.ToPtr(ref.Version), + }.Build() + } + if id := lo.FromPtrOr(nodeClass.Spec.ImageID, ""); id != "" { + specBuilder.ImageId = lo.ToPtr(id) + } + + return flexvm.AgentPool_builder{ + Metadata: mdBuilder.Build(), + Spec: specBuilder.Build(), + }.Build() +} + +// driftHash returns a deterministic hex digest over the AzureFlexNodeClass +// fields whose change must trigger node drift. Mirrors the nebius "rebuild +// from spec" pattern. +func driftHash(spec v1alpha1.AzureFlexNodeClassSpec) string { + h := sha256.New() + write := func(s string) { _, _ = h.Write([]byte(s)); _, _ = h.Write([]byte{0}) } + + write(spec.SubscriptionID) + write(spec.Location) + write(spec.ResourceGroup) + write(spec.SubnetID) + write(lo.FromPtrOr(spec.SecurityType, "Standard")) + write(fmt.Sprintf("%d", lo.FromPtrOr(spec.OSDiskSizeGB, 128))) + if ref := spec.ImageReference; ref != nil { + write("imgref:" + ref.Publisher + "|" + ref.Offer + "|" + ref.SKU + "|" + ref.Version) + } else { + write("imgref:") + } + write("imgid:" + lo.FromPtrOr(spec.ImageID, "")) + // Tags affect downstream observability/billing but not the VM identity. + // They DO contribute to drift so an operator-driven tag rotation forces + // nodes to reconcile. + keys := make([]string, 0, len(spec.Tags)) + for k := range spec.Tags { + keys = append(keys, k) + } + sort.Strings(keys) + for _, k := range keys { + write("tag:" + k + "=" + spec.Tags[k]) + } + return hex.EncodeToString(h.Sum(nil)) +} + +func filterNonZero(_ corev1.ResourceName, q resource.Quantity) bool { + return !resources.IsZero(q) +} diff --git a/karpenter/pkg/cloudproviders/azure/nodeclaim_test.go b/karpenter/pkg/cloudproviders/azure/nodeclaim_test.go new file mode 100644 index 0000000..3e3f37a --- /dev/null +++ b/karpenter/pkg/cloudproviders/azure/nodeclaim_test.go @@ -0,0 +1,95 @@ +package azure + +import ( + "strings" + "testing" + + "github.com/Azure/aks-flex/karpenter/pkg/apis/v1alpha1" +) + +func TestProviderIDRoundTrip(t *testing.T) { + armID := "/subscriptions/00000000-0000-0000-0000-000000000000/resourceGroups/my-rg/providers/Microsoft.Compute/virtualMachines/nodeclaim-abc" + pid := armIDToProviderID(armID) + + if !strings.HasPrefix(pid, "azure-flex:///subscriptions/") { + t.Fatalf("providerID %q must have azure-flex:/// prefix and a slash before subscriptions", pid) + } + + got, err := providerIDToARMID(pid) + if err != nil { + t.Fatalf("providerIDToARMID: %v", err) + } + if got != armID { + t.Fatalf("round-trip mismatch:\n in: %s\n out: %s", armID, got) + } + + name, err := providerIDToVMName(pid) + if err != nil { + t.Fatalf("providerIDToVMName: %v", err) + } + if name != "nodeclaim-abc" { + t.Fatalf("expected name nodeclaim-abc, got %s", name) + } +} + +func TestProviderIDInvalidScheme(t *testing.T) { + cases := []string{ + "aks-nebius://abc", + "https://example.com/foo", + "azure:///subscriptions/x/y", + "not-a-url", + } + for _, c := range cases { + t.Run(c, func(t *testing.T) { + if _, err := providerIDToARMID(c); err == nil { + t.Fatalf("expected error parsing providerID %q", c) + } + }) + } +} + +func TestProviderIDRejectsHost(t *testing.T) { + // Three slashes are required: azure-flex:///. Two slashes followed + // by something puts that something in the URL host, which we reject. + bad := "azure-flex://hostname/subscriptions/x/y" + if _, err := providerIDToARMID(bad); err == nil { + t.Fatalf("expected error for providerID with host segment") + } +} + +func TestDriftHashDeterministic(t *testing.T) { + mk := func() v1alpha1.AzureFlexNodeClassSpec { + size := int32(256) + sec := "Standard" + return v1alpha1.AzureFlexNodeClassSpec{ + SubscriptionID: "sub", + Location: "eastus2", + ResourceGroup: "rg", + SubnetID: "/subscriptions/sub/resourceGroups/rg/providers/Microsoft.Network/virtualNetworks/v/subnets/s", + SecurityType: &sec, + OSDiskSizeGB: &size, + Tags: map[string]string{"a": "1", "b": "2"}, + } + } + a := driftHash(mk()) + b := driftHash(mk()) + if a != b { + t.Fatalf("hash should be deterministic: %s != %s", a, b) + } + + // Tags in different insertion order should yield the same hash because + // driftHash sorts tag keys. Map iteration order is non-deterministic, so + // build with the same content and just verify equality is preserved. + c := mk() + c.Tags = map[string]string{"b": "2", "a": "1"} + if driftHash(c) != a { + t.Fatalf("hash must not depend on map insertion order") + } + + // Different subnet → different hash. + d := mk() + d.SubnetID = "/subscriptions/sub/resourceGroups/rg/providers/Microsoft.Network/virtualNetworks/v/subnets/other" + if driftHash(d) == a { + t.Fatalf("different subnet should produce different hash") + } +} From 43f1b52aade7686da37835b2aabee94f22162405 Mon Sep 17 00:00:00 2001 From: chokevin <11001563+chokevin@users.noreply.github.com> Date: Wed, 22 Apr 2026 11:29:13 -0700 Subject: [PATCH 05/20] feat(karpenter): add azure nodeclass status+termination controllers Adds the two AzureFlexNodeClass reconcilers (status and termination), mirrored almost line-for-line from the nebius equivalents. Status controller adds a finalizer and sets the ValidationSucceeded condition based on cheap shape checks (required fields, subnet ARM-ID prefix, imageReference vs imageID mutual exclusion). Termination controller blocks NodeClass deletion until all owning NodeClaims are gone, re-emitting a WaitingOnNodeClaimTermination event every 10m. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../pkg/controllers/azure/nodeclass_status.go | 111 +++++++++++++ .../azure/nodeclass_status_test.go | 40 +++++ .../azure/nodeclass_termination.go | 146 ++++++++++++++++++ 3 files changed, 297 insertions(+) create mode 100644 karpenter/pkg/controllers/azure/nodeclass_status.go create mode 100644 karpenter/pkg/controllers/azure/nodeclass_status_test.go create mode 100644 karpenter/pkg/controllers/azure/nodeclass_termination.go diff --git a/karpenter/pkg/controllers/azure/nodeclass_status.go b/karpenter/pkg/controllers/azure/nodeclass_status.go new file mode 100644 index 0000000..61792a9 --- /dev/null +++ b/karpenter/pkg/controllers/azure/nodeclass_status.go @@ -0,0 +1,111 @@ +package azure + +import ( + "context" + "fmt" + "strings" + + opcontroller "github.com/awslabs/operatorpkg/controller" + "github.com/awslabs/operatorpkg/reasonable" + "k8s.io/apimachinery/pkg/api/equality" + controllerruntime "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + "sigs.k8s.io/karpenter/pkg/operator/injection" + + "github.com/Azure/aks-flex/karpenter/pkg/apis/v1alpha1" +) + +const controllerNameStatus = "azureflex_nodeclass.status" + +type NodeClassStatusController struct { + kubeClient client.Client +} + +var ( + _ opcontroller.Controller = (*NodeClassStatusController)(nil) + _ reconcile.ObjectReconciler[*v1alpha1.AzureFlexNodeClass] = (*NodeClassStatusController)(nil) +) + +func NewNodeClassStatusController(kubeClient client.Client) *NodeClassStatusController { + return &NodeClassStatusController{kubeClient: kubeClient} +} + +func (c *NodeClassStatusController) Register(_ context.Context, mgr manager.Manager) error { + return controllerruntime.NewControllerManagedBy(mgr). + Named(controllerNameStatus). + For(&v1alpha1.AzureFlexNodeClass{}). + WithOptions(controller.Options{ + RateLimiter: reasonable.RateLimiter(), + MaxConcurrentReconciles: 10, + }). + Complete(reconcile.AsReconciler(mgr.GetClient(), c)) +} + +func (c *NodeClassStatusController) Reconcile( + ctx context.Context, + nodeClass *v1alpha1.AzureFlexNodeClass, +) (reconcile.Result, error) { + ctx = injection.WithControllerName(ctx, controllerNameStatus) + + existing := nodeClass + future := nodeClass.DeepCopy() + + if err := c.ensureFinalizer(ctx, future); err != nil { + return reconcile.Result{}, err + } + + if err := validateSpec(future.Spec); err != nil { + future.StatusConditions().SetFalse( + v1alpha1.ConditionTypeValidationSucceeded, "InvalidSpec", err.Error(), + ) + } else { + future.StatusConditions().SetTrue(v1alpha1.ConditionTypeValidationSucceeded) + } + + if !equality.Semantic.DeepEqual(existing, future) { + if err := c.kubeClient.Status().Patch(ctx, future, client.MergeFrom(existing)); err != nil { + return reconcile.Result{}, err + } + } + + return reconcile.Result{}, nil +} + +func (c *NodeClassStatusController) ensureFinalizer( + ctx context.Context, + nodeClass *v1alpha1.AzureFlexNodeClass, +) error { + if controllerutil.ContainsFinalizer(nodeClass, v1alpha1.TerminationFinalizer) { + return nil + } + controllerutil.AddFinalizer(nodeClass, v1alpha1.TerminationFinalizer) + if err := c.kubeClient.Patch(ctx, nodeClass, client.MergeFrom(nodeClass)); err != nil { + return fmt.Errorf("patch finalizer: %w", err) + } + return nil +} + +// validateSpec performs cheap shape checks. Anything Azure-side (subnet +// existence, RG existence, identity perms) is detected on Create. +func validateSpec(spec v1alpha1.AzureFlexNodeClassSpec) error { + if strings.TrimSpace(spec.SubscriptionID) == "" { + return fmt.Errorf("subscriptionID is required") + } + if strings.TrimSpace(spec.Location) == "" { + return fmt.Errorf("location is required") + } + if strings.TrimSpace(spec.ResourceGroup) == "" { + return fmt.Errorf("resourceGroup is required") + } + if !strings.HasPrefix(spec.SubnetID, "/subscriptions/") { + return fmt.Errorf("subnetID %q must be a full ARM resource ID", spec.SubnetID) + } + if spec.ImageReference != nil && spec.ImageID != nil && *spec.ImageID != "" { + return fmt.Errorf("imageReference and imageID are mutually exclusive") + } + return nil +} diff --git a/karpenter/pkg/controllers/azure/nodeclass_status_test.go b/karpenter/pkg/controllers/azure/nodeclass_status_test.go new file mode 100644 index 0000000..760d928 --- /dev/null +++ b/karpenter/pkg/controllers/azure/nodeclass_status_test.go @@ -0,0 +1,40 @@ +package azure + +import ( + "testing" + + "github.com/Azure/aks-flex/karpenter/pkg/apis/v1alpha1" +) + +func TestValidateSpec(t *testing.T) { + good := v1alpha1.AzureFlexNodeClassSpec{ + SubscriptionID: "sub", + Location: "eastus2", + ResourceGroup: "rg", + SubnetID: "/subscriptions/sub/resourceGroups/rg/providers/Microsoft.Network/virtualNetworks/v/subnets/s", + } + if err := validateSpec(good); err != nil { + t.Fatalf("expected good spec to pass: %v", err) + } + + bad := []v1alpha1.AzureFlexNodeClassSpec{ + {Location: "eastus2", ResourceGroup: "rg", SubnetID: "/subscriptions/x"}, // missing sub + {SubscriptionID: "sub", ResourceGroup: "rg", SubnetID: "/subscriptions/x"}, // missing loc + {SubscriptionID: "sub", Location: "eastus2", SubnetID: "/subscriptions/x"}, // missing rg + {SubscriptionID: "sub", Location: "eastus2", ResourceGroup: "rg", SubnetID: "not-an-arm-id"}, + } + for i, s := range bad { + if err := validateSpec(s); err == nil { + t.Errorf("case %d: expected validateSpec to fail", i) + } + } + + // Mutually exclusive fields. + id := "/subscriptions/sub/.../images/x" + mut := good + mut.ImageReference = &v1alpha1.AzureFlexImageReference{Publisher: "p", Offer: "o", SKU: "s"} + mut.ImageID = &id + if err := validateSpec(mut); err == nil { + t.Fatalf("imageReference + imageID must be mutually exclusive") + } +} diff --git a/karpenter/pkg/controllers/azure/nodeclass_termination.go b/karpenter/pkg/controllers/azure/nodeclass_termination.go new file mode 100644 index 0000000..4a04b01 --- /dev/null +++ b/karpenter/pkg/controllers/azure/nodeclass_termination.go @@ -0,0 +1,146 @@ +package azure + +import ( + "context" + "fmt" + "time" + + "github.com/Azure/karpenter-provider-azure/pkg/utils" + opcontroller "github.com/awslabs/operatorpkg/controller" + "github.com/awslabs/operatorpkg/reasonable" + "github.com/samber/lo" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/equality" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + controllerruntime "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1" + "sigs.k8s.io/karpenter/pkg/events" + "sigs.k8s.io/karpenter/pkg/operator/injection" + + "github.com/Azure/aks-flex/karpenter/pkg/apis" + "github.com/Azure/aks-flex/karpenter/pkg/apis/v1alpha1" +) + +const ( + controllerNameTermination = "azureflex_nodeclass.termination" + nodeClassKind = "AzureFlexNodeClass" +) + +type NodeClassTerminationController struct { + kubeClient client.Client + recorder events.Recorder +} + +var ( + _ opcontroller.Controller = (*NodeClassTerminationController)(nil) + _ reconcile.ObjectReconciler[*v1alpha1.AzureFlexNodeClass] = (*NodeClassTerminationController)(nil) +) + +func NewNodeClassTerminationController( + kubeClient client.Client, + recorder events.Recorder, +) *NodeClassTerminationController { + return &NodeClassTerminationController{kubeClient: kubeClient, recorder: recorder} +} + +func (c *NodeClassTerminationController) Register(_ context.Context, mgr manager.Manager) error { + return controllerruntime.NewControllerManagedBy(mgr). + Named(controllerNameTermination). + For(&v1alpha1.AzureFlexNodeClass{}). + Watches( + &karpv1.NodeClaim{}, + handler.EnqueueRequestsFromMapFunc(func(_ context.Context, o client.Object) []reconcile.Request { + nc := o.(*karpv1.NodeClaim) + if nc.Spec.NodeClassRef == nil { + return nil + } + if nc.Spec.NodeClassRef.Group != apis.Group { + return nil + } + if nc.Spec.NodeClassRef.Kind != nodeClassKind { + return nil + } + return []reconcile.Request{{NamespacedName: types.NamespacedName{Name: nc.Spec.NodeClassRef.Name}}} + }), + builder.WithPredicates(predicate.Funcs{ + CreateFunc: func(_ event.CreateEvent) bool { return false }, + UpdateFunc: func(_ event.UpdateEvent) bool { return false }, + DeleteFunc: func(_ event.DeleteEvent) bool { return true }, + }), + ). + WithOptions(controller.Options{ + RateLimiter: reasonable.RateLimiter(), + MaxConcurrentReconciles: 10, + }). + Complete(reconcile.AsReconciler(mgr.GetClient(), c)) +} + +func (c *NodeClassTerminationController) Reconcile( + ctx context.Context, + nodeClass *v1alpha1.AzureFlexNodeClass, +) (reconcile.Result, error) { + ctx = injection.WithControllerName(ctx, controllerNameTermination) + if nodeClass.GetDeletionTimestamp().IsZero() { + return reconcile.Result{}, nil + } + return c.finalize(ctx, nodeClass) +} + +func (c *NodeClassTerminationController) finalize( + ctx context.Context, + nodeClass *v1alpha1.AzureFlexNodeClass, +) (reconcile.Result, error) { + if !controllerutil.ContainsFinalizer(nodeClass, v1alpha1.TerminationFinalizer) { + return reconcile.Result{}, nil + } + + stored := nodeClass.DeepCopy() + + nodeClaimList := &karpv1.NodeClaimList{} + if err := c.kubeClient.List(ctx, nodeClaimList, client.MatchingFields{"spec.nodeClassRef.name": nodeClass.Name}); err != nil { + return reconcile.Result{}, fmt.Errorf("listing nodeclaims using nodeclass: %w", err) + } + if len(nodeClaimList.Items) > 0 { + c.recorder.Publish(WaitingOnNodeClaimTerminationEvent(nodeClass, + lo.Map(nodeClaimList.Items, func(nc karpv1.NodeClaim, _ int) string { return nc.Name }))) + return reconcile.Result{RequeueAfter: 10 * time.Minute}, nil + } + + controllerutil.RemoveFinalizer(nodeClass, v1alpha1.TerminationFinalizer) + if !equality.Semantic.DeepEqual(stored, nodeClass) { + if err := c.kubeClient.Patch(ctx, nodeClass, + client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})); err != nil { + if errors.IsConflict(err) { + return reconcile.Result{Requeue: true}, nil + } + return reconcile.Result{}, client.IgnoreNotFound(fmt.Errorf("removing termination finalizer: %w", err)) + } + } + return reconcile.Result{}, nil +} + +type RuntimeObjectWithUID interface { + runtime.Object + GetUID() types.UID +} + +func WaitingOnNodeClaimTerminationEvent(nodeClass RuntimeObjectWithUID, names []string) events.Event { + return events.Event{ + InvolvedObject: nodeClass, + Type: corev1.EventTypeNormal, + Reason: "WaitingOnNodeClaimTermination", + Message: fmt.Sprintf("Waiting on NodeClaim termination for %s", utils.PrettySlice(names, 5)), + DedupeValues: []string{string(nodeClass.GetUID())}, + } +} From 100ca7fd3bcb69d6936b021a8b628be55900749d Mon Sep 17 00:00:00 2001 From: chokevin <11001563+chokevin@users.noreply.github.com> Date: Wed, 22 Apr 2026 11:29:13 -0700 Subject: [PATCH 06/20] feat(karpenter): wire azure cloudprovider into controller main Registers the new azure cloudprovider in the cloudproviders hub alongside aks/nebius/kaito, adds AzureFlexNodeClass to the WaitForCRDs list, and registers the two AzureFlex controllers in flexcontrollers.NewControllers. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- karpenter/cmd/controller/main.go | 13 +++++++++++++ karpenter/pkg/controllers/controllers.go | 4 ++++ 2 files changed, 17 insertions(+) diff --git a/karpenter/cmd/controller/main.go b/karpenter/cmd/controller/main.go index dca0fbf..d08e5a9 100644 --- a/karpenter/cmd/controller/main.go +++ b/karpenter/cmd/controller/main.go @@ -26,6 +26,7 @@ import ( kaitov1alpha1 "github.com/Azure/aks-flex/karpenter/pkg/apis/kaito/v1alpha1" "github.com/Azure/aks-flex/karpenter/pkg/apis/v1alpha1" flexcloudproviders "github.com/Azure/aks-flex/karpenter/pkg/cloudproviders" + azureflex "github.com/Azure/aks-flex/karpenter/pkg/cloudproviders/azure" "github.com/Azure/aks-flex/karpenter/pkg/cloudproviders/kaito" "github.com/Azure/aks-flex/karpenter/pkg/cloudproviders/nebius" flexcontrollers "github.com/Azure/aks-flex/karpenter/pkg/controllers" @@ -47,6 +48,7 @@ func main() { operator.WaitForCRDs( ctx, 2*time.Minute, ctrl.GetConfigOrDie(), logger, &v1alpha1.NebiusNodeClass{}, + &v1alpha1.AzureFlexNodeClass{}, &kaitov1alpha1.KaitoNodeClass{}, ), "failed waiting for CRDs", @@ -119,6 +121,17 @@ func main() { lo.Must0(err, "registering kaito cloud provider") } + // azure-flex (cross-region single-VM Azure cloud provider) + { + err := azureflex.Register( + ctx, + hubCloudProvider, + op.GetClient(), + clusterCA, + ) + lo.Must0(err, "registering azure-flex cloud provider") + } + overlayUndecoratedCloudProvider := metrics.Decorate(hubCloudProvider) cloudProvider := overlay.Decorate(overlayUndecoratedCloudProvider, op.GetClient(), op.InstanceTypeStore) clusterState := state.NewCluster(op.Clock, op.GetClient(), cloudProvider) diff --git a/karpenter/pkg/controllers/controllers.go b/karpenter/pkg/controllers/controllers.go index 29f7172..a3d04f6 100644 --- a/karpenter/pkg/controllers/controllers.go +++ b/karpenter/pkg/controllers/controllers.go @@ -7,6 +7,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/karpenter/pkg/events" + "github.com/Azure/aks-flex/karpenter/pkg/controllers/azure" "github.com/Azure/aks-flex/karpenter/pkg/controllers/nebius" "github.com/Azure/aks-flex/karpenter/pkg/controllers/nodes" ) @@ -21,6 +22,9 @@ func NewControllers( nebius.NewNodeClassStatusController(kubeClient), nebius.NewNodeClassTerminationController(kubeClient, recorder), + azure.NewNodeClassStatusController(kubeClient), + azure.NewNodeClassTerminationController(kubeClient, recorder), + nodes.NewSetProviderIDController(kubeClient), } } From e0267824b543d018b19df126103092adcdaa29bb Mon Sep 17 00:00:00 2001 From: chokevin <11001563+chokevin@users.noreply.github.com> Date: Wed, 22 Apr 2026 11:29:13 -0700 Subject: [PATCH 07/20] docs(karpenter): add azure example NodeClass+NodePool Adds a worked example for an H200 NodePool in eastus2 backed by an AzureFlexNodeClass, with a 64-GPU limit and consolidation enabled. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../azureflexnodeclass-h200-eastus2.yaml | 21 ++++++++++++++ karpenter/examples/azure/nodepool-h200.yaml | 29 +++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 karpenter/examples/azure/azureflexnodeclass-h200-eastus2.yaml create mode 100644 karpenter/examples/azure/nodepool-h200.yaml diff --git a/karpenter/examples/azure/azureflexnodeclass-h200-eastus2.yaml b/karpenter/examples/azure/azureflexnodeclass-h200-eastus2.yaml new file mode 100644 index 0000000..2f335f1 --- /dev/null +++ b/karpenter/examples/azure/azureflexnodeclass-h200-eastus2.yaml @@ -0,0 +1,21 @@ +apiVersion: flex.aks.azure.com/v1alpha1 +kind: AzureFlexNodeClass +metadata: + name: h200-eastus2 +spec: + subscriptionID: 00000000-0000-0000-0000-000000000000 + location: eastus2 + resourceGroup: my-flex-rg + subnetID: /subscriptions/00000000-0000-0000-0000-000000000000/resourceGroups/my-flex-rg/providers/Microsoft.Network/virtualNetworks/flex-vnet/subnets/nodes + imageReference: + publisher: microsoft-dsvm + offer: ubuntu-hpc + sku: "2204" + version: latest + securityType: Standard + osDiskSizeGB: 256 + allocateNodePublicIP: false + maxPodsPerNode: 110 + tags: + purpose: karpenter-flex-h200 + managed-by: aks-flex-karpenter diff --git a/karpenter/examples/azure/nodepool-h200.yaml b/karpenter/examples/azure/nodepool-h200.yaml new file mode 100644 index 0000000..5a5f4b4 --- /dev/null +++ b/karpenter/examples/azure/nodepool-h200.yaml @@ -0,0 +1,29 @@ +apiVersion: karpenter.sh/v1 +kind: NodePool +metadata: + name: h200 +spec: + template: + spec: + nodeClassRef: + group: flex.aks.azure.com + kind: AzureFlexNodeClass + name: h200-eastus2 + requirements: + - key: node.kubernetes.io/instance-type + operator: In + values: + - Standard_ND96isr_H200_v5 + - key: karpenter.sh/capacity-type + operator: In + values: + - on-demand + taints: + - key: nvidia.com/gpu + value: "true" + effect: NoSchedule + limits: + nvidia.com/gpu: 64 + disruption: + consolidationPolicy: WhenEmpty + consolidateAfter: 30s From c4b32b6811cdc156c7032cdd4dbe6c2c4219efc5 Mon Sep 17 00:00:00 2001 From: Kevin Cho Date: Wed, 22 Apr 2026 12:25:58 -0700 Subject: [PATCH 08/20] fix(azure): address P0/P1 review findings - nodeclass_status: capture base before AddFinalizer; client.MergeFrom on the mutated object produces an empty patch and silently drops the finalizer, allowing nodeclass deletion to bypass termination cleanup. - flexvm/agentpools Create: clean up NIC on PollUntilDone failure (NIC was created but VM never reached a state where DeleteOption cascades), and guard against nil vmResp.ID before deref. - flexvm/agentpools Delete: NIC cleanup uses fresh background context so it still runs if caller cancels mid-VM-delete. - nodeclaim: armIDToProviderID("") returns empty rather than the invalid 'azure-flex:///' URL when status is not yet populated. --- .../pkg/cloudproviders/azure/nodeclaim.go | 7 +++++++ .../pkg/controllers/azure/nodeclass_status.go | 3 ++- .../agentpools/azure/flexvm/agentpools.go | 19 ++++++++++++++++--- 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/karpenter/pkg/cloudproviders/azure/nodeclaim.go b/karpenter/pkg/cloudproviders/azure/nodeclaim.go index f72afc7..2ffbfeb 100644 --- a/karpenter/pkg/cloudproviders/azure/nodeclaim.go +++ b/karpenter/pkg/cloudproviders/azure/nodeclaim.go @@ -36,6 +36,13 @@ import ( // [armIDToProviderID] is lossless. func armIDToProviderID(armID string) string { + // Empty ARM ID (e.g. status not yet populated by plugin) → empty providerID. + // Karpenter treats empty providerID as "node not yet bound" and will retry, + // rather than producing an invalid `azure-flex:///` URL that breaks + // downstream parsers. + if armID == "" { + return "" + } if !strings.HasPrefix(armID, "/") { armID = "/" + armID } diff --git a/karpenter/pkg/controllers/azure/nodeclass_status.go b/karpenter/pkg/controllers/azure/nodeclass_status.go index 61792a9..9c2f33b 100644 --- a/karpenter/pkg/controllers/azure/nodeclass_status.go +++ b/karpenter/pkg/controllers/azure/nodeclass_status.go @@ -82,8 +82,9 @@ func (c *NodeClassStatusController) ensureFinalizer( if controllerutil.ContainsFinalizer(nodeClass, v1alpha1.TerminationFinalizer) { return nil } + base := nodeClass.DeepCopy() controllerutil.AddFinalizer(nodeClass, v1alpha1.TerminationFinalizer) - if err := c.kubeClient.Patch(ctx, nodeClass, client.MergeFrom(nodeClass)); err != nil { + if err := c.kubeClient.Patch(ctx, nodeClass, client.MergeFrom(base)); err != nil { return fmt.Errorf("patch finalizer: %w", err) } return nil diff --git a/plugin/pkg/services/agentpools/azure/flexvm/agentpools.go b/plugin/pkg/services/agentpools/azure/flexvm/agentpools.go index 5b793e5..ad9251d 100644 --- a/plugin/pkg/services/agentpools/azure/flexvm/agentpools.go +++ b/plugin/pkg/services/agentpools/azure/flexvm/agentpools.go @@ -27,6 +27,7 @@ import ( "errors" "fmt" "strings" + "time" "github.com/Azure/azure-sdk-for-go/sdk/azcore" "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm" @@ -221,8 +222,17 @@ func (srv *agentpoolsServer) CreateOrUpdate(ctx context.Context, req *api.Create } vmResp, err := vmPoller.PollUntilDone(ctx, nil) if err != nil { + // VM provisioning failed mid-flight: NIC was created but VM never + // reached a state where DeleteOption=Delete would cascade. Best-effort + // cleanup with a fresh context so it still runs if ctx was cancelled. + cleanupCtx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) + _, _ = nicsClient.BeginDelete(cleanupCtx, spec.GetResourceGroup(), nicName, nil) + cancel() return nil, fmt.Errorf("polling VM creation %q: %w", vmName, err) } + if vmResp.ID == nil { + return nil, fmt.Errorf("VM %q created but Azure returned nil resource ID", vmName) + } ap.SetStatus(AgentPoolStatus_builder{ VmResourceId: vmResp.ID, @@ -272,17 +282,20 @@ func (srv *agentpoolsServer) Delete(ctx context.Context, req *api.DeleteRequest) } // Best-effort NIC delete in case the VM never made it to a state where - // DeleteOption applied (e.g. failed mid-create). Idempotent. + // DeleteOption applied (e.g. failed mid-create). Idempotent. Uses a fresh + // context so cleanup still runs if the caller's ctx was cancelled mid-Delete. + nicCtx, nicCancel := context.WithTimeout(context.Background(), 2*time.Minute) + defer nicCancel() nicsClient, err := armnetwork.NewInterfacesClient(spec.GetSubscriptionId(), srv.credentials, nil) if err != nil { return nil, fmt.Errorf("creating NIC client: %w", err) } - nicPoller, err := nicsClient.BeginDelete(ctx, spec.GetResourceGroup(), nicName, nil) + nicPoller, err := nicsClient.BeginDelete(nicCtx, spec.GetResourceGroup(), nicName, nil) if err != nil && !isNotFound(err) { return nil, fmt.Errorf("starting NIC delete %q: %w", nicName, err) } if nicPoller != nil { - if _, err := nicPoller.PollUntilDone(ctx, nil); err != nil && !isNotFound(err) { + if _, err := nicPoller.PollUntilDone(nicCtx, nil); err != nil && !isNotFound(err) { return nil, fmt.Errorf("polling NIC delete %q: %w", nicName, err) } } From 6e7b5b9225b031d6fdbc1d9d159d250519227aaa Mon Sep 17 00:00:00 2001 From: Kevin Cho Date: Wed, 22 Apr 2026 12:42:18 -0700 Subject: [PATCH 09/20] fix(charts): grant rbac for azureflexnodeclasses The new CRD shares the flex.aks.azure.com api group with nebius, but the controller needs explicit verb grants on the resource name. Without this the nodeclass controllers would 403 on every reconcile in a real deployment. --- karpenter/charts/karpenter/templates/clusterrole.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/karpenter/charts/karpenter/templates/clusterrole.yaml b/karpenter/charts/karpenter/templates/clusterrole.yaml index 3e6a978..12aebfb 100644 --- a/karpenter/charts/karpenter/templates/clusterrole.yaml +++ b/karpenter/charts/karpenter/templates/clusterrole.yaml @@ -33,7 +33,7 @@ rules: resources: ["aksnodeclasses"] verbs: ["get", "list", "watch"] - apiGroups: ["flex.aks.azure.com"] - resources: ["nebiusnodeclasses"] + resources: ["nebiusnodeclasses", "azureflexnodeclasses"] verbs: ["get", "list", "watch"] - apiGroups: ["kaito.sh"] resources: ["kaitonodeclasses"] @@ -43,7 +43,7 @@ rules: resources: ["aksnodeclasses", "aksnodeclasses/status"] verbs: ["patch", "update"] - apiGroups: ["flex.aks.azure.com"] - resources: ["nebiusnodeclasses", "nebiusnodeclasses/status"] + resources: ["nebiusnodeclasses", "nebiusnodeclasses/status", "azureflexnodeclasses", "azureflexnodeclasses/status"] verbs: ["patch", "update"] - apiGroups: ["kaito.sh"] resources: ["kaitonodeclasses", "kaitonodeclasses/status"] From d17174b7c9466ad6d220748b05516ede425256d9 Mon Sep 17 00:00:00 2001 From: Kevin Cho Date: Wed, 22 Apr 2026 12:50:11 -0700 Subject: [PATCH 10/20] chore(karpenter): go mod tidy azcore is a direct import in the new azure cloudprovider. --- karpenter/go.mod | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/karpenter/go.mod b/karpenter/go.mod index 16481eb..1476ff3 100644 --- a/karpenter/go.mod +++ b/karpenter/go.mod @@ -4,6 +4,7 @@ go 1.26.0 require ( github.com/Azure/aks-flex/plugin v0.0.0-00010101000000-000000000000 + github.com/Azure/azure-sdk-for-go/sdk/azcore v1.21.0 github.com/Azure/karpenter-provider-azure v1.7.1 github.com/awslabs/operatorpkg v0.0.0-20250909182303-e8e550b6f339 github.com/go-logr/logr v1.4.3 @@ -27,7 +28,6 @@ require ( github.com/Azure/azure-kusto-go v0.16.1 // indirect github.com/Azure/azure-sdk-for-go v68.0.0+incompatible // indirect github.com/Azure/azure-sdk-for-go-extensions v0.5.1 // indirect - github.com/Azure/azure-sdk-for-go/sdk/azcore v1.21.0 // indirect github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 // indirect github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 // indirect github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/authorization/armauthorization/v2 v2.2.0 // indirect From a4d892be643391fc9ff5b52602e50ca0fcb90bde Mon Sep 17 00:00:00 2001 From: Kevin Cho Date: Wed, 22 Apr 2026 13:20:28 -0700 Subject: [PATCH 11/20] feat(catalog): add Standard_ND96isr_H100_v5 (8x H100 SKU) --- karpenter/pkg/cloudproviders/azure/instancetype/catalog.go | 1 + 1 file changed, 1 insertion(+) diff --git a/karpenter/pkg/cloudproviders/azure/instancetype/catalog.go b/karpenter/pkg/cloudproviders/azure/instancetype/catalog.go index 8c36bb5..ea4c5f3 100644 --- a/karpenter/pkg/cloudproviders/azure/instancetype/catalog.go +++ b/karpenter/pkg/cloudproviders/azure/instancetype/catalog.go @@ -20,6 +20,7 @@ type CatalogEntry struct { // NodePools may schedule onto in Phase 1. var Catalog = []CatalogEntry{ {Name: "Standard_ND96isr_H200_v5", VCPU: 96, MemoryGB: 1900, GPU: 8}, + {Name: "Standard_ND96isr_H100_v5", VCPU: 96, MemoryGB: 1900, GPU: 8}, {Name: "Standard_ND96amsr_A100_v4", VCPU: 96, MemoryGB: 1900, GPU: 8}, {Name: "Standard_NC40ads_H100_v5", VCPU: 40, MemoryGB: 320, GPU: 1}, {Name: "Standard_NC24ads_A100_v4", VCPU: 24, MemoryGB: 220, GPU: 1}, From 0f6173fb5f4d96fc766956e9509b2f87b14bdffe Mon Sep 17 00:00:00 2001 From: Kevin Cho Date: Wed, 22 Apr 2026 14:01:15 -0700 Subject: [PATCH 12/20] userdata: regenerate containerd v2 config before aks-flex-node apply AKSFlexNode v0.0.18 ships a containerd v2 binary but its template writes a v1-schema config (only 'imports' and 'oom_score'), leaving CRI non-functional and kubeadm join hanging at step 7/7. Manual nodes were repaired by hand; Karpenter-provisioned nodes hit registration timeout and churn. Regenerate /etc/containerd/config.toml via 'containerd config default' (which produces the v3 schema v2 expects) and restart containerd before invoking aks-flex-node apply. Mirrors the workaround applied to manual nodes. --- .../agentpools/userdata/flex/assets/bootstrap.sh.tmpl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/plugin/pkg/services/agentpools/userdata/flex/assets/bootstrap.sh.tmpl b/plugin/pkg/services/agentpools/userdata/flex/assets/bootstrap.sh.tmpl index 98dfc80..31f93ae 100644 --- a/plugin/pkg/services/agentpools/userdata/flex/assets/bootstrap.sh.tmpl +++ b/plugin/pkg/services/agentpools/userdata/flex/assets/bootstrap.sh.tmpl @@ -3,5 +3,12 @@ curl -L -o /tmp/flex/aks-flex-node-linux-{{ .Arch }}.tar.gz https://github.com/A tar -xzf /tmp/flex/aks-flex-node-linux-{{ .Arch }}.tar.gz -C /tmp/flex mv /tmp/flex/aks-flex-node-linux-{{ .Arch }} /tmp/flex/aks-flex-node chmod +x /tmp/flex/aks-flex-node +# Workaround for AKSFlexNode v0.0.x gap: ships containerd v2 binary but writes +# a v1-schema config. Regenerate the default config (which produces the v3 +# schema containerd v2 expects) and restart before running apply, otherwise +# kubelet/kubeadm join hangs because CRI is non-functional. +mkdir -p /etc/containerd +containerd config default | sed -e '/SystemdCgroup/ s/false/true/' >/etc/containerd/config.toml +systemctl restart containerd /tmp/flex/aks-flex-node apply -f /tmp/flex-config.json rm -rf /tmp/flex \ No newline at end of file From 244839e034b6688456ca9cae7cb2426a124d4ec5 Mon Sep 17 00:00:00 2001 From: Kevin Cho Date: Wed, 22 Apr 2026 15:00:53 -0700 Subject: [PATCH 13/20] userdata: write v3-schema containerd config AFTER aks-flex-node apply Previous attempt regenerated containerd config BEFORE aks-flex-node, but aks-flex-node clobbered it during apply. Result: a v1-schema CNI config (io.containerd.grpc.v1.cri.cni) which containerd 2.x silently ignores in favor of the v3 schema (io.containerd.cri.v1.runtime.cni). bin_dir/conf_dir end up empty, every Pod fails: 'failed to find plugin cilium-cni in path []'. Write the canonical v3-schema config (mirrored from a working manual node) after apply, then restart containerd. This is the same hand-fix that was applied to manual nodes and unblocks Pod sandbox creation. --- .../userdata/flex/assets/bootstrap.sh.tmpl | 51 ++++++++++++++++--- 1 file changed, 44 insertions(+), 7 deletions(-) diff --git a/plugin/pkg/services/agentpools/userdata/flex/assets/bootstrap.sh.tmpl b/plugin/pkg/services/agentpools/userdata/flex/assets/bootstrap.sh.tmpl index 31f93ae..3a3e248 100644 --- a/plugin/pkg/services/agentpools/userdata/flex/assets/bootstrap.sh.tmpl +++ b/plugin/pkg/services/agentpools/userdata/flex/assets/bootstrap.sh.tmpl @@ -3,12 +3,49 @@ curl -L -o /tmp/flex/aks-flex-node-linux-{{ .Arch }}.tar.gz https://github.com/A tar -xzf /tmp/flex/aks-flex-node-linux-{{ .Arch }}.tar.gz -C /tmp/flex mv /tmp/flex/aks-flex-node-linux-{{ .Arch }} /tmp/flex/aks-flex-node chmod +x /tmp/flex/aks-flex-node -# Workaround for AKSFlexNode v0.0.x gap: ships containerd v2 binary but writes -# a v1-schema config. Regenerate the default config (which produces the v3 -# schema containerd v2 expects) and restart before running apply, otherwise -# kubelet/kubeadm join hangs because CRI is non-functional. -mkdir -p /etc/containerd -containerd config default | sed -e '/SystemdCgroup/ s/false/true/' >/etc/containerd/config.toml -systemctl restart containerd /tmp/flex/aks-flex-node apply -f /tmp/flex-config.json +# Workaround for AKSFlexNode v0.0.x gap: aks-flex-node apply writes a +# v1-schema containerd config (e.g. [plugins."io.containerd.grpc.v1.cri".cni]) +# but ships a containerd v2 binary which only honors v3-schema sections +# (e.g. [plugins."io.containerd.cri.v1.runtime".cni]). Result: CRI starts but +# CNI bin_dir/conf_dir are empty, every Pod fails with "failed to find plugin +# cilium-cni in path []". Overwrite with a v3-schema config and restart so +# Pod sandbox creation works. Mirror of the manual repair on existing nodes. +cat >/etc/containerd/config.toml <<'CONTAINERD_V3_EOF' +imports = ["/etc/containerd/conf.d/*.toml"] +oom_score = 0 +version = 3 + +[metrics] + address = "0.0.0.0:10257" + +[plugins] + + [plugins."io.containerd.cri.v1.images"] + sandbox_image = "mcr.microsoft.com/oss/kubernetes/pause:3.9" + + [plugins."io.containerd.cri.v1.images".registry] + config_path = "/etc/containerd/certs.d" + + [plugins."io.containerd.cri.v1.images".registry.headers] + X-Meta-Source-Client = ["azure/aks"] + + [plugins."io.containerd.cri.v1.runtime"] + + [plugins."io.containerd.cri.v1.runtime".cni] + bin_dir = "/opt/cni/bin" + conf_dir = "/etc/cni/net.d" + + [plugins."io.containerd.cri.v1.runtime".containerd] + + [plugins."io.containerd.cri.v1.runtime".containerd.runtimes] + + [plugins."io.containerd.cri.v1.runtime".containerd.runtimes.runc] + runtime_type = "io.containerd.runc.v2" + + [plugins."io.containerd.cri.v1.runtime".containerd.runtimes.runc.options] + BinaryName = "/usr/local/bin/runc" + SystemdCgroup = true +CONTAINERD_V3_EOF +systemctl restart containerd rm -rf /tmp/flex \ No newline at end of file From 16e0483ab0d396a852b5b9b5a12f2b1bbb2533d0 Mon Sep 17 00:00:00 2001 From: Kevin Cho Date: Wed, 22 Apr 2026 16:16:45 -0700 Subject: [PATCH 14/20] userdata: fix conf.d/99-nvidia.toml bin_dir override gpu-operator drops /etc/containerd/conf.d/99-nvidia.toml with bin_dir = "" and bin_dirs = ["/opt/cni/bin"]. containerd 2.0.4 only honors bin_dir, so the empty string blanks our main-config bin_dir and CNI plugin lookup fails with 'failed to find plugin cilium-cni in path []'. sed-rewrite the import after aks-flex-node apply. --- .../agentpools/userdata/flex/assets/bootstrap.sh.tmpl | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/plugin/pkg/services/agentpools/userdata/flex/assets/bootstrap.sh.tmpl b/plugin/pkg/services/agentpools/userdata/flex/assets/bootstrap.sh.tmpl index 3a3e248..617a3a2 100644 --- a/plugin/pkg/services/agentpools/userdata/flex/assets/bootstrap.sh.tmpl +++ b/plugin/pkg/services/agentpools/userdata/flex/assets/bootstrap.sh.tmpl @@ -47,5 +47,15 @@ version = 3 BinaryName = "/usr/local/bin/runc" SystemdCgroup = true CONTAINERD_V3_EOF +# Workaround for AKSFlexNode v0.0.x gap (continued): the gpu-operator/aks-flex +# bake drops /etc/containerd/conf.d/99-nvidia.toml that sets +# bin_dir = "" +# bin_dirs = ["/opt/cni/bin"] +# in [plugins."io.containerd.cri.v1.runtime".cni]. containerd 2.0.4 only honors +# bin_dir (singular) — the empty string blanks out our main config. Patch the +# import to set bin_dir to /opt/cni/bin so CNI plugin discovery works. +if [ -f /etc/containerd/conf.d/99-nvidia.toml ]; then + sed -i 's|bin_dir = ""|bin_dir = "/opt/cni/bin"|' /etc/containerd/conf.d/99-nvidia.toml +fi systemctl restart containerd rm -rf /tmp/flex \ No newline at end of file From 9a8fdf5bb1ac4497479bd7ccc7163ce38c702216 Mon Sep 17 00:00:00 2001 From: Kevin Cho Date: Wed, 22 Apr 2026 20:20:55 -0700 Subject: [PATCH 15/20] flexvm: garbage-collect orphan NICs after failed VM creation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each Karpenter VM-create failure (commonly 409 quota) was leaving the NIC behind, exhausting the subnet (~250 orphans / 8h of churn). Root cause: the inline best-effort cleanup at line 220 reused the gRPC ctx, which Karpenter has often already cancelled by the time the VM PUT returns. A cancelled context causes BeginDelete to return immediately without sending the HTTP DELETE. Naive fix (cleanup with a fresh context inside the gRPC handler) does not work either: ARM reserves the NIC for its target VM for 180s after *any* CreateOrUpdate attempt — successful or not. Synchronous delete during that window returns 400 NicReservedForAnotherVm. Blocking the gRPC handler for 3+ minutes is also unacceptable since Karpenter expects fast failure to back off. Fix: spawn a detached goroutine on the failure path that sleeps out the 180s ARM reservation window plus slack, then deletes the NIC with retries on its own background context. The gRPC handler returns immediately to Karpenter so its retry loop is unaffected. Caveats documented in code: - Cleanup is best-effort. Pod restart abandons in-flight goroutines; a periodic reconciler is left as future work. - Sleeping-goroutine count is bounded by retry rate * cleanup window (observed ~7/min * 4min = ~30 max). Validated on voice-agent-flex Sweden cluster: forced 50+ quota-blocked retries, all NICs reaped automatically (43 cleanup-success log lines, zero failures) and orphan count returned to zero. --- .../agentpools/azure/flexvm/agentpools.go | 95 +++++++++++++++++-- 1 file changed, 86 insertions(+), 9 deletions(-) diff --git a/plugin/pkg/services/agentpools/azure/flexvm/agentpools.go b/plugin/pkg/services/agentpools/azure/flexvm/agentpools.go index ad9251d..8161861 100644 --- a/plugin/pkg/services/agentpools/azure/flexvm/agentpools.go +++ b/plugin/pkg/services/agentpools/azure/flexvm/agentpools.go @@ -26,6 +26,7 @@ import ( "encoding/base64" "errors" "fmt" + "log/slog" "strings" "time" @@ -80,7 +81,7 @@ func NewAgentPoolsServer(storage db.RODB) (agentpools.AgentPoolsServer, error) { }, nil } -func (srv *agentpoolsServer) CreateOrUpdate(ctx context.Context, req *api.CreateOrUpdateRequest) (*api.CreateOrUpdateResponse, error) { +func (srv *agentpoolsServer) CreateOrUpdate(ctx context.Context, req *api.CreateOrUpdateRequest) (resp *api.CreateOrUpdateResponse, err error) { ap, err := helper.AnyTo[*AgentPool](req.GetItem()) if err != nil { return nil, err @@ -151,6 +152,38 @@ func (srv *agentpoolsServer) CreateOrUpdate(ctx context.Context, req *api.Create } nicID := *nicResp.ID + // Best-effort NIC cleanup if anything between here and the successful VM + // creation fails. Without this, every quota-rejected / ARM-rejected VM + // create on a Karpenter retry loop leaks one NIC, which exhausts the + // subnet (observed: ~250 orphan NICs accumulating per hour during + // quota-blocked H100 churn). Uses a fresh background context because the + // gRPC ctx is often already cancelled by the time we land here on retry. + // Best-effort NIC cleanup if anything between here and the successful VM + // creation fails. + // + // Azure platform quirk: after *any* VM CreateOrUpdate attempt (even one + // that fails synchronously with 409 quota), ARM reserves the referenced + // NIC for the target VM name for 180 seconds. Delete attempts during + // that window return 400 NicReservedForAnotherVm. We therefore cannot + // clean up synchronously inside the gRPC handler — Karpenter expects a + // fast error response so it can back off and retry. We spawn a detached + // goroutine that waits out the reservation and retries with backoff. + // + // Best-effort contract: + // - Cleanup survives only while the plugin process is alive. On pod + // restart, any in-flight orphan NICs need manual sweep or a periodic + // reconciler (future work). + // - Under sustained quota exhaustion, the number of sleeping + // cleanup goroutines is bounded by the retry rate (observed ~7/min) + // times the cleanup window (~4 min) — a few dozen max. + nicCleanedUp := false + defer func() { + if err == nil || nicCleanedUp { + return + } + go cleanupReservedNIC(nicsClient, spec.GetResourceGroup(), nicName) + }() + // 2. VM. NIC + OS disk both set DeleteOption=Delete so a single VM // delete cascades — this is critical for Karpenter retry idempotency. vmsClient, err := armcompute.NewVirtualMachinesClient(spec.GetSubscriptionId(), srv.credentials, nil) @@ -216,23 +249,18 @@ func (srv *agentpoolsServer) CreateOrUpdate(ctx context.Context, req *api.Create vmPoller, err := vmsClient.BeginCreateOrUpdate(ctx, spec.GetResourceGroup(), vmName, vmParams, nil) if err != nil { - // Best-effort NIC cleanup if VM create kicked back synchronously. - _, _ = nicsClient.BeginDelete(ctx, spec.GetResourceGroup(), nicName, nil) return nil, fmt.Errorf("creating VM %q: %w", vmName, err) } vmResp, err := vmPoller.PollUntilDone(ctx, nil) if err != nil { - // VM provisioning failed mid-flight: NIC was created but VM never - // reached a state where DeleteOption=Delete would cascade. Best-effort - // cleanup with a fresh context so it still runs if ctx was cancelled. - cleanupCtx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) - _, _ = nicsClient.BeginDelete(cleanupCtx, spec.GetResourceGroup(), nicName, nil) - cancel() return nil, fmt.Errorf("polling VM creation %q: %w", vmName, err) } if vmResp.ID == nil { return nil, fmt.Errorf("VM %q created but Azure returned nil resource ID", vmName) } + // VM is up and owns the NIC via DeleteOption=Delete; suppress the deferred + // NIC cleanup so a downstream marshal failure doesn't tear down the node. + nicCleanedUp = true ap.SetStatus(AgentPoolStatus_builder{ VmResourceId: vmResp.ID, @@ -394,3 +422,52 @@ func isNotFound(err error) bool { } return false } + +// cleanupReservedNIC deletes an orphan NIC after the 180s ARM reservation +// window expires. Runs detached (its own goroutine); intended only for the +// post-VM-create-failure path where the NIC is guaranteed to outlive its +// caller's request context. All errors are best-effort logged; under +// sustained ARM turbulence this may leave orphans that a human or periodic +// reconciler will need to sweep. +func cleanupReservedNIC(nicsClient *armnetwork.InterfacesClient, resourceGroup, nicName string) { + // Wait out the ARM 180s NIC reservation window, plus slack for clock + // skew and any in-flight VM-create retry that might re-reserve the NIC + // on the same name (Karpenter retries DO use new nodeclaim names, so + // this is belt-and-suspenders). + time.Sleep(3*time.Minute + 30*time.Second) + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() + + for attempt := 0; attempt < 5; attempt++ { + delPoller, delErr := nicsClient.BeginDelete(ctx, resourceGroup, nicName, nil) + if delErr == nil { + if _, pollErr := delPoller.PollUntilDone(ctx, nil); pollErr == nil { + slog.Info("flexvm orphan NIC cleanup succeeded", + "nic", nicName, "attempt", attempt) + return + } else { + slog.Warn("flexvm orphan NIC poll failed", + "nic", nicName, "attempt", attempt, "err", pollErr) + } + } else { + // 404 = already gone (raced with someone else). Treat as success. + if isNotFound(delErr) { + slog.Info("flexvm orphan NIC already gone", + "nic", nicName, "attempt", attempt) + return + } + slog.Warn("flexvm orphan NIC BeginDelete failed", + "nic", nicName, "attempt", attempt, "err", delErr) + } + + select { + case <-time.After(30 * time.Second): + case <-ctx.Done(): + slog.Error("flexvm orphan NIC cleanup timed out", + "nic", nicName, "attempts", attempt+1) + return + } + } + slog.Error("flexvm orphan NIC cleanup exhausted retries", "nic", nicName) +} From 0a0bfb251dd9c8489cfbdbca19578a6333fe53ed Mon Sep 17 00:00:00 2001 From: chokevin <11001563+chokevin@users.noreply.github.com> Date: Wed, 22 Apr 2026 21:52:17 -0700 Subject: [PATCH 16/20] address copilot review comments - flexvm: reject allocate_public_ip=true in validateSpec (was silently ignored; previously fell through to a private-only NIC) - flexvm: reject nil kubeadm spec in validateSpec (would nil-panic in CreateOrUpdate when calling AddNodeLabels) - karpenter cloudprovider: stamp AzureFlexNodeClassHashAnnotation on NodeClaim in Create(); without this IsDrifted never triggered - karpenter nodeclass_status: validate subnetID with arm.ParseResourceID instead of a prefix check (was letting malformed IDs through to VM create) --- .../pkg/cloudproviders/azure/cloudprovider.go | 10 +++++++++- .../pkg/controllers/azure/nodeclass_status.go | 4 ++++ .../agentpools/azure/flexvm/agentpools.go | 17 +++++++++++++---- 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/karpenter/pkg/cloudproviders/azure/cloudprovider.go b/karpenter/pkg/cloudproviders/azure/cloudprovider.go index de2f037..fe1938d 100644 --- a/karpenter/pkg/cloudproviders/azure/cloudprovider.go +++ b/karpenter/pkg/cloudproviders/azure/cloudprovider.go @@ -157,7 +157,15 @@ func (c *CloudProvider) Create(ctx context.Context, nodeClaim *v1.NodeClaim) (*v return nil, fmt.Errorf("creating azure-flex agent pool: %w", err) } - return agentPoolToNodeClaim(created, it), nil + // Stamp the NodeClass drift hash onto the returned NodeClaim so that + // IsDrifted can detect spec changes later. Without this annotation the + // drift check silently no-ops. + out := agentPoolToNodeClaim(created, it) + if out.Annotations == nil { + out.Annotations = map[string]string{} + } + out.Annotations[v1alpha1.AzureFlexNodeClassHashAnnotation] = driftHash(nodeClass.Spec) + return out, nil } func (c *CloudProvider) Delete(ctx context.Context, nodeClaim *v1.NodeClaim) error { diff --git a/karpenter/pkg/controllers/azure/nodeclass_status.go b/karpenter/pkg/controllers/azure/nodeclass_status.go index 9c2f33b..998e8aa 100644 --- a/karpenter/pkg/controllers/azure/nodeclass_status.go +++ b/karpenter/pkg/controllers/azure/nodeclass_status.go @@ -5,6 +5,7 @@ import ( "fmt" "strings" + "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm" opcontroller "github.com/awslabs/operatorpkg/controller" "github.com/awslabs/operatorpkg/reasonable" "k8s.io/apimachinery/pkg/api/equality" @@ -105,6 +106,9 @@ func validateSpec(spec v1alpha1.AzureFlexNodeClassSpec) error { if !strings.HasPrefix(spec.SubnetID, "/subscriptions/") { return fmt.Errorf("subnetID %q must be a full ARM resource ID", spec.SubnetID) } + if _, err := arm.ParseResourceID(spec.SubnetID); err != nil { + return fmt.Errorf("subnetID %q is not a valid ARM resource ID: %w", spec.SubnetID, err) + } if spec.ImageReference != nil && spec.ImageID != nil && *spec.ImageID != "" { return fmt.Errorf("imageReference and imageID are mutually exclusive") } diff --git a/plugin/pkg/services/agentpools/azure/flexvm/agentpools.go b/plugin/pkg/services/agentpools/azure/flexvm/agentpools.go index 8161861..4acf206 100644 --- a/plugin/pkg/services/agentpools/azure/flexvm/agentpools.go +++ b/plugin/pkg/services/agentpools/azure/flexvm/agentpools.go @@ -137,10 +137,9 @@ func (srv *agentpoolsServer) CreateOrUpdate(ctx context.Context, req *api.Create }, } if spec.GetAllocatePublicIp() { - // Phase 1: skip explicit PIP creation — leave a TODO. Per-NodeClass - // public IP is deferred; documented in CRD. - // (Falls through to private-only NIC.) - _ = nicParams // satisfy linter + // validateSpec rejects this; the branch is kept as a compile-time + // reminder for when Phase 2 adds PIP support. + return nil, errors.New("allocate_public_ip=true is not supported in Phase 1") } nicPoller, err := nicsClient.BeginCreateOrUpdate(ctx, spec.GetResourceGroup(), nicName, nicParams, nil) if err != nil { @@ -357,6 +356,16 @@ func validateSpec(spec *AgentPoolSpec) error { if st := spec.GetSecurityType(); st != "" && st != "Standard" { return fmt.Errorf("unsupported security_type %q (only Standard is supported in Phase 1)", st) } + // Public IP per NIC is not implemented in Phase 1. Reject instead of + // silently creating a private-only NIC when callers expect a public one. + if spec.GetAllocatePublicIp() { + return errors.New("allocate_public_ip=true is not supported in Phase 1") + } + // kubeadm config carries the AKS bootstrap token + CA and is used to + // render userdata; a nil value here would panic in CreateOrUpdate. + if spec.GetKubeadm() == nil { + return errors.New("kubeadm is required") + } return nil } From e68df8599d883eb9f0e837150cee975c92b34d45 Mon Sep 17 00:00:00 2001 From: chokevin Date: Fri, 24 Apr 2026 11:44:19 -0700 Subject: [PATCH 17/20] fix(azure-flex): handle mismatched agentpool types in GC paths --- karpenter/pkg/cloudproviders/azure/api.go | 10 +++ .../pkg/cloudproviders/azure/api_test.go | 55 ++++++++++++ .../pkg/cloudproviders/azure/cloudprovider.go | 42 ++++++--- .../azure/cloudprovider_test.go | 89 +++++++++++++++++++ 4 files changed, 186 insertions(+), 10 deletions(-) create mode 100644 karpenter/pkg/cloudproviders/azure/api_test.go create mode 100644 karpenter/pkg/cloudproviders/azure/cloudprovider_test.go diff --git a/karpenter/pkg/cloudproviders/azure/api.go b/karpenter/pkg/cloudproviders/azure/api.go index f46b56c..216c037 100644 --- a/karpenter/pkg/cloudproviders/azure/api.go +++ b/karpenter/pkg/cloudproviders/azure/api.go @@ -26,6 +26,16 @@ func IsNotFound(err error) bool { return false } +// IsTypeMismatch returns true if err indicates the plugin returned an object of +// a different concrete protobuf type than the caller expected. +func IsTypeMismatch(err error) bool { + if err == nil { + return false + } + s, ok := status.FromError(err) + return ok && s.Code() == codes.InvalidArgument && strings.Contains(s.Message(), "type mismatch") +} + // IsQuotaError returns true if err signals an Azure quota / capacity exhaustion. // We classify both HTTP 429 and the well-known Azure ARM error codes. func IsQuotaError(err error) bool { diff --git a/karpenter/pkg/cloudproviders/azure/api_test.go b/karpenter/pkg/cloudproviders/azure/api_test.go new file mode 100644 index 0000000..c684818 --- /dev/null +++ b/karpenter/pkg/cloudproviders/azure/api_test.go @@ -0,0 +1,55 @@ +package azure + +import ( + "fmt" + "testing" + + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" +) + +func TestIsTypeMismatch(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + err error + want bool + }{ + { + name: "exact type mismatch status", + err: status.Error(codes.InvalidArgument, "type mismatch"), + want: true, + }, + { + name: "wrapped type mismatch status", + err: fmt.Errorf("wrap: %w", status.Error(codes.InvalidArgument, "type mismatch")), + want: true, + }, + { + name: "other invalid argument", + err: status.Error(codes.InvalidArgument, "bad request"), + want: false, + }, + { + name: "not found", + err: status.Error(codes.NotFound, "not found"), + want: false, + }, + { + name: "nil", + err: nil, + want: false, + }, + } + + for _, tc := range tests { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + if got := IsTypeMismatch(tc.err); got != tc.want { + t.Fatalf("IsTypeMismatch(%v) = %v, want %v", tc.err, got, tc.want) + } + }) + } +} diff --git a/karpenter/pkg/cloudproviders/azure/cloudprovider.go b/karpenter/pkg/cloudproviders/azure/cloudprovider.go index fe1938d..ef7f724 100644 --- a/karpenter/pkg/cloudproviders/azure/cloudprovider.go +++ b/karpenter/pkg/cloudproviders/azure/cloudprovider.go @@ -18,12 +18,15 @@ import ( "github.com/Azure/karpenter-provider-azure/pkg/utils" "github.com/awslabs/operatorpkg/status" "google.golang.org/grpc" + "google.golang.org/grpc/codes" + grpcstatus "google.golang.org/grpc/status" "k8s.io/apimachinery/pkg/runtime/schema" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" v1 "sigs.k8s.io/karpenter/pkg/apis/v1" corecloudprovider "sigs.k8s.io/karpenter/pkg/cloudprovider" + pluginapi "github.com/Azure/aks-flex/plugin/api" stretchhelper "github.com/Azure/aks-flex/plugin/pkg/helper" stretchservices "github.com/Azure/aks-flex/plugin/pkg/services" agentpoolsapi "github.com/Azure/aks-flex/plugin/pkg/services/agentpools/api" @@ -50,6 +53,8 @@ type CloudProvider struct { instanceTypeProvider *instancetype.Provider } +var flexAgentPoolTypeURL = "type.googleapis.com/" + string((&flexvm.AgentPool{}).ProtoReflect().Descriptor().FullName()) + func newCloudProvider( stretchPluginConn *grpc.ClientConn, kubeClient client.Client, @@ -177,11 +182,8 @@ func (c *CloudProvider) Delete(ctx context.Context, nodeClaim *v1.NodeClaim) err // Per CloudProvider.Delete contract: signal NodeClaimNotFoundError if the // remote resource is already gone (so karpenter knows it's safe to drop). - if _, err := stretchhelper.Get[*flexvm.AgentPool]( - c.stretchAgentPoolsClient.Get, - ctx, nodeClaim.Name, - ); err != nil { - if IsNotFound(err) { + if _, err := c.getFlexAgentPool(ctx, nodeClaim.Name); err != nil { + if IsNotFound(err) || IsTypeMismatch(err) { return corecloudprovider.NewNodeClaimNotFoundError(err) } // Non-NotFound get failure: log and proceed with delete in best effort. @@ -192,6 +194,9 @@ func (c *CloudProvider) Delete(ctx context.Context, nodeClaim *v1.NodeClaim) err c.stretchAgentPoolsClient.Delete, ctx, nodeClaim.Name, ); err != nil { + if IsNotFound(err) || IsTypeMismatch(err) { + return corecloudprovider.NewNodeClaimNotFoundError(err) + } return fmt.Errorf("deleting azure-flex agent pool: %w", err) } logger.Info("deleted azure-flex agent pool", "nodeClaim", nodeClaim.Name) @@ -203,12 +208,9 @@ func (c *CloudProvider) Get(ctx context.Context, providerID string) (*v1.NodeCla if err != nil { return nil, err } - ap, err := stretchhelper.Get[*flexvm.AgentPool]( - c.stretchAgentPoolsClient.Get, - ctx, name, - ) + ap, err := c.getFlexAgentPool(ctx, name) if err != nil { - if IsNotFound(err) { + if IsNotFound(err) || IsTypeMismatch(err) { return nil, corecloudprovider.NewNodeClaimNotFoundError(err) } return nil, err @@ -219,6 +221,26 @@ func (c *CloudProvider) Get(ctx context.Context, providerID string) (*v1.NodeCla return agentPoolToNodeClaim(ap, nil), nil } +func (c *CloudProvider) getFlexAgentPool(ctx context.Context, id string) (*flexvm.AgentPool, error) { + req := &pluginapi.GetRequest{} + req.SetId(id) + resp, err := c.stretchAgentPoolsClient.Get(ctx, req) + if err != nil { + return nil, err + } + return flexAgentPoolFromGetResponse(resp) +} + +func flexAgentPoolFromGetResponse(resp *pluginapi.GetResponse) (*flexvm.AgentPool, error) { + if resp == nil || resp.GetItem() == nil { + return nil, grpcstatus.Error(codes.NotFound, "") + } + if resp.GetItem().GetTypeUrl() != flexAgentPoolTypeURL { + return nil, grpcstatus.Error(codes.NotFound, "") + } + return stretchhelper.AnyTo[*flexvm.AgentPool](resp.GetItem()) +} + func (c *CloudProvider) List(ctx context.Context) ([]*v1.NodeClaim, error) { aps, err := stretchhelper.List[*flexvm.AgentPool]( c.stretchAgentPoolsClient.List, diff --git a/karpenter/pkg/cloudproviders/azure/cloudprovider_test.go b/karpenter/pkg/cloudproviders/azure/cloudprovider_test.go new file mode 100644 index 0000000..1d08fd7 --- /dev/null +++ b/karpenter/pkg/cloudproviders/azure/cloudprovider_test.go @@ -0,0 +1,89 @@ +package azure + +import ( + "testing" + + pluginapi "github.com/Azure/aks-flex/plugin/api" + "github.com/Azure/aks-flex/plugin/pkg/services/agentpools/azure/flexvm" + "github.com/Azure/aks-flex/plugin/pkg/services/agentpools/azure/ubuntu2404vmss" + "google.golang.org/protobuf/proto" + "google.golang.org/protobuf/types/known/anypb" +) + +func TestFlexAgentPoolFromGetResponse(t *testing.T) { + t.Parallel() + + mkMeta := func(id string) *pluginapi.Metadata { + return pluginapi.Metadata_builder{Id: proto.String(id)}.Build() + } + mkFlexResp := func(id string) *pluginapi.GetResponse { + item, err := anypb.New(flexvm.AgentPool_builder{ + Metadata: mkMeta(id), + }.Build()) + if err != nil { + t.Fatalf("building flex anypb: %v", err) + } + return pluginapi.GetResponse_builder{Item: item}.Build() + } + mkVMSSResp := func(id string) *pluginapi.GetResponse { + item, err := anypb.New(ubuntu2404vmss.AgentPool_builder{ + Metadata: mkMeta(id), + }.Build()) + if err != nil { + t.Fatalf("building vmss anypb: %v", err) + } + return pluginapi.GetResponse_builder{Item: item}.Build() + } + + tests := []struct { + name string + resp *pluginapi.GetResponse + wantID string + wantErr bool + }{ + { + name: "nil response is not found", + resp: nil, + wantErr: true, + }, + { + name: "nil item is not found", + resp: pluginapi.GetResponse_builder{}.Build(), + wantErr: true, + }, + { + name: "wrong item type is not found", + resp: mkVMSSResp("node-1"), + wantErr: true, + }, + { + name: "flex agentpool item returns parsed object", + resp: mkFlexResp("node-2"), + wantID: "node-2", + }, + } + + for _, tc := range tests { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + got, err := flexAgentPoolFromGetResponse(tc.resp) + if tc.wantErr { + if err == nil { + t.Fatalf("expected error, got nil") + } + if !IsNotFound(err) { + t.Fatalf("expected NotFound-style error, got: %v", err) + } + return + } + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got.GetMetadata().GetId() != tc.wantID { + t.Fatalf("got id %q, want %q", got.GetMetadata().GetId(), tc.wantID) + } + }) + } +} From 460b21b9371ce300bff12bd77349ea181df8f311 Mon Sep 17 00:00:00 2001 From: chokevin Date: Fri, 24 Apr 2026 12:40:52 -0700 Subject: [PATCH 18/20] chore(karpenter): tidy protobuf module classification --- karpenter/go.mod | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/karpenter/go.mod b/karpenter/go.mod index 1476ff3..b7319ee 100644 --- a/karpenter/go.mod +++ b/karpenter/go.mod @@ -13,6 +13,7 @@ require ( github.com/samber/lo v1.52.0 golang.org/x/sync v0.19.0 google.golang.org/grpc v1.79.1 + google.golang.org/protobuf v1.36.11 k8s.io/api v0.35.1 k8s.io/apimachinery v0.35.1 k8s.io/client-go v0.35.1 @@ -174,7 +175,6 @@ require ( golang.org/x/time v0.14.0 // indirect gomodules.xyz/jsonpatch/v2 v2.5.0 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 // indirect - google.golang.org/protobuf v1.36.11 // indirect gopkg.in/dnaeon/go-vcr.v3 v3.2.0 // indirect gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect From 3ec4f79a29eb40882b39bdd84cbcf66634927ed5 Mon Sep 17 00:00:00 2001 From: chokevin <11001563+chokevin@users.noreply.github.com> Date: Sat, 16 May 2026 13:43:39 -0700 Subject: [PATCH 19/20] fix(karpenter): harden azure h200 provisioning Persist incomplete AzureFlex agent pools after NIC creation, add typed list filtering for mixed agent pool records, and wire H200 NodeClaims through Karpenter with region-specific examples. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- cli/internal/config/karpenter/karpenter.go | 4 +- docs/usages/karpenter.md | 27 +- .../karpenter/templates/clusterrole-core.yaml | 2 +- karpenter/cmd/controller/main.go | 65 ++++- .../azureflexnodeclass-h200-eastus2euap.yaml | 21 ++ karpenter/examples/azure/h200_deployment.yaml | 44 +++ .../azure/nodepool-h200-eastus2euap.yaml | 33 +++ karpenter/examples/azure/nodepool-h200.yaml | 4 + karpenter/go.mod | 15 +- karpenter/go.sum | 26 +- .../pkg/cloudproviders/azure/cloudprovider.go | 66 ++++- .../pkg/cloudproviders/azure/nodeclaim.go | 14 +- .../cloudproviders/azure/nodeclaim_test.go | 253 ++++++++++++++++++ .../pkg/cloudproviders/kaito/cloudprovider.go | 2 +- .../cloudproviders/nebius/cloudprovider.go | 2 +- plugin/pkg/helper/helper.go | 59 ++++ plugin/pkg/helper/helper_test.go | 70 +++++ .../agentpools/azure/flexvm/agentpools.go | 86 +++--- .../azure/flexvm/agentpools_test.go | 92 +++++++ 19 files changed, 807 insertions(+), 78 deletions(-) create mode 100644 karpenter/examples/azure/azureflexnodeclass-h200-eastus2euap.yaml create mode 100644 karpenter/examples/azure/h200_deployment.yaml create mode 100644 karpenter/examples/azure/nodepool-h200-eastus2euap.yaml create mode 100644 plugin/pkg/helper/helper_test.go create mode 100644 plugin/pkg/services/agentpools/azure/flexvm/agentpools_test.go diff --git a/cli/internal/config/karpenter/karpenter.go b/cli/internal/config/karpenter/karpenter.go index 16fe804..75ebe0a 100644 --- a/cli/internal/config/karpenter/karpenter.go +++ b/cli/internal/config/karpenter/karpenter.go @@ -44,9 +44,11 @@ podLabels: controller: nebiusCredentials: - enabled: true {{- if .NebiusCredentialsJSON }} + enabled: true credentialsJSON: {{ .NebiusCredentialsJSON }} +{{- else }} + enabled: false {{- end }} image: digest: "" diff --git a/docs/usages/karpenter.md b/docs/usages/karpenter.md index 2e79c01..5585d1a 100644 --- a/docs/usages/karpenter.md +++ b/docs/usages/karpenter.md @@ -46,7 +46,9 @@ $ kubectl create namespace karpenter ### 2. Locate your Nebius credentials file -The karpenter controller needs Nebius API credentials to provision VMs. The credentials file is a JSON file generated by the Nebius console (see the [Nebius authorized keys documentation](https://docs.nebius.com/iam/service-accounts/authorized-keys)). +This step is only needed if you plan to provision Nebius nodes. Azure and Azure Flex H200 nodes do not require Nebius credentials. + +For Nebius, the karpenter controller needs Nebius API credentials to provision VMs. The credentials file is a JSON file generated by the Nebius console (see the [Nebius authorized keys documentation](https://docs.nebius.com/iam/service-accounts/authorized-keys)). Note the local path to this file — you will pass it to the CLI in step 4 via `--nebius-credentials-file`. The chart will create the `nebius-credentials` Secret in the `karpenter` namespace automatically during `helm upgrade --install`; no separate `kubectl create secret` step is needed. @@ -65,15 +67,16 @@ The template also creates a **federated identity credential** that pairs the man ### 4. Generate the Helm values file and install -Use the CLI to generate a `karpenter_values.yaml` file with all required values pre-populated. Pass `--nebius-credentials-file` to have the chart create the `nebius-credentials` Secret automatically, and `--ssh-public-key-file` to embed the SSH public key used when bootstrapping provisioned nodes: +Use the CLI to generate a `karpenter_values.yaml` file with all required values pre-populated. Pass `--ssh-public-key-file` to embed the SSH public key used when bootstrapping provisioned nodes: ```bash $ aks-flex-cli config karpenter helm \ - --nebius-credentials-file ~/.nebius/credentials.json \ --ssh-public-key-file ~/.ssh/id_ed25519.pub ``` -The command reads both files, embeds their contents into `karpenter_values.yaml`, and prints the install command to stdout: +If you also use Nebius, add `--nebius-credentials-file ~/.nebius/credentials.json` so the chart creates and mounts the Nebius credentials Secret. For Azure-only H200 clusters, omit it; the generated values will keep `controller.nebiusCredentials.enabled: false`. + +The command reads the files, embeds their contents into `karpenter_values.yaml`, and prints the install command to stdout: ``` helm upgrade --install karpenter charts/karpenter \ @@ -101,7 +104,7 @@ podLabels: controller: nebiusCredentials: - enabled: true + enabled: false image: digest: "" env: @@ -246,6 +249,20 @@ azure-cpu-nodepool-6rhlk aks-azure-cpu-nodepo > aks-flex-cli aks deploy --nvidia-dra-driver --skip-arm > ``` +### Creating an Azure Flex H200 NodePool + +For cross-region Azure H200 nodes, apply one Azure Flex NodeClass and NodePool per Azure region, then deploy a GPU workload with a matching toleration and node affinity: + +```bash +$ kubectl apply -f examples/azure/azureflexnodeclass-h200-eastus2.yaml +$ kubectl apply -f examples/azure/nodepool-h200.yaml +$ kubectl apply -f examples/azure/azureflexnodeclass-h200-eastus2euap.yaml +$ kubectl apply -f examples/azure/nodepool-h200-eastus2euap.yaml +$ kubectl apply -f examples/azure/h200_deployment.yaml +``` + +Each H200 NodePool references exactly one `AzureFlexNodeClass`, so use separate NodePools when trying the same `Standard_ND96isr_H200_v5` SKU in both `eastus2` and `eastus2euap`. The H200 NodePools must have a non-zero `limits.nvidia.com/gpu` value. Set each one to `8` for one node in that region, or a higher multiple of 8 for more nodes. + ## Creating Nodes on Nebius via Karpenter With the karpenter controller running, you can define a `NebiusNodeClass` and `NodePool` to tell Karpenter how and when to provision Nebius nodes. diff --git a/karpenter/charts/karpenter/templates/clusterrole-core.yaml b/karpenter/charts/karpenter/templates/clusterrole-core.yaml index 0ae9f1f..fe8673e 100644 --- a/karpenter/charts/karpenter/templates/clusterrole-core.yaml +++ b/karpenter/charts/karpenter/templates/clusterrole-core.yaml @@ -71,7 +71,7 @@ rules: {{- if .Values.webhook.enabled }} - apiGroups: ["apiextensions.k8s.io"] resources: ["customresourcedefinitions/status"] - resourceNames: ["aksnodeclasses.karpenter.azure.com", "nodepools.karpenter.sh", "nodeclaims.karpenter.sh", "nebiusnodeclasses.flex.aks.azure.com"] + resourceNames: ["aksnodeclasses.karpenter.azure.com", "nodepools.karpenter.sh", "nodeclaims.karpenter.sh", "nebiusnodeclasses.flex.aks.azure.com", "azureflexnodeclasses.flex.aks.azure.com", "kaitonodeclasses.kaito.sh"] verbs: ["patch"] - apiGroups: ["apiextensions.k8s.io"] resources: ["customresourcedefinitions"] diff --git a/karpenter/cmd/controller/main.go b/karpenter/cmd/controller/main.go index d08e5a9..6108b48 100644 --- a/karpenter/cmd/controller/main.go +++ b/karpenter/cmd/controller/main.go @@ -2,6 +2,7 @@ package main import ( "context" + "fmt" "time" "github.com/Azure/karpenter-provider-azure/pkg/apis" @@ -9,11 +10,17 @@ import ( "github.com/Azure/karpenter-provider-azure/pkg/controllers" "github.com/Azure/karpenter-provider-azure/pkg/operator" "github.com/Azure/karpenter-provider-azure/pkg/operator/options" + "github.com/go-logr/logr" "github.com/go-logr/zapr" "github.com/samber/lo" + "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client/apiutil" "sigs.k8s.io/karpenter/pkg/cloudprovider/metrics" "sigs.k8s.io/karpenter/pkg/cloudprovider/overlay" corecontrollers "sigs.k8s.io/karpenter/pkg/controllers" @@ -45,13 +52,17 @@ func main() { ctx := injection.WithOptionsOrDie(context.Background(), coreoptions.Injectables...) logger := zapr.NewLogger(logging.NewLogger(ctx, "controller")) lo.Must0( - operator.WaitForCRDs( + operator.WaitForCRDs(ctx, 2*time.Minute, ctrl.GetConfigOrDie(), logger), + "failed waiting for CRDs", + ) + lo.Must0( + waitForCRDs( ctx, 2*time.Minute, ctrl.GetConfigOrDie(), logger, &v1alpha1.NebiusNodeClass{}, &v1alpha1.AzureFlexNodeClass{}, &kaitov1alpha1.KaitoNodeClass{}, ), - "failed waiting for CRDs", + "failed waiting for flex CRDs", ) ctx, op := operator.NewOperator(coreoperator.NewOperator()) @@ -159,8 +170,11 @@ func main() { // TODO: still need to refactor ImageProvider side of things. op.KubernetesVersionProvider, op.ImageProvider, + op.InstanceTypesProvider, op.InClusterKubernetesInterface, op.AZClient.SubnetsClient(), + op.AZClient.DiskEncryptionSetsClient(), + options.FromContext(ctx).ParsedDiskEncryptionSetID, )...). WithControllers(ctx, flexcontrollers.NewControllers( ctx, @@ -169,3 +183,50 @@ func main() { )...). Start(ctx) } + +func waitForCRDs(ctx context.Context, timeout time.Duration, config *rest.Config, logger logr.Logger, objs ...runtime.Object) error { + client, err := rest.HTTPClientFor(config) + if err != nil { + return fmt.Errorf("creating kubernetes client: %w", err) + } + restMapper, err := apiutil.NewDynamicRESTMapper(config, client) + if err != nil { + return fmt.Errorf("creating dynamic rest mapper: %w", err) + } + + requiredGVKs := make([]schema.GroupVersionKind, 0, len(objs)) + for _, obj := range objs { + gvk, err := apiutil.GVKForObject(obj, scheme.Scheme) + if err != nil { + return fmt.Errorf("getting GVK for %T: %w", obj, err) + } + requiredGVKs = append(requiredGVKs, gvk) + } + + logger.Info("waiting for flex CRDs to be available", "gvks", requiredGVKs, "timeout", timeout) + ctx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + for _, gvk := range requiredGVKs { + err := wait.PollUntilContextCancel(ctx, 10*time.Second, true, func(ctx context.Context) (bool, error) { + if _, err := restMapper.RESTMapping(gvk.GroupKind(), gvk.Version); err != nil { + if meta.IsNoMatchError(err) { + logger.V(1).Info("waiting for flex CRD to be available", "gvk", gvk) + return false, nil + } + return false, err + } + logger.V(1).Info("flex CRD is available", "gvk", gvk) + return true, nil + }) + if err != nil { + if ctx.Err() == context.DeadlineExceeded { + return fmt.Errorf("timed out waiting for CRD %s to be available", gvk) + } + return fmt.Errorf("failed to wait for CRD %s: %w", gvk, err) + } + } + + logger.Info("all flex CRDs are available") + return nil +} diff --git a/karpenter/examples/azure/azureflexnodeclass-h200-eastus2euap.yaml b/karpenter/examples/azure/azureflexnodeclass-h200-eastus2euap.yaml new file mode 100644 index 0000000..1c353d9 --- /dev/null +++ b/karpenter/examples/azure/azureflexnodeclass-h200-eastus2euap.yaml @@ -0,0 +1,21 @@ +apiVersion: flex.aks.azure.com/v1alpha1 +kind: AzureFlexNodeClass +metadata: + name: h200-eastus2euap +spec: + subscriptionID: 00000000-0000-0000-0000-000000000000 + location: eastus2euap + resourceGroup: my-flex-rg + subnetID: /subscriptions/00000000-0000-0000-0000-000000000000/resourceGroups/my-flex-rg/providers/Microsoft.Network/virtualNetworks/flex-vnet-eastus2euap/subnets/nodes + imageReference: + publisher: microsoft-dsvm + offer: ubuntu-hpc + sku: "2204" + version: latest + securityType: Standard + osDiskSizeGB: 256 + allocateNodePublicIP: false + maxPodsPerNode: 110 + tags: + purpose: karpenter-flex-h200 + managed-by: aks-flex-karpenter diff --git a/karpenter/examples/azure/h200_deployment.yaml b/karpenter/examples/azure/h200_deployment.yaml new file mode 100644 index 0000000..a51270f --- /dev/null +++ b/karpenter/examples/azure/h200_deployment.yaml @@ -0,0 +1,44 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: azure-sample-h200-app +spec: + replicas: 1 + selector: + matchLabels: + app: azure-sample-h200-app + template: + metadata: + labels: + app: azure-sample-h200-app + spec: + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: gpu + operator: In + values: + - h200 + - key: node.kubernetes.io/instance-type + operator: In + values: + - Standard_ND96isr_H200_v5 + containers: + - name: gpu-container + image: nvidia/cuda:12.4.0-base-ubuntu22.04 + command: ["nvidia-smi", "-l", "60"] + resources: + requests: + memory: "512Mi" + cpu: "250m" + nvidia.com/gpu: "1" + limits: + memory: "1Gi" + cpu: "500m" + nvidia.com/gpu: "1" diff --git a/karpenter/examples/azure/nodepool-h200-eastus2euap.yaml b/karpenter/examples/azure/nodepool-h200-eastus2euap.yaml new file mode 100644 index 0000000..fec0651 --- /dev/null +++ b/karpenter/examples/azure/nodepool-h200-eastus2euap.yaml @@ -0,0 +1,33 @@ +apiVersion: karpenter.sh/v1 +kind: NodePool +metadata: + name: h200-eastus2euap +spec: + template: + metadata: + labels: + gpu: h200 + nvidia.com/gpu.present: "true" + spec: + nodeClassRef: + group: flex.aks.azure.com + kind: AzureFlexNodeClass + name: h200-eastus2euap + requirements: + - key: node.kubernetes.io/instance-type + operator: In + values: + - Standard_ND96isr_H200_v5 + - key: karpenter.sh/capacity-type + operator: In + values: + - on-demand + taints: + - key: nvidia.com/gpu + value: "true" + effect: NoSchedule + limits: + nvidia.com/gpu: 64 + disruption: + consolidationPolicy: WhenEmpty + consolidateAfter: 30s diff --git a/karpenter/examples/azure/nodepool-h200.yaml b/karpenter/examples/azure/nodepool-h200.yaml index 5a5f4b4..fa3e06b 100644 --- a/karpenter/examples/azure/nodepool-h200.yaml +++ b/karpenter/examples/azure/nodepool-h200.yaml @@ -4,6 +4,10 @@ metadata: name: h200 spec: template: + metadata: + labels: + gpu: h200 + nvidia.com/gpu.present: "true" spec: nodeClassRef: group: flex.aks.azure.com diff --git a/karpenter/go.mod b/karpenter/go.mod index b7319ee..24afd07 100644 --- a/karpenter/go.mod +++ b/karpenter/go.mod @@ -1,25 +1,25 @@ module github.com/Azure/aks-flex/karpenter -go 1.26.0 +go 1.26.1 require ( github.com/Azure/aks-flex/plugin v0.0.0-00010101000000-000000000000 github.com/Azure/azure-sdk-for-go/sdk/azcore v1.21.0 - github.com/Azure/karpenter-provider-azure v1.7.1 - github.com/awslabs/operatorpkg v0.0.0-20250909182303-e8e550b6f339 + github.com/Azure/karpenter-provider-azure v1.10.2 + github.com/awslabs/operatorpkg v0.0.0-20251222193911-34e9a1898737 github.com/go-logr/logr v1.4.3 github.com/go-logr/zapr v1.3.0 github.com/nebius/gosdk v0.0.0-20260218100913-7fb27c45819a github.com/samber/lo v1.52.0 golang.org/x/sync v0.19.0 - google.golang.org/grpc v1.79.1 + google.golang.org/grpc v1.79.3 google.golang.org/protobuf v1.36.11 k8s.io/api v0.35.1 k8s.io/apimachinery v0.35.1 k8s.io/client-go v0.35.1 k8s.io/utils v0.0.0-20260108192941-914a6e750570 sigs.k8s.io/controller-runtime v0.23.1 - sigs.k8s.io/karpenter v1.7.1 + sigs.k8s.io/karpenter v1.10.0 ) require ( @@ -28,7 +28,7 @@ require ( github.com/Azure/aks-middleware v0.0.42 // indirect github.com/Azure/azure-kusto-go v0.16.1 // indirect github.com/Azure/azure-sdk-for-go v68.0.0+incompatible // indirect - github.com/Azure/azure-sdk-for-go-extensions v0.5.1 // indirect + github.com/Azure/azure-sdk-for-go-extensions v0.6.0 // indirect github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 // indirect github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 // indirect github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/authorization/armauthorization/v2 v2.2.0 // indirect @@ -36,6 +36,7 @@ require ( github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerregistry/armcontainerregistry v1.2.0 // indirect github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v6 v6.6.0 // indirect github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v8 v8.3.0-beta.1 // indirect + github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v9 v9.1.0-beta.1 // indirect github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/keyvault/armkeyvault v1.5.0 // indirect github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/msi/armmsi v1.3.0 // indirect github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork v1.1.0 // indirect @@ -171,7 +172,7 @@ require ( golang.org/x/oauth2 v0.34.0 // indirect golang.org/x/sys v0.40.0 // indirect golang.org/x/term v0.39.0 // indirect - golang.org/x/text v0.33.0 // indirect + golang.org/x/text v0.34.0 // indirect golang.org/x/time v0.14.0 // indirect gomodules.xyz/jsonpatch/v2 v2.5.0 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 // indirect diff --git a/karpenter/go.sum b/karpenter/go.sum index 90086c9..45361f3 100644 --- a/karpenter/go.sum +++ b/karpenter/go.sum @@ -10,8 +10,8 @@ github.com/Azure/azure-kusto-go v0.16.1 h1:vCBWcQghmC1qIErUUgVNWHxGhZVStu1U/hki6 github.com/Azure/azure-kusto-go v0.16.1/go.mod h1:9F2zvXH8B6eWzgI1S4k1ZXAIufnBZ1bv1cW1kB1n3D0= github.com/Azure/azure-sdk-for-go v68.0.0+incompatible h1:fcYLmCpyNYRnvJbPerq7U0hS+6+I79yEDJBqVNcqUzU= github.com/Azure/azure-sdk-for-go v68.0.0+incompatible/go.mod h1:9XXNKU+eRnpl9moKnB4QOLf1HestfXbmab5FXxiDBjc= -github.com/Azure/azure-sdk-for-go-extensions v0.5.1 h1:kV3u4tAWoFd+0wipN7QKSWckDkAHR06mZ3LglDuYSVM= -github.com/Azure/azure-sdk-for-go-extensions v0.5.1/go.mod h1:adhNwBpL1vnUS6yvTCbu0tVB/b6SdmmQhU9SpwYtjjY= +github.com/Azure/azure-sdk-for-go-extensions v0.6.0 h1:LzJ4iAk3ZBZ0Y27uUm66XBQntbgMr3QXn2KIDb4Mx04= +github.com/Azure/azure-sdk-for-go-extensions v0.6.0/go.mod h1:f/wRrqvvh197V5r4jGADV7528UdO/zfL+/Ud92BMSag= github.com/Azure/azure-sdk-for-go/sdk/azcore v1.21.0 h1:fou+2+WFTib47nS+nz/ozhEBnvU96bKHy6LjRsY4E28= github.com/Azure/azure-sdk-for-go/sdk/azcore v1.21.0/go.mod h1:t76Ruy8AHvUAC8GfMWJMa0ElSbuIcO03NLpynfbgsPA= github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 h1:Hk5QBxZQC1jb2Fwj6mpzme37xbCDdNTxU7O9eb5+LB4= @@ -32,6 +32,8 @@ github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontai github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v6 v6.6.0/go.mod h1:OWKfCmX4X3Vp2w7GSx1LZn8566tOHJBA6K0IAUVNYx0= github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v8 v8.3.0-beta.1 h1:qUFuc6UySPwwCCWYhuiay5/ef50JEDl8jZ7UWNUx4nA= github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v8 v8.3.0-beta.1/go.mod h1:DTMSChgVxhpEYIPzaE0nfUsdAHcGO1wJtdQ4MDX4VbM= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v9 v9.1.0-beta.1 h1:jSeRQBf6dETTFquS18l7PaecEWLrxuU4f7N2dYSkREw= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v9 v9.1.0-beta.1/go.mod h1:1NzwJtdlA4Qwki8NuLFN2wVS24bOUiWRcNWpmcmHNiU= github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/internal v1.0.0 h1:lMW1lD/17LUA5z1XTURo7LcVG2ICBPlyMHjIUrcFZNQ= github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/internal v1.0.0/go.mod h1:ceIuwmxDWptoW3eCqSXlnPsZFKh4X+R38dWPv7GS9Vs= github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/internal/v2 v2.0.0 h1:PTFGRSlMKCQelWwxUyYVEUqseBJVemLyqWJjvMyt0do= @@ -88,8 +90,8 @@ github.com/Azure/go-autorest/logger v0.2.1 h1:IG7i4p/mDa2Ce4TRyAO8IHnVhAVF3RFU+Z github.com/Azure/go-autorest/logger v0.2.1/go.mod h1:T9E3cAhj2VqvPOtCYAvby9aBXkZmbF5NWuPV8+WeEW8= github.com/Azure/go-autorest/tracing v0.6.0 h1:TYi4+3m5t6K48TGI9AUdb+IzbnSxvnvUMfuitfgcfuo= github.com/Azure/go-autorest/tracing v0.6.0/go.mod h1:+vhtPC754Xsa23ID7GlGsrdKBpUA79WCAKPPZVC2DeU= -github.com/Azure/karpenter-provider-azure v1.7.1 h1:dhjDn8T1YKlfNpQsfWQdk6V8ed4lYTxjJgNvoa1DqdE= -github.com/Azure/karpenter-provider-azure v1.7.1/go.mod h1:VjyqZVV/fDwirfYl+DkhrzqQmOJTJoeI/ZUVFcSM0vI= +github.com/Azure/karpenter-provider-azure v1.10.2 h1:zI5gTIHm+x5qHwmpCq1n8kOFyr47bqy3BO5N1CPAZEE= +github.com/Azure/karpenter-provider-azure v1.10.2/go.mod h1:BEdfQh3u1k1IAvEfcPzbZo48I6fFeCHi7LRitxGJF80= github.com/Azure/msi-dataplane v0.4.3 h1:dWPWzY4b54tLIR9T1Q014Xxd/1DxOsMIp6EjRFAJlQY= github.com/Azure/msi-dataplane v0.4.3/go.mod h1:yAfxdJyvcnvSDfSyOFV9qm4fReEQDl+nZLGeH2ZWSmw= github.com/Azure/skewer v0.0.21 h1:6Yew9XAlJ1ltjJxh1m68X6weXc1ihm9oY3++qY5JWnM= @@ -140,8 +142,8 @@ github.com/aws/aws-sdk-go-v2/service/sts v1.41.6 h1:5fFjR/ToSOzB2OQ/XqWpZBmNvmP/ github.com/aws/aws-sdk-go-v2/service/sts v1.41.6/go.mod h1:qgFDZQSD/Kys7nJnVqYlWKnh0SSdMjAi0uSwON4wgYQ= github.com/aws/smithy-go v1.24.1 h1:VbyeNfmYkWoxMVpGUAbQumkODcYmfMRfZ8yQiH30SK0= github.com/aws/smithy-go v1.24.1/go.mod h1:LEj2LM3rBRQJxPZTB4KuzZkaZYnZPnvgIhb4pu07mx0= -github.com/awslabs/operatorpkg v0.0.0-20250909182303-e8e550b6f339 h1:p4oSlQ9IaT7/DHfgcrs9zdNhdIp37VIMujZLuxSgECk= -github.com/awslabs/operatorpkg v0.0.0-20250909182303-e8e550b6f339/go.mod h1:tNmCf0qIjaGbODGbm3DM8GIKBUvvxM7iW3KHbpSnVgw= +github.com/awslabs/operatorpkg v0.0.0-20251222193911-34e9a1898737 h1:hF8FFDPnboX/ABn1r8oS77t8tG4TVS8i99iPXMaL8Jk= +github.com/awslabs/operatorpkg v0.0.0-20251222193911-34e9a1898737/go.mod h1:reUhRkYche5Vkz+ACdxho8smFwdAspzr8rpA2dNqsVQ= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= @@ -450,8 +452,8 @@ golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= -golang.org/x/text v0.33.0 h1:B3njUFyqtHDUI5jMn1YIr5B0IE2U0qck04r6d4KPAxE= -golang.org/x/text v0.33.0/go.mod h1:LuMebE6+rBincTi9+xWTY8TztLzKHc/9C1uBCG27+q8= +golang.org/x/text v0.34.0 h1:oL/Qq0Kdaqxa1KbNeMKwQq0reLCCaFtqu2eNuSeNHbk= +golang.org/x/text v0.34.0/go.mod h1:homfLqTYRFyVYemLBFl5GgL/DWEiH5wcsQ5gSh1yziA= golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI= golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= @@ -467,8 +469,8 @@ gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 h1:gRkg/vSppuSQoDjxyiGfN4Upv/h/DQmIR10ZU8dh4Ww= google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk= -google.golang.org/grpc v1.79.1 h1:zGhSi45ODB9/p3VAawt9a+O/MULLl9dpizzNNpq7flY= -google.golang.org/grpc v1.79.1/go.mod h1:KmT0Kjez+0dde/v2j9vzwoAScgEPx/Bw1CYChhHLrHQ= +google.golang.org/grpc v1.79.3 h1:sybAEdRIEtvcD68Gx7dmnwjZKlyfuc61Dyo9pGXXkKE= +google.golang.org/grpc v1.79.3/go.mod h1:KmT0Kjez+0dde/v2j9vzwoAScgEPx/Bw1CYChhHLrHQ= google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= @@ -515,8 +517,8 @@ sigs.k8s.io/controller-runtime v0.23.1 h1:TjJSM80Nf43Mg21+RCy3J70aj/W6KyvDtOlpKf sigs.k8s.io/controller-runtime v0.23.1/go.mod h1:B6COOxKptp+YaUT5q4l6LqUJTRpizbgf9KSRNdQGns0= sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg= sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= -sigs.k8s.io/karpenter v1.7.1 h1:KoAWzWG4dEjaW33KtXKnGySAPDnIMwRrLZUndaeAjoY= -sigs.k8s.io/karpenter v1.7.1/go.mod h1:fqk7MeJYRNfMPcOZGv/BtsPR/Hq170J4D2GoU3IVHYA= +sigs.k8s.io/karpenter v1.10.0 h1:F8cupDXyn5c7TQDgTSj86nPmUJxFaV0wxu5HIdp+TJc= +sigs.k8s.io/karpenter v1.10.0/go.mod h1:XQtYAxoCysLHjytci7Fx5zw2txgcW2Vxc+qq6DDiFX8= sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= sigs.k8s.io/structured-merge-diff/v6 v6.3.2-0.20260122202528-d9cc6641c482 h1:2WOzJpHUBVrrkDjU4KBT8n5LDcj824eX0I5UKcgeRUs= diff --git a/karpenter/pkg/cloudproviders/azure/cloudprovider.go b/karpenter/pkg/cloudproviders/azure/cloudprovider.go index ef7f724..99fc013 100644 --- a/karpenter/pkg/cloudproviders/azure/cloudprovider.go +++ b/karpenter/pkg/cloudproviders/azure/cloudprovider.go @@ -13,6 +13,8 @@ import ( "context" "errors" "fmt" + "sync" + "time" karpoptions "github.com/Azure/karpenter-provider-azure/pkg/operator/options" "github.com/Azure/karpenter-provider-azure/pkg/utils" @@ -38,6 +40,8 @@ import ( "github.com/Azure/aks-flex/karpenter/pkg/cloudproviders/azure/instancetype" ) +const incompleteAgentPoolCleanupDelay = 30 * time.Minute + type CloudProvider struct { stretchPluginConn *grpc.ClientConn stretchAgentPoolsClient agentpoolsapi.AgentPoolsClient @@ -51,6 +55,8 @@ type CloudProvider struct { clusterCA []byte instanceTypeProvider *instancetype.Provider + + cleanupInFlight sync.Map } var flexAgentPoolTypeURL = "type.googleapis.com/" + string((&flexvm.AgentPool{}).ProtoReflect().Descriptor().FullName()) @@ -157,6 +163,7 @@ func (c *CloudProvider) Create(ctx context.Context, nodeClaim *v1.NodeClaim) (*v ) if err != nil { if IsQuotaError(err) { + c.cleanupAgentPoolInBackground(ctx, nodeClaim.Name, "quota/capacity create failure") return nil, corecloudprovider.NewInsufficientCapacityError(err) } return nil, fmt.Errorf("creating azure-flex agent pool: %w", err) @@ -174,15 +181,14 @@ func (c *CloudProvider) Create(ctx context.Context, nodeClaim *v1.NodeClaim) (*v } func (c *CloudProvider) Delete(ctx context.Context, nodeClaim *v1.NodeClaim) error { - logger := log.FromContext(ctx).WithValues("nodeClaim", nodeClaim.Name) - if nodeClaim.Status.ProviderID == "" { - logger.V(5).Info("nodeClaim has no providerID, skipping deletion") - return nil - } + return c.deleteAgentPool(ctx, nodeClaim.Name) +} +func (c *CloudProvider) deleteAgentPool(ctx context.Context, name string) error { + logger := log.FromContext(ctx).WithValues("agentPool", name) // Per CloudProvider.Delete contract: signal NodeClaimNotFoundError if the // remote resource is already gone (so karpenter knows it's safe to drop). - if _, err := c.getFlexAgentPool(ctx, nodeClaim.Name); err != nil { + if _, err := c.getFlexAgentPool(ctx, name); err != nil { if IsNotFound(err) || IsTypeMismatch(err) { return corecloudprovider.NewNodeClaimNotFoundError(err) } @@ -192,14 +198,14 @@ func (c *CloudProvider) Delete(ctx context.Context, nodeClaim *v1.NodeClaim) err if err := stretchhelper.Delete( c.stretchAgentPoolsClient.Delete, - ctx, nodeClaim.Name, + ctx, name, ); err != nil { if IsNotFound(err) || IsTypeMismatch(err) { return corecloudprovider.NewNodeClaimNotFoundError(err) } return fmt.Errorf("deleting azure-flex agent pool: %w", err) } - logger.Info("deleted azure-flex agent pool", "nodeClaim", nodeClaim.Name) + logger.Info("deleted azure-flex agent pool") return nil } @@ -242,7 +248,7 @@ func flexAgentPoolFromGetResponse(resp *pluginapi.GetResponse) (*flexvm.AgentPoo } func (c *CloudProvider) List(ctx context.Context) ([]*v1.NodeClaim, error) { - aps, err := stretchhelper.List[*flexvm.AgentPool]( + aps, err := stretchhelper.ListByType[*flexvm.AgentPool]( c.stretchAgentPoolsClient.List, ctx, "", ) @@ -250,12 +256,54 @@ func (c *CloudProvider) List(ctx context.Context) ([]*v1.NodeClaim, error) { return nil, err } out := make([]*v1.NodeClaim, 0, len(aps)) + now := time.Now() for _, ap := range aps { + if ap.GetStatus().GetVmResourceId() == "" { + if shouldCleanupIncompleteAgentPool(ap, now) { + c.cleanupAgentPoolInBackground(ctx, ap.GetMetadata().GetId(), "stale incomplete agent pool") + } + continue + } out = append(out, agentPoolToNodeClaim(ap, nil)) } return out, nil } +func shouldCleanupIncompleteAgentPool(ap *flexvm.AgentPool, now time.Time) bool { + if ap.GetStatus().GetVmResourceId() != "" { + return false + } + createdAt := ap.GetStatus().GetCreatedAt() + if createdAt == nil { + return true + } + return !createdAt.AsTime().Add(incompleteAgentPoolCleanupDelay).After(now) +} + +func (c *CloudProvider) cleanupAgentPoolInBackground(ctx context.Context, name, reason string) { + if name == "" { + return + } + if _, loaded := c.cleanupInFlight.LoadOrStore(name, struct{}{}); loaded { + return + } + + logger := log.FromContext(ctx).WithValues("agentPool", name, "reason", reason) + logger.Info("starting azure-flex agent pool cleanup") + go func() { + defer c.cleanupInFlight.Delete(name) + + cleanupCtx, cancel := context.WithTimeout(context.Background(), 8*time.Minute) + defer cancel() + + if err := c.deleteAgentPool(cleanupCtx, name); err != nil && !corecloudprovider.IsNodeClaimNotFoundError(err) { + logger.Error(err, "cleaning up azure-flex agent pool") + return + } + logger.Info("cleaned up azure-flex agent pool") + }() +} + func (c *CloudProvider) GetInstanceTypes(ctx context.Context, nodePool *v1.NodePool) ([]*corecloudprovider.InstanceType, error) { logger := loggerFromContext(ctx).WithValues("nodePool", nodePool.Name) diff --git a/karpenter/pkg/cloudproviders/azure/nodeclaim.go b/karpenter/pkg/cloudproviders/azure/nodeclaim.go index 2ffbfeb..b3da4ee 100644 --- a/karpenter/pkg/cloudproviders/azure/nodeclaim.go +++ b/karpenter/pkg/cloudproviders/azure/nodeclaim.go @@ -91,7 +91,7 @@ func agentPoolToNodeClaim( rv := &v1.NodeClaim{ ObjectMeta: metav1.ObjectMeta{ Name: ap.GetMetadata().GetId(), - Labels: map[string]string{}, + Labels: lo.Assign(map[string]string{}, ap.GetSpec().GetKubeadm().GetNodeLabels()), Annotations: map[string]string{}, CreationTimestamp: metav1.NewTime(ap.GetStatus().GetCreatedAt().AsTime()), }, @@ -102,7 +102,7 @@ func agentPoolToNodeClaim( } if instanceType != nil { - rv.Labels = labelspkg.GetAllSingleValuedRequirementLabels(instanceType.Requirements) + rv.Labels = lo.Assign(rv.Labels, labelspkg.GetAllSingleValuedRequirementLabels(instanceType.Requirements)) rv.Status.Capacity = lo.PickBy(instanceType.Capacity, filterNonZero) rv.Status.Allocatable = lo.PickBy(instanceType.Allocatable(), filterNonZero) } @@ -135,21 +135,23 @@ func nodeClaimToAgentPool( Server: lo.ToPtr(karpOpts.ClusterEndpoint), CertificateAuthorityData: clusterCA, Token: lo.ToPtr(karpOpts.KubeletClientTLSBootstrapToken), - NodeLabels: map[string]string{ + NodeLabels: lo.Assign(map[string]string{}, nodeClaim.Labels, map[string]string{ cloudproviders.NodeClaimLabelKey: nodeClaim.Name, topology.NodeLabelKeyCloudProviderManaged: "false", topology.NodeLabelKeyCloudProviderCluster: karpOpts.NodeResourceGroup, topology.NodeLabelKeyStretchManaged: "true", - }, + }), }.Build() kubeadmConfig.AddNodeLabels(map[string]string{ corev1.LabelInstanceTypeStable: instanceType.Name, corev1.LabelTopologyRegion: nodeClass.Spec.Location, // Empty zone — region-only Phase 1. - corev1.LabelTopologyZone: "", - v1.CapacityTypeLabelKey: v1.CapacityTypeOnDemand, + corev1.LabelTopologyZone: "", + v1.CapacityTypeLabelKey: v1.CapacityTypeOnDemand, "kubernetes.azure.com/mode": "user", }) + kubeadmConfig.AddK8SRegisterTaints(nodeClaim.Spec.Taints...) + kubeadmConfig.AddK8SRegisterTaints(nodeClaim.Spec.StartupTaints...) kubeadmConfig.AddK8SRegisterTaints(v1.UnregisteredNoExecuteTaint) specBuilder := flexvm.AgentPoolSpec_builder{ diff --git a/karpenter/pkg/cloudproviders/azure/nodeclaim_test.go b/karpenter/pkg/cloudproviders/azure/nodeclaim_test.go index 3e3f37a..6a258f3 100644 --- a/karpenter/pkg/cloudproviders/azure/nodeclaim_test.go +++ b/karpenter/pkg/cloudproviders/azure/nodeclaim_test.go @@ -1,10 +1,28 @@ package azure import ( + "context" "strings" "testing" + "time" + + karpoptions "github.com/Azure/karpenter-provider-azure/pkg/operator/options" + "google.golang.org/grpc" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" + "google.golang.org/protobuf/proto" + "google.golang.org/protobuf/types/known/anypb" + "google.golang.org/protobuf/types/known/timestamppb" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + v1 "sigs.k8s.io/karpenter/pkg/apis/v1" + corecloudprovider "sigs.k8s.io/karpenter/pkg/cloudprovider" "github.com/Azure/aks-flex/karpenter/pkg/apis/v1alpha1" + stretchapi "github.com/Azure/aks-flex/plugin/api" + agentpoolsapi "github.com/Azure/aks-flex/plugin/pkg/services/agentpools/api" + flexvm "github.com/Azure/aks-flex/plugin/pkg/services/agentpools/azure/flexvm" + nebiusinstance "github.com/Azure/aks-flex/plugin/pkg/services/agentpools/nebius/instance" ) func TestProviderIDRoundTrip(t *testing.T) { @@ -93,3 +111,238 @@ func TestDriftHashDeterministic(t *testing.T) { t.Fatalf("different subnet should produce different hash") } } + +func TestDeleteCleansAgentPoolWithoutProviderID(t *testing.T) { + fake := newFakeAgentPoolsClient(testFlexVMAgentPool("nodeclaim-1", "/subscriptions/sub/resourceGroups/rg/providers/Microsoft.Compute/virtualMachines/nodeclaim-1")) + cp := &CloudProvider{stretchAgentPoolsClient: fake} + + err := cp.Delete(context.Background(), &v1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{Name: "nodeclaim-1"}, + }) + if err != nil { + t.Fatalf("Delete: %v", err) + } + if got := <-fake.deleted; got != "nodeclaim-1" { + t.Fatalf("expected delete for nodeclaim-1, got %q", got) + } +} + +func TestListSkipsAndCleansIncompleteAgentPools(t *testing.T) { + fake := newFakeAgentPoolsClient(testIncompleteFlexVMAgentPool("nodeclaim-1", time.Now().Add(-incompleteAgentPoolCleanupDelay-time.Minute))) + cp := &CloudProvider{stretchAgentPoolsClient: fake} + + nodeClaims, err := cp.List(context.Background()) + if err != nil { + t.Fatalf("List: %v", err) + } + if len(nodeClaims) != 0 { + t.Fatalf("expected incomplete agent pool to be skipped, got %d nodeclaims", len(nodeClaims)) + } + + select { + case got := <-fake.deleted: + if got != "nodeclaim-1" { + t.Fatalf("expected cleanup delete for nodeclaim-1, got %q", got) + } + case <-time.After(time.Second): + t.Fatal("timed out waiting for incomplete agent pool cleanup") + } +} + +func TestListDefersFreshIncompleteAgentPoolCleanup(t *testing.T) { + fake := newFakeAgentPoolsClient(testIncompleteFlexVMAgentPool("nodeclaim-1", time.Now())) + cp := &CloudProvider{stretchAgentPoolsClient: fake} + + nodeClaims, err := cp.List(context.Background()) + if err != nil { + t.Fatalf("List: %v", err) + } + if len(nodeClaims) != 0 { + t.Fatalf("expected incomplete agent pool to be skipped, got %d nodeclaims", len(nodeClaims)) + } + + select { + case got := <-fake.deleted: + t.Fatalf("fresh incomplete agent pool should not be cleaned up yet, deleted %q", got) + case <-time.After(100 * time.Millisecond): + } +} + +func TestListIgnoresNonAzureFlexAgentPools(t *testing.T) { + fake := newFakeAgentPoolsClient(testFlexVMAgentPool("nodeclaim-1", "/subscriptions/sub/resourceGroups/rg/providers/Microsoft.Compute/virtualMachines/nodeclaim-1")) + other, err := anypb.New(nebiusinstance.AgentPool_builder{}.Build()) + if err != nil { + t.Fatalf("building nebius agent pool Any: %v", err) + } + fake.rawItems = append(fake.rawItems, other) + cp := &CloudProvider{stretchAgentPoolsClient: fake} + + nodeClaims, err := cp.List(context.Background()) + if err != nil { + t.Fatalf("List: %v", err) + } + if len(nodeClaims) != 1 { + t.Fatalf("expected only the azure-flex agent pool, got %d nodeclaims", len(nodeClaims)) + } + if nodeClaims[0].Name != "nodeclaim-1" { + t.Fatalf("expected nodeclaim-1, got %q", nodeClaims[0].Name) + } +} + +func TestNodeClaimToAgentPoolPropagatesH200LabelsAndTaints(t *testing.T) { + osDiskSize := int32(256) + nodeClass := &v1alpha1.AzureFlexNodeClass{ + ObjectMeta: metav1.ObjectMeta{Name: "h200-eastus2"}, + Spec: v1alpha1.AzureFlexNodeClassSpec{ + SubscriptionID: "sub", + Location: "eastus2", + ResourceGroup: "rg", + SubnetID: "/subscriptions/sub/resourceGroups/rg/providers/Microsoft.Network/virtualNetworks/vnet/subnets/nodes", + OSDiskSizeGB: &osDiskSize, + }, + } + nodeClaim := &v1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "flex-h200-abcde", + Labels: map[string]string{ + "gpu": "h200", + "nvidia.com/gpu.present": "true", + "rune.ai/gpu-family": "h200", + }, + }, + Spec: v1.NodeClaimSpec{ + Taints: []corev1.Taint{ + {Key: "nvidia.com/gpu", Value: "present", Effect: corev1.TaintEffectNoSchedule}, + }, + StartupTaints: []corev1.Taint{ + {Key: "nvidia.com/gpu.init", Value: "true", Effect: corev1.TaintEffectNoSchedule}, + }, + }, + } + + ap := nodeClaimToAgentPool( + &karpoptions.Options{ + ClusterEndpoint: "https://cluster.example:443", + KubeletClientTLSBootstrapToken: "token", + NodeResourceGroup: "MC_rg_cluster_eastus2", + }, + []byte("ca"), + nodeClass, + nodeClaim, + &corecloudprovider.InstanceType{Name: "Standard_ND96isr_H200_v5"}, + ) + + labels := ap.GetSpec().GetKubeadm().GetNodeLabels() + if labels["gpu"] != "h200" { + t.Fatalf("expected gpu=h200 label, got %q", labels["gpu"]) + } + if labels["nvidia.com/gpu.present"] != "true" { + t.Fatalf("expected nvidia.com/gpu.present=true label, got %q", labels["nvidia.com/gpu.present"]) + } + if labels["node.kubernetes.io/instance-type"] != "Standard_ND96isr_H200_v5" { + t.Fatalf("expected stable instance-type label, got %q", labels["node.kubernetes.io/instance-type"]) + } + + taints := ap.GetSpec().GetKubeadm().GetK8SRegisterTaints() + assertHasTaint(t, taints, corev1.Taint{Key: "nvidia.com/gpu", Value: "present", Effect: corev1.TaintEffectNoSchedule}) + assertHasTaint(t, taints, corev1.Taint{Key: "nvidia.com/gpu.init", Value: "true", Effect: corev1.TaintEffectNoSchedule}) + assertHasTaint(t, taints, v1.UnregisteredNoExecuteTaint) + + ap.SetStatus(flexvm.AgentPoolStatus_builder{ + VmResourceId: proto.String("/subscriptions/sub/resourceGroups/rg/providers/Microsoft.Compute/virtualMachines/flex-h200-abcde"), + CreatedAt: timestamppb.Now(), + }.Build()) + listed := agentPoolToNodeClaim(ap, nil) + if listed.Labels["gpu"] != "h200" { + t.Fatalf("expected listed nodeclaim to retain gpu=h200 label, got %q", listed.Labels["gpu"]) + } + if listed.Labels["rune.ai/gpu-family"] != "h200" { + t.Fatalf("expected listed nodeclaim to retain rune.ai/gpu-family=h200 label, got %q", listed.Labels["rune.ai/gpu-family"]) + } +} + +func assertHasTaint(t *testing.T, taints []corev1.Taint, want corev1.Taint) { + t.Helper() + for i := range taints { + if want.MatchTaint(&taints[i]) { + return + } + } + t.Fatalf("expected taint %+v in %+v", want, taints) +} + +type fakeAgentPoolsClient struct { + items map[string]*flexvm.AgentPool + rawItems []*anypb.Any + deleted chan string +} + +func newFakeAgentPoolsClient(items ...*flexvm.AgentPool) *fakeAgentPoolsClient { + f := &fakeAgentPoolsClient{ + items: map[string]*flexvm.AgentPool{}, + deleted: make(chan string, len(items)+1), + } + for _, item := range items { + f.items[item.GetMetadata().GetId()] = item + } + return f +} + +var _ agentpoolsapi.AgentPoolsClient = (*fakeAgentPoolsClient)(nil) + +func (f *fakeAgentPoolsClient) CreateOrUpdate(context.Context, *stretchapi.CreateOrUpdateRequest, ...grpc.CallOption) (*stretchapi.CreateOrUpdateResponse, error) { + return nil, status.Error(codes.Unimplemented, "not implemented") +} + +func (f *fakeAgentPoolsClient) List(context.Context, *stretchapi.ListRequest, ...grpc.CallOption) (*stretchapi.ListResponse, error) { + items := make([]*anypb.Any, 0, len(f.items)+len(f.rawItems)) + for _, item := range f.items { + anyItem, err := anypb.New(item) + if err != nil { + return nil, err + } + items = append(items, anyItem) + } + items = append(items, f.rawItems...) + return stretchapi.ListResponse_builder{Items: items}.Build(), nil +} + +func (f *fakeAgentPoolsClient) Get(_ context.Context, req *stretchapi.GetRequest, _ ...grpc.CallOption) (*stretchapi.GetResponse, error) { + item, ok := f.items[req.GetId()] + if !ok { + return nil, status.Error(codes.NotFound, "") + } + anyItem, err := anypb.New(item) + if err != nil { + return nil, err + } + return stretchapi.GetResponse_builder{Item: anyItem}.Build(), nil +} + +func (f *fakeAgentPoolsClient) Delete(_ context.Context, req *stretchapi.DeleteRequest, _ ...grpc.CallOption) (*stretchapi.DeleteResponse, error) { + delete(f.items, req.GetId()) + f.deleted <- req.GetId() + return stretchapi.DeleteResponse_builder{}.Build(), nil +} + +func testFlexVMAgentPool(id, vmResourceID string) *flexvm.AgentPool { + return flexvm.AgentPool_builder{ + Metadata: stretchapi.Metadata_builder{ + Id: proto.String(id), + }.Build(), + Status: flexvm.AgentPoolStatus_builder{ + VmResourceId: proto.String(vmResourceID), + }.Build(), + }.Build() +} + +func testIncompleteFlexVMAgentPool(id string, createdAt time.Time) *flexvm.AgentPool { + return flexvm.AgentPool_builder{ + Metadata: stretchapi.Metadata_builder{ + Id: proto.String(id), + }.Build(), + Status: flexvm.AgentPoolStatus_builder{ + CreatedAt: timestamppb.New(createdAt), + }.Build(), + }.Build() +} diff --git a/karpenter/pkg/cloudproviders/kaito/cloudprovider.go b/karpenter/pkg/cloudproviders/kaito/cloudprovider.go index 01d62fc..3a12c4f 100644 --- a/karpenter/pkg/cloudproviders/kaito/cloudprovider.go +++ b/karpenter/pkg/cloudproviders/kaito/cloudprovider.go @@ -205,7 +205,7 @@ func (c *CloudProvider) GetInstanceTypes(ctx context.Context, nodePool *v1.NodeP } func (c *CloudProvider) List(ctx context.Context) ([]*v1.NodeClaim, error) { - agentPools, err := stretchhelper.List[*nebiusinstance.AgentPool]( + agentPools, err := stretchhelper.ListByType[*nebiusinstance.AgentPool]( c.stretchAgentPoolsClient.List, ctx, "", ) diff --git a/karpenter/pkg/cloudproviders/nebius/cloudprovider.go b/karpenter/pkg/cloudproviders/nebius/cloudprovider.go index 9a0db20..5dc56d2 100644 --- a/karpenter/pkg/cloudproviders/nebius/cloudprovider.go +++ b/karpenter/pkg/cloudproviders/nebius/cloudprovider.go @@ -296,7 +296,7 @@ func (c *CloudProvider) Get(ctx context.Context, providerID string) (*v1.NodeCla } func (c *CloudProvider) List(ctx context.Context) ([]*v1.NodeClaim, error) { - agentPools, err := stretchhelper.List[*nebiusinstance.AgentPool]( + agentPools, err := stretchhelper.ListByType[*nebiusinstance.AgentPool]( c.stretchAgentPoolsClient.List, ctx, "", ) diff --git a/plugin/pkg/helper/helper.go b/plugin/pkg/helper/helper.go index 11bf9a9..62a2b9e 100644 --- a/plugin/pkg/helper/helper.go +++ b/plugin/pkg/helper/helper.go @@ -2,6 +2,8 @@ package helper import ( "context" + "fmt" + "reflect" "google.golang.org/grpc" "google.golang.org/grpc/codes" @@ -98,6 +100,63 @@ func List[M proto.Message, REQT any, REQ id[REQT], RESP items](list func(context return ms, nil } +// ListByType filters parent list responses to the concrete protobuf type M. +// It still fails if a matching Any payload cannot be decoded. +func ListByType[M proto.Message, REQT any, REQ id[REQT], RESP items](list func(context.Context, REQ, ...grpc.CallOption) (RESP, error), ctx context.Context, id string, opts ...grpc.CallOption) ([]M, error) { + req := REQ(new(REQT)) + req.SetId(id) + + resp, err := list(ctx, req, opts...) + if err != nil { + return nil, err + } + + typeURL, err := typeURLFor[M]() + if err != nil { + return nil, err + } + + var ms []M + for _, item := range resp.GetItems() { + if item.GetTypeUrl() != typeURL { + continue + } + + m, err := AnyTo[M](item) + if err != nil { + return nil, err + } + + ms = append(ms, m) + } + + return ms, nil +} + +func typeURLFor[M proto.Message]() (string, error) { + msg, err := newProtoMessage[M]() + if err != nil { + return "", err + } + return "type.googleapis.com/" + string(msg.ProtoReflect().Descriptor().FullName()), nil +} + +func newProtoMessage[M proto.Message]() (M, error) { + var zero M + t := reflect.TypeOf(zero) + if t == nil { + return zero, fmt.Errorf("proto message type has no concrete type") + } + if t.Kind() != reflect.Pointer { + return zero, fmt.Errorf("proto message type %T must be a pointer", zero) + } + msg, ok := reflect.New(t.Elem()).Interface().(M) + if !ok { + return zero, fmt.Errorf("proto message type %T cannot be constructed", zero) + } + return msg, nil +} + func AnyTo[M proto.Message](o *anypb.Any) (M, error) { m, err := o.UnmarshalNew() if err != nil { diff --git a/plugin/pkg/helper/helper_test.go b/plugin/pkg/helper/helper_test.go new file mode 100644 index 0000000..41582a5 --- /dev/null +++ b/plugin/pkg/helper/helper_test.go @@ -0,0 +1,70 @@ +package helper_test + +import ( + "context" + "testing" + + "google.golang.org/grpc" + "google.golang.org/protobuf/proto" + "google.golang.org/protobuf/types/known/anypb" + + stretchapi "github.com/Azure/aks-flex/plugin/api" + "github.com/Azure/aks-flex/plugin/pkg/helper" + flexvm "github.com/Azure/aks-flex/plugin/pkg/services/agentpools/azure/flexvm" + nebiusinstance "github.com/Azure/aks-flex/plugin/pkg/services/agentpools/nebius/instance" +) + +func TestListByTypeFiltersDifferentAnyTypes(t *testing.T) { + flexAny := mustAny(t, flexvm.AgentPool_builder{ + Metadata: stretchapi.Metadata_builder{Id: proto.String("flex")}.Build(), + }.Build()) + nebiusAny := mustAny(t, nebiusinstance.AgentPool_builder{ + Metadata: stretchapi.Metadata_builder{Id: proto.String("nebius")}.Build(), + }.Build()) + + var gotID string + list := func(_ context.Context, req *stretchapi.ListRequest, _ ...grpc.CallOption) (*stretchapi.ListResponse, error) { + gotID = req.GetId() + return stretchapi.ListResponse_builder{Items: []*anypb.Any{flexAny, nebiusAny}}.Build(), nil + } + + got, err := helper.ListByType[*flexvm.AgentPool](list, context.Background(), "nodeclaims") + if err != nil { + t.Fatalf("ListByType: %v", err) + } + if gotID != "nodeclaims" { + t.Fatalf("expected List request id nodeclaims, got %q", gotID) + } + if len(got) != 1 { + t.Fatalf("expected one flexvm agent pool, got %d", len(got)) + } + if got[0].GetMetadata().GetId() != "flex" { + t.Fatalf("expected flex agent pool, got %q", got[0].GetMetadata().GetId()) + } +} + +func TestListByTypeReturnsErrorsForInvalidMatchingPayloads(t *testing.T) { + flexAny := mustAny(t, flexvm.AgentPool_builder{}.Build()) + corruptFlexAny := &anypb.Any{ + TypeUrl: flexAny.GetTypeUrl(), + Value: []byte{0xff}, + } + nebiusAny := mustAny(t, nebiusinstance.AgentPool_builder{}.Build()) + + list := func(_ context.Context, _ *stretchapi.ListRequest, _ ...grpc.CallOption) (*stretchapi.ListResponse, error) { + return stretchapi.ListResponse_builder{Items: []*anypb.Any{nebiusAny, corruptFlexAny}}.Build(), nil + } + + if _, err := helper.ListByType[*flexvm.AgentPool](list, context.Background(), ""); err == nil { + t.Fatal("expected matching corrupt flexvm payload to return an error") + } +} + +func mustAny(t *testing.T, msg proto.Message) *anypb.Any { + t.Helper() + item, err := anypb.New(msg) + if err != nil { + t.Fatalf("anypb.New: %v", err) + } + return item +} diff --git a/plugin/pkg/services/agentpools/azure/flexvm/agentpools.go b/plugin/pkg/services/agentpools/azure/flexvm/agentpools.go index 4acf206..86017ae 100644 --- a/plugin/pkg/services/agentpools/azure/flexvm/agentpools.go +++ b/plugin/pkg/services/agentpools/azure/flexvm/agentpools.go @@ -64,12 +64,12 @@ const ( type agentpoolsServer struct { agentpools.UnimplementedAgentPoolsServer - storage db.RODB + storage db.DB credentials azcore.TokenCredential } -func NewAgentPoolsServer(storage db.RODB) (agentpools.AgentPoolsServer, error) { +func NewAgentPoolsServer(storage db.DB) (agentpools.AgentPoolsServer, error) { credentials, err := azidentity.NewDefaultAzureCredential(nil) if err != nil { return nil, err @@ -150,37 +150,14 @@ func (srv *agentpoolsServer) CreateOrUpdate(ctx context.Context, req *api.Create return nil, fmt.Errorf("polling NIC creation %q: %w", nicName, err) } nicID := *nicResp.ID + persistIncompleteAgentPool(srv.storage, ap) - // Best-effort NIC cleanup if anything between here and the successful VM - // creation fails. Without this, every quota-rejected / ARM-rejected VM - // create on a Karpenter retry loop leaks one NIC, which exhausts the - // subnet (observed: ~250 orphan NICs accumulating per hour during - // quota-blocked H100 churn). Uses a fresh background context because the - // gRPC ctx is often already cancelled by the time we land here on retry. - // Best-effort NIC cleanup if anything between here and the successful VM - // creation fails. - // - // Azure platform quirk: after *any* VM CreateOrUpdate attempt (even one - // that fails synchronously with 409 quota), ARM reserves the referenced - // NIC for the target VM name for 180 seconds. Delete attempts during - // that window return 400 NicReservedForAnotherVm. We therefore cannot - // clean up synchronously inside the gRPC handler — Karpenter expects a - // fast error response so it can back off and retry. We spawn a detached - // goroutine that waits out the reservation and retries with backoff. - // - // Best-effort contract: - // - Cleanup survives only while the plugin process is alive. On pod - // restart, any in-flight orphan NICs need manual sweep or a periodic - // reconciler (future work). - // - Under sustained quota exhaustion, the number of sleeping - // cleanup goroutines is bounded by the retry rate (observed ~7/min) - // times the cleanup window (~4 min) — a few dozen max. nicCleanedUp := false defer func() { if err == nil || nicCleanedUp { return } - go cleanupReservedNIC(nicsClient, spec.GetResourceGroup(), nicName) + go cleanupReservedNIC(nicsClient, spec.GetResourceGroup(), nicName, srv.storage, vmName) }() // 2. VM. NIC + OS disk both set DeleteOption=Delete so a single VM @@ -432,13 +409,54 @@ func isNotFound(err error) bool { return false } +func persistIncompleteAgentPool(storage db.DB, ap *AgentPool) { + if storage == nil || ap == nil || ap.GetMetadata().GetId() == "" { + return + } + var createdAt *timestamppb.Timestamp + if obj, ok := storage.Get(ap.GetMetadata().GetId()); ok { + existing, err := helper.To[*AgentPool](obj) + if err == nil && existing.GetStatus().GetVmResourceId() != "" { + return + } + if err == nil { + createdAt = existing.GetStatus().GetCreatedAt() + } + } + if ap.GetStatus().GetCreatedAt() != nil { + createdAt = ap.GetStatus().GetCreatedAt() + } + if ap.GetStatus().GetVmResourceId() == "" && createdAt == nil { + createdAt = timestamppb.Now() + } + if ap.GetStatus().GetVmResourceId() == "" { + ap.SetStatus(AgentPoolStatus_builder{ + CreatedAt: createdAt, + }.Build()) + } + storage.CreateOrUpdate(ap) +} + +func deleteIncompleteAgentPool(storage db.DB, id string) { + if storage == nil || id == "" { + return + } + obj, ok := storage.Get(id) + if !ok { + return + } + ap, err := helper.To[*AgentPool](obj) + if err != nil || ap.GetStatus().GetVmResourceId() != "" { + return + } + storage.Delete(id) +} + // cleanupReservedNIC deletes an orphan NIC after the 180s ARM reservation -// window expires. Runs detached (its own goroutine); intended only for the -// post-VM-create-failure path where the NIC is guaranteed to outlive its -// caller's request context. All errors are best-effort logged; under -// sustained ARM turbulence this may leave orphans that a human or periodic -// reconciler will need to sweep. -func cleanupReservedNIC(nicsClient *armnetwork.InterfacesClient, resourceGroup, nicName string) { +// window expires. The failed AgentPool is persisted before this starts so a +// plugin restart can still rediscover and delete the orphan through Karpenter's +// normal List/Delete path. +func cleanupReservedNIC(nicsClient *armnetwork.InterfacesClient, resourceGroup, nicName string, storage db.DB, agentPoolID string) { // Wait out the ARM 180s NIC reservation window, plus slack for clock // skew and any in-flight VM-create retry that might re-reserve the NIC // on the same name (Karpenter retries DO use new nodeclaim names, so @@ -452,6 +470,7 @@ func cleanupReservedNIC(nicsClient *armnetwork.InterfacesClient, resourceGroup, delPoller, delErr := nicsClient.BeginDelete(ctx, resourceGroup, nicName, nil) if delErr == nil { if _, pollErr := delPoller.PollUntilDone(ctx, nil); pollErr == nil { + deleteIncompleteAgentPool(storage, agentPoolID) slog.Info("flexvm orphan NIC cleanup succeeded", "nic", nicName, "attempt", attempt) return @@ -462,6 +481,7 @@ func cleanupReservedNIC(nicsClient *armnetwork.InterfacesClient, resourceGroup, } else { // 404 = already gone (raced with someone else). Treat as success. if isNotFound(delErr) { + deleteIncompleteAgentPool(storage, agentPoolID) slog.Info("flexvm orphan NIC already gone", "nic", nicName, "attempt", attempt) return diff --git a/plugin/pkg/services/agentpools/azure/flexvm/agentpools_test.go b/plugin/pkg/services/agentpools/azure/flexvm/agentpools_test.go new file mode 100644 index 0000000..7bc55a1 --- /dev/null +++ b/plugin/pkg/services/agentpools/azure/flexvm/agentpools_test.go @@ -0,0 +1,92 @@ +package flexvm + +import ( + "path/filepath" + "testing" + "time" + + "github.com/stretchr/testify/require" + "google.golang.org/protobuf/proto" + "google.golang.org/protobuf/types/known/timestamppb" + + "github.com/Azure/aks-flex/plugin/api" + "github.com/Azure/aks-flex/plugin/pkg/db" + "github.com/Azure/aks-flex/plugin/pkg/helper" +) + +func tempDB(t *testing.T) *db.StupidDB { + t.Helper() + store := db.NewStupidDB(filepath.Join(t.TempDir(), "agentpools.db")) + t.Cleanup(store.Close) + return store +} + +func testAgentPool(id string, status *AgentPoolStatus) *AgentPool { + return AgentPool_builder{ + Metadata: api.Metadata_builder{ + Id: proto.String(id), + }.Build(), + Status: status, + }.Build() +} + +func TestPersistIncompleteAgentPoolStoresFailedCreate(t *testing.T) { + store := tempDB(t) + ap := testAgentPool("nodeclaim-1", nil) + + persistIncompleteAgentPool(store, ap) + + got, ok := store.Get("nodeclaim-1") + require.True(t, ok) + require.Equal(t, "nodeclaim-1", got.GetMetadata().GetId()) + gotAP, err := helper.To[*AgentPool](got) + require.NoError(t, err) + require.NotNil(t, gotAP.GetStatus().GetCreatedAt()) +} + +func TestPersistIncompleteAgentPoolDoesNotOverwriteCompletedStatus(t *testing.T) { + store := tempDB(t) + store.CreateOrUpdate(testAgentPool("nodeclaim-1", AgentPoolStatus_builder{ + VmResourceId: proto.String("/subscriptions/sub/resourceGroups/rg/providers/Microsoft.Compute/virtualMachines/nodeclaim-1"), + }.Build())) + + persistIncompleteAgentPool(store, testAgentPool("nodeclaim-1", nil)) + + gotObj, ok := store.Get("nodeclaim-1") + require.True(t, ok) + got, err := helper.To[*AgentPool](gotObj) + require.NoError(t, err) + require.NotEmpty(t, got.GetStatus().GetVmResourceId()) +} + +func TestPersistIncompleteAgentPoolPreservesPendingCreatedAt(t *testing.T) { + store := tempDB(t) + createdAt := timestamppb.New(time.Now().Add(-time.Hour)) + store.CreateOrUpdate(testAgentPool("nodeclaim-1", AgentPoolStatus_builder{ + CreatedAt: createdAt, + }.Build())) + + persistIncompleteAgentPool(store, testAgentPool("nodeclaim-1", nil)) + + gotObj, ok := store.Get("nodeclaim-1") + require.True(t, ok) + got, err := helper.To[*AgentPool](gotObj) + require.NoError(t, err) + require.Equal(t, createdAt.AsTime(), got.GetStatus().GetCreatedAt().AsTime()) +} + +func TestDeleteIncompleteAgentPoolOnlyDeletesPendingRecords(t *testing.T) { + store := tempDB(t) + store.CreateOrUpdate(testAgentPool("pending", nil)) + store.CreateOrUpdate(testAgentPool("complete", AgentPoolStatus_builder{ + VmResourceId: proto.String("/subscriptions/sub/resourceGroups/rg/providers/Microsoft.Compute/virtualMachines/complete"), + }.Build())) + + deleteIncompleteAgentPool(store, "pending") + deleteIncompleteAgentPool(store, "complete") + + _, ok := store.Get("pending") + require.False(t, ok) + _, ok = store.Get("complete") + require.True(t, ok) +} From 85ca16f2d58018769cc92c9524848d629d86dc48 Mon Sep 17 00:00:00 2001 From: chokevin <11001563+chokevin@users.noreply.github.com> Date: Sat, 16 May 2026 13:55:52 -0700 Subject: [PATCH 20/20] chore(karpenter): drop obsolete provider patches The updated Karpenter/Azure provider dependencies already include these patched changes, so keeping the old patch files breaks make vendor-patch in CI. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ...-provider-azure-aks-node-class-filter.diff | 50 ------------------- ...er-provider-azure-wait-for-extra-crds.diff | 29 ----------- karpenter/patches/004-karpenter-taints.diff | 21 -------- 3 files changed, 100 deletions(-) delete mode 100644 karpenter/patches/001-karpenter-provider-azure-aks-node-class-filter.diff delete mode 100644 karpenter/patches/003-karpenter-provider-azure-wait-for-extra-crds.diff delete mode 100644 karpenter/patches/004-karpenter-taints.diff diff --git a/karpenter/patches/001-karpenter-provider-azure-aks-node-class-filter.diff b/karpenter/patches/001-karpenter-provider-azure-aks-node-class-filter.diff deleted file mode 100644 index 9fe079f..0000000 --- a/karpenter/patches/001-karpenter-provider-azure-aks-node-class-filter.diff +++ /dev/null @@ -1,50 +0,0 @@ -diff --git a/karpenter/vendor/github.com/Azure/karpenter-provider-azure/pkg/controllers/nodeclaim/inplaceupdate/controller.go b/karpenter/vendor/github.com/Azure/karpenter-provider-azure/pkg/controllers/nodeclaim/inplaceupdate/controller.go -index fb46b1a..08cd5e4 100644 ---- a/karpenter/vendor/github.com/Azure/karpenter-provider-azure/pkg/controllers/nodeclaim/inplaceupdate/controller.go -+++ b/karpenter/vendor/github.com/Azure/karpenter-provider-azure/pkg/controllers/nodeclaim/inplaceupdate/controller.go -@@ -252,6 +252,7 @@ func (c *Controller) Register(_ context.Context, m manager.Manager) error { - For( - &karpv1.NodeClaim{}, - builder.WithPredicates( -+ nodeclaimutils.UsingAKSNodeClassPredicate(), - predicate.Or( - predicate.GenerationChangedPredicate{}, // Note that this will trigger on pod restart for all Machines. - ), -diff --git a/karpenter/vendor/github.com/Azure/karpenter-provider-azure/pkg/utils/nodeclaim/nodeclaim.go b/karpenter/vendor/github.com/Azure/karpenter-provider-azure/pkg/utils/nodeclaim/nodeclaim.go -index 6b3719b..60b4631 100644 ---- a/karpenter/vendor/github.com/Azure/karpenter-provider-azure/pkg/utils/nodeclaim/nodeclaim.go -+++ b/karpenter/vendor/github.com/Azure/karpenter-provider-azure/pkg/utils/nodeclaim/nodeclaim.go -@@ -24,6 +24,7 @@ import ( - "k8s.io/apimachinery/pkg/runtime/schema" - "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/controller-runtime/pkg/client" -+ "sigs.k8s.io/controller-runtime/pkg/predicate" - karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1" - - armcompute "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" -@@ -33,6 +34,25 @@ import ( - "github.com/Azure/karpenter-provider-azure/pkg/utils" - ) - -+// UsingAKSNodeClass tells if the giving node claim referencing AKS node class. -+func UsingAKSNodeClass(nodeClaim *karpv1.NodeClaim) bool { -+ if nodeClaim.Spec.NodeClassRef == nil { -+ return false -+ } -+ return nodeClaim.Spec.NodeClassRef.Kind == "AKSNodeClass" -+} -+ -+// UsingAKSNodeClassPredicate creates a predicate to filter node claim using AKS node class. -+func UsingAKSNodeClassPredicate() predicate.Funcs { -+ return predicate.NewPredicateFuncs(func(object client.Object) bool { -+ nodeClaim, ok := object.(*karpv1.NodeClaim) -+ if !ok { -+ return false -+ } -+ return UsingAKSNodeClass(nodeClaim) -+ }) -+} -+ - // GetAKSNodeClass resolves the AKSNodeClass from the NodeClaim's NodeClassRef. - // If the NodeClass for the nodeClaim has DeletionTimestamp set, an error is returned. - func GetAKSNodeClass(ctx context.Context, kubeClient client.Client, nodeClaim *karpv1.NodeClaim) (*v1beta1.AKSNodeClass, error) { diff --git a/karpenter/patches/003-karpenter-provider-azure-wait-for-extra-crds.diff b/karpenter/patches/003-karpenter-provider-azure-wait-for-extra-crds.diff deleted file mode 100644 index 51f780e..0000000 --- a/karpenter/patches/003-karpenter-provider-azure-wait-for-extra-crds.diff +++ /dev/null @@ -1,29 +0,0 @@ -diff --git a/karpenter/vendor/github.com/Azure/karpenter-provider-azure/pkg/operator/operator.go b/karpenter/vendor/github.com/Azure/karpenter-provider-azure/pkg/operator/operator.go -index 397760b..143d58b 100644 ---- a/karpenter/vendor/github.com/Azure/karpenter-provider-azure/pkg/operator/operator.go -+++ b/karpenter/vendor/github.com/Azure/karpenter-provider-azure/pkg/operator/operator.go -@@ -314,7 +314,13 @@ func getVnetGUID(ctx context.Context, creds azcore.TokenCredential, cfg *auth.Co - } - - // WaitForCRDs waits for the required CRDs to be available with a timeout --func WaitForCRDs(ctx context.Context, timeout time.Duration, config *rest.Config, log logr.Logger) error { -+func WaitForCRDs( -+ ctx context.Context, -+ timeout time.Duration, -+ config *rest.Config, -+ log logr.Logger, -+ otherObjs ...runtime.Object, -+) error { - gvk := func(obj runtime.Object) schema.GroupVersionKind { - return lo.Must(apiutil.GVKForObject(obj, scheme.Scheme)) - } -@@ -323,6 +329,9 @@ func WaitForCRDs(ctx context.Context, timeout time.Duration, config *rest.Config - gvk(&karpv1.NodeClaim{}), - gvk(&v1beta1.AKSNodeClass{}), - } -+ for _, o := range otherObjs { -+ requiredGVKs = append(requiredGVKs, gvk(o)) -+ } - - client, err := rest.HTTPClientFor(config) - if err != nil { diff --git a/karpenter/patches/004-karpenter-taints.diff b/karpenter/patches/004-karpenter-taints.diff deleted file mode 100644 index 528902b..0000000 --- a/karpenter/patches/004-karpenter-taints.diff +++ /dev/null @@ -1,21 +0,0 @@ -diff --git a/karpenter/vendor/sigs.k8s.io/karpenter/pkg/scheduling/taints.go b/karpenter/vendor/sigs.k8s.io/karpenter/pkg/scheduling/taints.go -index d19c17b..ea1d167 100644 ---- a/karpenter/vendor/sigs.k8s.io/karpenter/pkg/scheduling/taints.go -+++ b/karpenter/vendor/sigs.k8s.io/karpenter/pkg/scheduling/taints.go -@@ -25,6 +25,7 @@ import ( - corev1 "k8s.io/api/core/v1" - cloudproviderapi "k8s.io/cloud-provider/api" - -+ "sigs.k8s.io/karpenter/pkg/operator/logging" - "sigs.k8s.io/karpenter/pkg/utils/pretty" - - v1 "sigs.k8s.io/karpenter/pkg/apis/v1" -@@ -55,7 +56,7 @@ func (ts Taints) Tolerates(tolerations []corev1.Toleration) (errs error) { - taint := ts[i] - tolerates := false - for _, t := range tolerations { -- tolerates = tolerates || t.ToleratesTaint(&taint) -+ tolerates = tolerates || t.ToleratesTaint(logging.NopLogger, &taint, true) - } - if !tolerates { - errs = multierr.Append(errs, serrors.Wrap(fmt.Errorf("did not tolerate taint"), "taint", pretty.Taint(taint)))