diff --git a/charts/kubeflow-trainer/README.md b/charts/kubeflow-trainer/README.md index 36d40c19af..50bc9791d9 100644 --- a/charts/kubeflow-trainer/README.md +++ b/charts/kubeflow-trainer/README.md @@ -136,7 +136,10 @@ manager: | manager.volumeMounts | list | `[]` | Volume mounts for manager containers. | | manager.resources | object | `{}` | Pod resource requests and limits for manager containers. | | manager.securityContext | object | `{"allowPrivilegeEscalation":false,"capabilities":{"drop":["ALL"]},"runAsNonRoot":true,"seccompProfile":{"type":"RuntimeDefault"}}` | Security context for manager containers. | -| manager.config | object | `{"certManagement":{"enable":true,"webhookSecretName":"","webhookServiceName":""},"controller":{"groupKindConcurrency":{"clusterTrainingRuntime":1,"trainJob":5,"trainingRuntime":1}},"featureGates":{},"health":{"healthProbeBindAddress":":8081","livenessEndpointName":"healthz","readinessEndpointName":"readyz"},"leaderElection":{"leaderElect":true,"leaseDuration":"15s","renewDeadline":"10s","resourceName":"trainer.kubeflow.org","resourceNamespace":"","retryPeriod":"2s"},"metrics":{"bindAddress":":8443","secureServing":true},"statusServer":{"burst":10,"port":10443,"qps":5},"webhook":{"host":"","port":9443}}` | Controller manager configuration. This configuration is used to generate the ConfigMap for the controller manager. | +| manager.config | object | `{"certManagement":{"enable":true,"secretName":"","serviceName":""},"controller":{"groupKindConcurrency":{"clusterTrainingRuntime":1,"trainJob":5,"trainingRuntime":1}},"featureGates":{},"health":{"healthProbeBindAddress":":8081","livenessEndpointName":"healthz","readinessEndpointName":"readyz"},"leaderElection":{"leaderElect":true,"leaseDuration":"15s","renewDeadline":"10s","resourceName":"trainer.kubeflow.org","resourceNamespace":"","retryPeriod":"2s"},"metrics":{"bindAddress":":8443","secureServing":true},"statusServer":{"burst":10,"port":10443,"qps":5},"webhook":{"host":"","port":9443}}` | Controller manager configuration. This configuration is used to generate the ConfigMap for the controller manager. | +| manager.config.certManagement.enable | bool | `true` | Enable automatic TLS certificate management. | +| manager.config.certManagement.serviceName | string | `""` | Service name for TLS certificate DNS SAN. Defaults to controller manager service name if empty. | +| manager.config.certManagement.secretName | string | `""` | Secret name for storing TLS certificates. Defaults to "kubeflow-trainer-webhook-cert" if empty. | | manager.config.statusServer.port | int | `10443` | Port that the TrainJob status server serves on. | | manager.config.statusServer.qps | int | `5` | QPS rate limit for the TrainJob Status Server api client | | manager.config.statusServer.burst | int | `10` | Burst rate limit for the TrainJob Status Server api client | diff --git a/charts/kubeflow-trainer/templates/manager/configmap.yaml b/charts/kubeflow-trainer/templates/manager/configmap.yaml index a9f76c1ff1..0eb4476374 100644 --- a/charts/kubeflow-trainer/templates/manager/configmap.yaml +++ b/charts/kubeflow-trainer/templates/manager/configmap.yaml @@ -60,8 +60,8 @@ data: # Certificate management configuration certManagement: enable: {{ .Values.manager.config.certManagement.enable }} - webhookServiceName: {{ if .Values.manager.config.certManagement.webhookServiceName }}{{ .Values.manager.config.certManagement.webhookServiceName }}{{ else }}{{ include "trainer.webhook.service.name" . }}{{ end }} - webhookSecretName: {{ if .Values.manager.config.certManagement.webhookSecretName }}{{ .Values.manager.config.certManagement.webhookSecretName }}{{ else }}{{ include "trainer.webhook.secret.name" . }}{{ end }} + serviceName: {{ if .Values.manager.config.certManagement.serviceName }}{{ .Values.manager.config.certManagement.serviceName }}{{ else }}{{ include "trainer.webhook.service.name" . }}{{ end }} + secretName: {{ if .Values.manager.config.certManagement.secretName }}{{ .Values.manager.config.certManagement.secretName }}{{ else }}{{ include "trainer.webhook.secret.name" . }}{{ end }} statusServer: port: {{ .Values.manager.config.statusServer.port }} diff --git a/charts/kubeflow-trainer/tests/manager/configmap_test.yaml b/charts/kubeflow-trainer/tests/manager/configmap_test.yaml index b5143b3a63..ceff044925 100644 --- a/charts/kubeflow-trainer/tests/manager/configmap_test.yaml +++ b/charts/kubeflow-trainer/tests/manager/configmap_test.yaml @@ -28,29 +28,29 @@ tests: path: data["controller_manager_config.yaml"] pattern: "port: 9443" - - it: should use custom webhook service and secret names from config + - it: should use custom service and secret names from config set: manager: config: certManagement: - webhookServiceName: "custom-webhook-service" - webhookSecretName: "custom-webhook-secret" + serviceName: "custom-webhook-service" + secretName: "custom-webhook-secret" asserts: - matchRegex: path: data["controller_manager_config.yaml"] - pattern: "webhookServiceName: custom-webhook-service" + pattern: "serviceName: custom-webhook-service" - matchRegex: path: data["controller_manager_config.yaml"] - pattern: "webhookSecretName: custom-webhook-secret" + pattern: "secretName: custom-webhook-secret" - - it: should auto-generate webhook service and secret names when not provided + - it: should auto-generate service and secret names when not provided asserts: - matchRegex: path: data["controller_manager_config.yaml"] - pattern: "webhookServiceName: kubeflow-trainer-controller-manager" + pattern: "serviceName: kubeflow-trainer-controller-manager" - matchRegex: path: data["controller_manager_config.yaml"] - pattern: "webhookSecretName: kubeflow-trainer-webhook-cert" + pattern: "secretName: kubeflow-trainer-webhook-cert" - it: should enable leader election by default asserts: diff --git a/charts/kubeflow-trainer/values.yaml b/charts/kubeflow-trainer/values.yaml index 6d32e47214..c192aecf4f 100644 --- a/charts/kubeflow-trainer/values.yaml +++ b/charts/kubeflow-trainer/values.yaml @@ -134,10 +134,12 @@ manager: trainingRuntime: 1 clusterTrainingRuntime: 1 certManagement: + # -- Enable automatic TLS certificate management. enable: true - # webhookServiceName and webhookSecretName are auto-generated if not specified - webhookServiceName: "" - webhookSecretName: "" + # -- Service name for TLS certificate DNS SAN. Defaults to controller manager service name if empty. + serviceName: "" + # -- Secret name for storing TLS certificates. Defaults to "kubeflow-trainer-webhook-cert" if empty. + secretName: "" statusServer: # -- Port that the TrainJob status server serves on. port: 10443 diff --git a/cmd/trainer-controller-manager/main.go b/cmd/trainer-controller-manager/main.go index 3871dc84d2..71be15b34f 100644 --- a/cmd/trainer-controller-manager/main.go +++ b/cmd/trainer-controller-manager/main.go @@ -129,8 +129,8 @@ func main() { if config.IsCertManagementEnabled(&cfg) { setupLog.Info("Setting up certificate management") if err = cert.ManageCerts(mgr, cert.Config{ - WebhookSecretName: cfg.CertManagement.WebhookSecretName, - WebhookServiceName: cfg.CertManagement.WebhookServiceName, + SecretName: cfg.CertManagement.SecretName, + ServiceName: cfg.CertManagement.ServiceName, ValidatingWebhookConfigurationName: validatingWebhookConfigurationName, MutatingWebhookConfigurationName: mutatingWebhookConfigurationName, }, certsReady); err != nil { diff --git a/manifests/base/manager/controller_manager_config.yaml b/manifests/base/manager/controller_manager_config.yaml index 397b2a6106..ecd63697a8 100644 --- a/manifests/base/manager/controller_manager_config.yaml +++ b/manifests/base/manager/controller_manager_config.yaml @@ -35,8 +35,8 @@ controller: # Certificate management configuration certManagement: enable: true - webhookServiceName: kubeflow-trainer-controller-manager - webhookSecretName: kubeflow-trainer-webhook-cert + serviceName: kubeflow-trainer-controller-manager + secretName: kubeflow-trainer-webhook-cert # Client connection configuration clientConnection: diff --git a/pkg/apis/config/v1alpha1/configuration_types.go b/pkg/apis/config/v1alpha1/configuration_types.go index 9be1961be3..b66d683742 100644 --- a/pkg/apis/config/v1alpha1/configuration_types.go +++ b/pkg/apis/config/v1alpha1/configuration_types.go @@ -51,7 +51,8 @@ type Configuration struct { // +optional Controller *ControllerConfigurationSpec `json:"controller,omitempty"` - // certManagement is configuration for certificate management used by the webhook server. + // certManagement is configuration for TLS certificate management. + // The certificate is used by webhooks, metrics server, and status server. // +optional CertManagement *CertManagement `json:"certManagement,omitempty"` @@ -157,7 +158,8 @@ type ControllerConfigurationSpec struct { GroupKindConcurrency map[string]int32 `json:"groupKindConcurrency,omitempty"` } -// CertManagement holds configuration related to webhook server certificate generation. +// CertManagement holds configuration related to TLS certificate generation for the controller manager. +// The certificate is used by multiple components: admission webhooks, metrics server, and status server. type CertManagement struct { // enable controls whether the cert management is enabled. // If disabled, certificates must be provided externally. @@ -166,22 +168,22 @@ type CertManagement struct { // +kubebuilder:default=true Enable *bool `json:"enable,omitempty"` - // webhookServiceName is the name of the Service used as part of the DNSName - // when generating the webhook server certificate. + // serviceName is the name of the Service used as part of the DNSName + // when generating the TLS certificate. // Defaults to "kubeflow-trainer-controller-manager". // +optional // +kubebuilder:default="kubeflow-trainer-controller-manager" // +kubebuilder:validation:MinLength=1 // +kubebuilder:validation:MaxLength=253 - WebhookServiceName string `json:"webhookServiceName,omitempty"` + ServiceName string `json:"serviceName,omitempty"` - // webhookSecretName is the name of the Secret used to store the CA and server certificates. + // secretName is the name of the Secret used to store the CA and server certificates. // Defaults to "kubeflow-trainer-webhook-cert". // +optional // +kubebuilder:default="kubeflow-trainer-webhook-cert" // +kubebuilder:validation:MinLength=1 // +kubebuilder:validation:MaxLength=253 - WebhookSecretName string `json:"webhookSecretName,omitempty"` + SecretName string `json:"secretName,omitempty"` } // ClientConnection provides additional configuration options for Kubernetes diff --git a/pkg/apis/config/v1alpha1/defaults.go b/pkg/apis/config/v1alpha1/defaults.go index 19a3a3beee..9b2f34fc8e 100644 --- a/pkg/apis/config/v1alpha1/defaults.go +++ b/pkg/apis/config/v1alpha1/defaults.go @@ -46,11 +46,11 @@ func SetDefaults_Configuration(cfg *Configuration) { if cfg.CertManagement.Enable == nil { cfg.CertManagement.Enable = ptr.To(true) } - if cfg.CertManagement.WebhookServiceName == "" { - cfg.CertManagement.WebhookServiceName = "kubeflow-trainer-controller-manager" + if cfg.CertManagement.ServiceName == "" { + cfg.CertManagement.ServiceName = "kubeflow-trainer-controller-manager" } - if cfg.CertManagement.WebhookSecretName == "" { - cfg.CertManagement.WebhookSecretName = "kubeflow-trainer-webhook-cert" + if cfg.CertManagement.SecretName == "" { + cfg.CertManagement.SecretName = "kubeflow-trainer-webhook-cert" } if cfg.ClientConnection == nil { cfg.ClientConnection = &ClientConnection{} diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index b043230ae2..be2d8a6262 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -76,8 +76,8 @@ apiVersion: config.trainer.kubeflow.org/v1alpha1 kind: Configuration certManagement: enable: true - webhookServiceName: custom-webhook-service - webhookSecretName: custom-webhook-secret + serviceName: custom-webhook-service + secretName: custom-webhook-secret `), os.FileMode(0600)); err != nil { t.Fatal(err) } @@ -195,8 +195,8 @@ controller: TrainingRuntime.trainer.kubeflow.org: 1 certManagement: enable: true - webhookServiceName: kubeflow-trainer-controller-manager - webhookSecretName: kubeflow-trainer-webhook-cert + serviceName: kubeflow-trainer-controller-manager + secretName: kubeflow-trainer-webhook-cert clientConnection: qps: 50 burst: 100 @@ -270,9 +270,9 @@ this is not: valid: yaml: content } defaultCertManagement := &configapi.CertManagement{ - Enable: ptr.To(true), - WebhookServiceName: "kubeflow-trainer-controller-manager", - WebhookSecretName: "kubeflow-trainer-webhook-cert", + Enable: ptr.To(true), + ServiceName: "kubeflow-trainer-controller-manager", + SecretName: "kubeflow-trainer-webhook-cert", } defaultClientConnection := &configapi.ClientConnection{ @@ -500,9 +500,9 @@ this is not: valid: yaml: content ClientConnection: defaultClientConnection, StatusServer: defaultStatusServer, CertManagement: &configapi.CertManagement{ - Enable: ptr.To(true), - WebhookServiceName: "custom-webhook-service", - WebhookSecretName: "custom-webhook-secret", + Enable: ptr.To(true), + ServiceName: "custom-webhook-service", + SecretName: "custom-webhook-secret", }, }, wantOptions: defaultOptions, @@ -518,9 +518,9 @@ this is not: valid: yaml: content ClientConnection: defaultClientConnection, StatusServer: defaultStatusServer, CertManagement: &configapi.CertManagement{ - Enable: ptr.To(false), - WebhookServiceName: "kubeflow-trainer-controller-manager", - WebhookSecretName: "kubeflow-trainer-webhook-cert", + Enable: ptr.To(false), + ServiceName: "kubeflow-trainer-controller-manager", + SecretName: "kubeflow-trainer-webhook-cert", }, }, wantOptions: defaultOptions, diff --git a/pkg/runtime/framework/plugins/trainjobstatus/trainjobstatus.go b/pkg/runtime/framework/plugins/trainjobstatus/trainjobstatus.go index d7722d978d..93f2149b65 100644 --- a/pkg/runtime/framework/plugins/trainjobstatus/trainjobstatus.go +++ b/pkg/runtime/framework/plugins/trainjobstatus/trainjobstatus.go @@ -115,8 +115,7 @@ func (p *Status) createEnvVars(trainJob *trainer.TrainJob) ([]corev1ac.EnvVarApp if p.cfg.StatusServer.Port == nil { return nil, fmt.Errorf("missing status server port") } - // TODO: consider renaming the CertManagement.WebhookServiceName name? - svc := fmt.Sprintf("https://%s.%s.svc:%d", p.cfg.CertManagement.WebhookServiceName, cert.GetOperatorNamespace(), *p.cfg.StatusServer.Port) + svc := fmt.Sprintf("https://%s.%s.svc:%d", p.cfg.CertManagement.ServiceName, cert.GetOperatorNamespace(), *p.cfg.StatusServer.Port) path := statusserver.StatusUrl(trainJob.Namespace, trainJob.Name) statusURL := svc + path @@ -169,15 +168,15 @@ func createTokenVolume(trainJob *trainer.TrainJob) corev1ac.VolumeApplyConfigura ) } -// buildStatusServerCaCrtConfigMap creates a ConfigMap that will copy the ca.crt from the webhook secret +// buildStatusServerCaCrtConfigMap creates a ConfigMap that will copy the ca.crt from the TLS secret func (p *Status) buildStatusServerCaCrtConfigMap(ctx context.Context, trainJob *trainer.TrainJob) (*corev1ac.ConfigMapApplyConfiguration, error) { configMapName := fmt.Sprintf("%s-tls-config", trainJob.Name) - // Get the CA cert from the webhook secret + // Get the CA cert from the TLS secret secret := &corev1.Secret{} secretKey := client.ObjectKey{ Namespace: cert.GetOperatorNamespace(), - Name: p.cfg.CertManagement.WebhookSecretName, + Name: p.cfg.CertManagement.SecretName, } var caCertData string diff --git a/pkg/runtime/framework/plugins/trainjobstatus/trainjobstatus_test.go b/pkg/runtime/framework/plugins/trainjobstatus/trainjobstatus_test.go index a8d2a8802e..9ff5b5662b 100644 --- a/pkg/runtime/framework/plugins/trainjobstatus/trainjobstatus_test.go +++ b/pkg/runtime/framework/plugins/trainjobstatus/trainjobstatus_test.go @@ -269,8 +269,8 @@ func TestEnforceMLPolicy(t *testing.T) { cli := utiltesting.NewClientBuilder().Build() cfg := &configapi.Configuration{ CertManagement: &configapi.CertManagement{ - WebhookServiceName: "kubeflow-trainer-controller-manager", - WebhookSecretName: "kubeflow-trainer-webhook-cert", + ServiceName: "kubeflow-trainer-controller-manager", + SecretName: "kubeflow-trainer-webhook-cert", }, StatusServer: &configapi.StatusServer{ Port: ptr.To[int32](10443), @@ -461,8 +461,8 @@ func TestBuild(t *testing.T) { cfg := &configapi.Configuration{ CertManagement: &configapi.CertManagement{ - WebhookServiceName: "kubeflow-trainer-controller-manager", - WebhookSecretName: "kubeflow-trainer-webhook-cert", + ServiceName: "kubeflow-trainer-controller-manager", + SecretName: "kubeflow-trainer-webhook-cert", }, StatusServer: &configapi.StatusServer{ Port: ptr.To[int32](10443), diff --git a/pkg/util/cert/cert.go b/pkg/util/cert/cert.go index b436fc105f..c7e952eb4f 100644 --- a/pkg/util/cert/cert.go +++ b/pkg/util/cert/cert.go @@ -45,8 +45,8 @@ func GetOperatorNamespace() string { } type Config struct { - WebhookServiceName string - WebhookSecretName string + ServiceName string + SecretName string ValidatingWebhookConfigurationName string MutatingWebhookConfigurationName string } @@ -60,12 +60,12 @@ func ManageCerts(mgr ctrl.Manager, cfg Config, setupFinished chan struct{}) erro ns := GetOperatorNamespace() // DNSName is ..svc - dnsName := fmt.Sprintf("%s.%s.svc", cfg.WebhookServiceName, ns) + dnsName := fmt.Sprintf("%s.%s.svc", cfg.ServiceName, ns) return cert.AddRotator(mgr, &cert.CertRotator{ SecretKey: types.NamespacedName{ Namespace: ns, - Name: cfg.WebhookSecretName, + Name: cfg.SecretName, }, CertDir: certDir, CAName: caName,