From 1da6eeb7bc05316bce9255e2620033db020f1dd3 Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Tue, 14 Apr 2026 17:10:34 +1200 Subject: [PATCH 1/4] Allow explicit ID for agent-gather Add a -i flag to agent-gather that sets the gather ID, which determines the output filename. A timestamp-based ID is generated by default. This allows callers to know in advance where to find the output file, following the same pattern used by installer-gather.sh. Assisted-by: Claude Code --- .../data/agent/files/usr/local/bin/agent-gather | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/data/data/agent/files/usr/local/bin/agent-gather b/data/data/agent/files/usr/local/bin/agent-gather index 444c760712d..81e48f9bef7 100644 --- a/data/data/agent/files/usr/local/bin/agent-gather +++ b/data/data/agent/files/usr/local/bin/agent-gather @@ -123,7 +123,7 @@ function gather_bootstrap_status() { ( >&2 echo -n ".") if [ -f /usr/local/bin/installer-gather.sh ]; then local bs_gather="${ARTIFACTS_DIR}/bootstrap-gather.tar.gz" - TAR_FILE="${bs_gather}" /usr/local/bin/installer-gather.sh --id "$(date '+%Y%m%d%H%M%S')" >/dev/null + TAR_FILE="${bs_gather}" /usr/local/bin/installer-gather.sh --id "${GATHER_ID}" >/dev/null ( >&2 echo -n ".") [ -f "${bs_gather}" ] && gunzip "${bs_gather}" fi @@ -134,19 +134,24 @@ function Help() { echo "Gathers the necessary data for troubleshooting OpenShift's agent based installation" echo - echo "Syntax: agent-gather [-h|-v]" + echo "Syntax: agent-gather [-h|-i ID|-v|-O]" echo "options:" - echo "-h Print this help" - echo "-O Output the compressed content to stdout" + echo "-h Print this help" + echo "-i Set the gather ID (determines output filename)" + echo "-O Output the compressed content to stdout" echo "-v Set verbose mode" echo } -while getopts ":hvO" option; do +GATHER_ID="$(date '+%Y%m%d%H%M%S')" + +while getopts ":hvOi:" option; do case $option in h) Help exit;; + i) + GATHER_ID="$OPTARG";; v) set -xv;; O) @@ -179,7 +184,7 @@ gather_bootstrap_status find "$ARTIFACTS_DIR" -type d -exec chmod a+rwx "{}" \; find "$ARTIFACTS_DIR" -type f -exec chmod a+rw "{}" \; -OUTPUT_FILE="./agent-gather-$(date +%Y%m%d-%H%M%S%Z).tar.xz" +OUTPUT_FILE="./agent-gather-${GATHER_ID}.tar.xz" ( >&2 echo "Compressing gathered data to $OUTPUT_FILE" ) if [[ "$STDOUT" == "1" ]]; then From 2ba77a476c4db54ae71adb0d246e4cbcb68170b3 Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Thu, 2 Jul 2026 22:19:19 +1200 Subject: [PATCH 2/4] Drop unused SSH key from rendezvous host lookup The SSH key loaded from InstallConfig and returned from FindRendezvousIPAndSSHKeyFromAssetStore was stored in NodeZeroRestClient.NodeSSHKey but never read by any code since 82c7aa3bf9609961e8c415982eed39213ca26bd1. Remove the dead code to reduce confusion. Assisted-by: Claude Code --- cmd/openshift-install/agent/waitfor.go | 8 +++--- pkg/agent/cluster.go | 4 +-- pkg/agent/rest.go | 36 +++++++------------------- pkg/nodejoiner/monitoraddnodes.go | 5 +--- 4 files changed, 16 insertions(+), 37 deletions(-) diff --git a/cmd/openshift-install/agent/waitfor.go b/cmd/openshift-install/agent/waitfor.go index 05c128d2886..dc0119ff16e 100644 --- a/cmd/openshift-install/agent/waitfor.go +++ b/cmd/openshift-install/agent/waitfor.go @@ -67,13 +67,13 @@ func newWaitForBootstrapCompleteCmd() *cobra.Command { logrus.Fatal(err) } - rendezvousIP, sshKey, err := agentpkg.FindRendezvousIPAndSSHKeyFromAssetStore(assetStore) + rendezvousIP, err := agentpkg.FindRendezvousIPFromAssetStore(assetStore) if err != nil { logrus.Fatal(err) } ctx := context.Background() - cluster, err := agentpkg.NewCluster(ctx, assetStore, rendezvousIP, kubeconfigPath, sshKey, workflow.AgentWorkflowTypeInstall) + cluster, err := agentpkg.NewCluster(ctx, assetStore, rendezvousIP, kubeconfigPath, workflow.AgentWorkflowTypeInstall) if err != nil { logrus.Exit(command.ExitCodeBootstrapFailed) } @@ -107,13 +107,13 @@ func newWaitForInstallCompleteCmd() *cobra.Command { logrus.Fatal(err) } - rendezvousIP, sshKey, err := agentpkg.FindRendezvousIPAndSSHKeyFromAssetStore(assetStore) + rendezvousIP, err := agentpkg.FindRendezvousIPFromAssetStore(assetStore) if err != nil { logrus.Fatal(err) } ctx := context.Background() - cluster, err := agentpkg.NewCluster(ctx, assetStore, rendezvousIP, kubeconfigPath, sshKey, workflow.AgentWorkflowTypeInstall) + cluster, err := agentpkg.NewCluster(ctx, assetStore, rendezvousIP, kubeconfigPath, workflow.AgentWorkflowTypeInstall) if err != nil { logrus.Exit(command.ExitCodeBootstrapFailed) } diff --git a/pkg/agent/cluster.go b/pkg/agent/cluster.go index 0148a2e81e8..5758606659e 100644 --- a/pkg/agent/cluster.go +++ b/pkg/agent/cluster.go @@ -59,7 +59,7 @@ type clusterInstallStatusHistory struct { } // NewCluster initializes a Cluster object -func NewCluster(ctx context.Context, assetStore asset.Store, rendezvousIP, kubeconfigPath, sshKey string, workflowType workflow.AgentWorkflowType) (*Cluster, error) { +func NewCluster(ctx context.Context, assetStore asset.Store, rendezvousIP, kubeconfigPath string, workflowType workflow.AgentWorkflowType) (*Cluster, error) { czero := &Cluster{} capi := &clientSet{} @@ -81,7 +81,7 @@ func NewCluster(ctx context.Context, assetStore asset.Store, rendezvousIP, kubec return nil, fmt.Errorf("AgentWorkflowType value not supported: %s", workflowType) } - restclient := NewNodeZeroRestClient(ctx, rendezvousIP, sshKey, watcherAuthToken) + restclient := NewNodeZeroRestClient(ctx, rendezvousIP, watcherAuthToken) kubeclient, err := NewClusterKubeAPIClient(ctx, kubeconfigPath) if err != nil { diff --git a/pkg/agent/rest.go b/pkg/agent/rest.go index 277474e1426..535be614e3e 100644 --- a/pkg/agent/rest.go +++ b/pkg/agent/rest.go @@ -19,7 +19,6 @@ import ( "github.com/openshift/installer/pkg/asset/agent/gencrypto" "github.com/openshift/installer/pkg/asset/agent/image" "github.com/openshift/installer/pkg/asset/agent/manifests" - "github.com/openshift/installer/pkg/asset/installconfig" "github.com/openshift/installer/pkg/types/agent" ) @@ -29,18 +28,12 @@ type NodeZeroRestClient struct { ctx context.Context config client.Config NodeZeroIP string - NodeSSHKey []string } // NewNodeZeroRestClient Initialize a new rest client to interact with the Agent Rest API on node zero. -func NewNodeZeroRestClient(ctx context.Context, rendezvousIP, sshKey, watcherAuthToken string) *NodeZeroRestClient { +func NewNodeZeroRestClient(ctx context.Context, rendezvousIP, watcherAuthToken string) *NodeZeroRestClient { restClient := &NodeZeroRestClient{} - // Get SSH Keys which can be used to determine if Rest API failures are due to network connectivity issues - if sshKey != "" { - restClient.NodeSSHKey = append(restClient.NodeSSHKey, sshKey) - } - config := client.Config{} config.URL = &url.URL{ Scheme: "http", @@ -60,16 +53,14 @@ func NewNodeZeroRestClient(ctx context.Context, rendezvousIP, sshKey, watcherAut return restClient } -// FindRendezvousIPAndSSHKeyFromAssetStore returns the rendezvousIP and public ssh key. -func FindRendezvousIPAndSSHKeyFromAssetStore(assetStore asset.Store) (string, string, error) { +// FindRendezvousIPFromAssetStore returns the rendezvous IP of the agent cluster. +func FindRendezvousIPFromAssetStore(assetStore asset.Store) (string, error) { agentConfigAsset := &agentconfig.AgentConfig{} agentManifestsAsset := &manifests.AgentManifests{} - installConfigAsset := &installconfig.InstallConfig{} agentHostsAsset := &agentconfig.AgentHosts{} agentConfig, agentConfigError := assetStore.Load(agentConfigAsset) agentManifests, manifestError := assetStore.Load(agentManifestsAsset) - installConfig, installConfigError := assetStore.Load(installConfigAsset) agentHosts, agentHostsError := assetStore.Load(agentHostsAsset) if agentConfigError != nil { @@ -78,14 +69,11 @@ func FindRendezvousIPAndSSHKeyFromAssetStore(assetStore asset.Store) (string, st if manifestError != nil { logrus.Debug(errors.Wrapf(manifestError, "failed to load %s", agentManifestsAsset.Name())) } - if installConfigError != nil { - logrus.Debug(errors.Wrapf(installConfigError, "failed to load %s", installConfigAsset.Name())) - } if agentHostsError != nil { - logrus.Debug(errors.Wrapf(agentConfigError, "failed to load %s", agentHostsAsset.Name())) + logrus.Debug(errors.Wrapf(agentHostsError, "failed to load %s", agentHostsAsset.Name())) } - if agentConfigError != nil || manifestError != nil || installConfigError != nil || agentHostsError != nil { - return "", "", errors.New("failed to load AgentConfig, NMStateConfig, InstallConfig, or AgentHosts") + if agentConfigError != nil || manifestError != nil || agentHostsError != nil { + return "", errors.New("failed to load AgentConfig, NMStateConfig, or AgentHosts") } var rendezvousIP string @@ -99,19 +87,13 @@ func FindRendezvousIPAndSSHKeyFromAssetStore(assetStore asset.Store) (string, st } else if agentConfig != nil && agentManifests == nil { rendezvousIP, rendezvousIPError = image.RetrieveRendezvousIP(agentConfig.(*agentconfig.AgentConfig).Config, agentHosts.(*agentconfig.AgentHosts).Hosts, emptyNMStateConfigs) } else { - return "", "", errors.New("both AgentConfig and NMStateConfig are empty") + return "", errors.New("both AgentConfig and NMStateConfig are empty") } if rendezvousIPError != nil { - return "", "", rendezvousIPError - } - - var sshKey string - // Get SSH Keys which can be used to determine if Rest API failures are due to network connectivity issues - if installConfig != nil { - sshKey = installConfig.(*installconfig.InstallConfig).Config.SSHKey + return "", rendezvousIPError } - return rendezvousIP, sshKey, nil + return rendezvousIP, nil } // FindAuthTokenFromAssetStore returns the auth token from asset store. diff --git a/pkg/nodejoiner/monitoraddnodes.go b/pkg/nodejoiner/monitoraddnodes.go index 2ff28457091..116edcaaff2 100644 --- a/pkg/nodejoiner/monitoraddnodes.go +++ b/pkg/nodejoiner/monitoraddnodes.go @@ -20,13 +20,10 @@ func NewMonitorAddNodesCommand(directory, kubeconfigPath string, ips []string) e return err } - // sshKey is not required parameter for monitor-add-nodes - sshKey := "" - clusters := []*agentpkg.Cluster{} ctx := context.Background() for _, ip := range ips { - cluster, err := agentpkg.NewCluster(ctx, assetStore, ip, kubeconfigPath, sshKey, workflow.AgentWorkflowTypeAddNodes) + cluster, err := agentpkg.NewCluster(ctx, assetStore, ip, kubeconfigPath, workflow.AgentWorkflowTypeAddNodes) if err != nil { return err } From 3c87d4dbb656f3e23cd1650b1509404f618de7bb Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Thu, 2 Jul 2026 22:00:31 +1200 Subject: [PATCH 3/4] Add BootstrapSSHKeyPair public key to agent ignition Add the generated bootstrap SSH public key to the core user's SSHAuthorizedKeys in the agent ignition, alongside the user-provided key from the InfraEnv spec. This enables the installer to SSH to the rendezvous host using the bootstrap key without requiring the user to provide a key. This key does not persist into the cluster, it is only active while the agent ISO is booted. Assisted-by: Claude Code --- pkg/asset/agent/image/ignition.go | 5 ++++- pkg/asset/agent/image/ignition_test.go | 6 ++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/pkg/asset/agent/image/ignition.go b/pkg/asset/agent/image/ignition.go index 1727b672636..19660515fa3 100644 --- a/pkg/asset/agent/image/ignition.go +++ b/pkg/asset/agent/image/ignition.go @@ -108,6 +108,7 @@ func (a *Ignition) Dependencies() []asset.Asset { &tls.KubeAPIServerLocalhostSignerCertKey{}, &tls.KubeAPIServerServiceNetworkSignerCertKey{}, &tls.AdminKubeConfigSignerCertKey{}, + &tls.BootstrapSSHKeyPair{}, &password.KubeadminPassword{}, &agentconfig.AgentConfig{}, &agentconfig.AgentHosts{}, @@ -136,7 +137,8 @@ func (a *Ignition) Generate(ctx context.Context, dependencies asset.Parents) err } pwd := &password.KubeadminPassword{} - dependencies.Get(pwd) + bootstrapSSHKeyPair := &tls.BootstrapSSHKeyPair{} + dependencies.Get(pwd, bootstrapSSHKeyPair) pwdHash := string(pwd.PasswordHash) infraEnv := agentManifests.InfraEnv @@ -151,6 +153,7 @@ func (a *Ignition) Generate(ctx context.Context, dependencies asset.Parents) err Name: "core", SSHAuthorizedKeys: []igntypes.SSHAuthorizedKey{ igntypes.SSHAuthorizedKey(infraEnv.Spec.SSHAuthorizedKey), + igntypes.SSHAuthorizedKey(string(bootstrapSSHKeyPair.Public())), }, PasswordHash: &pwdHash, }, diff --git a/pkg/asset/agent/image/ignition_test.go b/pkg/asset/agent/image/ignition_test.go index a6534245d09..eb1b1ab5cde 100644 --- a/pkg/asset/agent/image/ignition_test.go +++ b/pkg/asset/agent/image/ignition_test.go @@ -654,6 +654,11 @@ logdir /var/log/chrony`, assertExpectedFiles(t, ignitionAsset.Config, tc.expectedFiles, tc.expectedFileContent) assertServiceEnabled(t, ignitionAsset.Config, tc.serviceEnabledMap) + + assert.Len(t, ignitionAsset.Config.Passwd.Users, 1) + assert.Equal(t, "core", ignitionAsset.Config.Passwd.Users[0].Name) + assert.Contains(t, ignitionAsset.Config.Passwd.Users[0].SSHAuthorizedKeys, igntypes.SSHAuthorizedKey("my-ssh-key")) + assert.Contains(t, ignitionAsset.Config.Passwd.Users[0].SSHAuthorizedKeys, igntypes.SSHAuthorizedKey("test-bootstrap-ssh-key\n")) } }) } @@ -801,6 +806,7 @@ func buildIgnitionAssetDefaultDependencies(t *testing.T) []asset.Asset { &tls.KubeAPIServerServiceNetworkSignerCertKey{}, &tls.AdminKubeConfigSignerCertKey{}, &tls.AdminKubeConfigClientCertKey{}, + &tls.BootstrapSSHKeyPair{Pub: []byte("test-bootstrap-ssh-key\n")}, &gencrypto.AuthConfig{}, &common.InfraEnvID{}, &agentcommon.OptionalInstallConfig{}, From f175ad352de5dbc7fff164b72c308099838f02fb Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Tue, 14 Apr 2026 17:11:10 +1200 Subject: [PATCH 4/4] AGENT-580: Add agent gather bootstrap command Add "openshift-install agent gather bootstrap" command to collect debugging data from the rendezvous host during agent-based installations. The command determines the rendezvous IP from the asset store, SSHs to the host, runs agent-gather, and pulls the resulting archive locally. As with "openshift-install gather bootstrap", the bootstrap SSH key pair is loaded automatically from the asset store, additional keys can be specified with the --key flag, and keys from the user's SSH agent or ~/.ssh/ are always included as well. Assisted-by: Claude Code --- cmd/openshift-install/agent.go | 1 + cmd/openshift-install/agent/gather.go | 104 ++++++++++++++++++++++++++ pkg/agent/gather.go | 47 ++++++++++++ 3 files changed, 152 insertions(+) create mode 100644 cmd/openshift-install/agent/gather.go create mode 100644 pkg/agent/gather.go diff --git a/cmd/openshift-install/agent.go b/cmd/openshift-install/agent.go index fb19102dd2b..5d8a3ee09bc 100644 --- a/cmd/openshift-install/agent.go +++ b/cmd/openshift-install/agent.go @@ -29,6 +29,7 @@ func newAgentCmd(ctx context.Context) *cobra.Command { agentCmd.AddCommand(newAgentCreateCmd(ctx)) agentCmd.AddCommand(agent.NewWaitForCmd()) agentCmd.AddCommand(newAgentGraphCmd()) + agentCmd.AddCommand(agent.NewGatherCmd()) return agentCmd } diff --git a/cmd/openshift-install/agent/gather.go b/cmd/openshift-install/agent/gather.go new file mode 100644 index 00000000000..feaf3c13a0f --- /dev/null +++ b/cmd/openshift-install/agent/gather.go @@ -0,0 +1,104 @@ +package agent + +import ( + "context" + "fmt" + "os" + "time" + + "github.com/sirupsen/logrus" + "github.com/spf13/cobra" + + "github.com/openshift/installer/cmd/openshift-install/command" + agentpkg "github.com/openshift/installer/pkg/agent" + assetstore "github.com/openshift/installer/pkg/asset/store" + "github.com/openshift/installer/pkg/asset/tls" +) + +var agentGatherOpts struct { + sshKeys []string +} + +// NewGatherCmd creates the commands for gathering debug data from an agent-based installation. +func NewGatherCmd() *cobra.Command { + cmd := &cobra.Command{ + Use: "gather", + Short: "Gather debugging data for a failed agent-based installation", + Long: `Gather debugging data for a failed agent-based installation. + +When an agent-based installation fails, this command collects debugging +data from the rendezvous host to help diagnose the issue.`, + RunE: func(cmd *cobra.Command, args []string) error { + return cmd.Help() + }, + } + + cmd.AddCommand(newAgentGatherCmd()) + return cmd +} + +func newAgentGatherCmd() *cobra.Command { + cmd := &cobra.Command{ + Use: "bootstrap", + Short: "Gather debugging data from the rendezvous host", + Args: cobra.ExactArgs(0), + Run: func(_ *cobra.Command, _ []string) { + cleanup := command.SetupFileHook(command.RootOpts.Dir) + defer cleanup() + + bundlePath, err := runAgentGatherCmd(command.RootOpts.Dir) + if err != nil { + logrus.Fatal(err) + } + logrus.Infof("Agent gather logs captured here %q", bundlePath) + }, + } + + cmd.PersistentFlags().StringArrayVar(&agentGatherOpts.sshKeys, "key", []string{}, + "Path to SSH private keys that should be used for authentication. "+ + "If no key was provided, SSH private keys from user's environment will be used") + return cmd +} + +func runAgentGatherCmd(directory string) (string, error) { + ctx := context.TODO() + + store, err := assetstore.NewStore(directory) + if err != nil { + return "", fmt.Errorf("failed to create asset store: %w", err) + } + + rendezvousIP, err := agentpkg.FindRendezvousIPFromAssetStore(store) + if err != nil { + return "", fmt.Errorf("failed to determine rendezvous host: %w", err) + } + logrus.Infof("Rendezvous host IP: %s", rendezvousIP) + + // add the bootstrap SSH key pair to the sshKeys list automatically + bootstrapSSHKeyPair := &tls.BootstrapSSHKeyPair{} + if err := store.Fetch(ctx, bootstrapSSHKeyPair); err != nil { + logrus.Debugf("Failed to fetch bootstrap SSH key pair: %v", err) + } else { + tmpfile, err := os.CreateTemp("", "bootstrap-ssh") + if err != nil { + return "", err + } + defer os.Remove(tmpfile.Name()) + if _, err := tmpfile.Write(bootstrapSSHKeyPair.Private()); err != nil { + return "", err + } + if err := tmpfile.Close(); err != nil { + return "", err + } + agentGatherOpts.sshKeys = append(agentGatherOpts.sshKeys, tmpfile.Name()) + } + + gatherID := time.Now().Format("20060102150405") + + bundlePath, err := agentpkg.PullAgentGatherArchive(rendezvousIP, agentGatherOpts.sshKeys, directory, gatherID) + if err != nil { + return "", fmt.Errorf("failed to gather data from rendezvous host: %w", err) + } + + return bundlePath, nil +} diff --git a/pkg/agent/gather.go b/pkg/agent/gather.go new file mode 100644 index 00000000000..c3be1649259 --- /dev/null +++ b/pkg/agent/gather.go @@ -0,0 +1,47 @@ +package agent + +import ( + "fmt" + "net" + "path" + "path/filepath" + "strconv" + + "github.com/sirupsen/logrus" + + gatherssh "github.com/openshift/installer/pkg/gather/ssh" +) + +// PullAgentGatherArchive SSHs to the rendezvous host and runs the +// agent-gather script, pulling the resulting tar.xz archive to the +// local directory. +func PullAgentGatherArchive(rendezvousIP string, sshKeys []string, directory, gatherID string) (string, error) { + logrus.Info("Pulling agent-gather data from the rendezvous host") + + address := net.JoinHostPort(rendezvousIP, strconv.Itoa(22)) + client, err := gatherssh.NewClient("core", address, sshKeys) + if err != nil { + return "", fmt.Errorf("failed to create SSH client for rendezvous host %s: %w", rendezvousIP, err) + } + + // Run agent-gather with -i so it writes to a predictable path + cmd := fmt.Sprintf("sudo /usr/local/bin/agent-gather -i %s", gatherID) + if err := gatherssh.Run(client, cmd); err != nil { + return "", fmt.Errorf("failed to run agent-gather on rendezvous host %s: %w", rendezvousIP, err) + } + + archiveName := fmt.Sprintf("agent-gather-%s.tar.xz", gatherID) + remoteFile := path.Join("/home/core", archiveName) + localFile := filepath.Join(directory, archiveName) + if err := gatherssh.PullFileTo(client, remoteFile, localFile); err != nil { + return "", fmt.Errorf("failed to pull agent-gather archive: %w", err) + } + + absPath, err := filepath.Abs(localFile) + if err != nil { + return "", fmt.Errorf("failed to get absolute path: %w", err) + } + + logrus.Info("Successfully pulled agent-gather data") + return absPath, nil +}