diff --git a/cmd/openshift-install/agent.go b/cmd/openshift-install/agent.go index fb19102dd2b..5d8a3ee09bc 100644 --- a/cmd/openshift-install/agent.go +++ b/cmd/openshift-install/agent.go @@ -29,6 +29,7 @@ func newAgentCmd(ctx context.Context) *cobra.Command { agentCmd.AddCommand(newAgentCreateCmd(ctx)) agentCmd.AddCommand(agent.NewWaitForCmd()) agentCmd.AddCommand(newAgentGraphCmd()) + agentCmd.AddCommand(agent.NewGatherCmd()) return agentCmd } diff --git a/cmd/openshift-install/agent/gather.go b/cmd/openshift-install/agent/gather.go new file mode 100644 index 00000000000..feaf3c13a0f --- /dev/null +++ b/cmd/openshift-install/agent/gather.go @@ -0,0 +1,104 @@ +package agent + +import ( + "context" + "fmt" + "os" + "time" + + "github.com/sirupsen/logrus" + "github.com/spf13/cobra" + + "github.com/openshift/installer/cmd/openshift-install/command" + agentpkg "github.com/openshift/installer/pkg/agent" + assetstore "github.com/openshift/installer/pkg/asset/store" + "github.com/openshift/installer/pkg/asset/tls" +) + +var agentGatherOpts struct { + sshKeys []string +} + +// NewGatherCmd creates the commands for gathering debug data from an agent-based installation. +func NewGatherCmd() *cobra.Command { + cmd := &cobra.Command{ + Use: "gather", + Short: "Gather debugging data for a failed agent-based installation", + Long: `Gather debugging data for a failed agent-based installation. + +When an agent-based installation fails, this command collects debugging +data from the rendezvous host to help diagnose the issue.`, + RunE: func(cmd *cobra.Command, args []string) error { + return cmd.Help() + }, + } + + cmd.AddCommand(newAgentGatherCmd()) + return cmd +} + +func newAgentGatherCmd() *cobra.Command { + cmd := &cobra.Command{ + Use: "bootstrap", + Short: "Gather debugging data from the rendezvous host", + Args: cobra.ExactArgs(0), + Run: func(_ *cobra.Command, _ []string) { + cleanup := command.SetupFileHook(command.RootOpts.Dir) + defer cleanup() + + bundlePath, err := runAgentGatherCmd(command.RootOpts.Dir) + if err != nil { + logrus.Fatal(err) + } + logrus.Infof("Agent gather logs captured here %q", bundlePath) + }, + } + + cmd.PersistentFlags().StringArrayVar(&agentGatherOpts.sshKeys, "key", []string{}, + "Path to SSH private keys that should be used for authentication. "+ + "If no key was provided, SSH private keys from user's environment will be used") + return cmd +} + +func runAgentGatherCmd(directory string) (string, error) { + ctx := context.TODO() + + store, err := assetstore.NewStore(directory) + if err != nil { + return "", fmt.Errorf("failed to create asset store: %w", err) + } + + rendezvousIP, err := agentpkg.FindRendezvousIPFromAssetStore(store) + if err != nil { + return "", fmt.Errorf("failed to determine rendezvous host: %w", err) + } + logrus.Infof("Rendezvous host IP: %s", rendezvousIP) + + // add the bootstrap SSH key pair to the sshKeys list automatically + bootstrapSSHKeyPair := &tls.BootstrapSSHKeyPair{} + if err := store.Fetch(ctx, bootstrapSSHKeyPair); err != nil { + logrus.Debugf("Failed to fetch bootstrap SSH key pair: %v", err) + } else { + tmpfile, err := os.CreateTemp("", "bootstrap-ssh") + if err != nil { + return "", err + } + defer os.Remove(tmpfile.Name()) + if _, err := tmpfile.Write(bootstrapSSHKeyPair.Private()); err != nil { + return "", err + } + if err := tmpfile.Close(); err != nil { + return "", err + } + agentGatherOpts.sshKeys = append(agentGatherOpts.sshKeys, tmpfile.Name()) + } + + gatherID := time.Now().Format("20060102150405") + + bundlePath, err := agentpkg.PullAgentGatherArchive(rendezvousIP, agentGatherOpts.sshKeys, directory, gatherID) + if err != nil { + return "", fmt.Errorf("failed to gather data from rendezvous host: %w", err) + } + + return bundlePath, nil +} diff --git a/cmd/openshift-install/agent/waitfor.go b/cmd/openshift-install/agent/waitfor.go index 05c128d2886..dc0119ff16e 100644 --- a/cmd/openshift-install/agent/waitfor.go +++ b/cmd/openshift-install/agent/waitfor.go @@ -67,13 +67,13 @@ func newWaitForBootstrapCompleteCmd() *cobra.Command { logrus.Fatal(err) } - rendezvousIP, sshKey, err := agentpkg.FindRendezvousIPAndSSHKeyFromAssetStore(assetStore) + rendezvousIP, err := agentpkg.FindRendezvousIPFromAssetStore(assetStore) if err != nil { logrus.Fatal(err) } ctx := context.Background() - cluster, err := agentpkg.NewCluster(ctx, assetStore, rendezvousIP, kubeconfigPath, sshKey, workflow.AgentWorkflowTypeInstall) + cluster, err := agentpkg.NewCluster(ctx, assetStore, rendezvousIP, kubeconfigPath, workflow.AgentWorkflowTypeInstall) if err != nil { logrus.Exit(command.ExitCodeBootstrapFailed) } @@ -107,13 +107,13 @@ func newWaitForInstallCompleteCmd() *cobra.Command { logrus.Fatal(err) } - rendezvousIP, sshKey, err := agentpkg.FindRendezvousIPAndSSHKeyFromAssetStore(assetStore) + rendezvousIP, err := agentpkg.FindRendezvousIPFromAssetStore(assetStore) if err != nil { logrus.Fatal(err) } ctx := context.Background() - cluster, err := agentpkg.NewCluster(ctx, assetStore, rendezvousIP, kubeconfigPath, sshKey, workflow.AgentWorkflowTypeInstall) + cluster, err := agentpkg.NewCluster(ctx, assetStore, rendezvousIP, kubeconfigPath, workflow.AgentWorkflowTypeInstall) if err != nil { logrus.Exit(command.ExitCodeBootstrapFailed) } diff --git a/data/data/agent/files/usr/local/bin/agent-gather b/data/data/agent/files/usr/local/bin/agent-gather index 444c760712d..81e48f9bef7 100644 --- a/data/data/agent/files/usr/local/bin/agent-gather +++ b/data/data/agent/files/usr/local/bin/agent-gather @@ -123,7 +123,7 @@ function gather_bootstrap_status() { ( >&2 echo -n ".") if [ -f /usr/local/bin/installer-gather.sh ]; then local bs_gather="${ARTIFACTS_DIR}/bootstrap-gather.tar.gz" - TAR_FILE="${bs_gather}" /usr/local/bin/installer-gather.sh --id "$(date '+%Y%m%d%H%M%S')" >/dev/null + TAR_FILE="${bs_gather}" /usr/local/bin/installer-gather.sh --id "${GATHER_ID}" >/dev/null ( >&2 echo -n ".") [ -f "${bs_gather}" ] && gunzip "${bs_gather}" fi @@ -134,19 +134,24 @@ function Help() { echo "Gathers the necessary data for troubleshooting OpenShift's agent based installation" echo - echo "Syntax: agent-gather [-h|-v]" + echo "Syntax: agent-gather [-h|-i ID|-v|-O]" echo "options:" - echo "-h Print this help" - echo "-O Output the compressed content to stdout" + echo "-h Print this help" + echo "-i Set the gather ID (determines output filename)" + echo "-O Output the compressed content to stdout" echo "-v Set verbose mode" echo } -while getopts ":hvO" option; do +GATHER_ID="$(date '+%Y%m%d%H%M%S')" + +while getopts ":hvOi:" option; do case $option in h) Help exit;; + i) + GATHER_ID="$OPTARG";; v) set -xv;; O) @@ -179,7 +184,7 @@ gather_bootstrap_status find "$ARTIFACTS_DIR" -type d -exec chmod a+rwx "{}" \; find "$ARTIFACTS_DIR" -type f -exec chmod a+rw "{}" \; -OUTPUT_FILE="./agent-gather-$(date +%Y%m%d-%H%M%S%Z).tar.xz" +OUTPUT_FILE="./agent-gather-${GATHER_ID}.tar.xz" ( >&2 echo "Compressing gathered data to $OUTPUT_FILE" ) if [[ "$STDOUT" == "1" ]]; then diff --git a/pkg/agent/cluster.go b/pkg/agent/cluster.go index 0148a2e81e8..5758606659e 100644 --- a/pkg/agent/cluster.go +++ b/pkg/agent/cluster.go @@ -59,7 +59,7 @@ type clusterInstallStatusHistory struct { } // NewCluster initializes a Cluster object -func NewCluster(ctx context.Context, assetStore asset.Store, rendezvousIP, kubeconfigPath, sshKey string, workflowType workflow.AgentWorkflowType) (*Cluster, error) { +func NewCluster(ctx context.Context, assetStore asset.Store, rendezvousIP, kubeconfigPath string, workflowType workflow.AgentWorkflowType) (*Cluster, error) { czero := &Cluster{} capi := &clientSet{} @@ -81,7 +81,7 @@ func NewCluster(ctx context.Context, assetStore asset.Store, rendezvousIP, kubec return nil, fmt.Errorf("AgentWorkflowType value not supported: %s", workflowType) } - restclient := NewNodeZeroRestClient(ctx, rendezvousIP, sshKey, watcherAuthToken) + restclient := NewNodeZeroRestClient(ctx, rendezvousIP, watcherAuthToken) kubeclient, err := NewClusterKubeAPIClient(ctx, kubeconfigPath) if err != nil { diff --git a/pkg/agent/gather.go b/pkg/agent/gather.go new file mode 100644 index 00000000000..c3be1649259 --- /dev/null +++ b/pkg/agent/gather.go @@ -0,0 +1,47 @@ +package agent + +import ( + "fmt" + "net" + "path" + "path/filepath" + "strconv" + + "github.com/sirupsen/logrus" + + gatherssh "github.com/openshift/installer/pkg/gather/ssh" +) + +// PullAgentGatherArchive SSHs to the rendezvous host and runs the +// agent-gather script, pulling the resulting tar.xz archive to the +// local directory. +func PullAgentGatherArchive(rendezvousIP string, sshKeys []string, directory, gatherID string) (string, error) { + logrus.Info("Pulling agent-gather data from the rendezvous host") + + address := net.JoinHostPort(rendezvousIP, strconv.Itoa(22)) + client, err := gatherssh.NewClient("core", address, sshKeys) + if err != nil { + return "", fmt.Errorf("failed to create SSH client for rendezvous host %s: %w", rendezvousIP, err) + } + + // Run agent-gather with -i so it writes to a predictable path + cmd := fmt.Sprintf("sudo /usr/local/bin/agent-gather -i %s", gatherID) + if err := gatherssh.Run(client, cmd); err != nil { + return "", fmt.Errorf("failed to run agent-gather on rendezvous host %s: %w", rendezvousIP, err) + } + + archiveName := fmt.Sprintf("agent-gather-%s.tar.xz", gatherID) + remoteFile := path.Join("/home/core", archiveName) + localFile := filepath.Join(directory, archiveName) + if err := gatherssh.PullFileTo(client, remoteFile, localFile); err != nil { + return "", fmt.Errorf("failed to pull agent-gather archive: %w", err) + } + + absPath, err := filepath.Abs(localFile) + if err != nil { + return "", fmt.Errorf("failed to get absolute path: %w", err) + } + + logrus.Info("Successfully pulled agent-gather data") + return absPath, nil +} diff --git a/pkg/agent/rest.go b/pkg/agent/rest.go index 277474e1426..535be614e3e 100644 --- a/pkg/agent/rest.go +++ b/pkg/agent/rest.go @@ -19,7 +19,6 @@ import ( "github.com/openshift/installer/pkg/asset/agent/gencrypto" "github.com/openshift/installer/pkg/asset/agent/image" "github.com/openshift/installer/pkg/asset/agent/manifests" - "github.com/openshift/installer/pkg/asset/installconfig" "github.com/openshift/installer/pkg/types/agent" ) @@ -29,18 +28,12 @@ type NodeZeroRestClient struct { ctx context.Context config client.Config NodeZeroIP string - NodeSSHKey []string } // NewNodeZeroRestClient Initialize a new rest client to interact with the Agent Rest API on node zero. -func NewNodeZeroRestClient(ctx context.Context, rendezvousIP, sshKey, watcherAuthToken string) *NodeZeroRestClient { +func NewNodeZeroRestClient(ctx context.Context, rendezvousIP, watcherAuthToken string) *NodeZeroRestClient { restClient := &NodeZeroRestClient{} - // Get SSH Keys which can be used to determine if Rest API failures are due to network connectivity issues - if sshKey != "" { - restClient.NodeSSHKey = append(restClient.NodeSSHKey, sshKey) - } - config := client.Config{} config.URL = &url.URL{ Scheme: "http", @@ -60,16 +53,14 @@ func NewNodeZeroRestClient(ctx context.Context, rendezvousIP, sshKey, watcherAut return restClient } -// FindRendezvousIPAndSSHKeyFromAssetStore returns the rendezvousIP and public ssh key. -func FindRendezvousIPAndSSHKeyFromAssetStore(assetStore asset.Store) (string, string, error) { +// FindRendezvousIPFromAssetStore returns the rendezvous IP of the agent cluster. +func FindRendezvousIPFromAssetStore(assetStore asset.Store) (string, error) { agentConfigAsset := &agentconfig.AgentConfig{} agentManifestsAsset := &manifests.AgentManifests{} - installConfigAsset := &installconfig.InstallConfig{} agentHostsAsset := &agentconfig.AgentHosts{} agentConfig, agentConfigError := assetStore.Load(agentConfigAsset) agentManifests, manifestError := assetStore.Load(agentManifestsAsset) - installConfig, installConfigError := assetStore.Load(installConfigAsset) agentHosts, agentHostsError := assetStore.Load(agentHostsAsset) if agentConfigError != nil { @@ -78,14 +69,11 @@ func FindRendezvousIPAndSSHKeyFromAssetStore(assetStore asset.Store) (string, st if manifestError != nil { logrus.Debug(errors.Wrapf(manifestError, "failed to load %s", agentManifestsAsset.Name())) } - if installConfigError != nil { - logrus.Debug(errors.Wrapf(installConfigError, "failed to load %s", installConfigAsset.Name())) - } if agentHostsError != nil { - logrus.Debug(errors.Wrapf(agentConfigError, "failed to load %s", agentHostsAsset.Name())) + logrus.Debug(errors.Wrapf(agentHostsError, "failed to load %s", agentHostsAsset.Name())) } - if agentConfigError != nil || manifestError != nil || installConfigError != nil || agentHostsError != nil { - return "", "", errors.New("failed to load AgentConfig, NMStateConfig, InstallConfig, or AgentHosts") + if agentConfigError != nil || manifestError != nil || agentHostsError != nil { + return "", errors.New("failed to load AgentConfig, NMStateConfig, or AgentHosts") } var rendezvousIP string @@ -99,19 +87,13 @@ func FindRendezvousIPAndSSHKeyFromAssetStore(assetStore asset.Store) (string, st } else if agentConfig != nil && agentManifests == nil { rendezvousIP, rendezvousIPError = image.RetrieveRendezvousIP(agentConfig.(*agentconfig.AgentConfig).Config, agentHosts.(*agentconfig.AgentHosts).Hosts, emptyNMStateConfigs) } else { - return "", "", errors.New("both AgentConfig and NMStateConfig are empty") + return "", errors.New("both AgentConfig and NMStateConfig are empty") } if rendezvousIPError != nil { - return "", "", rendezvousIPError - } - - var sshKey string - // Get SSH Keys which can be used to determine if Rest API failures are due to network connectivity issues - if installConfig != nil { - sshKey = installConfig.(*installconfig.InstallConfig).Config.SSHKey + return "", rendezvousIPError } - return rendezvousIP, sshKey, nil + return rendezvousIP, nil } // FindAuthTokenFromAssetStore returns the auth token from asset store. diff --git a/pkg/asset/agent/image/ignition.go b/pkg/asset/agent/image/ignition.go index 1727b672636..19660515fa3 100644 --- a/pkg/asset/agent/image/ignition.go +++ b/pkg/asset/agent/image/ignition.go @@ -108,6 +108,7 @@ func (a *Ignition) Dependencies() []asset.Asset { &tls.KubeAPIServerLocalhostSignerCertKey{}, &tls.KubeAPIServerServiceNetworkSignerCertKey{}, &tls.AdminKubeConfigSignerCertKey{}, + &tls.BootstrapSSHKeyPair{}, &password.KubeadminPassword{}, &agentconfig.AgentConfig{}, &agentconfig.AgentHosts{}, @@ -136,7 +137,8 @@ func (a *Ignition) Generate(ctx context.Context, dependencies asset.Parents) err } pwd := &password.KubeadminPassword{} - dependencies.Get(pwd) + bootstrapSSHKeyPair := &tls.BootstrapSSHKeyPair{} + dependencies.Get(pwd, bootstrapSSHKeyPair) pwdHash := string(pwd.PasswordHash) infraEnv := agentManifests.InfraEnv @@ -151,6 +153,7 @@ func (a *Ignition) Generate(ctx context.Context, dependencies asset.Parents) err Name: "core", SSHAuthorizedKeys: []igntypes.SSHAuthorizedKey{ igntypes.SSHAuthorizedKey(infraEnv.Spec.SSHAuthorizedKey), + igntypes.SSHAuthorizedKey(string(bootstrapSSHKeyPair.Public())), }, PasswordHash: &pwdHash, }, diff --git a/pkg/asset/agent/image/ignition_test.go b/pkg/asset/agent/image/ignition_test.go index a6534245d09..eb1b1ab5cde 100644 --- a/pkg/asset/agent/image/ignition_test.go +++ b/pkg/asset/agent/image/ignition_test.go @@ -654,6 +654,11 @@ logdir /var/log/chrony`, assertExpectedFiles(t, ignitionAsset.Config, tc.expectedFiles, tc.expectedFileContent) assertServiceEnabled(t, ignitionAsset.Config, tc.serviceEnabledMap) + + assert.Len(t, ignitionAsset.Config.Passwd.Users, 1) + assert.Equal(t, "core", ignitionAsset.Config.Passwd.Users[0].Name) + assert.Contains(t, ignitionAsset.Config.Passwd.Users[0].SSHAuthorizedKeys, igntypes.SSHAuthorizedKey("my-ssh-key")) + assert.Contains(t, ignitionAsset.Config.Passwd.Users[0].SSHAuthorizedKeys, igntypes.SSHAuthorizedKey("test-bootstrap-ssh-key\n")) } }) } @@ -801,6 +806,7 @@ func buildIgnitionAssetDefaultDependencies(t *testing.T) []asset.Asset { &tls.KubeAPIServerServiceNetworkSignerCertKey{}, &tls.AdminKubeConfigSignerCertKey{}, &tls.AdminKubeConfigClientCertKey{}, + &tls.BootstrapSSHKeyPair{Pub: []byte("test-bootstrap-ssh-key\n")}, &gencrypto.AuthConfig{}, &common.InfraEnvID{}, &agentcommon.OptionalInstallConfig{}, diff --git a/pkg/nodejoiner/monitoraddnodes.go b/pkg/nodejoiner/monitoraddnodes.go index 2ff28457091..116edcaaff2 100644 --- a/pkg/nodejoiner/monitoraddnodes.go +++ b/pkg/nodejoiner/monitoraddnodes.go @@ -20,13 +20,10 @@ func NewMonitorAddNodesCommand(directory, kubeconfigPath string, ips []string) e return err } - // sshKey is not required parameter for monitor-add-nodes - sshKey := "" - clusters := []*agentpkg.Cluster{} ctx := context.Background() for _, ip := range ips { - cluster, err := agentpkg.NewCluster(ctx, assetStore, ip, kubeconfigPath, sshKey, workflow.AgentWorkflowTypeAddNodes) + cluster, err := agentpkg.NewCluster(ctx, assetStore, ip, kubeconfigPath, workflow.AgentWorkflowTypeAddNodes) if err != nil { return err }