diff --git a/pkg/httpserver/server.go b/pkg/httpserver/server.go index 395521021..79b5da891 100644 --- a/pkg/httpserver/server.go +++ b/pkg/httpserver/server.go @@ -22,7 +22,7 @@ import ( const ( _defaultReadTimeout = 15 * time.Second - _defaultWriteTimeout = 15 * time.Second + _defaultWriteTimeout = 60 * time.Second _defaultAddr = ":80" _defaultShutdownTimeout = 3 * time.Second diff --git a/redfish/internal/controller/http/v1/handler/systems.go b/redfish/internal/controller/http/v1/handler/systems.go index cc4e1cb83..6142a6c3f 100644 --- a/redfish/internal/controller/http/v1/handler/systems.go +++ b/redfish/internal/controller/http/v1/handler/systems.go @@ -100,9 +100,7 @@ func (s *RedfishServer) handleGetSystemError(c *gin.Context, err error, systemID NotFoundError(c, "System", systemID) default: if s.Logger != nil { - s.Logger.Error("Failed to retrieve computer system", - "systemID", systemID, - "error", err) + s.Logger.Error("Failed to retrieve computer system: systemID=%s error=%v", systemID, err) } InternalServerError(c, err) @@ -118,7 +116,7 @@ func (s *RedfishServer) GetRedfishV1Systems(c *gin.Context) { systemIDs, err := s.ComputerSystemUC.GetAll(ctx) if err != nil { if s.Logger != nil { - s.Logger.Error("Failed to retrieve computer systems collection", "error", err) + s.Logger.Error("Failed to retrieve computer systems collection: error=%v", err) } InternalServerError(c, err) diff --git a/redfish/internal/controller/http/v1/handler/systems_test.go b/redfish/internal/controller/http/v1/handler/systems_test.go index 1e5765f4d..17f6382a2 100644 --- a/redfish/internal/controller/http/v1/handler/systems_test.go +++ b/redfish/internal/controller/http/v1/handler/systems_test.go @@ -943,7 +943,7 @@ func TestSystemsHandler_GetSystemsCollection_WithLogger(t *testing.T) { assert.Equal(t, http.StatusInternalServerError, w.Code) // Verify logger was called assert.Len(t, testLogger.ErrorCalls, 1) - assert.Equal(t, "Failed to retrieve computer systems collection", testLogger.ErrorCalls[0][0]) + assert.Equal(t, "Failed to retrieve computer systems collection: error=%v", testLogger.ErrorCalls[0][0]) } // TestSystemsHandler_GetSystemByID_WithLogger tests logging paths @@ -973,7 +973,7 @@ func TestSystemsHandler_GetSystemByID_WithLogger(t *testing.T) { assert.Equal(t, http.StatusInternalServerError, w.Code) // Verify logger was called assert.Len(t, testLogger.ErrorCalls, 1) - assert.Equal(t, "Failed to retrieve computer system", testLogger.ErrorCalls[0][0]) + assert.Equal(t, "Failed to retrieve computer system: systemID=%s error=%v", testLogger.ErrorCalls[0][0]) } // createTestSystemEntityDataWithMemory creates a test system entity with MemorySummary diff --git a/redfish/internal/usecase/computer_system.go b/redfish/internal/usecase/computer_system.go index 9ceb4314e..f5ab6550a 100644 --- a/redfish/internal/usecase/computer_system.go +++ b/redfish/internal/usecase/computer_system.go @@ -58,6 +58,9 @@ const ( // Default system type. DefaultSystemType = "Physical" + + // WSMAN boot settings call timeout for system GET responses. + bootSettingsTimeout = 2 * time.Second ) // Resource Health constants. @@ -124,6 +127,8 @@ func (uc *ComputerSystemUseCase) GetAll(ctx context.Context) ([]string, error) { } // GetComputerSystem retrieves a ComputerSystem by its systemID and converts it to the generated API type. +// +//nolint:funlen // This method assembles many optional Redfish fields and handles best-effort fallbacks in one flow. func (uc *ComputerSystemUseCase) GetComputerSystem(ctx context.Context, systemID string) (*generated.ComputerSystemComputerSystem, error) { // Get device information from repository - this gives us basic device data system, err := uc.Repo.GetByID(ctx, systemID) @@ -186,11 +191,17 @@ func (uc *ComputerSystemUseCase) GetComputerSystem(ctx context.Context, systemID } } - // Fetch boot settings - boot, err := uc.Repo.GetBootSettings(ctx, systemID) - if err != nil { - // Log error but don't fail the entire request - boot settings may not be available - boot = nil + var boot *generated.ComputerSystemBoot + + if system.PowerState == redfishv1.PowerStateOn { + // Fetch boot settings from WSMAN with a bounded timeout to avoid long-tail latency. + bootCtx, bootCancel := context.WithTimeout(ctx, bootSettingsTimeout) + defer bootCancel() + + boot, err = uc.Repo.GetBootSettings(bootCtx, systemID) + if err != nil { + boot = nil + } } // Create Actions for this system using the generated Actions type actions := uc.createActionsStruct(systemID) diff --git a/redfish/internal/usecase/wsman_repo.go b/redfish/internal/usecase/wsman_repo.go index 1c31df442..f50506117 100644 --- a/redfish/internal/usecase/wsman_repo.go +++ b/redfish/internal/usecase/wsman_repo.go @@ -3,11 +3,14 @@ package usecase import ( "context" + "encoding/json" "errors" "fmt" "reflect" "strconv" "strings" + "sync" + "time" amtBoot "github.com/device-management-toolkit/go-wsman-messages/v2/pkg/wsman/amt/boot" "github.com/device-management-toolkit/go-wsman-messages/v2/pkg/wsman/amt/setupandconfiguration" @@ -109,6 +112,12 @@ const ( // KVM consent code constraints. consentCodeDigits = 6 + + // Parallel call and timeout constants for system aggregation. + parallelCallCount = 2 + wsmanCallTimeout = 8 * time.Second + consoleBudgetMinimum = 1500 * time.Millisecond + controlModeTimeout = 2 * time.Second ) var ( @@ -304,7 +313,9 @@ func (r *WsmanComputerSystemRepo) getCIMProperties(ctx context.Context, systemID // Get hardware info only once to avoid multiple WSMAN calls hwInfo, err := r.usecase.GetHardwareInfo(ctx, systemID) if err != nil { - r.log.Error("Failed to get hardware info", "systemID", systemID, "error", err) + if !isContextTimeoutOrCancelError(err) { + r.log.Error("Failed to get hardware info: systemID=%s error=%v", systemID, err) + } return results, hwInfo, err } @@ -876,8 +887,10 @@ func (r *WsmanComputerSystemRepo) GetAll(ctx context.Context) ([]string, error) } // GetByID retrieves a computer system by its ID from the WSMAN backend. +// +//nolint:gocognit,gocyclo,cyclop,funlen // Parallel WSMAN fan-out with partial-response fallback requires branching for resilience. func (r *WsmanComputerSystemRepo) GetByID(ctx context.Context, systemID string) (*redfishv1.ComputerSystem, error) { - // Verify device exists first + // Verify device exists first (DB lookup — fast, no WSMAN) device, err := r.usecase.GetByID(ctx, systemID, "", true) if r.isDeviceNotFoundError(err) { return nil, ErrSystemNotFound @@ -891,38 +904,117 @@ func (r *WsmanComputerSystemRepo) GetByID(ctx context.Context, systemID string) return nil, ErrSystemNotFound } - // Get power state from devices use case - powerState, err := r.usecase.GetPowerState(ctx, systemID) - if r.isDeviceNotFoundError(err) { + // Bound each WSMAN sub-call to keep end-to-end GET latency below the API deadline. + // wsmanCallTimeout allows for connection establishment + WS-MAN operation + network latency. + + var ( + redfishPowerState = redfishv1.PowerStateOff + cimData map[string]interface{} + hwInfo dto.HardwareInfo + powerErr error + cimErr error + ) + + var wg sync.WaitGroup + wg.Add(parallelCallCount) + + go func() { + defer wg.Done() + + powerCtx, powerCancel := context.WithTimeout(ctx, wsmanCallTimeout) + defer powerCancel() + + powerState, err := r.usecase.GetPowerState(powerCtx, systemID) + if err != nil { + powerErr = err + + return + } + + redfishPowerState = r.mapCIMPowerStateToRedfish(powerState.PowerState) + }() + + go func() { + defer wg.Done() + + cimCtx, cimCancel := context.WithTimeout(ctx, wsmanCallTimeout) + defer cimCancel() + + cimData, hwInfo, cimErr = r.getCIMProperties(cimCtx, systemID, allCIMConfigs) + }() + + wg.Wait() + + if r.isDeviceNotFoundError(powerErr) || r.isDeviceNotFoundError(cimErr) { return nil, ErrSystemNotFound } - if err != nil { - return nil, err + if powerErr != nil { + if !isContextTimeoutOrCancelError(powerErr) { + r.log.Warn("Failed to get power state, assuming off: systemID=%s error=%v", systemID, powerErr) + } } - // Map the integer power state to Redfish PowerState - redfishPowerState := r.mapCIMPowerStateToRedfish(powerState.PowerState) + if cimErr != nil { + // Build with empty CIM data to ensure we return a valid ComputerSystem + // instead of nil, providing graceful degradation + if ctx.Err() == nil && !isContextTimeoutOrCancelError(cimErr) { + r.log.Warn("Failed to retrieve CIM properties, building with partial data: systemID=%s error=%v", systemID, cimErr) + } - // Extract CIM data using the global configuration with static transformers - cimData, hwInfo, err := r.getCIMProperties(ctx, systemID, allCIMConfigs) - if err != nil { - return nil, err + cimData = make(map[string]interface{}) } - // Build and return the complete ComputerSystem using CIM data and hardware info + // Build and return the ComputerSystem using available CIM data and hardware info + // If CIM failed, cimData is empty but buildComputerSystemFromCIMData handles this gracefully system := r.buildComputerSystemFromCIMData(systemID, redfishPowerState, cimData, hwInfo) - // Fetch GraphicalConsole data best-effort, but avoid returning guessed values - // when feature retrieval fails. - _, featuresV2, err := r.usecase.GetFeatures(ctx, systemID) - if err != nil { - r.log.Warn("Failed to retrieve KVM features for GraphicalConsole", "systemID", systemID, "error", err) - + // Optional console enrichment should not run if the request is close to timing out. + if !hasSufficientTimeBudget(ctx, consoleBudgetMinimum) { return system, nil } - controlMode := r.getAMTControlMode(ctx, systemID) + var ( + featuresV2 dtov2.Features + featuresErr error + controlMode string + ) + + wg = sync.WaitGroup{} + wg.Add(parallelCallCount) + + go func() { + defer wg.Done() + + featCtx, featCancel := context.WithTimeout(ctx, wsmanCallTimeout) + defer featCancel() + + _, featuresV2, featuresErr = r.usecase.GetFeatures(featCtx, systemID) + }() + + go func() { + defer wg.Done() + + modeCtx, modeCancel := context.WithTimeout(ctx, controlModeTimeout) + defer modeCancel() + + controlMode = r.getAMTControlMode(modeCtx, systemID) + }() + + wg.Wait() + + if featuresErr != nil { + // Fall back to DB-persisted features from last sync + if cached, ok := getCachedFeaturesFromDevice(device); ok { + featuresV2 = cached + } else if ctx.Err() == nil && !isContextTimeoutOrCancelError(featuresErr) { + r.log.Warn("Failed to retrieve KVM features for GraphicalConsole: systemID=%s error=%v", systemID, featuresErr) + } + } + + if controlMode == "" { + controlMode = getCachedControlModeFromDevice(device) + } system.GraphicalConsole = r.buildGraphicalConsole(device.UseTLS, featuresV2, controlMode) system.SerialConsole = r.buildSerialConsole(systemID, featuresV2, controlMode) @@ -930,6 +1022,89 @@ func (r *WsmanComputerSystemRepo) GetByID(ctx context.Context, systemID string) return system, nil } +func getCachedFeaturesFromDevice(device *dto.Device) (dtov2.Features, bool) { + if device == nil || device.DeviceInfo == nil { + return dtov2.Features{}, false + } + + raw := strings.TrimSpace(device.DeviceInfo.Features) + if raw == "" { + return dtov2.Features{}, false + } + + var featuresV2 dtov2.Features + if err := json.Unmarshal([]byte(raw), &featuresV2); err == nil { + return featuresV2, true + } + + var featuresV1 dto.Features + if err := json.Unmarshal([]byte(raw), &featuresV1); err != nil { + return dtov2.Features{}, false + } + + featuresV2 = dtov2.Features{ + UserConsent: featuresV1.UserConsent, + EnableSOL: featuresV1.EnableSOL, + EnableIDER: featuresV1.EnableIDER, + EnableKVM: featuresV1.EnableKVM, + Redirection: featuresV1.Redirection, + OptInState: featuresV1.OptInState, + KVMAvailable: featuresV1.KVMAvailable, + OCR: featuresV1.OCR, + HTTPSBootSupported: featuresV1.HTTPSBootSupported, + WinREBootSupported: featuresV1.WinREBootSupported, + LocalPBABootSupported: featuresV1.LocalPBABootSupported, + } + + return featuresV2, true +} + +func getCachedControlModeFromDevice(device *dto.Device) string { + if device == nil || device.DeviceInfo == nil { + return "" + } + + mode := strings.TrimSpace(device.DeviceInfo.CurrentMode) + if mode == "" { + return "" + } + + if strings.Contains(strings.ToLower(mode), "admin") { + return controlModeACM + } + + if strings.Contains(strings.ToLower(mode), "client") { + return controlModeCCM + } + + if strings.EqualFold(mode, controlModeACM) || strings.EqualFold(mode, controlModeCCM) { + return strings.ToUpper(mode) + } + + return "" +} + +func isContextTimeoutOrCancelError(err error) bool { + if err == nil { + return false + } + + errText := strings.ToLower(err.Error()) + + return strings.Contains(errText, "context deadline exceeded") || + strings.Contains(errText, "context canceled") || + strings.Contains(errText, "ctx.done") +} + +func hasSufficientTimeBudget(ctx context.Context, minBudget time.Duration) bool { + deadline, ok := ctx.Deadline() + if !ok { + return true + } + + return time.Until(deadline) > minBudget +} + func (r *WsmanComputerSystemRepo) buildGraphicalConsole(useTLS bool, featuresV2 dtov2.Features, controlMode string) *redfishv1.ComputerSystemHostGraphicalConsole { serviceEnabled := featuresV2.EnableKVM @@ -1117,7 +1292,11 @@ func determineSOLStatus(enableSOL, solAvailable bool, userConsent string, optInS func (r *WsmanComputerSystemRepo) getAMTControlMode(ctx context.Context, systemID string) string { version, _, err := r.usecase.GetVersion(ctx, systemID) if err != nil { - r.log.Warn("Failed to retrieve AMT version for control mode", "systemID", systemID, "error", err) + if ctx.Err() != nil { + return "" + } + + r.log.Warn("Failed to retrieve AMT version for control mode: systemID=%s error=%v", systemID, err) return "" } @@ -1261,21 +1440,6 @@ func (r *WsmanComputerSystemRepo) CancelKVMConsent(ctx context.Context, systemID // UpdatePowerState sends a power action command to the specified system via WSMAN. func (r *WsmanComputerSystemRepo) UpdatePowerState(ctx context.Context, systemID string, resetType redfishv1.PowerState) error { - // Get the current power state for logging and validation - currentSystem, err := r.GetByID(ctx, systemID) - if err != nil { - return err - } - - // For certain reset types like PowerCycle and ForceRestart, we don't check current state - // because they are valid operations regardless of current power state - if resetType != redfishv1.ResetTypePowerCycle && resetType != redfishv1.ResetTypeForceRestart { - // Check if the requested state matches the current state - if currentSystem.PowerState == resetType { - return ErrPowerStateConflict - } - } - // Map Redfish reset type to WSMAN action action, err := r.mapRedfishPowerStateToAction(resetType) if err != nil { @@ -1300,7 +1464,9 @@ func (r *WsmanComputerSystemRepo) GetBootSettings(ctx context.Context, systemID return nil, ErrSystemNotFound } - r.log.Warn("Failed to get boot data from device", "systemID", systemID, "error", err) + if ctx.Err() == nil { + r.log.Warn("Failed to get boot data from device: systemID=%s error=%v", systemID, err) + } return nil, ErrBootSettingsNotAvailable }