Skip to content

Commit 886e908

Browse files
refactor: enhance engine recovery logic by introducing conditional checks for health status before attempting recovery
1 parent c87e179 commit 886e908

2 files changed

Lines changed: 33 additions & 2 deletions

File tree

dashboard/lib/providers/engine_provider.dart

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -308,7 +308,24 @@ class EngineProvider extends ChangeNotifier {
308308
DateTime.now().difference(lastKill) < _bundledEngineKillCooldown) {
309309
return;
310310
}
311-
unawaited(_recoverStalledLiveChannel());
311+
unawaited(_maybeRecoverStalledLiveChannel());
312+
}
313+
314+
/// Only treat missing live frames as a hung engine if HTTP health still says the
315+
/// engine is up. If the process was killed externally, health is down — let the
316+
/// normal reconnect path run instead of [recoverOwnedAfterStall], which uses
317+
/// [forceRestart] and can burn through the launcher's restart budget.
318+
Future<void> _maybeRecoverStalledLiveChannel() async {
319+
if (_recoveringLive) return;
320+
try {
321+
final j = await _service.getHealth();
322+
if (j == null || j['engine'] != true) {
323+
return;
324+
}
325+
} catch (_) {
326+
return;
327+
}
328+
await _recoverStalledLiveChannel();
312329
}
313330

314331
/// HTTP/WS up but no live telemetry (hung orchestrator): restart bundled engine

dashboard/lib/services/engine_bundled_launcher.dart

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,9 @@ class EngineBundledLauncher {
9393
// installer, or a different launch path). If HTTP health succeeds, align disk and
9494
// connect — do not spawn a second engine. Stall recovery uses [forceRestart] and
9595
// skips this path.
96-
if (!forceRestart && await _strictHealth(cfg.host, cfg.port)) {
96+
final httpHealthy =
97+
!forceRestart && await _strictHealth(cfg.host, cfg.port);
98+
if (httpHealthy) {
9799
final dirty = cfg.status != EngineStatus.running ||
98100
cfg.lastError.isNotEmpty;
99101
if (dirty) {
@@ -114,6 +116,18 @@ class EngineBundledLauncher {
114116
);
115117
}
116118

119+
// Disk still says "running" (e.g. Task Manager kill — Python never rewrote the
120+
// file). HTTP is down; clear stale state so we skip the long "wait for existing"
121+
// window on a dead PID.
122+
if (!forceRestart && !httpHealthy && cfg.status == EngineStatus.running) {
123+
cfg = cfg.copyWith(
124+
status: EngineStatus.stopped,
125+
lastError: '',
126+
pid: 0,
127+
);
128+
await EngineConfigStore.writeAtomic(cfg);
129+
}
130+
117131
if (cfg.status == EngineStatus.failed && !userRetry) {
118132
return EngineBootstrapOutcome(
119133
success: false,

0 commit comments

Comments
 (0)