diff --git a/electron/ipc/server.ts b/electron/ipc/server.ts index 6c31c0f..57e0ba8 100644 --- a/electron/ipc/server.ts +++ b/electron/ipc/server.ts @@ -109,18 +109,8 @@ export function registerServerIpc(): void { ] // Parent-process watchdog: the Python server force-exits if this Electron - // process disappears. On Windows we also pass our process-creation time - // so the watchdog can defeat PID recycling — without it, a freshly spawned - // OS process inheriting our PID would look "alive" to the server even - // though Biome itself is long gone. - const parentStartTimeSec = Date.now() / 1000 - process.uptime() - const serverArgs = [ - ...baseServerArgs, - '--parent-pid', - String(process.pid), - '--parent-start-time', - parentStartTimeSec.toFixed(3) - ] + // process disappears. + const serverArgs = [...baseServerArgs, '--parent-pid', String(process.pid)] // Spawn the server const child = spawn(uvBinary, serverArgs, { diff --git a/server-components/main.py b/server-components/main.py index c9032fb..a3906e6 100644 --- a/server-components/main.py +++ b/server-components/main.py @@ -40,10 +40,6 @@ class StartupConfig: before the lifespan body itself runs.""" parent_pid: int | None = None - # Epoch seconds for the parent's process-creation time, as reported by - # the launcher. Paired with `parent_pid` to defend against PID recycling - # on Windows: a recycled PID matches, but the creation time won't. - parent_start_time: float | None = None # True when the launching Biome instance is in standalone mode and # owns this process's lifecycle — set via `--launched-from-standalone` # so the renderer can refuse to point itself at a server it would @@ -55,48 +51,60 @@ class StartupConfig: # If launched with --parent-pid, poll the parent and exit if it dies. # Linux's prctl(PR_SET_PDEATHSIG) is the kernel-level fallback we'd ideally # use, but Python doesn't expose it portably; the polling watchdog covers -# both Linux and Windows. On Windows the bare `os.kill(pid, 0)` check is -# unreliable in the face of PID recycling — kernel reuses PIDs aggressively -# and the new occupant can be running as a different user — so when the -# launcher passes `--parent-start-time` we additionally compare the parent's -# process-creation time via psutil. - - -# Tolerance for clock skew between the launcher's clock and the kernel's -# recorded process-creation timestamp. Process-creation times sit at -# millisecond precision; one second is generous enough to absorb the -# stringification round-trip and any small drift, while still being well -# below the gap any recycled PID would exhibit. +# both Linux and Windows. A bare `os.kill(pid, 0)` check is unreliable in +# the face of PID recycling — the kernel reuses PIDs aggressively and the +# new occupant can be a different process — so the watchdog also pins the +# parent's psutil `create_time()`, captured as a baseline at startup, and +# treats a changed creation timestamp as "parent gone". + + +# Tolerance for the PID-recycling guard. The baseline and every poll both +# read the parent's creation time from the same source (psutil), and a +# process's create_time is immutable for its lifetime, so a live parent +# compares exactly equal — this tolerance only absorbs float round-trip +# noise. A recycled PID belongs to a process created seconds-to-hours +# later, so it lands far outside this window. _PARENT_START_TIME_TOLERANCE_SEC = 1.0 class ParentWatchdog: """Monitors a parent process and force-exits this process if the parent dies. Constructed in `__main__` (one-shot startup check), - then run as an asyncio task by the lifespan (continuous polling).""" + then run as an asyncio task by the lifespan (continuous polling). - def __init__(self, parent_pid: int, parent_start_time: float | None = None) -> None: + The recycling guard compares the parent's *current* kernel + creation timestamp against a baseline captured here via psutil at + construction.""" + + def __init__(self, parent_pid: int) -> None: self.parent_pid = parent_pid - self.parent_start_time = parent_start_time + # Baseline from the kernel, captured now. None if the parent is + # already gone or its metadata isn't readable (→ pid-only liveness). + self.baseline_create_time = self._read_create_time() - def _parent_alive(self) -> bool: - """True iff a process with the parent's PID exists and (when - `parent_start_time` is known) has a matching creation timestamp.""" + def _read_create_time(self) -> float | None: try: - proc = psutil.Process(self.parent_pid) - except psutil.NoSuchProcess: - return False - if self.parent_start_time is None: - return True + return psutil.Process(self.parent_pid).create_time() + except (psutil.NoSuchProcess, psutil.AccessDenied): + return None + + def _parent_alive(self) -> bool: + """True if a process with the parent's PID exists and its + creation timestamp still matches the baseline (guards against + PID recycling).""" try: - create_time = proc.create_time() + current = psutil.Process(self.parent_pid).create_time() except psutil.NoSuchProcess: return False except psutil.AccessDenied: # Process exists but we can't read its metadata. Treat as alive # rather than self-exit on a transient permission glitch. return True - return abs(create_time - self.parent_start_time) <= _PARENT_START_TIME_TOLERANCE_SEC + if self.baseline_create_time is None: + # No baseline (parent was gone/unreadable at construction but + # the PID resolves now) → fall back to pid-only liveness. + return True + return abs(current - self.baseline_create_time) <= _PARENT_START_TIME_TOLERANCE_SEC def check_alive_or_exit(self) -> None: """Synchronous one-shot check at startup, in case the parent @@ -199,7 +207,7 @@ async def lifespan(app: FastAPI): cfg: StartupConfig = app.state.startup_config watchdog_task = None if cfg.parent_pid is not None: - watchdog_task = asyncio.create_task(ParentWatchdog(cfg.parent_pid, cfg.parent_start_time).run()) + watchdog_task = asyncio.create_task(ParentWatchdog(cfg.parent_pid).run()) yield @@ -229,15 +237,6 @@ async def lifespan(app: FastAPI): parser.add_argument( "--parent-pid", type=int, default=None, help="PID of parent process; server exits if parent dies" ) - parser.add_argument( - "--parent-start-time", - type=float, - default=None, - help=( - "Epoch-seconds creation timestamp of the parent process. When paired with --parent-pid, used to guard" - " against PID recycling: a recycled PID won't match the original parent's creation time." - ), - ) parser.add_argument( "--launched-from-standalone", action="store_true", @@ -250,12 +249,17 @@ async def lifespan(app: FastAPI): app.state.startup_config = StartupConfig( parent_pid=args.parent_pid, - parent_start_time=args.parent_start_time, launched_from_standalone=args.launched_from_standalone, ) if args.parent_pid is not None: - logger.info("Monitoring parent process", parent_pid=args.parent_pid, parent_start_time=args.parent_start_time) - ParentWatchdog(args.parent_pid, args.parent_start_time).check_alive_or_exit() + # The watchdog self-reads the parent's create_time as its recycling baseline. + watchdog = ParentWatchdog(args.parent_pid) + logger.info( + "Monitoring parent process", + parent_pid=args.parent_pid, + parent_create_time=watchdog.baseline_create_time, + ) + watchdog.check_alive_or_exit() # Construct the uvicorn Server explicitly (rather than via `uvicorn.run`) # so the `/shutdown` route can flip `should_exit` on the live instance, diff --git a/server-components/server/routes.py b/server-components/server/routes.py index b808cac..ef887de 100755 --- a/server-components/server/routes.py +++ b/server-components/server/routes.py @@ -489,7 +489,14 @@ async def _fetch_waypoint_ids(cache: TtlCache[str, list[str]]) -> list[str]: async def fetcher() -> list[str]: def _fetch() -> list[str]: - collection = get_collection(WAYPOINT_COLLECTION_SLUG) + # `token=False` forces an anonymous request. The collection is + # public curated metadata, so listing it must not depend on + # whatever HF token the host happens to have cached — a + # fine-grained/restricted token (e.g. one scoped only for + # uploads) authenticates but lacks "read collections" and gets + # a 403, whereas anonymous access succeeds. Decoupling from the + # ambient token keeps the picker populated for every user. + collection = get_collection(WAYPOINT_COLLECTION_SLUG, token=False) return [item.item_id for item in collection.items if item.item_type == "model"] fetched = await asyncio.to_thread(_fetch)