Skip to content
This repository was archived by the owner on Jan 23, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion packages/jumpstarter-cli/jumpstarter_cli/shell.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def _run_shell_with_lease(lease, exporter_logs, config, command):
def launch_remote_shell(path: str) -> int:
return launch_shell(
path, lease.exporter_name, config.drivers.allow, config.drivers.unsafe,
config.shell.use_profiles, command=command
config.shell.use_profiles, command=command, lease=lease
)

with lease.serve_unix() as path:
Expand Down
15 changes: 13 additions & 2 deletions packages/jumpstarter/jumpstarter/client/lease.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging
import os
import sys
from collections.abc import AsyncGenerator, Generator
from collections.abc import AsyncGenerator, Callable, Generator
from contextlib import (
ExitStack,
asynccontextmanager,
Expand Down Expand Up @@ -54,6 +54,9 @@ class Lease(ContextManagerMixin, AsyncContextManagerMixin):
grpc_options: dict[str, Any] = field(default_factory=dict)
acquisition_timeout: int = field(default=7200) # Timeout in seconds for lease acquisition, polled in 5s intervals
exporter_name: str = field(default="remote", init=False) # Populated during acquisition
lease_ending_callback: Callable[[Self, timedelta], None] | None = field(
default=None, init=False
) # Called when lease is ending

def __post_init__(self):
if hasattr(super(), "__post_init__"):
Expand Down Expand Up @@ -208,11 +211,14 @@ async def __asynccontextmanager__(self) -> AsyncGenerator[Self]:
yield value
finally:
if self.release and self.name:
logger.info("Releasing Lease %s", self.name)
# Shield cleanup from cancellation to ensure it completes
with CancelScope(shield=True):
try:
with fail_after(30):
# skip the message if the lease is already expired
lease = await self.get()
if not lease.effective_end_time:
logger.info("Releasing Lease %s", self.name)
await self.svc.DeleteLease(
name=self.name,
)
Expand Down Expand Up @@ -280,6 +286,8 @@ async def _monitor():
if remain < timedelta(0):
# lease already expired, stopping monitor
logger.info("Lease {} ended at {}".format(self.name, end_time))
if self.lease_ending_callback is not None:
self.lease_ending_callback(self, timedelta(0))
Comment thread
mangelajo marked this conversation as resolved.
break
# Log once when entering the threshold window
if threshold - timedelta(seconds=check_interval) <= remain < threshold:
Expand All @@ -288,6 +296,9 @@ async def _monitor():
self.name, int((remain.total_seconds() + 30) // 60), end_time
)
)
# Notify callback about approaching expiration
if self.lease_ending_callback is not None:
self.lease_ending_callback(self, remain)
Comment on lines +299 to +301
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Add error handling around callback invocation.

Similar to the expired case, if the callback raises an exception during the warning phase, it will crash the monitor task before the lease actually expires, preventing proper expiration handling.

Apply this diff to add error handling:

                         # Notify callback about approaching expiration
                         if self.lease_ending_callback is not None:
-                            self.lease_ending_callback(self, remain)
+                            try:
+                                self.lease_ending_callback(self, remain)
+                            except Exception as e:
+                                logger.error("Error in lease ending callback for lease %s: %s", self.name, e)
🤖 Prompt for AI Agents
In packages/jumpstarter/jumpstarter/client/lease.py around lines 299 to 301, the
lease_ending_callback is invoked without protection so any exception it raises
will crash the monitor task; wrap the callback invocation in a try/except block
that catches Exception, logs the exception (including context: that it occurred
during the warning/remaining-time callback and the lease id or relevant
identifiers), and continues without re-raising so the monitor keeps running;
follow the same error-handling pattern used for the expired-case callback to
ensure consistent behavior.

await sleep(min(remain.total_seconds(), check_interval))
else:
await sleep(1)
Expand Down
57 changes: 43 additions & 14 deletions packages/jumpstarter/jumpstarter/common/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import os
import signal
import sys
from contextlib import ExitStack, asynccontextmanager, contextmanager
from datetime import timedelta
from functools import partial
from subprocess import Popen

from anyio.from_thread import BlockingPortal, start_blocking_portal
Expand Down Expand Up @@ -46,6 +49,34 @@ def serve(root_device: Driver):
PROMPT_CWD = "\\W"


def lease_ending_handler(process: Popen, lease, remaining_time) -> None:
"""Lease ending handler to terminate a process when lease ends.

Args:
process: The process to terminate
lease: The lease instance
remaining_time: Time remaining until lease expiration
"""

if remaining_time <= timedelta(0):
try:
process.send_signal(signal.SIGHUP)
except (ProcessLookupError, OSError):
pass # Process already terminated


def _run_process(
cmd: list[str],
env: dict[str, str],
lease=None,
) -> int:
"""Helper to run a process with an option to set a lease ending callback."""
process = Popen(cmd, stdin=sys.stdin, stdout=sys.stdout, stderr=sys.stderr, env=env)
if lease is not None:
lease.lease_ending_callback = partial(lease_ending_handler, process)
return process.wait()
Comment on lines +74 to +77
Copy link
Copy Markdown
Contributor

@coderabbitai coderabbitai Bot Dec 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

🏁 Script executed:

# Search for SIGHUP, process group, and related signal handling
rg -n "SIGHUP|process.?group|setpgrp|setsid|signal|Popen" --type py -C 2

Repository: jumpstarter-dev/jumpstarter

Length of output: 198


🏁 Script executed:

# Read the full context of the file to understand the function and its purpose
wc -l packages/jumpstarter/jumpstarter/common/utils.py

Repository: jumpstarter-dev/jumpstarter

Length of output: 198


🏁 Script executed:

# Read the relevant section with more context
sed -n '60,90p' packages/jumpstarter/jumpstarter/common/utils.py

Repository: jumpstarter-dev/jumpstarter

Length of output: 198


🌐 Web query:

Python subprocess.Popen SIGHUP propagation shell child processes cleanup

💡 Result:

Short answer

  • By default signals (SIGHUP) sent to your Python process are NOT automatically propagated to subprocess children (including grandchildren created by a shell). Use process groups or a parent-death signal to get predictable cleanup. [1][2][3]

Recommended options

  1. Put the child (and its descendants) in their own process group, then signal the group
  • Start with preexec_fn=os.setsid (POSIX) so the child is leader of a new session / process group, then send os.killpg(pgid, signal.SIGHUP/SIGTERM). Works for shell=True and direct execs (shell becomes group leader if it’s the immediate child). Example pattern:
    • p = Popen(cmd, preexec_fn=os.setsid)
    • os.killpg(os.getpgid(p.pid), signal.SIGTERM).
      References: Python subprocess docs; examples/StackOverflow. [1][2]
  1. Arrange children to die when the parent dies (pdeathsig / prctl)
  • In the child before exec call prctl(PR_SET_PDEATHSIG, SIGHUP) so the kernel delivers a signal to the child when its parent exits. This requires using a small C/ctypes helper or a library (prctl/pyprctl) via preexec_fn. This makes grandchildren created later still get killed when the parent exits only if they inherit that behavior or are direct children. [2][3]
  1. Use a watchdog in the child (application-level)
  • Have the child detect parent death (getppid()==1) or open a pipe/socket to parent and exit when it closes. Useful for portable/fine‑grained control. [2]

Caveats

  • shell=True: the immediate child is the shell; signaling the Popen.pid affects the shell, not the grandchildren directly. Using a process group (setsid) and killing the group is the usual fix. [1][2]
  • Race conditions: processes spawned after you enumerate children may escape; kill-by-process-group or design children to exit on pdeathsig is more robust. [2][3]
  • Windows differs: use job objects or TerminateProcess (not covered here). [1]

Key references
[1] Python subprocess docs (Popen, preexec_fn/process groups, signals).
[2] "Process Groups / Sessions" example and discussion (PyMOTW / subprocess guides).
[3] StackOverflow threads showing patterns: preexec_fn=os.setsid + os.killpg and prctl(PR_SET_PDEATHSIG) examples.

If you want, I can paste a minimal code example for (a) setsid + killpg or (b) prctl-based pdeathsig.


🌐 Web query:

zombie processes Popen waitpid signal handling Python shell spawned processes

💡 Result:

Short answer: Zombies happen when a child has exited but its parent never reaped its exit status. With subprocess.Popen this occurs if you never call wait()/communicate() (or otherwise call waitpid()) on the child, or if your parent is blocked/ignores reaping. Fixes: explicitly reap children or let the kernel auto-reap them. Sources: [1][2][3][4].

What causes zombies (quick):

  • Child exits but parent doesn't call wait()/waitpid() → process stays as "zombie". [3][7]
  • Keeping Popen objects alive without calling wait/communicate/poll can leave zombies. [3][7]
  • Use of shell=True can add an extra shell process (sh -c) to consider; you still must reap the real child. [2]

Practical fixes (pick what's appropriate):

  1. Call wait()/communicate() or poll() and then wait when you detect termination
  • For each Popen: p = Popen(...); later p.wait() or p.communicate(); or poll() in a loop and then wait. This is the simplest and most robust. [3][4]
  1. Reap in a SIGCHLD handler (non-blocking waitpid loop)
  • Install a handler that calls os.waitpid(-1, os.WNOHANG) in a loop to reap all exited children. Careful with reentrancy and threads. Example patterns: loop on waitpid until no more children. [1]
  1. If you never need child statuses, tell the OS to discard them
  • signal.signal(signal.SIGCHLD, signal.SIG_IGN) will cause immediate reaping on many Unixes (usable only if you never need to wait for children). Note: only allowed in main thread on some platforms. [2][3]
  1. Avoid accidentally keeping references / let subprocess auto-reap behavior be understood
  • Don’t retain Popen objects you won’t wait on; if you spawn many children, track and reap them. Historically subprocess had auto-reaping behavior; rely on explicit reaping for control. [8]
  1. For detached/daemon children
  • If you want fully detached background processes, use double-fork or start the child in its own session (start_new_session=True or preexec_fn=os.setsid) and ensure stdio/fds are closed (DEVNULL, close_fds=True) so the parent won’t be left with zombies or hung pipes. Then reap the immediate child or use SIG_IGN if you don’t care about exit status. [2][3]

Caveats and gotchas

  • Popen.wait() can be affected by spurious wakeups/interrupts in some Python versions—use robust patterns and test under your target Python. [5]
  • signal.signal(SIGCHLD, SIG_IGN) is process-global and not safe in libraries that need child exit info; it may only be set in the main thread. [2][1]

Selected references

  • SIGCHLD handler / reaping example. [1]
  • StackOverflow discussion: SIG_IGN and practical Popen advice. [2][3]
  • Official subprocess behavior (Popen methods). [4]
  • Python bug discussion on waitpid/spurious wakeups. [5]

If you want, tell me your Python version and a short snippet of how you spawn processes and I’ll give a specific fix you can apply.


Reconsider reliance on shell cleanup; explicit signal handling recommended.

SIGHUP is not automatically propagated to grandchildren spawned by the shell—this depends on shell session configuration and process group setup. While process.wait() reaps the immediate shell process, any grandchildren it creates may not receive SIGHUP and could become zombies if the shell doesn't propagate the signal or if they detach.

For reliable cleanup on lease expiry, consider using preexec_fn=os.setsid to place the shell in its own process group, then signal the group explicitly when cleanup is needed. Alternatively, rely on process.wait() combined with explicit child monitoring if the current behavior has not been validated in the Shell driver's actual deployment scenarios.

Copy link
Copy Markdown
Contributor Author

@evakhoni evakhoni Dec 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tried the suggested solution, however using it introduced a bug where sending a SIGINT (with ctrl+c for example) is causing a crash and leaves the terminal in a misbehaving state.
I also tested locally, in zsh under fedora, the use of SIGHUP with the current approach is correctly closing subprocesses, even background ones, unless intentionally launched with nohup. nevertheless, I'm open to suggestions

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@evakhoni, thank you for testing the process group approach and reporting the issue. The SIGINT problem you encountered is expected behavior—when you use os.setsid(), the shell becomes a session leader in a new session, which removes it from the terminal's foreground process group. As a result, Ctrl+C signals from the terminal are no longer delivered to the shell, breaking interactive control.

Given your testing shows SIGHUP correctly propagates to subprocesses in zsh (and likely bash), and since this implementation is specifically for handling lease expiry rather than being a general-purpose process manager, the current approach appears reasonable. The key constraints here are:

  1. Interactive shell behavior must be preserved – users need Ctrl+C to work normally
  2. Lease expiry is a specific event – you're handling a known scenario (lease ends, close session) rather than managing arbitrary process trees

Your local testing is the right validation approach. For production confidence, you might want to verify behavior across the shell types you support (bash, zsh, fish) and document any known edge cases where subprocesses might not receive SIGHUP (like when users explicitly use nohup or disown).

The current implementation balances the need for lease-based cleanup with maintaining a working interactive shell, which seems appropriate for this use case.


✏️ Learnings added
Learnt from: evakhoni
Repo: jumpstarter-dev/jumpstarter PR: 757
File: packages/jumpstarter/jumpstarter/common/utils.py:74-77
Timestamp: 2025-12-02T19:57:16.556Z
Learning: Using `preexec_fn=os.setsid` with subprocess.Popen breaks SIGINT (Ctrl+C) handling for interactive shells in jumpstarter because it moves the shell to a different session, removing it from the terminal's foreground process group.

Note: Learnings are effective only in the context of similar code segments. To apply general coding guidelines, please configure review instructions. You can manage existing learnings in the app.


🧠 Learnings used
Learnt from: michalskrivanek
Repo: jumpstarter-dev/jumpstarter PR: 608
File: packages/jumpstarter-cli/jumpstarter_cli/run.py:0-0
Timestamp: 2025-09-06T05:25:18.184Z
Learning: The Shell driver in jumpstarter can spawn multiple processes and leave them behind, creating zombie processes that need to be reaped even in non-PID 1 scenarios.

Learnt from: michalskrivanek
Repo: jumpstarter-dev/jumpstarter PR: 608
File: packages/jumpstarter-cli/jumpstarter_cli/run.py:0-0
Timestamp: 2025-09-06T05:25:18.184Z
Learning: Zombie process reaping in jumpstarter is useful beyond just PID 1 cases, as drivers can leave behind processes in various execution contexts.



def launch_shell(
host: str,
context: str,
Expand All @@ -54,6 +85,7 @@ def launch_shell(
use_profiles: bool,
*,
command: tuple[str, ...] | None = None,
lease=None,
) -> int:
"""Launch a shell with a custom prompt indicating the exporter type.

Expand All @@ -62,6 +94,12 @@ def launch_shell(
context: The context of the shell (e.g. "local" or exporter name)
allow: List of allowed drivers
unsafe: Whether to allow drivers outside of the allow list
use_profiles: Whether to load shell profile files
command: Optional command to run instead of launching an interactive shell
lease: Optional Lease object to set up lease ending callback

Returns:
The exit code of the shell or command process
"""

shell = os.environ.get("SHELL", "bash")
Expand All @@ -73,19 +111,16 @@ def launch_shell(
}

if command:
process = Popen(command, stdin=sys.stdin, stdout=sys.stdout, stderr=sys.stderr, env=common_env)
return process.wait()
return _run_process(list(command), common_env, lease)

if shell_name.endswith("bash"):
env = common_env | {
"PS1": f"{ANSI_GRAY}{PROMPT_CWD} {ANSI_YELLOW}⚡{ANSI_WHITE}{context} {ANSI_YELLOW}➤{ANSI_RESET} ",
}

cmd = [shell]
if not use_profiles:
cmd.extend(["--norc", "--noprofile"])
process = Popen(cmd, stdin=sys.stdin, stdout=sys.stdout, stderr=sys.stderr, env=env)
return process.wait()
return _run_process(cmd, env, lease)

elif shell_name == "fish":
fish_fn = (
Expand All @@ -102,26 +137,20 @@ def launch_shell(
"end"
)
cmd = [shell, "--init-command", fish_fn]
process = Popen(cmd, stdin=sys.stdin, stdout=sys.stdout, stderr=sys.stderr, env=common_env)
return process.wait()
return _run_process(cmd, common_env, lease)

elif shell_name == "zsh":
env = common_env | {
"PS1": f"%F{{8}}%1~ %F{{yellow}}⚡%F{{white}}{context} %F{{yellow}}➤%f ",
}

if "HISTFILE" not in env:
env["HISTFILE"] = os.path.join(os.path.expanduser("~"), ".zsh_history")

cmd = [shell]
if not use_profiles:
cmd.append("--no-rcs")

cmd.extend(["-o", "inc_append_history", "-o", "share_history"])

process = Popen(cmd, stdin=sys.stdin, stdout=sys.stdout, stderr=sys.stderr, env=env)
return process.wait()
return _run_process(cmd, env, lease)

else:
process = Popen([shell], stdin=sys.stdin, stdout=sys.stdout, stderr=sys.stderr, env=common_env)
return process.wait()
return _run_process([shell], common_env, lease)
Loading