Skip to content

Commit d338ab0

Browse files
author
Mark Saroufim
committed
container level timeouts
1 parent 41a6349 commit d338ab0

2 files changed

Lines changed: 14 additions & 3 deletions

File tree

src/libkernelbot/launchers/modal.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,16 @@ async def run_submission(
3030

3131
await status.push("⏳ Waiting for Modal run to finish...")
3232

33+
# Use task-specific timeout + 60s buffer for signal-based timeout
34+
# This catches most hangs; container timeout is the fallback for hung GPUs
35+
task_timeout = config.get("ranked_timeout", 180)
36+
signal_timeout = task_timeout + 60
37+
3338
result = await loop.run_in_executor(
3439
None,
35-
lambda: modal.Function.from_name("discord-bot-runner", func_name).remote(config=config),
40+
lambda: modal.Function.from_name("discord-bot-runner", func_name).remote(
41+
config=config, timeout_seconds=signal_timeout
42+
),
3643
)
3744

3845
await status.update("✅ Waiting for modal run to finish... Done")

src/runners/modal_runner_archs.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,16 @@
22
# Modal apps on specific devices. We will fix this later.
33
from modal_runner import app, cuda_image, modal_run_config
44

5+
# Container-level timeout (seconds) - kills container regardless of GPU state
6+
# This is the nuclear option for hung GPUs that don't respond to signals
7+
MODAL_CONTAINER_TIMEOUT = 300
8+
59
gpus = ["T4", "L4", "L4:4", "A100-80GB", "H100!", "B200"]
610
for gpu in gpus:
711
gpu_slug = gpu.lower().split("-")[0].strip("!").replace(":", "x")
8-
app.function(gpu=gpu, image=cuda_image, name=f"run_cuda_script_{gpu_slug}", serialized=True)(
12+
app.function(gpu=gpu, image=cuda_image, name=f"run_cuda_script_{gpu_slug}", serialized=True, timeout=MODAL_CONTAINER_TIMEOUT)(
913
modal_run_config
1014
)
11-
app.function(gpu=gpu, image=cuda_image, name=f"run_pytorch_script_{gpu_slug}", serialized=True)(
15+
app.function(gpu=gpu, image=cuda_image, name=f"run_pytorch_script_{gpu_slug}", serialized=True, timeout=MODAL_CONTAINER_TIMEOUT)(
1216
modal_run_config
1317
)

0 commit comments

Comments
 (0)