Skip to content
27 changes: 26 additions & 1 deletion .github/workflows/amd-health.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,43 @@ on:

jobs:
health-check:
runs-on: [amdgpu-mi300-x86-64]
runs-on: [amdgpu-mi300-8-x86-64]
timeout-minutes: 5

steps:
- uses: actions/checkout@v3

- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.10'

- name: Install PyTorch
run: |
pip install numpy
pip install torch --index-url https://download.pytorch.org/whl/rocm6.3

- name: System Information
run: |
echo "=== ROCm Version ==="
rocm-smi --version || rocminfo --version || echo "ROCm version check failed"
echo ""
echo "=== GPU Driver Info ==="
rocm-smi -a || rocminfo || echo "ROCm SMI failed"
echo ""
echo "=== PyTorch Version ==="
python -c "import torch; print(f'PyTorch: {torch.__version__}')"
python -c "import torch; print(f'CUDA/ROCm: {torch.version.cuda}')"
python -c "import torch; print(f'HIP: {torch.version.hip if hasattr(torch.version, \"hip\") else \"N/A\"}')"
echo ""
echo "=== OS Info ==="
uname -a
cat /etc/os-release | head -5

- name: GPU Health Check
run: python -c "import torch; torch.randn(5, device='cuda')"

- name: Distributed Health Check
run: |
python -c "import torch; print(f'Available GPUs: {torch.cuda.device_count()}')"
python scripts/test_distributed.py
93 changes: 93 additions & 0 deletions scripts/test_distributed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import os
import signal
import sys
from multiprocessing import Pool

import torch
import torch.distributed as dist
import torch.multiprocessing as mp


def timeout_handler(signum, frame):
print("✗ TIMEOUT: Process hung")
sys.exit(1)


def test_worker(args):
rank, world_size, master_port = args
try:
os.environ["MASTER_ADDR"] = "127.0.0.1"
os.environ["MASTER_PORT"] = str(master_port)
os.environ["RANK"] = str(rank)
os.environ["WORLD_SIZE"] = str(world_size)

signal.signal(signal.SIGALRM, timeout_handler)
signal.alarm(30)

print(f"Rank {rank}: Init NCCL...")
dist.init_process_group(
"nccl",
init_method="env://",
rank=rank,
world_size=world_size,
device_id=torch.device(f"cuda:{rank}"),
)
signal.alarm(0)

device = torch.device(f"cuda:{rank}")
tensor = torch.ones(100, device=device) * rank

signal.alarm(15)
dist.all_reduce(tensor)
signal.alarm(0)

print(f"✓ Rank {rank}: sum = {tensor[0].item()}")
dist.destroy_process_group()
return True

except Exception as e:
signal.alarm(0)
print(f"✗ Rank {rank}: {e}")
return False


def main():
num_gpus = torch.cuda.device_count()
print(f"Testing {num_gpus} GPUs - 4 rounds")

for round_num in range(4):
print(f"=== ROUND {round_num + 1} ===")
master_port = 29500 + round_num

mp.set_start_method("spawn", force=True)

# Prepare worker arguments
worker_args = [(rank, num_gpus, master_port) for rank in range(num_gpus)]

with Pool(processes=num_gpus) as pool:
try:
# Use map_async with timeout
result = pool.map_async(test_worker, worker_args)
results = result.get(timeout=60)

# Check if all workers succeeded
if not all(results):
print(f"✗ ROUND {round_num + 1} FAILED")
sys.exit(1)

except mp.TimeoutError:
print(f"✗ ROUND {round_num + 1} HUNG")
pool.terminate()
pool.join()
sys.exit(1)
except Exception as e:
print(f"✗ ROUND {round_num + 1} ERROR: {e}")
sys.exit(1)

print(f"✓ ROUND {round_num + 1} PASSED")

print("✓ ALL ROUNDS PASSED")


if __name__ == "__main__":
main()
6 changes: 2 additions & 4 deletions src/libkernelbot/launchers/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ async def run_submission( # noqa: C901
logger.info("Waiting for workflow to start...")

timeout = get_timeout(config) + TIMEOUT_BUFFER_MINUTES

logger.info(f"Waiting for workflow to complete... (timeout: {timeout} minutes)")
await run.wait_for_completion(
lambda x: self.wait_callback(x, status), timeout_minutes=timeout
Expand Down Expand Up @@ -350,7 +351,6 @@ async def wait_for_completion(
logger.error(f"Error waiting for GitHub run {self.run_id}: {e}", exc_info=e)
raise # Re-raise other exceptions


def get_artifact_index(self) -> dict[str, GitHubArtifact]:
logger.info("Creating artifact index for run %s", self.run_id)
artifacts = self.run.get_artifacts()
Expand All @@ -368,7 +368,6 @@ def get_artifact_index(self) -> dict[str, GitHubArtifact]:

return extracted


async def download_artifact(self, artifact: GitHubArtifact) -> dict:
logger.info("Attempting to download artifact '%s' for run %s", artifact.name, self.run_id)

Expand All @@ -387,6 +386,5 @@ async def download_artifact(self, artifact: GitHubArtifact) -> dict:
return artifact_dict
else:
raise RuntimeError(
f"Failed to download artifact {artifact.name}. "
f"Status code: {response.status_code}"
f"Failed to download artifact {artifact.name}. Status code: {response.status_code}"
)
2 changes: 1 addition & 1 deletion tests/test_github.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def github_config():

@pytest.mark.integration
@pytest.mark.asyncio
@pytest.mark.parametrize("gpu_type", [GitHubGPU.NVIDIA, GitHubGPU.MI300])
@pytest.mark.parametrize("gpu_type", [GitHubGPU.NVIDIA, GitHubGPU.MI300x8])
async def test_github_launcher_python_script(project_root: Path, github_config: GitHubConfig, gpu_type: GitHubGPU):
"""
Test GitHubLauncher with a real Python script using real GitHub Actions.
Expand Down
Loading