wip

ngc92 · ngc92 · commit 616d11ad366f · 2025-08-23T17:09:24.000+02:00
diff --git a/examples/eval.py b/examples/eval.py
@@ -1,4 +1,5 @@
 import base64
+import copy
 import dataclasses
 import multiprocessing
 import re
@@ -65,7 +66,7 @@ def get_test_cases(file_name: str, seed: Optional[int]) -> list[TestCase]:
 
     tests = []
     lines = content.splitlines()
-    match = r"\s*([a-zA-Z]+):\s*([a-zA-Z]+|[+-]?[0-9]+)\s*"
+    match = r"\s*([a-zA-Z_]+):\s*([a-zA-Z]+|[+-]?[0-9]+)\s*"
     for line in lines:
         parts = line.split(";")
         case = {}
@@ -123,18 +124,19 @@ def calculate_stats(durations: list[int]):
                  worst=float(worst))
 
 
-def _clone_data(data):
+def _clone_data(data, rank: int):
     """
     Recursively goes through data and clones all tensors.
     """
     if isinstance(data, tuple):
-        return tuple(_clone_data(x) for x in data)
+        return tuple(_clone_data(x, rank) for x in data)
     elif isinstance(data, list):
-        return [_clone_data(x) for x in data]
+        return [_clone_data(x, rank) for x in data]
     elif isinstance(data, dict):
-        return {k: _clone_data(v) for k, v in data.items()}
+        return {k: _clone_data(v, rank) for k, v in data.items()}
     elif isinstance(data, torch.Tensor):
-        return data.clone()
+        device = f"cuda:{rank}"
+        return data.clone().to(device)
     else:
         return data
 
@@ -157,16 +159,60 @@ def _run_single_test(test: TestCase):
     from submission import custom_kernel
     data = generate_input(**test.args)
     torch.cuda.synchronize()
-    submission_output = custom_kernel(_clone_data(data))
+    submission_output = custom_kernel(_clone_data(data, 0))
     torch.cuda.synchronize()
     return wrap_check_implementation(data, submission_output)
 
 
+def _run_distributed_test(test: TestCase, rank: int):
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+    import torch.distributed as dist
+    world_size = test.args["world_size"]
+    os.environ["MASTER_ADDR"] = "127.0.0.1"
+    os.environ["MASTER_PORT"] = "12356"
+    dist.init_process_group("nccl", init_method="env://", rank=rank, world_size=world_size)
+    try:
+        data = generate_input(**test.args, rank=rank)
+        torch.cuda.synchronize()
+        submission_output = custom_kernel(_clone_data(data, rank))
+        torch.cuda.synchronize()
+        return wrap_check_implementation(data, submission_output)
+    finally:
+        dist.destroy_process_group()
+
+
+def run_multi_gpu_test(pool: multiprocessing.Pool, test: TestCase, world_size: int):
+    """
+    Runs a single test in another process.
+    """
+    rets = []
+    # world_size is a mandatory argument for multi-gpu tests
+    for i in range(world_size):
+        rets.append(
+            pool.apply_async(
+                _run_distributed_test,
+                args=(test, i),
+            )
+        )
+    rets = [el.get() for el in rets]
+
+    correct = all(ret[0] for ret in rets)
+    error_messages = str.join("\n", [f"rank {rank}: {ret[1]}" for rank, ret in enumerate(rets) if not ret[0]])
+    return correct, error_messages
+
+
 def run_single_test(pool: multiprocessing.Pool, test: TestCase):
     """
     Runs a single test in another process.
     """
-    return pool.apply(_run_single_test, (test,))
+    world_size = test.args.get("world_size", None)
+    if world_size is None:
+        return pool.apply(_run_single_test, (test, 0, 0))
+    else:
+        return run_multi_gpu_test(pool, test, world_size)
 
 
 def run_testing(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]):
@@ -345,14 +391,15 @@ def main():
     mode = sys.argv[1]
     seed = os.getenv("POPCORN_SEED")
     os.unsetenv("POPCORN_SEED")
+    n_gpus = int(os.getenv("POPCORN_GPUS", "1"))
     seed = int(seed) if seed else None
     set_seed(seed or 42)
     tests = get_test_cases(sys.argv[2], seed)
 
     with PopcornOutput(int(fd)) as logger:
         import multiprocessing
         mp_context = multiprocessing.get_context('spawn')
-        with mp_context.Pool(1) as pool:
+        with mp_context.Pool(n_gpus) as pool:
             if mode == "test":
                 return run_testing(logger, pool, tests)
             if mode == "benchmark":
diff --git a/examples/gather/reference.py b/examples/gather/reference.py
@@ -0,0 +1,19 @@
+import torch
+from task import input_t, output_t
+from utils import verbose_allclose
+from typing import Tuple
+
+
+def generate_input(seed: int, world_size: int, rank: int) -> input_t:
+    local_data = torch.tensor([rank]).to(f"cuda:{rank}")
+    return local_data, rank, world_size
+
+
+def check_implementation(data: input_t, output: output_t) -> Tuple[bool, str]:
+    data, rank, world_size = data
+    for i in range(world_size):
+        if output[i].get_device() != rank:
+            return False, f"mismatch found! output {i} of rank {rank} is on device {output[i].device}"
+        if (item := output[i].cpu().detach().item()) != i:
+            return False, f"mismatch found! custom implementation doesn't match reference: rank {rank}, entry {i} has value {item}"
+    return True, ''
diff --git a/examples/gather/submission.py b/examples/gather/submission.py
@@ -0,0 +1,12 @@
+#!POPCORN leaderboard identity_py-dev
+
+from task import input_t, output_t
+import torch
+from torch import distributed as dist
+
+
+def custom_kernel(data: input_t) -> output_t:
+    data, rank, world_size = data
+    result = [torch.empty_like(data) for _ in range(dist.get_world_size())]
+    dist.all_gather(result, data)
+    return result
diff --git a/examples/gather/task.py b/examples/gather/task.py
@@ -0,0 +1,10 @@
+from typing import TypedDict, List, Tuple
+import torch
+
+
+input_t = Tuple[torch.Tensor, int, int]
+output_t = List[torch.Tensor]
+
+
+class TestSpec(TypedDict):
+    pass
diff --git a/examples/gather/task.yml b/examples/gather/task.yml
@@ -0,0 +1,28 @@
+# name: identity-py
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+multi_gpu: true
+description:
+  A simple test task - python
+
+config:
+  main: "eval.py"
+
+templates:
+  Python: "../template.py"
+
+# small test cases. should be cheap to run.
+tests:
+  - {"seed": 5, "world_size": 4}
+
+benchmarks:
+  - {"seed": 10}
+
+ranking_by: "geom"
diff --git a/examples/gather/wrong.py b/examples/gather/wrong.py
@@ -0,0 +1,11 @@
+#!POPCORN leaderboard identity_py-dev
+
+from task import input_t, output_t
+import torch
+from torch import distributed as dist
+
+
+def custom_kernel(data: input_t) -> output_t:
+    data, rank, world_size = data
+    result = [torch.ones_like(data) for _ in range(dist.get_world_size())]
+    return result
diff --git a/src/libkernelbot/consts.py b/src/libkernelbot/consts.py
@@ -28,6 +28,8 @@ class ModalGPU(Enum):
     A100 = "A100"
     H100 = "H100"
     B200 = "B200"
+    # multi-gpu
+    L4x4 = "L4x4"
 
 
 @dataclasses.dataclass
@@ -109,7 +111,8 @@ class RankCriterion(Enum):
 
 GPU_TO_SM = {
     "T4": "75",
-    "L4": "80",
+    "L4": "89",
+    "L4x4": "89",
     "A100": "80",
     "H100": "90a",
     "B200": "100",
diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py
@@ -44,6 +44,7 @@ class RunResult:
 class SystemInfo:
     # fmt: off
     gpu: str = ''           # Model name of the GPU
+    device_count: int = 1   # Number of GPUs
     cpu: str = ''           # Model name of the CPU
     platform: str = ''      # Platform string of the machine
     torch: str = ''         # Torch version
@@ -217,7 +218,7 @@ def compile_cuda_script(  # # noqa: C901
     )
 
 
-def run_program(args: list[str], seed: Optional[int], timeout: int) -> RunResult:
+def run_program(args: list[str], seed: Optional[int], timeout: int, multi_gpu: bool = False) -> RunResult:
     print("[Running]")
     # set up a pipe so the tester can communicate its verdict with us
     env = os.environ.copy()
@@ -226,6 +227,10 @@ def run_program(args: list[str], seed: Optional[int], timeout: int) -> RunResult
     if seed is not None:
         env["POPCORN_SEED"] = str(seed)
 
+    if multi_gpu:
+        import torch
+        env["POPCORN_GPUS"] = str(torch.cuda.device_count())
+
     execution_start_time = time.perf_counter()
     try:
         run_process = subprocess.run(
@@ -279,6 +284,8 @@ def run_program(args: list[str], seed: Optional[int], timeout: int) -> RunResult
 def run_single_evaluation(
     call: list[str],
     mode: str,
+    *,
+    multi_gpu: bool = False,
     tests: Optional[str] = None,
     benchmarks: Optional[str] = None,
     test_timeout: int = Timeout.TEST,
@@ -295,7 +302,7 @@ def run_single_evaluation(
         with tempfile.NamedTemporaryFile("w") as tests_file:
             tests_file.write(tests)
             tests_file.flush()
-            return run_program(call + [mode, tests_file.name], seed=seed, timeout=test_timeout)
+            return run_program(call + [mode, tests_file.name], seed=seed, timeout=test_timeout, multi_gpu=multi_gpu)
     elif mode in ["benchmark", "profile", "leaderboard"]:
         timeout = ranked_timeout if mode == "leaderboard" else benchmark_timeout
         with tempfile.NamedTemporaryFile("w") as bench_file:
@@ -304,7 +311,7 @@ def run_single_evaluation(
             else:
                 bench_file.write(benchmarks)
             bench_file.flush()
-            return run_program(call + [mode, bench_file.name], seed=seed, timeout=timeout)
+            return run_program(call + [mode, bench_file.name], seed=seed, timeout=timeout, multi_gpu=multi_gpu)
     else:
         raise ValueError(f"Invalid mode {mode}")
 
@@ -319,6 +326,7 @@ def make_system_info() -> SystemInfo:
         # https://pytorch.org/docs/stable/notes/hip.html
         if torch.cuda.is_available():
             info.gpu = torch.cuda.get_device_name()
+            info.device_count = torch.cuda.device_count()
     except ImportError:
         # get GPU info manually
         try:
@@ -551,6 +559,7 @@ def run_config(config: dict):
         "ranked_timeout": config.get("ranked_timeout", Timeout.RANKED),
         "benchmark_timeout": config.get("benchmark_timeout", Timeout.BENCHMARK),
         "test_timeout": config.get("test_timeout", Timeout.TEST),
+        "multi_gpu": config.get("multi_gpu", False),
     }
     if config["lang"] == "py":
         runner = functools.partial(
diff --git a/src/libkernelbot/task.py b/src/libkernelbot/task.py
@@ -61,6 +61,7 @@ class LeaderboardTask:
     ranked_timeout: int = 180
     ranking_by: RankCriterion = RankCriterion.LAST
     seed: Optional[int] = None
+    multi_gpu: bool = False
 
     def __post_init__(self):
         if self.lang == Language.Python and not isinstance(self.config, PythonTaskData):
@@ -75,6 +76,7 @@ def from_dict(cls, data: dict):
         criterion = RankCriterion(data.get("ranking_by", RankCriterion.LAST))
         data_["lang"] = lang
         data_["ranking_by"] = criterion
+        data_["multi_gpu"] = data.get("multi_gpu", False)
         if lang == Language.Python:
             data_["config"] = PythonTaskData(**data["config"])
         else:
@@ -176,6 +178,7 @@ def build_task_config(
         "ranked_timeout": task.ranked_timeout,
         "ranking_by": task.ranking_by.value,
         "seed": task.seed,
+        "multi_gpu": task.multi_gpu,
     }
 
     if task.lang == Language.Python:
diff --git a/src/runners/modal_runner.py b/src/runners/modal_runner.py
@@ -40,7 +40,7 @@
     )
     # other frameworks
     .pip_install(
-        "jax[cuda12]==0.5.3",   # 0.6 want's cudnn 9.8 in conflict with torch 2.7
+        "jax[cuda12]==0.5.3",  # 0.6 want's cudnn 9.8 in conflict with torch 2.7
         "jax2torch==0.0.7",
         "tinygrad~=0.10",
     )
@@ -50,8 +50,8 @@
         "nvidia-cutlass-dsl~=4.0",
         "cuda-core[cu12]~=0.3",
         "cuda-python[all]==12.8",
-        #"nvmath-python[cu12]~=0.4",
-        #"numba-cuda[cu12]~=0.15",
+        # "nvmath-python[cu12]~=0.4",
+        # "numba-cuda[cu12]~=0.15",
     )
 )
 
diff --git a/src/runners/modal_runner_archs.py b/src/runners/modal_runner_archs.py
@@ -2,9 +2,9 @@
 # Modal apps on specific devices. We will fix this later.
 from modal_runner import app, cuda_image, modal_run_config
 
-gpus = ["T4", "L4", "A100-80GB", "H100!", "B200"]
+gpus = ["T4", "L4", "L4:4", "A100-80GB", "H100!", "B200"]
 for gpu in gpus:
-    gpu_slug = gpu.lower().split("-")[0].strip("!")
+    gpu_slug = gpu.lower().split("-")[0].strip("!").replace(":", "x")
     app.function(gpu=gpu, image=cuda_image, name=f"run_cuda_script_{gpu_slug}", serialized=True)(
         modal_run_config
     )
diff --git a/tests/test_modal.py b/tests/test_modal.py

Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,7 @@`
`40`	`40`	`)`
`41`	`41`	`# other frameworks`
`42`	`42`	`.pip_install(`
`43`		`- "jax[cuda12]==0.5.3", # 0.6 want's cudnn 9.8 in conflict with torch 2.7`
	`43`	`+ "jax[cuda12]==0.5.3", # 0.6 want's cudnn 9.8 in conflict with torch 2.7`
`44`	`44`	`"jax2torch==0.0.7",`
`45`	`45`	`"tinygrad~=0.10",`
`46`	`46`	`)`
`@@ -50,8 +50,8 @@`
`50`	`50`	`"nvidia-cutlass-dsl~=4.0",`
`51`	`51`	`"cuda-core[cu12]~=0.3",`
`52`	`52`	`"cuda-python[all]==12.8",`
`53`		`- #"nvmath-python[cu12]~=0.4",`
`54`		`- #"numba-cuda[cu12]~=0.15",`
	`53`	`+ # "nvmath-python[cu12]~=0.4",`
	`54`	`+ # "numba-cuda[cu12]~=0.15",`
`55`	`55`	`)`
`56`	`56`	`)`
`57`	`57`