Skip to content

Commit 616d11a

Browse files
committed
wip
1 parent 379552f commit 616d11a

12 files changed

Lines changed: 215 additions & 18 deletions

File tree

examples/eval.py

Lines changed: 56 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import base64
2+
import copy
23
import dataclasses
34
import multiprocessing
45
import re
@@ -65,7 +66,7 @@ def get_test_cases(file_name: str, seed: Optional[int]) -> list[TestCase]:
6566

6667
tests = []
6768
lines = content.splitlines()
68-
match = r"\s*([a-zA-Z]+):\s*([a-zA-Z]+|[+-]?[0-9]+)\s*"
69+
match = r"\s*([a-zA-Z_]+):\s*([a-zA-Z]+|[+-]?[0-9]+)\s*"
6970
for line in lines:
7071
parts = line.split(";")
7172
case = {}
@@ -123,18 +124,19 @@ def calculate_stats(durations: list[int]):
123124
worst=float(worst))
124125

125126

126-
def _clone_data(data):
127+
def _clone_data(data, rank: int):
127128
"""
128129
Recursively goes through data and clones all tensors.
129130
"""
130131
if isinstance(data, tuple):
131-
return tuple(_clone_data(x) for x in data)
132+
return tuple(_clone_data(x, rank) for x in data)
132133
elif isinstance(data, list):
133-
return [_clone_data(x) for x in data]
134+
return [_clone_data(x, rank) for x in data]
134135
elif isinstance(data, dict):
135-
return {k: _clone_data(v) for k, v in data.items()}
136+
return {k: _clone_data(v, rank) for k, v in data.items()}
136137
elif isinstance(data, torch.Tensor):
137-
return data.clone()
138+
device = f"cuda:{rank}"
139+
return data.clone().to(device)
138140
else:
139141
return data
140142

@@ -157,16 +159,60 @@ def _run_single_test(test: TestCase):
157159
from submission import custom_kernel
158160
data = generate_input(**test.args)
159161
torch.cuda.synchronize()
160-
submission_output = custom_kernel(_clone_data(data))
162+
submission_output = custom_kernel(_clone_data(data, 0))
161163
torch.cuda.synchronize()
162164
return wrap_check_implementation(data, submission_output)
163165

164166

167+
def _run_distributed_test(test: TestCase, rank: int):
168+
"""
169+
Runs a single test case. Do not call directly
170+
"""
171+
from submission import custom_kernel
172+
import torch.distributed as dist
173+
world_size = test.args["world_size"]
174+
os.environ["MASTER_ADDR"] = "127.0.0.1"
175+
os.environ["MASTER_PORT"] = "12356"
176+
dist.init_process_group("nccl", init_method="env://", rank=rank, world_size=world_size)
177+
try:
178+
data = generate_input(**test.args, rank=rank)
179+
torch.cuda.synchronize()
180+
submission_output = custom_kernel(_clone_data(data, rank))
181+
torch.cuda.synchronize()
182+
return wrap_check_implementation(data, submission_output)
183+
finally:
184+
dist.destroy_process_group()
185+
186+
187+
def run_multi_gpu_test(pool: multiprocessing.Pool, test: TestCase, world_size: int):
188+
"""
189+
Runs a single test in another process.
190+
"""
191+
rets = []
192+
# world_size is a mandatory argument for multi-gpu tests
193+
for i in range(world_size):
194+
rets.append(
195+
pool.apply_async(
196+
_run_distributed_test,
197+
args=(test, i),
198+
)
199+
)
200+
rets = [el.get() for el in rets]
201+
202+
correct = all(ret[0] for ret in rets)
203+
error_messages = str.join("\n", [f"rank {rank}: {ret[1]}" for rank, ret in enumerate(rets) if not ret[0]])
204+
return correct, error_messages
205+
206+
165207
def run_single_test(pool: multiprocessing.Pool, test: TestCase):
166208
"""
167209
Runs a single test in another process.
168210
"""
169-
return pool.apply(_run_single_test, (test,))
211+
world_size = test.args.get("world_size", None)
212+
if world_size is None:
213+
return pool.apply(_run_single_test, (test, 0, 0))
214+
else:
215+
return run_multi_gpu_test(pool, test, world_size)
170216

171217

172218
def run_testing(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]):
@@ -345,14 +391,15 @@ def main():
345391
mode = sys.argv[1]
346392
seed = os.getenv("POPCORN_SEED")
347393
os.unsetenv("POPCORN_SEED")
394+
n_gpus = int(os.getenv("POPCORN_GPUS", "1"))
348395
seed = int(seed) if seed else None
349396
set_seed(seed or 42)
350397
tests = get_test_cases(sys.argv[2], seed)
351398

352399
with PopcornOutput(int(fd)) as logger:
353400
import multiprocessing
354401
mp_context = multiprocessing.get_context('spawn')
355-
with mp_context.Pool(1) as pool:
402+
with mp_context.Pool(n_gpus) as pool:
356403
if mode == "test":
357404
return run_testing(logger, pool, tests)
358405
if mode == "benchmark":

examples/gather/reference.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import torch
2+
from task import input_t, output_t
3+
from utils import verbose_allclose
4+
from typing import Tuple
5+
6+
7+
def generate_input(seed: int, world_size: int, rank: int) -> input_t:
8+
local_data = torch.tensor([rank]).to(f"cuda:{rank}")
9+
return local_data, rank, world_size
10+
11+
12+
def check_implementation(data: input_t, output: output_t) -> Tuple[bool, str]:
13+
data, rank, world_size = data
14+
for i in range(world_size):
15+
if output[i].get_device() != rank:
16+
return False, f"mismatch found! output {i} of rank {rank} is on device {output[i].device}"
17+
if (item := output[i].cpu().detach().item()) != i:
18+
return False, f"mismatch found! custom implementation doesn't match reference: rank {rank}, entry {i} has value {item}"
19+
return True, ''

examples/gather/submission.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#!POPCORN leaderboard identity_py-dev
2+
3+
from task import input_t, output_t
4+
import torch
5+
from torch import distributed as dist
6+
7+
8+
def custom_kernel(data: input_t) -> output_t:
9+
data, rank, world_size = data
10+
result = [torch.empty_like(data) for _ in range(dist.get_world_size())]
11+
dist.all_gather(result, data)
12+
return result

examples/gather/task.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
from typing import TypedDict, List, Tuple
2+
import torch
3+
4+
5+
input_t = Tuple[torch.Tensor, int, int]
6+
output_t = List[torch.Tensor]
7+
8+
9+
class TestSpec(TypedDict):
10+
pass

examples/gather/task.yml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# name: identity-py
2+
3+
files:
4+
- {"name": "submission.py", "source": "@SUBMISSION@"}
5+
- {"name": "task.py", "source": "task.py"}
6+
- {"name": "utils.py", "source": "../utils.py"}
7+
- {"name": "reference.py", "source": "reference.py"}
8+
- {"name": "eval.py", "source": "../eval.py"}
9+
10+
lang: "py"
11+
multi_gpu: true
12+
description:
13+
A simple test task - python
14+
15+
config:
16+
main: "eval.py"
17+
18+
templates:
19+
Python: "../template.py"
20+
21+
# small test cases. should be cheap to run.
22+
tests:
23+
- {"seed": 5, "world_size": 4}
24+
25+
benchmarks:
26+
- {"seed": 10}
27+
28+
ranking_by: "geom"

examples/gather/wrong.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
#!POPCORN leaderboard identity_py-dev
2+
3+
from task import input_t, output_t
4+
import torch
5+
from torch import distributed as dist
6+
7+
8+
def custom_kernel(data: input_t) -> output_t:
9+
data, rank, world_size = data
10+
result = [torch.ones_like(data) for _ in range(dist.get_world_size())]
11+
return result

src/libkernelbot/consts.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ class ModalGPU(Enum):
2828
A100 = "A100"
2929
H100 = "H100"
3030
B200 = "B200"
31+
# multi-gpu
32+
L4x4 = "L4x4"
3133

3234

3335
@dataclasses.dataclass
@@ -109,7 +111,8 @@ class RankCriterion(Enum):
109111

110112
GPU_TO_SM = {
111113
"T4": "75",
112-
"L4": "80",
114+
"L4": "89",
115+
"L4x4": "89",
113116
"A100": "80",
114117
"H100": "90a",
115118
"B200": "100",

src/libkernelbot/run_eval.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ class RunResult:
4444
class SystemInfo:
4545
# fmt: off
4646
gpu: str = '' # Model name of the GPU
47+
device_count: int = 1 # Number of GPUs
4748
cpu: str = '' # Model name of the CPU
4849
platform: str = '' # Platform string of the machine
4950
torch: str = '' # Torch version
@@ -217,7 +218,7 @@ def compile_cuda_script( # # noqa: C901
217218
)
218219

219220

220-
def run_program(args: list[str], seed: Optional[int], timeout: int) -> RunResult:
221+
def run_program(args: list[str], seed: Optional[int], timeout: int, multi_gpu: bool = False) -> RunResult:
221222
print("[Running]")
222223
# set up a pipe so the tester can communicate its verdict with us
223224
env = os.environ.copy()
@@ -226,6 +227,10 @@ def run_program(args: list[str], seed: Optional[int], timeout: int) -> RunResult
226227
if seed is not None:
227228
env["POPCORN_SEED"] = str(seed)
228229

230+
if multi_gpu:
231+
import torch
232+
env["POPCORN_GPUS"] = str(torch.cuda.device_count())
233+
229234
execution_start_time = time.perf_counter()
230235
try:
231236
run_process = subprocess.run(
@@ -279,6 +284,8 @@ def run_program(args: list[str], seed: Optional[int], timeout: int) -> RunResult
279284
def run_single_evaluation(
280285
call: list[str],
281286
mode: str,
287+
*,
288+
multi_gpu: bool = False,
282289
tests: Optional[str] = None,
283290
benchmarks: Optional[str] = None,
284291
test_timeout: int = Timeout.TEST,
@@ -295,7 +302,7 @@ def run_single_evaluation(
295302
with tempfile.NamedTemporaryFile("w") as tests_file:
296303
tests_file.write(tests)
297304
tests_file.flush()
298-
return run_program(call + [mode, tests_file.name], seed=seed, timeout=test_timeout)
305+
return run_program(call + [mode, tests_file.name], seed=seed, timeout=test_timeout, multi_gpu=multi_gpu)
299306
elif mode in ["benchmark", "profile", "leaderboard"]:
300307
timeout = ranked_timeout if mode == "leaderboard" else benchmark_timeout
301308
with tempfile.NamedTemporaryFile("w") as bench_file:
@@ -304,7 +311,7 @@ def run_single_evaluation(
304311
else:
305312
bench_file.write(benchmarks)
306313
bench_file.flush()
307-
return run_program(call + [mode, bench_file.name], seed=seed, timeout=timeout)
314+
return run_program(call + [mode, bench_file.name], seed=seed, timeout=timeout, multi_gpu=multi_gpu)
308315
else:
309316
raise ValueError(f"Invalid mode {mode}")
310317

@@ -319,6 +326,7 @@ def make_system_info() -> SystemInfo:
319326
# https://pytorch.org/docs/stable/notes/hip.html
320327
if torch.cuda.is_available():
321328
info.gpu = torch.cuda.get_device_name()
329+
info.device_count = torch.cuda.device_count()
322330
except ImportError:
323331
# get GPU info manually
324332
try:
@@ -551,6 +559,7 @@ def run_config(config: dict):
551559
"ranked_timeout": config.get("ranked_timeout", Timeout.RANKED),
552560
"benchmark_timeout": config.get("benchmark_timeout", Timeout.BENCHMARK),
553561
"test_timeout": config.get("test_timeout", Timeout.TEST),
562+
"multi_gpu": config.get("multi_gpu", False),
554563
}
555564
if config["lang"] == "py":
556565
runner = functools.partial(

src/libkernelbot/task.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ class LeaderboardTask:
6161
ranked_timeout: int = 180
6262
ranking_by: RankCriterion = RankCriterion.LAST
6363
seed: Optional[int] = None
64+
multi_gpu: bool = False
6465

6566
def __post_init__(self):
6667
if self.lang == Language.Python and not isinstance(self.config, PythonTaskData):
@@ -75,6 +76,7 @@ def from_dict(cls, data: dict):
7576
criterion = RankCriterion(data.get("ranking_by", RankCriterion.LAST))
7677
data_["lang"] = lang
7778
data_["ranking_by"] = criterion
79+
data_["multi_gpu"] = data.get("multi_gpu", False)
7880
if lang == Language.Python:
7981
data_["config"] = PythonTaskData(**data["config"])
8082
else:
@@ -176,6 +178,7 @@ def build_task_config(
176178
"ranked_timeout": task.ranked_timeout,
177179
"ranking_by": task.ranking_by.value,
178180
"seed": task.seed,
181+
"multi_gpu": task.multi_gpu,
179182
}
180183

181184
if task.lang == Language.Python:

src/runners/modal_runner.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
)
4141
# other frameworks
4242
.pip_install(
43-
"jax[cuda12]==0.5.3", # 0.6 want's cudnn 9.8 in conflict with torch 2.7
43+
"jax[cuda12]==0.5.3", # 0.6 want's cudnn 9.8 in conflict with torch 2.7
4444
"jax2torch==0.0.7",
4545
"tinygrad~=0.10",
4646
)
@@ -50,8 +50,8 @@
5050
"nvidia-cutlass-dsl~=4.0",
5151
"cuda-core[cu12]~=0.3",
5252
"cuda-python[all]==12.8",
53-
#"nvmath-python[cu12]~=0.4",
54-
#"numba-cuda[cu12]~=0.15",
53+
# "nvmath-python[cu12]~=0.4",
54+
# "numba-cuda[cu12]~=0.15",
5555
)
5656
)
5757

0 commit comments

Comments
 (0)