Reproducing Table 2 from Orion Paper

I am trying to reproduce the numbers from the `conv/bnorm` toy benchmark from the Orion paper . I saw some code provided [here](https://github.com/eth-easl/orion/tree/main/profiling/benchmarks) but did not see a script to run bnorm and conv in parallel on different streams. I rewrote this benchmark in the following script and reran on H100. I got the following results and didn't see any significant speedup from running in parallel. Any advise?

```
(.venv) ubuntu@209-20-159-95:~/conv_bnorm$ python3 main.py 
------------------------------
solo_conv: 0.42575 +- 0.00234 ms
solo_bnorm: 0.14967 +- 0.00172 ms
------------------------------
conv_conv_seq 0.80283 +- 0.00713 ms
conv_conv_par 0.77428 +- 0.01028 ms
speedup: 1.04x
------------------------------
bnorm_bnorm_seq 0.25994 +- 0.00549 ms
bnorm_bnorm_par 0.25029 +- 0.00464 ms
speedup: 1.04x
------------------------------
conv_bnorm_seq 0.53533 +- 0.00751 ms
conv_bnorm_par 0.51155 +- 0.01055 ms
speedup: 1.05x
------------------------------
bnorm_conv_seq 0.53632 +- 0.00601 ms
bnorm_conv_par 0.52873 +- 0.00558 ms
speedup: 1.01x
------------------------------
```

Source:
```
import os
from platform import node
import sched
import sys
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
from torchvision import models, datasets, transforms
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.nn.functional as F
from torch.multiprocessing import Pool, Process, set_start_method, Manager, Value, Lock
from datetime import timedelta
import random
import numpy as np
import time
import os
import argparse
import threading

class Conv(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = torch.nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)

    def forward(self, x):
        y = self.conv(x)

class Bnorm(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.bn = torch.nn.BatchNorm2d(64)

    def forward(self, x):
        x = self.bn(x)


bs = 32
conv_model = Conv().cuda().eval()
bnorm_model = Bnorm().cuda().eval()
conv_data = torch.rand([bs,3,224,224]).cuda().contiguous()
bnorm_data = torch.rand([bs,64,122,122]).cuda().contiguous()
conv_datas = [conv_data.clone() for _ in range(2)]
bnorm_datas = [bnorm_data.clone() for _ in range(2)]

def run_conv(curr_stream, data_idx=0):
    with torch.no_grad():
        with torch.cuda.stream(curr_stream):
            output = conv_model(conv_datas[data_idx])

def run_bnorm(curr_stream, data_idx=0):
    with torch.no_grad():
        with torch.cuda.stream(curr_stream):
            output = bnorm_model(bnorm_datas[data_idx])


stream = torch.cuda.Stream()
streamA = torch.cuda.Stream()
streamB = torch.cuda.Stream()

event1 = torch.cuda.Event(enable_timing=True)
event2 = torch.cuda.Event(enable_timing=True)
event3 = torch.cuda.Event(enable_timing=True)
event4 = torch.cuda.Event(enable_timing=True)

def solo(fun):
    torch.cuda.synchronize()

    fun(streamA)
    event2.record(streamA)

    event1.record(stream)
    event2.wait(stream=stream)
    event4.record(stream)

    torch.cuda.synchronize()
    return event1.elapsed_time(event4)

def seq(funA, funB):
    torch.cuda.synchronize()

    funA(streamA)
    funB(streamA, data_idx=1)
    event2.record(streamA)

    event1.record(stream)
    event2.wait(stream=stream)
    event4.record(stream)

    torch.cuda.synchronize()
    return event1.elapsed_time(event4)

def par(funA, funB):
    torch.cuda.synchronize()

    funA(streamA)
    funB(streamB, data_idx=1)
    event2.record(streamA)
    event3.record(streamB)
    
    event1.record(stream)
    event2.wait(stream=stream)
    event3.wait(stream=stream)
    event4.record(stream)

    torch.cuda.synchronize()
    return event1.elapsed_time(event4)

def timeit(fun):
    times = []
    for i in range(100):
        times.append(fun())
    times = times[10:]
    avg = np.mean(times)
    std = np.std(times)
    print(f'{avg:.5f} +- {std:.5f} ms')
    return avg

def speedup(seq_time, par_time):
    print(f'speedup: {seq_time/par_time:.2f}x')

print('------------------------------')
print(f'solo_conv: ', end='')
timeit(lambda: solo(run_conv))
print(f'solo_bnorm: ', end='')
timeit(lambda: solo(run_bnorm))
print('------------------------------')
print(f'conv_conv_seq ', end='')
seq_time = timeit(lambda: seq(run_conv, run_conv))
print(f'conv_conv_par ', end='')
par_time = timeit(lambda: par(run_conv, run_conv))
speedup(seq_time, par_time)
print('------------------------------')
print(f'bnorm_bnorm_seq ', end='')
seq_time = timeit(lambda: seq(run_bnorm, run_bnorm))
print(f'bnorm_bnorm_par ', end='')
par_time = timeit(lambda: par(run_bnorm, run_bnorm))
speedup(seq_time, par_time)
print('------------------------------')
print(f'conv_bnorm_seq ', end='')
seq_time = timeit(lambda: seq(run_conv, run_bnorm))
print(f'conv_bnorm_par ', end='')
par_time = timeit(lambda: par(run_conv, run_bnorm))
speedup(seq_time, par_time)
print('------------------------------')
print(f'bnorm_conv_seq ', end='')
seq_time = timeit(lambda: seq(run_bnorm, run_conv))
print(f'bnorm_conv_par ', end='')
par_time = timeit(lambda: par(run_bnorm, run_conv))
speedup(seq_time, par_time)
print('------------------------------')
```


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Reproducing Table 2 from Orion Paper #35

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Reproducing Table 2 from Orion Paper #35

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions