Skip to content

Reproducing Table 2 from Orion Paper #35

@atomicapple0

Description

@atomicapple0

I am trying to reproduce the numbers from the conv/bnorm toy benchmark from the Orion paper . I saw some code provided here but did not see a script to run bnorm and conv in parallel on different streams. I rewrote this benchmark in the following script and reran on H100. I got the following results and didn't see any significant speedup from running in parallel. Any advise?

(.venv) ubuntu@209-20-159-95:~/conv_bnorm$ python3 main.py 
------------------------------
solo_conv: 0.42575 +- 0.00234 ms
solo_bnorm: 0.14967 +- 0.00172 ms
------------------------------
conv_conv_seq 0.80283 +- 0.00713 ms
conv_conv_par 0.77428 +- 0.01028 ms
speedup: 1.04x
------------------------------
bnorm_bnorm_seq 0.25994 +- 0.00549 ms
bnorm_bnorm_par 0.25029 +- 0.00464 ms
speedup: 1.04x
------------------------------
conv_bnorm_seq 0.53533 +- 0.00751 ms
conv_bnorm_par 0.51155 +- 0.01055 ms
speedup: 1.05x
------------------------------
bnorm_conv_seq 0.53632 +- 0.00601 ms
bnorm_conv_par 0.52873 +- 0.00558 ms
speedup: 1.01x
------------------------------

Source:

import os
from platform import node
import sched
import sys
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
from torchvision import models, datasets, transforms
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.nn.functional as F
from torch.multiprocessing import Pool, Process, set_start_method, Manager, Value, Lock
from datetime import timedelta
import random
import numpy as np
import time
import os
import argparse
import threading

class Conv(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = torch.nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)

    def forward(self, x):
        y = self.conv(x)

class Bnorm(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.bn = torch.nn.BatchNorm2d(64)

    def forward(self, x):
        x = self.bn(x)


bs = 32
conv_model = Conv().cuda().eval()
bnorm_model = Bnorm().cuda().eval()
conv_data = torch.rand([bs,3,224,224]).cuda().contiguous()
bnorm_data = torch.rand([bs,64,122,122]).cuda().contiguous()
conv_datas = [conv_data.clone() for _ in range(2)]
bnorm_datas = [bnorm_data.clone() for _ in range(2)]

def run_conv(curr_stream, data_idx=0):
    with torch.no_grad():
        with torch.cuda.stream(curr_stream):
            output = conv_model(conv_datas[data_idx])

def run_bnorm(curr_stream, data_idx=0):
    with torch.no_grad():
        with torch.cuda.stream(curr_stream):
            output = bnorm_model(bnorm_datas[data_idx])


stream = torch.cuda.Stream()
streamA = torch.cuda.Stream()
streamB = torch.cuda.Stream()

event1 = torch.cuda.Event(enable_timing=True)
event2 = torch.cuda.Event(enable_timing=True)
event3 = torch.cuda.Event(enable_timing=True)
event4 = torch.cuda.Event(enable_timing=True)

def solo(fun):
    torch.cuda.synchronize()

    fun(streamA)
    event2.record(streamA)

    event1.record(stream)
    event2.wait(stream=stream)
    event4.record(stream)

    torch.cuda.synchronize()
    return event1.elapsed_time(event4)

def seq(funA, funB):
    torch.cuda.synchronize()

    funA(streamA)
    funB(streamA, data_idx=1)
    event2.record(streamA)

    event1.record(stream)
    event2.wait(stream=stream)
    event4.record(stream)

    torch.cuda.synchronize()
    return event1.elapsed_time(event4)

def par(funA, funB):
    torch.cuda.synchronize()

    funA(streamA)
    funB(streamB, data_idx=1)
    event2.record(streamA)
    event3.record(streamB)
    
    event1.record(stream)
    event2.wait(stream=stream)
    event3.wait(stream=stream)
    event4.record(stream)

    torch.cuda.synchronize()
    return event1.elapsed_time(event4)

def timeit(fun):
    times = []
    for i in range(100):
        times.append(fun())
    times = times[10:]
    avg = np.mean(times)
    std = np.std(times)
    print(f'{avg:.5f} +- {std:.5f} ms')
    return avg

def speedup(seq_time, par_time):
    print(f'speedup: {seq_time/par_time:.2f}x')

print('------------------------------')
print(f'solo_conv: ', end='')
timeit(lambda: solo(run_conv))
print(f'solo_bnorm: ', end='')
timeit(lambda: solo(run_bnorm))
print('------------------------------')
print(f'conv_conv_seq ', end='')
seq_time = timeit(lambda: seq(run_conv, run_conv))
print(f'conv_conv_par ', end='')
par_time = timeit(lambda: par(run_conv, run_conv))
speedup(seq_time, par_time)
print('------------------------------')
print(f'bnorm_bnorm_seq ', end='')
seq_time = timeit(lambda: seq(run_bnorm, run_bnorm))
print(f'bnorm_bnorm_par ', end='')
par_time = timeit(lambda: par(run_bnorm, run_bnorm))
speedup(seq_time, par_time)
print('------------------------------')
print(f'conv_bnorm_seq ', end='')
seq_time = timeit(lambda: seq(run_conv, run_bnorm))
print(f'conv_bnorm_par ', end='')
par_time = timeit(lambda: par(run_conv, run_bnorm))
speedup(seq_time, par_time)
print('------------------------------')
print(f'bnorm_conv_seq ', end='')
seq_time = timeit(lambda: seq(run_bnorm, run_conv))
print(f'bnorm_conv_par ', end='')
par_time = timeit(lambda: par(run_bnorm, run_conv))
speedup(seq_time, par_time)
print('------------------------------')

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions