From e13a735df9b1ff22692eef24011121bb194f2c8f Mon Sep 17 00:00:00 2001 From: Panlichen Date: Mon, 19 Dec 2022 09:15:04 +0000 Subject: [PATCH 01/33] ori of run small vit ok --- .gitignore | 6 +++++- configs/vit_imagenet.py | 9 ++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index cd06a347d..747699a86 100644 --- a/.gitignore +++ b/.gitignore @@ -129,4 +129,8 @@ venv.bak/ dmypy.json # Pyre type checker -.pyre/ \ No newline at end of file +.pyre/ + +config/ +version.py +output/ \ No newline at end of file diff --git a/configs/vit_imagenet.py b/configs/vit_imagenet.py index 91bdc914b..f9788bd50 100644 --- a/configs/vit_imagenet.py +++ b/configs/vit_imagenet.py @@ -1,5 +1,6 @@ from libai.config import LazyCall -from .common.models.vit.vit_base_patch16_224 import model +# from .common.models.vit.vit_base_patch16_224 import model +from .common.models.vit.vit_small_patch16_224 import model from .common.models.graph import graph from .common.train import train from .common.optim import optim @@ -12,6 +13,12 @@ dataloader.train.dataset[0].root = "/path/to/imagenet" dataloader.test[0].dataset.root = "/path/to/imagenet" +import os +host = os.environ.get('HOST') +if (host == "oneflow-28"): + dataloader.train.dataset[0].root = "/ssd/dataset/ImageNet/extract" + dataloader.test[0].dataset.root = "/ssd/dataset/ImageNet/extract" + # Refine model cfg for vit training on imagenet model.cfg.num_classes = 1000 model.cfg.loss_func = SoftTargetCrossEntropy() From e44fb2fbc2b6ccf8cb65fc7c8cc8b45beb8c38d1 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Thu, 22 Dec 2022 10:52:53 +0000 Subject: [PATCH 02/33] update path --- configs/vit_imagenet.py | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/configs/vit_imagenet.py b/configs/vit_imagenet.py index f9788bd50..e823437da 100644 --- a/configs/vit_imagenet.py +++ b/configs/vit_imagenet.py @@ -1,6 +1,12 @@ from libai.config import LazyCall -# from .common.models.vit.vit_base_patch16_224 import model -from .common.models.vit.vit_small_patch16_224 import model + +import os +host = os.environ.get('HOST') +if (host == "oneflow-25" or host == "oneflow-27"): + from .common.models.vit.vit_base_patch16_224 import model +else: + from .common.models.vit.vit_small_patch16_224 import model + from .common.models.graph import graph from .common.train import train from .common.optim import optim @@ -13,11 +19,27 @@ dataloader.train.dataset[0].root = "/path/to/imagenet" dataloader.test[0].dataset.root = "/path/to/imagenet" -import os -host = os.environ.get('HOST') if (host == "oneflow-28"): dataloader.train.dataset[0].root = "/ssd/dataset/ImageNet/extract" dataloader.test[0].dataset.root = "/ssd/dataset/ImageNet/extract" +elif (host == "oneflow-15"): + dataloader.train.dataset[0].root = "/home/panlichen/dataset/ImageNet/extract" + dataloader.test[0].dataset.root = "/home/panlichen/dataset/ImageNet/extract" +elif (host == "oneflow-16"): + dataloader.train.dataset[0].root = "/dataset/ImageNet/extract" + dataloader.test[0].dataset.root = "/dataset/ImageNet/extract" +elif (host == "oneflow-25"): + dataloader.train.dataset[0].root = "/data/dataset/ImageNet/extract" + dataloader.test[0].dataset.root = "/data/dataset/ImageNet/extract" +elif (host == "oneflow-26"): + dataloader.train.dataset[0].root = "/data/home/panlichen/ImageNet/extract" + dataloader.test[0].dataset.root = "/data/home/panlichen/ImageNet/extract" +elif (host == "oneflow-27"): + dataloader.train.dataset[0].root = "/data/home/panlichen/ImageNet/extract" + dataloader.test[0].dataset.root = "/data/home/panlichen/ImageNet/extract" +else: + print("NO LEGAL HOST, exit.") + exit(1) # Refine model cfg for vit training on imagenet model.cfg.num_classes = 1000 From 6c83d185b1f6dd4e2b5344b87660515c16d9eaaa Mon Sep 17 00:00:00 2001 From: Panlichen Date: Thu, 22 Dec 2022 11:53:30 +0000 Subject: [PATCH 03/33] update path --- configs/vit_imagenet.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/configs/vit_imagenet.py b/configs/vit_imagenet.py index e823437da..cc480e7d4 100644 --- a/configs/vit_imagenet.py +++ b/configs/vit_imagenet.py @@ -32,11 +32,11 @@ dataloader.train.dataset[0].root = "/data/dataset/ImageNet/extract" dataloader.test[0].dataset.root = "/data/dataset/ImageNet/extract" elif (host == "oneflow-26"): - dataloader.train.dataset[0].root = "/data/home/panlichen/ImageNet/extract" - dataloader.test[0].dataset.root = "/data/home/panlichen/ImageNet/extract" + dataloader.train.dataset[0].root = "/ssd/dataset/ImageNet/extract" + dataloader.test[0].dataset.root = "/ssd/dataset/ImageNet/extract" elif (host == "oneflow-27"): - dataloader.train.dataset[0].root = "/data/home/panlichen/ImageNet/extract" - dataloader.test[0].dataset.root = "/data/home/panlichen/ImageNet/extract" + dataloader.train.dataset[0].root = "/ssd/dataset/ImageNet/extract" + dataloader.test[0].dataset.root = "/ssd/dataset/ImageNet/extract" else: print("NO LEGAL HOST, exit.") exit(1) From 873be34092845b8f786ef16da16e458ceaae6cb0 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Fri, 6 Jan 2023 08:09:25 +0000 Subject: [PATCH 04/33] update path --- configs/vit_imagenet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/vit_imagenet.py b/configs/vit_imagenet.py index cc480e7d4..1ceaa7d4a 100644 --- a/configs/vit_imagenet.py +++ b/configs/vit_imagenet.py @@ -23,8 +23,8 @@ dataloader.train.dataset[0].root = "/ssd/dataset/ImageNet/extract" dataloader.test[0].dataset.root = "/ssd/dataset/ImageNet/extract" elif (host == "oneflow-15"): - dataloader.train.dataset[0].root = "/home/panlichen/dataset/ImageNet/extract" - dataloader.test[0].dataset.root = "/home/panlichen/dataset/ImageNet/extract" + dataloader.train.dataset[0].root = "/minio/sdd/dataset/imagenet/extract" + dataloader.test[0].dataset.root = "/minio/sdd/dataset/imagenet/extract" elif (host == "oneflow-16"): dataloader.train.dataset[0].root = "/dataset/ImageNet/extract" dataloader.test[0].dataset.root = "/dataset/ImageNet/extract" From 0a7cb5d781fe8640607f74e52609c06c2066f14f Mon Sep 17 00:00:00 2001 From: Panlichen Date: Tue, 10 Jan 2023 09:57:58 +0000 Subject: [PATCH 05/33] scripts --- configs/vit_imagenet.py | 3 +- tools/train.sh | 66 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 66 insertions(+), 3 deletions(-) diff --git a/configs/vit_imagenet.py b/configs/vit_imagenet.py index 1ceaa7d4a..706e844ad 100644 --- a/configs/vit_imagenet.py +++ b/configs/vit_imagenet.py @@ -68,7 +68,8 @@ train.test_micro_batch_size = 128 train.train_epoch = 300 train.warmup_ratio = 5 / 300 -train.evaluation.eval_period = 1000 +train.evaluation.enabled = False +# train.evaluation.eval_period = 100 train.log_period = 1 # Scheduler diff --git a/tools/train.sh b/tools/train.sh index 714ac9953..87f73eee0 100755 --- a/tools/train.sh +++ b/tools/train.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +clear + FILE=$1 CONFIG=$2 GPUS=$3 @@ -8,9 +10,69 @@ NODE_RANK=${NODE_RANK:-0} ADDR=${ADDR:-127.0.0.1} PORT=${PORT:-12345} +export GLOG_logtostderr=1 +export ONEFLOW_ACTOR_ENABLE_LIGHT_ACTOR=0 # 禁用lightweight actor + +export NCCL_PROTO=Simple +export NCCL_ALGO=Ring +# export NCCL_MAX_NCHANNELS=1 +# export NCCL_MIN_NCHANNELS=1 +# export NCCL_NTHREADS=64 + +export ONEFLOW_ENABLE_OFCCL=1 +export ONEFLOW_OFCCL_SKIP_NEGO=0 +export ONEFLOW_DEBUG_MODE=1 +export ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE=1 + +export GLOG_vmodule=plan_util*=1,of_collective_actor*=1,of_collective_boxing_kernels*=1,collective_backend_ofccl*=1,hierarchical_sub_task_graph_builder_impl*=1,of_request_store*=1,request_store*=1,runtime*=1,scheduler*=1,collective_manager*=1 +# nn_graph*=1, +# export GLOG_v=1 + +echo ONEFLOW_ENABLE_OFCCL=$ONEFLOW_ENABLE_OFCCL +echo ONEFLOW_OFCCL_SKIP_NEGO=$ONEFLOW_OFCCL_SKIP_NEGO +echo ONEFLOW_ENABLE_OFCCL=$ONEFLOW_DEBUG_MODE +echo ONEFLOW_OFCCL_SKIP_NEGO=$ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE +echo NCCL_PROTO=$NCCL_PROTO +echo NCCL_ALGO=$NCCL_ALGO +echo NCCL_MAX_NCHANNELS=$NCCL_MAX_NCHANNELS +echo NCCL_NTHREADS=$NCCL_NTHREADS +echo ONEFLOW_OFCCL_CHAIN=$ONEFLOW_OFCCL_CHAIN +echo GLOG_vmodule=$GLOG_vmodule +echo GLOG_v=$GLOG_v +echo GLOG_logtostderr=$GLOG_logtostderr + +export SHOW_ALL_PREPARED_COLL=1 + +export TRAVERSE_TIMES=10 +export TOLERANT_UNPROGRESSED_CNT=10000 +export BASE_CTX_SWITCH_THRESHOLD=80 +export BOUNS_SWITCH_4_PROCESSED_COLL=0 +export DEV_TRY_ROUND=10 +export DEBUG_FILE="/home/panlichen/work/oneflow/log/oneflow_cpu_rank_" + +echo TRAVERSE_TIMES=$TRAVERSE_TIMES +echo TOLERANT_UNPROGRESSED_CNT=$TOLERANT_UNPROGRESSED_CNT +echo BASE_CTX_SWITCH_THRESHOLD=$BASE_CTX_SWITCH_THRESHOLD +echo BOUNS_SWITCH_4_PROCESSED_COLL=$BOUNS_SWITCH_4_PROCESSED_COLL +echo DEV_TRY_ROUND=$DEV_TRY_ROUND +echo DEBUG_FILE=$DEBUG_FILE + +export PYTHONUNBUFFERED=1 +echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED +export NCCL_LAUNCH_MODE=PARALLEL +echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE +# export NCCL_DEBUG=INFO + +rm -rf /home/panlichen/work/libai/log +mkdir -p /home/panlichen/work/libai/log + +rm -rf /home/panlichen/work/oneflow/log +mkdir -p /home/panlichen/work/oneflow/log + export ONEFLOW_FUSE_OPTIMIZER_UPDATE_CAST=true python3 -m oneflow.distributed.launch \ ---nproc_per_node $GPUS --nnodes $NODE --node_rank $NODE_RANK --master_addr $ADDR --master_port $PORT \ -$FILE --config-file $CONFIG ${@:4} + --nproc_per_node $GPUS --nnodes $NODE --node_rank $NODE_RANK --master_addr $ADDR --master_port $PORT \ + $FILE --config-file $CONFIG ${@:4} \ + # > /home/panlichen/work/oneflow/log/oneflow.log 2>&1 From a6141a4cbcb1911c623cf46f68fee81ec370be0d Mon Sep 17 00:00:00 2001 From: Panlichen Date: Tue, 10 Jan 2023 10:20:53 +0000 Subject: [PATCH 06/33] scripts --- tools/train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/train.sh b/tools/train.sh index 87f73eee0..0f38830a7 100755 --- a/tools/train.sh +++ b/tools/train.sh @@ -74,5 +74,5 @@ export ONEFLOW_FUSE_OPTIMIZER_UPDATE_CAST=true python3 -m oneflow.distributed.launch \ --nproc_per_node $GPUS --nnodes $NODE --node_rank $NODE_RANK --master_addr $ADDR --master_port $PORT \ $FILE --config-file $CONFIG ${@:4} \ - # > /home/panlichen/work/oneflow/log/oneflow.log 2>&1 + > /home/panlichen/work/oneflow/log/oneflow.log 2>&1 From ade18938992dc4b037ab73bc2e64e5d3029563b3 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Tue, 10 Jan 2023 12:52:22 +0000 Subject: [PATCH 07/33] + CUDA_VISIBLE_DEVICES control --- tools/train.sh | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/train.sh b/tools/train.sh index 0f38830a7..cadd966ab 100755 --- a/tools/train.sh +++ b/tools/train.sh @@ -10,6 +10,13 @@ NODE_RANK=${NODE_RANK:-0} ADDR=${ADDR:-127.0.0.1} PORT=${PORT:-12345} +if [ $GPUS = 4 ]; then + export CUDA_VISIBLE_DEVICES=0,1,4,5 +fi +if [ $GPUS = 2 ]; then + export CUDA_VISIBLE_DEVICES=4,5 +fi + export GLOG_logtostderr=1 export ONEFLOW_ACTOR_ENABLE_LIGHT_ACTOR=0 # 禁用lightweight actor @@ -74,5 +81,5 @@ export ONEFLOW_FUSE_OPTIMIZER_UPDATE_CAST=true python3 -m oneflow.distributed.launch \ --nproc_per_node $GPUS --nnodes $NODE --node_rank $NODE_RANK --master_addr $ADDR --master_port $PORT \ $FILE --config-file $CONFIG ${@:4} \ - > /home/panlichen/work/oneflow/log/oneflow.log 2>&1 + # > /home/panlichen/work/oneflow/log/oneflow.log 2>&1 From 0666d423eb2b5e45793a628c864e1788f9b548e0 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Wed, 11 Jan 2023 09:56:41 +0000 Subject: [PATCH 08/33] 0 epoch; 200iter --- configs/vit_imagenet.py | 4 +++- tools/train.sh | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/configs/vit_imagenet.py b/configs/vit_imagenet.py index 706e844ad..9e440d681 100644 --- a/configs/vit_imagenet.py +++ b/configs/vit_imagenet.py @@ -66,7 +66,9 @@ # Refine train cfg for vit model train.train_micro_batch_size = 128 train.test_micro_batch_size = 128 -train.train_epoch = 300 +# train.train_epoch = 300 +train.train_epoch = 0 +train.train_iter = 200 train.warmup_ratio = 5 / 300 train.evaluation.enabled = False # train.evaluation.eval_period = 100 diff --git a/tools/train.sh b/tools/train.sh index cadd966ab..8dfdbf5ed 100755 --- a/tools/train.sh +++ b/tools/train.sh @@ -35,6 +35,7 @@ export GLOG_vmodule=plan_util*=1,of_collective_actor*=1,of_collective_boxing_ker # nn_graph*=1, # export GLOG_v=1 +echo GPUS=$GPUS echo ONEFLOW_ENABLE_OFCCL=$ONEFLOW_ENABLE_OFCCL echo ONEFLOW_OFCCL_SKIP_NEGO=$ONEFLOW_OFCCL_SKIP_NEGO echo ONEFLOW_ENABLE_OFCCL=$ONEFLOW_DEBUG_MODE @@ -51,7 +52,7 @@ echo GLOG_logtostderr=$GLOG_logtostderr export SHOW_ALL_PREPARED_COLL=1 export TRAVERSE_TIMES=10 -export TOLERANT_UNPROGRESSED_CNT=10000 +export TOLERANT_UNPROGRESSED_CNT=100000 export BASE_CTX_SWITCH_THRESHOLD=80 export BOUNS_SWITCH_4_PROCESSED_COLL=0 export DEV_TRY_ROUND=10 From 57e54ddfd2e7b8ad8f17cfebf1b777004e770e23 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Fri, 13 Jan 2023 09:17:42 +0000 Subject: [PATCH 09/33] scripts --- tools/train.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/train.sh b/tools/train.sh index 8dfdbf5ed..54defdd95 100755 --- a/tools/train.sh +++ b/tools/train.sh @@ -52,10 +52,11 @@ echo GLOG_logtostderr=$GLOG_logtostderr export SHOW_ALL_PREPARED_COLL=1 export TRAVERSE_TIMES=10 -export TOLERANT_UNPROGRESSED_CNT=100000 +export TOLERANT_UNPROGRESSED_CNT=1000000 export BASE_CTX_SWITCH_THRESHOLD=80 export BOUNS_SWITCH_4_PROCESSED_COLL=0 export DEV_TRY_ROUND=10 +export CHECK_REMAINING_SQE_INTERVAL=10000 export DEBUG_FILE="/home/panlichen/work/oneflow/log/oneflow_cpu_rank_" echo TRAVERSE_TIMES=$TRAVERSE_TIMES @@ -63,6 +64,7 @@ echo TOLERANT_UNPROGRESSED_CNT=$TOLERANT_UNPROGRESSED_CNT echo BASE_CTX_SWITCH_THRESHOLD=$BASE_CTX_SWITCH_THRESHOLD echo BOUNS_SWITCH_4_PROCESSED_COLL=$BOUNS_SWITCH_4_PROCESSED_COLL echo DEV_TRY_ROUND=$DEV_TRY_ROUND +echo CHECK_REMAINING_SQE_INTERVAL=$CHECK_REMAINING_SQE_INTERVAL echo DEBUG_FILE=$DEBUG_FILE export PYTHONUNBUFFERED=1 @@ -82,5 +84,5 @@ export ONEFLOW_FUSE_OPTIMIZER_UPDATE_CAST=true python3 -m oneflow.distributed.launch \ --nproc_per_node $GPUS --nnodes $NODE --node_rank $NODE_RANK --master_addr $ADDR --master_port $PORT \ $FILE --config-file $CONFIG ${@:4} \ - # > /home/panlichen/work/oneflow/log/oneflow.log 2>&1 + > /home/panlichen/work/oneflow/log/oneflow.log 2>&1 From 2db72ef5336e8c3f124bb5f905265cce64015d71 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Mon, 16 Jan 2023 09:51:33 +0000 Subject: [PATCH 10/33] + control enable_use_compute_stream --- configs/vit_imagenet.py | 7 ++++--- libai/models/utils/graph_base.py | 7 +++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/configs/vit_imagenet.py b/configs/vit_imagenet.py index 9e440d681..a2e1cd85a 100644 --- a/configs/vit_imagenet.py +++ b/configs/vit_imagenet.py @@ -68,7 +68,8 @@ train.test_micro_batch_size = 128 # train.train_epoch = 300 train.train_epoch = 0 -train.train_iter = 200 +# train.train_iter = 200 +train.train_iter = 10 train.warmup_ratio = 5 / 300 train.evaluation.enabled = False # train.evaluation.eval_period = 100 @@ -84,6 +85,6 @@ # Distributed Settings train.dist.pipeline_num_layers = model.cfg.depth -train.dist.data_parallel_size = 1 -train.dist.tensor_parallel_size = 1 +train.dist.data_parallel_size = 2 +train.dist.tensor_parallel_size = 4 train.dist.pipeline_parallel_size = 1 diff --git a/libai/models/utils/graph_base.py b/libai/models/utils/graph_base.py index dc49a7a7a..dd349c26a 100644 --- a/libai/models/utils/graph_base.py +++ b/libai/models/utils/graph_base.py @@ -73,8 +73,11 @@ def __init__( # Enable cuda stream for computation and communication as the same stream. # This will reduce memory when using model parallelism. dist_util = dist.get_dist_util() - if dist_util.is_tensor_model_parallel() or dist_util.is_pipeline_model_parallel(): - flow.boxing.nccl.enable_use_compute_stream(True) + import os + enable_occl = os.getenv("ONEFLOW_ENABLE_OFCCL") + if enable_occl != "1": + if dist_util.is_tensor_model_parallel() or dist_util.is_pipeline_model_parallel(): + flow.boxing.nccl.enable_use_compute_stream(True) # auto_parallel if auto_parallel_conf is not None and auto_parallel_conf.enabled: From d23a706dc21d255fef867be5cda946f5a4f8c2dc Mon Sep 17 00:00:00 2001 From: Panlichen Date: Tue, 17 Jan 2023 08:53:37 +0000 Subject: [PATCH 11/33] control enable_use_compute_stream with env --- configs/vit_imagenet.py | 4 ++-- libai/models/utils/graph_base.py | 3 ++- tools/train.sh | 1 + 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/configs/vit_imagenet.py b/configs/vit_imagenet.py index a2e1cd85a..b7dd7bb10 100644 --- a/configs/vit_imagenet.py +++ b/configs/vit_imagenet.py @@ -68,8 +68,8 @@ train.test_micro_batch_size = 128 # train.train_epoch = 300 train.train_epoch = 0 -# train.train_iter = 200 -train.train_iter = 10 +train.train_iter = 200 +# train.train_iter = 10 train.warmup_ratio = 5 / 300 train.evaluation.enabled = False # train.evaluation.eval_period = 100 diff --git a/libai/models/utils/graph_base.py b/libai/models/utils/graph_base.py index dd349c26a..4d532558e 100644 --- a/libai/models/utils/graph_base.py +++ b/libai/models/utils/graph_base.py @@ -75,7 +75,8 @@ def __init__( dist_util = dist.get_dist_util() import os enable_occl = os.getenv("ONEFLOW_ENABLE_OFCCL") - if enable_occl != "1": + disable_nccl_compute_stream = os.getenv("DISABLE_NCCL_COMPUTE_STREAM") + if enable_occl != "1" and disable_nccl_compute_stream != "1": if dist_util.is_tensor_model_parallel() or dist_util.is_pipeline_model_parallel(): flow.boxing.nccl.enable_use_compute_stream(True) diff --git a/tools/train.sh b/tools/train.sh index 54defdd95..972be6b70 100755 --- a/tools/train.sh +++ b/tools/train.sh @@ -27,6 +27,7 @@ export NCCL_ALGO=Ring # export NCCL_NTHREADS=64 export ONEFLOW_ENABLE_OFCCL=1 +# export DISABLE_NCCL_COMPUTE_STREAM=1 export ONEFLOW_OFCCL_SKIP_NEGO=0 export ONEFLOW_DEBUG_MODE=1 export ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE=1 From 2e862fc00928f53e7426e459e6cb7498dc5074d4 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Sun, 5 Feb 2023 12:52:24 +0000 Subject: [PATCH 12/33] hyperparameters --- configs/vit_imagenet.py | 7 +++--- tools/train.sh | 52 ++++++++++++++++++++++++++++++----------- 2 files changed, 42 insertions(+), 17 deletions(-) diff --git a/configs/vit_imagenet.py b/configs/vit_imagenet.py index b7dd7bb10..aff274977 100644 --- a/configs/vit_imagenet.py +++ b/configs/vit_imagenet.py @@ -68,8 +68,7 @@ train.test_micro_batch_size = 128 # train.train_epoch = 300 train.train_epoch = 0 -train.train_iter = 200 -# train.train_iter = 10 +train.train_iter = 10 train.warmup_ratio = 5 / 300 train.evaluation.enabled = False # train.evaluation.eval_period = 100 @@ -85,6 +84,6 @@ # Distributed Settings train.dist.pipeline_num_layers = model.cfg.depth -train.dist.data_parallel_size = 2 -train.dist.tensor_parallel_size = 4 +train.dist.data_parallel_size = 1 +train.dist.tensor_parallel_size = 8 train.dist.pipeline_parallel_size = 1 diff --git a/tools/train.sh b/tools/train.sh index 972be6b70..730dd1ca2 100755 --- a/tools/train.sh +++ b/tools/train.sh @@ -10,13 +10,6 @@ NODE_RANK=${NODE_RANK:-0} ADDR=${ADDR:-127.0.0.1} PORT=${PORT:-12345} -if [ $GPUS = 4 ]; then - export CUDA_VISIBLE_DEVICES=0,1,4,5 -fi -if [ $GPUS = 2 ]; then - export CUDA_VISIBLE_DEVICES=4,5 -fi - export GLOG_logtostderr=1 export ONEFLOW_ACTOR_ENABLE_LIGHT_ACTOR=0 # 禁用lightweight actor @@ -26,8 +19,8 @@ export NCCL_ALGO=Ring # export NCCL_MIN_NCHANNELS=1 # export NCCL_NTHREADS=64 -export ONEFLOW_ENABLE_OFCCL=1 -# export DISABLE_NCCL_COMPUTE_STREAM=1 +# export ONEFLOW_ENABLE_OFCCL=1 +export DISABLE_NCCL_COMPUTE_STREAM=1 export ONEFLOW_OFCCL_SKIP_NEGO=0 export ONEFLOW_DEBUG_MODE=1 export ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE=1 @@ -53,17 +46,50 @@ echo GLOG_logtostderr=$GLOG_logtostderr export SHOW_ALL_PREPARED_COLL=1 export TRAVERSE_TIMES=10 -export TOLERANT_UNPROGRESSED_CNT=1000000 -export BASE_CTX_SWITCH_THRESHOLD=80 -export BOUNS_SWITCH_4_PROCESSED_COLL=0 export DEV_TRY_ROUND=10 export CHECK_REMAINING_SQE_INTERVAL=10000 export DEBUG_FILE="/home/panlichen/work/oneflow/log/oneflow_cpu_rank_" +if [ $GPUS = 2 ]; then + export CUDA_VISIBLE_DEVICES=4,5 + + #pure dp + # export BASE_CTX_SWITCH_THRESHOLD=100 + # export TOLERANT_UNPROGRESSED_CNT=2000 + # export NUM_TRY_TASKQ_HEAD=40 + + #pure tp + export BASE_CTX_SWITCH_THRESHOLD=120 + export TOLERANT_UNPROGRESSED_CNT=10000 + export NUM_TRY_TASKQ_HEAD=100 +elif [ $GPUS = 4 ]; then + export CUDA_VISIBLE_DEVICES=0,1,4,5 + + #pure dp + # export BASE_CTX_SWITCH_THRESHOLD=80 + # export TOLERANT_UNPROGRESSED_CNT=10000 + # export NUM_TRY_TASKQ_HEAD=50 + + #pure tp + export BASE_CTX_SWITCH_THRESHOLD=120 + export TOLERANT_UNPROGRESSED_CNT=14000 + export NUM_TRY_TASKQ_HEAD=120 +elif [ $GPUS = 8 ]; then + #pure dp + # export BASE_CTX_SWITCH_THRESHOLD=120 + # export TOLERANT_UNPROGRESSED_CNT=70000 + # export NUM_TRY_TASKQ_HEAD=240 + + #pure tp + export BASE_CTX_SWITCH_THRESHOLD=120 + export TOLERANT_UNPROGRESSED_CNT=17000 + export NUM_TRY_TASKQ_HEAD=100 +fi + echo TRAVERSE_TIMES=$TRAVERSE_TIMES echo TOLERANT_UNPROGRESSED_CNT=$TOLERANT_UNPROGRESSED_CNT echo BASE_CTX_SWITCH_THRESHOLD=$BASE_CTX_SWITCH_THRESHOLD -echo BOUNS_SWITCH_4_PROCESSED_COLL=$BOUNS_SWITCH_4_PROCESSED_COLL +echo NUM_TRY_TASKQ_HEAD=$NUM_TRY_TASKQ_HEAD echo DEV_TRY_ROUND=$DEV_TRY_ROUND echo CHECK_REMAINING_SQE_INTERVAL=$CHECK_REMAINING_SQE_INTERVAL echo DEBUG_FILE=$DEBUG_FILE From c39a73f056527af4091aecf2976c7b7765e8045b Mon Sep 17 00:00:00 2001 From: Panlichen Date: Sun, 5 Feb 2023 13:34:39 +0000 Subject: [PATCH 13/33] +nsys --- tools/train.sh | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/tools/train.sh b/tools/train.sh index 730dd1ca2..b5c33b47c 100755 --- a/tools/train.sh +++ b/tools/train.sh @@ -19,7 +19,13 @@ export NCCL_ALGO=Ring # export NCCL_MIN_NCHANNELS=1 # export NCCL_NTHREADS=64 -# export ONEFLOW_ENABLE_OFCCL=1 +if [ -z $RUN_TYPE ];then + RUN_TYPE="PURE" + # RUN_TYPE="GDB" + # RUN_TYPE="NSYS" +fi + +export ONEFLOW_ENABLE_OFCCL=1 export DISABLE_NCCL_COMPUTE_STREAM=1 export ONEFLOW_OFCCL_SKIP_NEGO=0 export ONEFLOW_DEBUG_MODE=1 @@ -108,7 +114,26 @@ mkdir -p /home/panlichen/work/oneflow/log export ONEFLOW_FUSE_OPTIMIZER_UPDATE_CAST=true -python3 -m oneflow.distributed.launch \ +if [ $ONEFLOW_ENABLE_OFCCL == "1" ]; then + NSYS_FILE="ofccl_vit"_${HOST}_${GPUS}_card +else + NSYS_FILE="nccl_vit"_${HOST}_${GPUS}_card +fi + +if [ "$RUN_TYPE" == "PURE" ];then + cmd="python3 -m oneflow.distributed.launch" +elif [ "$RUN_TYPE" == "GDB" ];then + cmd="gdb -ex r --args python3 -m oneflow.distributed.launch" +elif [ "$RUN_TYPE" == "NSYS" ];then + if [ ! -d "/home/panlichen/work/oneflow/log/nsys" ];then + mkdir -p /home/panlichen/work/oneflow/log/nsys + fi + # cmd="nsys profile -f true --trace=cuda,cudnn,cublas,osrt,nvtx -o /home/panlichen/work/oneflow/log/nsys/$NSYS_FILE python3 -m oneflow.distributed.launch" + cmd="nsys profile -f true -o /home/panlichen/work/oneflow/log/nsys/$NSYS_FILE python3 -m oneflow.distributed.launch" +fi +echo cmd=$cmd + +$cmd \ --nproc_per_node $GPUS --nnodes $NODE --node_rank $NODE_RANK --master_addr $ADDR --master_port $PORT \ $FILE --config-file $CONFIG ${@:4} \ > /home/panlichen/work/oneflow/log/oneflow.log 2>&1 From aeb4550e87f35dfad463db3a7ede0792a12e78eb Mon Sep 17 00:00:00 2001 From: Panlichen Date: Mon, 6 Feb 2023 08:24:55 +0000 Subject: [PATCH 14/33] get iter from env --- configs/vit_imagenet.py | 2 +- tools/train.sh | 23 ++++++++++++++++------- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/configs/vit_imagenet.py b/configs/vit_imagenet.py index aff274977..2cd0472c5 100644 --- a/configs/vit_imagenet.py +++ b/configs/vit_imagenet.py @@ -68,7 +68,7 @@ train.test_micro_batch_size = 128 # train.train_epoch = 300 train.train_epoch = 0 -train.train_iter = 10 +train.train_iter = int(os.getenv("NUM_ITER_ENV")) train.warmup_ratio = 5 / 300 train.evaluation.enabled = False # train.evaluation.eval_period = 100 diff --git a/tools/train.sh b/tools/train.sh index b5c33b47c..fe78678de 100755 --- a/tools/train.sh +++ b/tools/train.sh @@ -27,7 +27,6 @@ fi export ONEFLOW_ENABLE_OFCCL=1 export DISABLE_NCCL_COMPUTE_STREAM=1 -export ONEFLOW_OFCCL_SKIP_NEGO=0 export ONEFLOW_DEBUG_MODE=1 export ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE=1 @@ -56,6 +55,9 @@ export DEV_TRY_ROUND=10 export CHECK_REMAINING_SQE_INTERVAL=10000 export DEBUG_FILE="/home/panlichen/work/oneflow/log/oneflow_cpu_rank_" +export NUM_ITER_ENV=20 +echo NUM_ITER_ENV=$NUM_ITER_ENV + if [ $GPUS = 2 ]; then export CUDA_VISIBLE_DEVICES=4,5 @@ -70,6 +72,7 @@ if [ $GPUS = 2 ]; then export NUM_TRY_TASKQ_HEAD=100 elif [ $GPUS = 4 ]; then export CUDA_VISIBLE_DEVICES=0,1,4,5 + export ONEFLOW_OFCCL_SKIP_NEGO=0 #pure dp # export BASE_CTX_SWITCH_THRESHOLD=80 @@ -77,18 +80,24 @@ elif [ $GPUS = 4 ]; then # export NUM_TRY_TASKQ_HEAD=50 #pure tp - export BASE_CTX_SWITCH_THRESHOLD=120 - export TOLERANT_UNPROGRESSED_CNT=14000 - export NUM_TRY_TASKQ_HEAD=120 + # export BASE_CTX_SWITCH_THRESHOLD=120 + # export TOLERANT_UNPROGRESSED_CNT=14000 + # export NUM_TRY_TASKQ_HEAD=120 + #pure tp-no nego + export BASE_CTX_SWITCH_THRESHOLD=3000 + export TOLERANT_UNPROGRESSED_CNT=16000 + export NUM_TRY_TASKQ_HEAD=200 elif [ $GPUS = 8 ]; then + export ONEFLOW_OFCCL_SKIP_NEGO=1 + #pure dp # export BASE_CTX_SWITCH_THRESHOLD=120 # export TOLERANT_UNPROGRESSED_CNT=70000 # export NUM_TRY_TASKQ_HEAD=240 - #pure tp - export BASE_CTX_SWITCH_THRESHOLD=120 - export TOLERANT_UNPROGRESSED_CNT=17000 + #pure tp no nego + export BASE_CTX_SWITCH_THRESHOLD=4000 + export TOLERANT_UNPROGRESSED_CNT=8000 export NUM_TRY_TASKQ_HEAD=100 fi From 790d4c865fa61ae759a1305f000e5e5bc8412f78 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Mon, 6 Feb 2023 15:23:33 +0000 Subject: [PATCH 15/33] set cfg.num_heads = 16 --- configs/common/models/vit/vit_base_patch16_224.py | 2 +- configs/vit_imagenet.py | 2 +- tools/train.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/configs/common/models/vit/vit_base_patch16_224.py b/configs/common/models/vit/vit_base_patch16_224.py index dd102b656..d66cb962c 100644 --- a/configs/common/models/vit/vit_base_patch16_224.py +++ b/configs/common/models/vit/vit_base_patch16_224.py @@ -6,6 +6,6 @@ cfg.patch_size = 16 cfg.embed_dim = 768 -cfg.num_heads = 12 +cfg.num_heads = 16 model = LazyCall(VisionTransformer)(cfg=cfg) diff --git a/configs/vit_imagenet.py b/configs/vit_imagenet.py index 2cd0472c5..4ed8b2a59 100644 --- a/configs/vit_imagenet.py +++ b/configs/vit_imagenet.py @@ -85,5 +85,5 @@ # Distributed Settings train.dist.pipeline_num_layers = model.cfg.depth train.dist.data_parallel_size = 1 -train.dist.tensor_parallel_size = 8 +train.dist.tensor_parallel_size = 4 train.dist.pipeline_parallel_size = 1 diff --git a/tools/train.sh b/tools/train.sh index fe78678de..0247476c3 100755 --- a/tools/train.sh +++ b/tools/train.sh @@ -123,7 +123,7 @@ mkdir -p /home/panlichen/work/oneflow/log export ONEFLOW_FUSE_OPTIMIZER_UPDATE_CAST=true -if [ $ONEFLOW_ENABLE_OFCCL == "1" ]; then +if [ "$ONEFLOW_ENABLE_OFCCL" == "1" ]; then NSYS_FILE="ofccl_vit"_${HOST}_${GPUS}_card else NSYS_FILE="nccl_vit"_${HOST}_${GPUS}_card From 0ae9ccaaf51c92cd2cfdbfa776a0db9b49b9ec10 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Wed, 8 Feb 2023 08:39:25 +0000 Subject: [PATCH 16/33] hyperparemeters; adjust env --- tools/train.sh | 99 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 75 insertions(+), 24 deletions(-) diff --git a/tools/train.sh b/tools/train.sh index 0247476c3..6d932d863 100755 --- a/tools/train.sh +++ b/tools/train.sh @@ -34,23 +34,8 @@ export GLOG_vmodule=plan_util*=1,of_collective_actor*=1,of_collective_boxing_ker # nn_graph*=1, # export GLOG_v=1 -echo GPUS=$GPUS -echo ONEFLOW_ENABLE_OFCCL=$ONEFLOW_ENABLE_OFCCL -echo ONEFLOW_OFCCL_SKIP_NEGO=$ONEFLOW_OFCCL_SKIP_NEGO -echo ONEFLOW_ENABLE_OFCCL=$ONEFLOW_DEBUG_MODE -echo ONEFLOW_OFCCL_SKIP_NEGO=$ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE -echo NCCL_PROTO=$NCCL_PROTO -echo NCCL_ALGO=$NCCL_ALGO -echo NCCL_MAX_NCHANNELS=$NCCL_MAX_NCHANNELS -echo NCCL_NTHREADS=$NCCL_NTHREADS -echo ONEFLOW_OFCCL_CHAIN=$ONEFLOW_OFCCL_CHAIN -echo GLOG_vmodule=$GLOG_vmodule -echo GLOG_v=$GLOG_v -echo GLOG_logtostderr=$GLOG_logtostderr - -export SHOW_ALL_PREPARED_COLL=1 +export SHOW_ALL_PREPARED_COLL=0 -export TRAVERSE_TIMES=10 export DEV_TRY_ROUND=10 export CHECK_REMAINING_SQE_INTERVAL=10000 export DEBUG_FILE="/home/panlichen/work/oneflow/log/oneflow_cpu_rank_" @@ -62,11 +47,15 @@ if [ $GPUS = 2 ]; then export CUDA_VISIBLE_DEVICES=4,5 #pure dp + # export RECV_SUCCESS_FACTOR=5 + # export RECV_SUCCESS_THRESHOLD=10000 # export BASE_CTX_SWITCH_THRESHOLD=100 # export TOLERANT_UNPROGRESSED_CNT=2000 # export NUM_TRY_TASKQ_HEAD=40 #pure tp + export RECV_SUCCESS_FACTOR=5 + export RECV_SUCCESS_THRESHOLD=10000 export BASE_CTX_SWITCH_THRESHOLD=120 export TOLERANT_UNPROGRESSED_CNT=10000 export NUM_TRY_TASKQ_HEAD=100 @@ -75,33 +64,95 @@ elif [ $GPUS = 4 ]; then export ONEFLOW_OFCCL_SKIP_NEGO=0 #pure dp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=5 + # export RECV_SUCCESS_THRESHOLD=10000 # export BASE_CTX_SWITCH_THRESHOLD=80 # export TOLERANT_UNPROGRESSED_CNT=10000 # export NUM_TRY_TASKQ_HEAD=50 #pure tp - # export BASE_CTX_SWITCH_THRESHOLD=120 - # export TOLERANT_UNPROGRESSED_CNT=14000 - # export NUM_TRY_TASKQ_HEAD=120 - #pure tp-no nego + export ONEFLOW_OFCCL_SKIP_NEGO=0 + export RECV_SUCCESS_FACTOR=40 + export RECV_SUCCESS_THRESHOLD=10000 export BASE_CTX_SWITCH_THRESHOLD=3000 export TOLERANT_UNPROGRESSED_CNT=16000 export NUM_TRY_TASKQ_HEAD=200 + elif [ $GPUS = 8 ]; then - export ONEFLOW_OFCCL_SKIP_NEGO=1 #pure dp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=5 + # export RECV_SUCCESS_THRESHOLD=10000 # export BASE_CTX_SWITCH_THRESHOLD=120 # export TOLERANT_UNPROGRESSED_CNT=70000 # export NUM_TRY_TASKQ_HEAD=240 - #pure tp no nego + #pure tp + export ONEFLOW_OFCCL_SKIP_NEGO=1 + export RECV_SUCCESS_FACTOR=5 + export RECV_SUCCESS_THRESHOLD=10000 export BASE_CTX_SWITCH_THRESHOLD=4000 export TOLERANT_UNPROGRESSED_CNT=8000 - export NUM_TRY_TASKQ_HEAD=100 + export NUM_TRY_TASKQ_HEAD=10 + + #3d + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=5 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=8000 + # export TOLERANT_UNPROGRESSED_CNT=80000 + # export NUM_TRY_TASKQ_HEAD=10 + + #2dp4pp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=5 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=8000 + # export TOLERANT_UNPROGRESSED_CNT=80000 + # export NUM_TRY_TASKQ_HEAD=10 + + #2tp4pp + # export ONEFLOW_OFCCL_SKIP_NEGO=1 + # export RECV_SUCCESS_FACTOR=10 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=12000 + # export TOLERANT_UNPROGRESSED_CNT=8000 + # export NUM_TRY_TASKQ_HEAD=10 + + #4tp2pp + # export ONEFLOW_OFCCL_SKIP_NEGO=1 + # export RECV_SUCCESS_FACTOR=10 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=14000 + # export TOLERANT_UNPROGRESSED_CNT=8000 + # export NUM_TRY_TASKQ_HEAD=10 + + #4tp2dp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=5 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=8000 + # export TOLERANT_UNPROGRESSED_CNT=9000 + # export NUM_TRY_TASKQ_HEAD=10 fi -echo TRAVERSE_TIMES=$TRAVERSE_TIMES +echo GPUS=$GPUS +echo ONEFLOW_ENABLE_OFCCL=$ONEFLOW_ENABLE_OFCCL +echo ONEFLOW_OFCCL_SKIP_NEGO=$ONEFLOW_OFCCL_SKIP_NEGO +echo ONEFLOW_ENABLE_OFCCL=$ONEFLOW_DEBUG_MODE +echo ONEFLOW_OFCCL_SKIP_NEGO=$ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE +echo NCCL_PROTO=$NCCL_PROTO +echo NCCL_ALGO=$NCCL_ALGO +echo NCCL_MAX_NCHANNELS=$NCCL_MAX_NCHANNELS +echo NCCL_NTHREADS=$NCCL_NTHREADS +echo ONEFLOW_OFCCL_CHAIN=$ONEFLOW_OFCCL_CHAIN +echo GLOG_vmodule=$GLOG_vmodule +echo GLOG_v=$GLOG_v +echo GLOG_logtostderr=$GLOG_logtostderr + +echo RECV_SUCCESS_FACTOR=$RECV_SUCCESS_FACTOR echo TOLERANT_UNPROGRESSED_CNT=$TOLERANT_UNPROGRESSED_CNT echo BASE_CTX_SWITCH_THRESHOLD=$BASE_CTX_SWITCH_THRESHOLD echo NUM_TRY_TASKQ_HEAD=$NUM_TRY_TASKQ_HEAD From db2c433c376b46cab8d562a0988c63839996087d Mon Sep 17 00:00:00 2001 From: Panlichen Date: Wed, 8 Feb 2023 09:42:43 +0000 Subject: [PATCH 17/33] hyperparameter --- configs/vit_imagenet.py | 4 ++-- tools/train.sh | 28 ++++++++++++++-------------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/configs/vit_imagenet.py b/configs/vit_imagenet.py index 4ed8b2a59..d5013c9f6 100644 --- a/configs/vit_imagenet.py +++ b/configs/vit_imagenet.py @@ -84,6 +84,6 @@ # Distributed Settings train.dist.pipeline_num_layers = model.cfg.depth -train.dist.data_parallel_size = 1 -train.dist.tensor_parallel_size = 4 +train.dist.data_parallel_size = 8 +train.dist.tensor_parallel_size = 1 train.dist.pipeline_parallel_size = 1 diff --git a/tools/train.sh b/tools/train.sh index 6d932d863..d7ea3c03a 100755 --- a/tools/train.sh +++ b/tools/train.sh @@ -34,7 +34,7 @@ export GLOG_vmodule=plan_util*=1,of_collective_actor*=1,of_collective_boxing_ker # nn_graph*=1, # export GLOG_v=1 -export SHOW_ALL_PREPARED_COLL=0 +export SHOW_ALL_PREPARED_COLL=1 export DEV_TRY_ROUND=10 export CHECK_REMAINING_SQE_INTERVAL=10000 @@ -54,7 +54,7 @@ if [ $GPUS = 2 ]; then # export NUM_TRY_TASKQ_HEAD=40 #pure tp - export RECV_SUCCESS_FACTOR=5 + export RECV_SUCCESS_FACTOR=20 export RECV_SUCCESS_THRESHOLD=10000 export BASE_CTX_SWITCH_THRESHOLD=120 export TOLERANT_UNPROGRESSED_CNT=10000 @@ -82,20 +82,20 @@ elif [ $GPUS = 4 ]; then elif [ $GPUS = 8 ]; then #pure dp - # export ONEFLOW_OFCCL_SKIP_NEGO=0 - # export RECV_SUCCESS_FACTOR=5 - # export RECV_SUCCESS_THRESHOLD=10000 - # export BASE_CTX_SWITCH_THRESHOLD=120 - # export TOLERANT_UNPROGRESSED_CNT=70000 - # export NUM_TRY_TASKQ_HEAD=240 + export ONEFLOW_OFCCL_SKIP_NEGO=0 + export RECV_SUCCESS_FACTOR=10 + export RECV_SUCCESS_THRESHOLD=10000 + export BASE_CTX_SWITCH_THRESHOLD=100000 + export TOLERANT_UNPROGRESSED_CNT=88000 + export NUM_TRY_TASKQ_HEAD=240 #pure tp - export ONEFLOW_OFCCL_SKIP_NEGO=1 - export RECV_SUCCESS_FACTOR=5 - export RECV_SUCCESS_THRESHOLD=10000 - export BASE_CTX_SWITCH_THRESHOLD=4000 - export TOLERANT_UNPROGRESSED_CNT=8000 - export NUM_TRY_TASKQ_HEAD=10 + # export ONEFLOW_OFCCL_SKIP_NEGO=1 + # export RECV_SUCCESS_FACTOR=5 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=4000 + # export TOLERANT_UNPROGRESSED_CNT=8000 + # export NUM_TRY_TASKQ_HEAD=10 #3d # export ONEFLOW_OFCCL_SKIP_NEGO=0 From a6bb5ad1e4440f45fb5aa5527861389b60975dd9 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Sat, 11 Feb 2023 13:33:21 +0000 Subject: [PATCH 18/33] +zero --- configs/vit_imagenet.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/configs/vit_imagenet.py b/configs/vit_imagenet.py index d5013c9f6..3a2e237f3 100644 --- a/configs/vit_imagenet.py +++ b/configs/vit_imagenet.py @@ -82,6 +82,10 @@ # Set fp16 ON train.amp.enabled = True +# zero +train.zero_optimization.enabled = False +train.zero_optimization.stage = 1 + # Distributed Settings train.dist.pipeline_num_layers = model.cfg.depth train.dist.data_parallel_size = 8 From 37131e0502acd091c4cf63f496de53e2e19c5cf4 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Thu, 16 Feb 2023 09:22:28 +0000 Subject: [PATCH 19/33] +ONEFLOW_TIME_SHAPE --- configs/vit_imagenet.py | 6 +++--- tools/train.sh | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/configs/vit_imagenet.py b/configs/vit_imagenet.py index 3a2e237f3..7f84a6989 100644 --- a/configs/vit_imagenet.py +++ b/configs/vit_imagenet.py @@ -88,6 +88,6 @@ # Distributed Settings train.dist.pipeline_num_layers = model.cfg.depth -train.dist.data_parallel_size = 8 -train.dist.tensor_parallel_size = 1 -train.dist.pipeline_parallel_size = 1 +train.dist.data_parallel_size = 2 +train.dist.tensor_parallel_size = 2 +train.dist.pipeline_parallel_size = 2 diff --git a/tools/train.sh b/tools/train.sh index d7ea3c03a..184247b4e 100755 --- a/tools/train.sh +++ b/tools/train.sh @@ -27,10 +27,11 @@ fi export ONEFLOW_ENABLE_OFCCL=1 export DISABLE_NCCL_COMPUTE_STREAM=1 +# export ONEFLOW_TIME_SHAPE=1 export ONEFLOW_DEBUG_MODE=1 export ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE=1 -export GLOG_vmodule=plan_util*=1,of_collective_actor*=1,of_collective_boxing_kernels*=1,collective_backend_ofccl*=1,hierarchical_sub_task_graph_builder_impl*=1,of_request_store*=1,request_store*=1,runtime*=1,scheduler*=1,collective_manager*=1 +export GLOG_vmodule=plan_util*=1,of_collective_actor*=1,of_collective_boxing_kernels*=1,collective_backend_ofccl*=1,hierarchical_sub_task_graph_builder_impl*=1,of_request_store*=1,request_store*=1,runtime*=1,scheduler*=1,collective_manager*=1,of_collective_boxing_sub_task_graph_builder*=1 # nn_graph*=1, # export GLOG_v=1 From 308da3eef17c60c6e0b49855603f2da4cc4fe66b Mon Sep 17 00:00:00 2001 From: Panlichen Date: Tue, 28 Feb 2023 16:27:13 +0000 Subject: [PATCH 20/33] +27 script --- tools/train.sh | 2 +- tools/train_27.sh | 202 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 203 insertions(+), 1 deletion(-) create mode 100755 tools/train_27.sh diff --git a/tools/train.sh b/tools/train.sh index 184247b4e..d170e7add 100755 --- a/tools/train.sh +++ b/tools/train.sh @@ -25,7 +25,7 @@ if [ -z $RUN_TYPE ];then # RUN_TYPE="NSYS" fi -export ONEFLOW_ENABLE_OFCCL=1 +# export ONEFLOW_ENABLE_OFCCL=1 export DISABLE_NCCL_COMPUTE_STREAM=1 # export ONEFLOW_TIME_SHAPE=1 export ONEFLOW_DEBUG_MODE=1 diff --git a/tools/train_27.sh b/tools/train_27.sh new file mode 100755 index 000000000..20cc9930e --- /dev/null +++ b/tools/train_27.sh @@ -0,0 +1,202 @@ +#!/usr/bin/env bash + +clear + +FILE=$1 +CONFIG=$2 +GPUS=$3 +NODE=${NODE:-1} +NODE_RANK=${NODE_RANK:-0} +ADDR=${ADDR:-127.0.0.1} +PORT=${PORT:-12345} + +export GLOG_logtostderr=1 +export ONEFLOW_ACTOR_ENABLE_LIGHT_ACTOR=0 # 禁用lightweight actor + +export NCCL_PROTO=Simple +export NCCL_ALGO=Ring +# export NCCL_MAX_NCHANNELS=1 +# export NCCL_MIN_NCHANNELS=1 +# export NCCL_NTHREADS=64 + +if [ -z $RUN_TYPE ];then + RUN_TYPE="PURE" + # RUN_TYPE="GDB" + # RUN_TYPE="NSYS" +fi + +export ONEFLOW_ENABLE_OFCCL=1 +export DISABLE_NCCL_COMPUTE_STREAM=1 +# export ONEFLOW_TIME_SHAPE=1 +export ONEFLOW_DEBUG_MODE=1 +export ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE=1 + +export GLOG_vmodule=plan_util*=1,of_collective_actor*=1,of_collective_boxing_kernels*=1,collective_backend_ofccl*=1,hierarchical_sub_task_graph_builder_impl*=1,of_request_store*=1,request_store*=1,runtime*=1,scheduler*=1,collective_manager*=1,of_collective_boxing_sub_task_graph_builder*=1,collective_boxing_sub_task_graph_builder*=1 +# nn_graph*=1, +# export GLOG_v=1 + +export SHOW_ALL_PREPARED_COLL=1 + +export DEV_TRY_ROUND=10 +export CHECK_REMAINING_SQE_INTERVAL=10000 +export DEBUG_FILE="/home/panlichen/work/oneflow/log/oneflow_cpu_rank_" + +export NUM_ITER_ENV=200 +echo NUM_ITER_ENV=$NUM_ITER_ENV + +if [ $GPUS = 2 ]; then + export CUDA_VISIBLE_DEVICES=4,5 + + #pure dp + # export RECV_SUCCESS_FACTOR=5 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=100 + # export TOLERANT_UNPROGRESSED_CNT=2000 + # export NUM_TRY_TASKQ_HEAD=40 + + #pure tp + export RECV_SUCCESS_FACTOR=20 + export RECV_SUCCESS_THRESHOLD=10000 + export BASE_CTX_SWITCH_THRESHOLD=120 + export TOLERANT_UNPROGRESSED_CNT=10000 + export NUM_TRY_TASKQ_HEAD=100 +elif [ $GPUS = 4 ]; then + export CUDA_VISIBLE_DEVICES=0,1,4,5 + export ONEFLOW_OFCCL_SKIP_NEGO=0 + + #pure dp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=40 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=30000 + # export TOLERANT_UNPROGRESSED_CNT=30000 + # export NUM_TRY_TASKQ_HEAD=200 + + #pure tp + export ONEFLOW_OFCCL_SKIP_NEGO=0 + export RECV_SUCCESS_FACTOR=40 + export RECV_SUCCESS_THRESHOLD=1000000000 + export BASE_CTX_SWITCH_THRESHOLD=100000 + export TOLERANT_UNPROGRESSED_CNT=16000 + export NUM_TRY_TASKQ_HEAD=200 + +elif [ $GPUS = 8 ]; then + + #pure dp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=30 + # export RECV_SUCCESS_THRESHOLD=100000000 + # export BASE_CTX_SWITCH_THRESHOLD=120000 + # export TOLERANT_UNPROGRESSED_CNT=180000 + # export NUM_TRY_TASKQ_HEAD=240 + + #pure tp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=10 + # export RECV_SUCCESS_THRESHOLD=1000000 + # export BASE_CTX_SWITCH_THRESHOLD=6000 + # export TOLERANT_UNPROGRESSED_CNT=8000 + # export NUM_TRY_TASKQ_HEAD=10 + + #4tp2dp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=10 + # export RECV_SUCCESS_THRESHOLD=10000000 + # export BASE_CTX_SWITCH_THRESHOLD=20000 + # export TOLERANT_UNPROGRESSED_CNT=9000 + # export NUM_TRY_TASKQ_HEAD=10 + + #3d + export ONEFLOW_OFCCL_SKIP_NEGO=0 + export RECV_SUCCESS_FACTOR=5 + export RECV_SUCCESS_THRESHOLD=10000000 + export BASE_CTX_SWITCH_THRESHOLD=20000 + export TOLERANT_UNPROGRESSED_CNT=80000 + export NUM_TRY_TASKQ_HEAD=10 + + #2dp4pp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=5 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=8000 + # export TOLERANT_UNPROGRESSED_CNT=80000 + # export NUM_TRY_TASKQ_HEAD=10 + + #2tp4pp + # export ONEFLOW_OFCCL_SKIP_NEGO=1 + # export RECV_SUCCESS_FACTOR=10 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=12000 + # export TOLERANT_UNPROGRESSED_CNT=8000 + # export NUM_TRY_TASKQ_HEAD=10 + + #4tp2pp + # export ONEFLOW_OFCCL_SKIP_NEGO=1 + # export RECV_SUCCESS_FACTOR=10 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=14000 + # export TOLERANT_UNPROGRESSED_CNT=8000 + # export NUM_TRY_TASKQ_HEAD=10 + +fi + +echo GPUS=$GPUS +echo ONEFLOW_ENABLE_OFCCL=$ONEFLOW_ENABLE_OFCCL +echo ONEFLOW_OFCCL_SKIP_NEGO=$ONEFLOW_OFCCL_SKIP_NEGO +echo ONEFLOW_ENABLE_OFCCL=$ONEFLOW_DEBUG_MODE +echo ONEFLOW_OFCCL_SKIP_NEGO=$ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE +echo NCCL_PROTO=$NCCL_PROTO +echo NCCL_ALGO=$NCCL_ALGO +echo NCCL_MAX_NCHANNELS=$NCCL_MAX_NCHANNELS +echo NCCL_NTHREADS=$NCCL_NTHREADS +echo ONEFLOW_OFCCL_CHAIN=$ONEFLOW_OFCCL_CHAIN +echo GLOG_vmodule=$GLOG_vmodule +echo GLOG_v=$GLOG_v +echo GLOG_logtostderr=$GLOG_logtostderr + +echo RECV_SUCCESS_FACTOR=$RECV_SUCCESS_FACTOR +echo TOLERANT_UNPROGRESSED_CNT=$TOLERANT_UNPROGRESSED_CNT +echo BASE_CTX_SWITCH_THRESHOLD=$BASE_CTX_SWITCH_THRESHOLD +echo NUM_TRY_TASKQ_HEAD=$NUM_TRY_TASKQ_HEAD +echo DEV_TRY_ROUND=$DEV_TRY_ROUND +echo CHECK_REMAINING_SQE_INTERVAL=$CHECK_REMAINING_SQE_INTERVAL +echo DEBUG_FILE=$DEBUG_FILE + +export PYTHONUNBUFFERED=1 +echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED +export NCCL_LAUNCH_MODE=PARALLEL +echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE +# export NCCL_DEBUG=INFO + +rm -rf /home/panlichen/work/libai/log +mkdir -p /home/panlichen/work/libai/log + +rm -rf /home/panlichen/work/oneflow/log +mkdir -p /home/panlichen/work/oneflow/log + +export ONEFLOW_FUSE_OPTIMIZER_UPDATE_CAST=true + +if [ "$ONEFLOW_ENABLE_OFCCL" == "1" ]; then + NSYS_FILE="ofccl_vit"_${HOST}_${GPUS}_card +else + NSYS_FILE="nccl_vit"_${HOST}_${GPUS}_card +fi + +if [ "$RUN_TYPE" == "PURE" ];then + cmd="python3 -m oneflow.distributed.launch" +elif [ "$RUN_TYPE" == "GDB" ];then + cmd="gdb -ex r --args python3 -m oneflow.distributed.launch" +elif [ "$RUN_TYPE" == "NSYS" ];then + if [ ! -d "/home/panlichen/work/oneflow/log/nsys" ];then + mkdir -p /home/panlichen/work/oneflow/log/nsys + fi + # cmd="nsys profile -f true --trace=cuda,cudnn,cublas,osrt,nvtx -o /home/panlichen/work/oneflow/log/nsys/$NSYS_FILE python3 -m oneflow.distributed.launch" + cmd="nsys profile -f true -o /home/panlichen/work/oneflow/log/nsys/$NSYS_FILE python3 -m oneflow.distributed.launch" +fi +echo cmd=$cmd + +$cmd \ + --nproc_per_node $GPUS --nnodes $NODE --node_rank $NODE_RANK --master_addr $ADDR --master_port $PORT \ + $FILE --config-file $CONFIG ${@:4} \ + > /home/panlichen/work/oneflow/log/oneflow.log 2>&1 + From 283f0fb574c196842dd8724c8e9e94a3fa4f31d6 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Tue, 28 Feb 2023 16:27:41 +0000 Subject: [PATCH 21/33] script --- tools/train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/train.sh b/tools/train.sh index d170e7add..184247b4e 100755 --- a/tools/train.sh +++ b/tools/train.sh @@ -25,7 +25,7 @@ if [ -z $RUN_TYPE ];then # RUN_TYPE="NSYS" fi -# export ONEFLOW_ENABLE_OFCCL=1 +export ONEFLOW_ENABLE_OFCCL=1 export DISABLE_NCCL_COMPUTE_STREAM=1 # export ONEFLOW_TIME_SHAPE=1 export ONEFLOW_DEBUG_MODE=1 From 0ae238e880ed060b46240581961d5ca81a1581ff Mon Sep 17 00:00:00 2001 From: Panlichen Date: Sun, 26 Mar 2023 16:06:48 +0000 Subject: [PATCH 22/33] + 2 machine script --- tools/train_27_25.sh | 215 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 215 insertions(+) create mode 100755 tools/train_27_25.sh diff --git a/tools/train_27_25.sh b/tools/train_27_25.sh new file mode 100755 index 000000000..6c0252005 --- /dev/null +++ b/tools/train_27_25.sh @@ -0,0 +1,215 @@ +#!/usr/bin/env bash + +clear + +FILE=$1 +CONFIG=$2 +GPUS=$3 + +# NODE=${NODE:-1} +# NODE_RANK=${NODE_RANK:-0} +# ADDR=${ADDR:-127.0.0.1} +# PORT=${PORT:-12345} + +NODE=2 + +if [[ $HOST = "oneflow-27" ]]; then + NODE_RANK=0 +elif [[ $HOST = "oneflow-25" ]]; then + NODE_RANK=1 +fi +echo $NODE_RANK + +ADDR=11.11.1.27 +PORT=12345 + +export GLOG_logtostderr=1 +export ONEFLOW_ACTOR_ENABLE_LIGHT_ACTOR=0 # 禁用lightweight actor + +export NCCL_PROTO=Simple +export NCCL_ALGO=Ring +# export NCCL_MAX_NCHANNELS=1 +# export NCCL_MIN_NCHANNELS=1 +# export NCCL_NTHREADS=64 + +if [ -z $RUN_TYPE ];then + RUN_TYPE="PURE" + # RUN_TYPE="GDB" + # RUN_TYPE="NSYS" +fi + +export ONEFLOW_ENABLE_OFCCL=1 +export DISABLE_NCCL_COMPUTE_STREAM=1 +# export ONEFLOW_TIME_SHAPE=1 +export ONEFLOW_DEBUG_MODE=1 +export ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE=1 + +export GLOG_vmodule=plan_util*=1,of_collective_actor*=1,of_collective_boxing_kernels*=1,collective_backend_ofccl*=1,hierarchical_sub_task_graph_builder_impl*=1,of_request_store*=1,request_store*=1,runtime*=1,scheduler*=1,collective_manager*=1,of_collective_boxing_sub_task_graph_builder*=1,collective_boxing_sub_task_graph_builder*=1 +# nn_graph*=1, +# export GLOG_v=1 + +export SHOW_ALL_PREPARED_COLL=1 + +export DEV_TRY_ROUND=10 +export CHECK_REMAINING_SQE_INTERVAL=10000 +export DEBUG_FILE="/home/panlichen/work/oneflow/log/oneflow_cpu_rank_" + +export NUM_ITER_ENV=20 +echo NUM_ITER_ENV=$NUM_ITER_ENV + +if [ $GPUS = 2 ]; then + export CUDA_VISIBLE_DEVICES=4,5 + + #pure dp + # export RECV_SUCCESS_FACTOR=5 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=100 + # export TOLERANT_UNPROGRESSED_CNT=2000 + # export NUM_TRY_TASKQ_HEAD=40 + + #pure tp + export RECV_SUCCESS_FACTOR=20 + export RECV_SUCCESS_THRESHOLD=10000 + export BASE_CTX_SWITCH_THRESHOLD=120 + export TOLERANT_UNPROGRESSED_CNT=10000 + export NUM_TRY_TASKQ_HEAD=100 +elif [ $GPUS = 4 ]; then + export CUDA_VISIBLE_DEVICES=0,1,4,5 + export ONEFLOW_OFCCL_SKIP_NEGO=0 + + #pure dp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=40 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=30000 + # export TOLERANT_UNPROGRESSED_CNT=30000 + # export NUM_TRY_TASKQ_HEAD=200 + + #pure tp + export ONEFLOW_OFCCL_SKIP_NEGO=0 + export RECV_SUCCESS_FACTOR=40 + export RECV_SUCCESS_THRESHOLD=1000000000 + export BASE_CTX_SWITCH_THRESHOLD=100000 + export TOLERANT_UNPROGRESSED_CNT=16000 + export NUM_TRY_TASKQ_HEAD=200 + +elif [ $GPUS = 8 ]; then + + #pure dp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=30 + # export RECV_SUCCESS_THRESHOLD=100000000 + # export BASE_CTX_SWITCH_THRESHOLD=120000 + # export TOLERANT_UNPROGRESSED_CNT=180000 + # export NUM_TRY_TASKQ_HEAD=240 + + #pure tp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=10 + # export RECV_SUCCESS_THRESHOLD=1000000 + # export BASE_CTX_SWITCH_THRESHOLD=6000 + # export TOLERANT_UNPROGRESSED_CNT=8000 + # export NUM_TRY_TASKQ_HEAD=10 + + #4tp2dp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=10 + # export RECV_SUCCESS_THRESHOLD=10000000 + # export BASE_CTX_SWITCH_THRESHOLD=20000 + # export TOLERANT_UNPROGRESSED_CNT=9000 + # export NUM_TRY_TASKQ_HEAD=10 + + #3d + export ONEFLOW_OFCCL_SKIP_NEGO=0 + export RECV_SUCCESS_FACTOR=5 + export RECV_SUCCESS_THRESHOLD=10000000 + export BASE_CTX_SWITCH_THRESHOLD=20000 + export TOLERANT_UNPROGRESSED_CNT=80000 + export NUM_TRY_TASKQ_HEAD=10 + + #2dp4pp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=5 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=8000 + # export TOLERANT_UNPROGRESSED_CNT=80000 + # export NUM_TRY_TASKQ_HEAD=10 + + #2tp4pp + # export ONEFLOW_OFCCL_SKIP_NEGO=1 + # export RECV_SUCCESS_FACTOR=10 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=12000 + # export TOLERANT_UNPROGRESSED_CNT=8000 + # export NUM_TRY_TASKQ_HEAD=10 + + #4tp2pp + # export ONEFLOW_OFCCL_SKIP_NEGO=1 + # export RECV_SUCCESS_FACTOR=10 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=14000 + # export TOLERANT_UNPROGRESSED_CNT=8000 + # export NUM_TRY_TASKQ_HEAD=10 + +fi + +echo GPUS=$GPUS +echo ONEFLOW_ENABLE_OFCCL=$ONEFLOW_ENABLE_OFCCL +echo ONEFLOW_OFCCL_SKIP_NEGO=$ONEFLOW_OFCCL_SKIP_NEGO +echo ONEFLOW_ENABLE_OFCCL=$ONEFLOW_DEBUG_MODE +echo ONEFLOW_OFCCL_SKIP_NEGO=$ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE +echo NCCL_PROTO=$NCCL_PROTO +echo NCCL_ALGO=$NCCL_ALGO +echo NCCL_MAX_NCHANNELS=$NCCL_MAX_NCHANNELS +echo NCCL_NTHREADS=$NCCL_NTHREADS +echo ONEFLOW_OFCCL_CHAIN=$ONEFLOW_OFCCL_CHAIN +echo GLOG_vmodule=$GLOG_vmodule +echo GLOG_v=$GLOG_v +echo GLOG_logtostderr=$GLOG_logtostderr + +echo RECV_SUCCESS_FACTOR=$RECV_SUCCESS_FACTOR +echo TOLERANT_UNPROGRESSED_CNT=$TOLERANT_UNPROGRESSED_CNT +echo BASE_CTX_SWITCH_THRESHOLD=$BASE_CTX_SWITCH_THRESHOLD +echo NUM_TRY_TASKQ_HEAD=$NUM_TRY_TASKQ_HEAD +echo DEV_TRY_ROUND=$DEV_TRY_ROUND +echo CHECK_REMAINING_SQE_INTERVAL=$CHECK_REMAINING_SQE_INTERVAL +echo DEBUG_FILE=$DEBUG_FILE + +export PYTHONUNBUFFERED=1 +echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED +export NCCL_LAUNCH_MODE=PARALLEL +echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE +# export NCCL_DEBUG=INFO + +rm -rf /home/panlichen/work/libai/log +mkdir -p /home/panlichen/work/libai/log + +rm -rf /home/panlichen/work/oneflow/log +mkdir -p /home/panlichen/work/oneflow/log + +export ONEFLOW_FUSE_OPTIMIZER_UPDATE_CAST=true + +if [ "$ONEFLOW_ENABLE_OFCCL" == "1" ]; then + NSYS_FILE="ofccl_vit"_${HOST}_${GPUS}_card +else + NSYS_FILE="nccl_vit"_${HOST}_${GPUS}_card +fi + +if [ "$RUN_TYPE" == "PURE" ];then + cmd="python3 -m oneflow.distributed.launch" +elif [ "$RUN_TYPE" == "GDB" ];then + cmd="gdb -ex r --args python3 -m oneflow.distributed.launch" +elif [ "$RUN_TYPE" == "NSYS" ];then + if [ ! -d "/home/panlichen/work/oneflow/log/nsys" ];then + mkdir -p /home/panlichen/work/oneflow/log/nsys + fi + # cmd="nsys profile -f true --trace=cuda,cudnn,cublas,osrt,nvtx -o /home/panlichen/work/oneflow/log/nsys/$NSYS_FILE python3 -m oneflow.distributed.launch" + cmd="nsys profile -f true -o /home/panlichen/work/oneflow/log/nsys/$NSYS_FILE python3 -m oneflow.distributed.launch" +fi +echo cmd=$cmd + +$cmd \ + --nproc_per_node $GPUS --nnodes $NODE --node_rank $NODE_RANK --master_addr $ADDR --master_port $PORT \ + $FILE --config-file $CONFIG ${@:4} \ + > /home/panlichen/work/oneflow/log/oneflow.log 2>&1 + From cf708ad435882da2f23268841c68b7a34af9116e Mon Sep 17 00:00:00 2001 From: Panlichen Date: Tue, 11 Apr 2023 11:46:30 +0000 Subject: [PATCH 23/33] scripts --- tools/train.sh | 102 ++++--------------------------------------------- 1 file changed, 7 insertions(+), 95 deletions(-) diff --git a/tools/train.sh b/tools/train.sh index 184247b4e..8c388d25b 100755 --- a/tools/train.sh +++ b/tools/train.sh @@ -41,103 +41,15 @@ export DEV_TRY_ROUND=10 export CHECK_REMAINING_SQE_INTERVAL=10000 export DEBUG_FILE="/home/panlichen/work/oneflow/log/oneflow_cpu_rank_" -export NUM_ITER_ENV=20 +export NUM_ITER_ENV=200 echo NUM_ITER_ENV=$NUM_ITER_ENV -if [ $GPUS = 2 ]; then - export CUDA_VISIBLE_DEVICES=4,5 - - #pure dp - # export RECV_SUCCESS_FACTOR=5 - # export RECV_SUCCESS_THRESHOLD=10000 - # export BASE_CTX_SWITCH_THRESHOLD=100 - # export TOLERANT_UNPROGRESSED_CNT=2000 - # export NUM_TRY_TASKQ_HEAD=40 - - #pure tp - export RECV_SUCCESS_FACTOR=20 - export RECV_SUCCESS_THRESHOLD=10000 - export BASE_CTX_SWITCH_THRESHOLD=120 - export TOLERANT_UNPROGRESSED_CNT=10000 - export NUM_TRY_TASKQ_HEAD=100 -elif [ $GPUS = 4 ]; then - export CUDA_VISIBLE_DEVICES=0,1,4,5 - export ONEFLOW_OFCCL_SKIP_NEGO=0 - - #pure dp - # export ONEFLOW_OFCCL_SKIP_NEGO=0 - # export RECV_SUCCESS_FACTOR=5 - # export RECV_SUCCESS_THRESHOLD=10000 - # export BASE_CTX_SWITCH_THRESHOLD=80 - # export TOLERANT_UNPROGRESSED_CNT=10000 - # export NUM_TRY_TASKQ_HEAD=50 - - #pure tp - export ONEFLOW_OFCCL_SKIP_NEGO=0 - export RECV_SUCCESS_FACTOR=40 - export RECV_SUCCESS_THRESHOLD=10000 - export BASE_CTX_SWITCH_THRESHOLD=3000 - export TOLERANT_UNPROGRESSED_CNT=16000 - export NUM_TRY_TASKQ_HEAD=200 - -elif [ $GPUS = 8 ]; then - - #pure dp - export ONEFLOW_OFCCL_SKIP_NEGO=0 - export RECV_SUCCESS_FACTOR=10 - export RECV_SUCCESS_THRESHOLD=10000 - export BASE_CTX_SWITCH_THRESHOLD=100000 - export TOLERANT_UNPROGRESSED_CNT=88000 - export NUM_TRY_TASKQ_HEAD=240 - - #pure tp - # export ONEFLOW_OFCCL_SKIP_NEGO=1 - # export RECV_SUCCESS_FACTOR=5 - # export RECV_SUCCESS_THRESHOLD=10000 - # export BASE_CTX_SWITCH_THRESHOLD=4000 - # export TOLERANT_UNPROGRESSED_CNT=8000 - # export NUM_TRY_TASKQ_HEAD=10 - - #3d - # export ONEFLOW_OFCCL_SKIP_NEGO=0 - # export RECV_SUCCESS_FACTOR=5 - # export RECV_SUCCESS_THRESHOLD=10000 - # export BASE_CTX_SWITCH_THRESHOLD=8000 - # export TOLERANT_UNPROGRESSED_CNT=80000 - # export NUM_TRY_TASKQ_HEAD=10 - - #2dp4pp - # export ONEFLOW_OFCCL_SKIP_NEGO=0 - # export RECV_SUCCESS_FACTOR=5 - # export RECV_SUCCESS_THRESHOLD=10000 - # export BASE_CTX_SWITCH_THRESHOLD=8000 - # export TOLERANT_UNPROGRESSED_CNT=80000 - # export NUM_TRY_TASKQ_HEAD=10 - - #2tp4pp - # export ONEFLOW_OFCCL_SKIP_NEGO=1 - # export RECV_SUCCESS_FACTOR=10 - # export RECV_SUCCESS_THRESHOLD=10000 - # export BASE_CTX_SWITCH_THRESHOLD=12000 - # export TOLERANT_UNPROGRESSED_CNT=8000 - # export NUM_TRY_TASKQ_HEAD=10 - - #4tp2pp - # export ONEFLOW_OFCCL_SKIP_NEGO=1 - # export RECV_SUCCESS_FACTOR=10 - # export RECV_SUCCESS_THRESHOLD=10000 - # export BASE_CTX_SWITCH_THRESHOLD=14000 - # export TOLERANT_UNPROGRESSED_CNT=8000 - # export NUM_TRY_TASKQ_HEAD=10 - - #4tp2dp - # export ONEFLOW_OFCCL_SKIP_NEGO=0 - # export RECV_SUCCESS_FACTOR=5 - # export RECV_SUCCESS_THRESHOLD=10000 - # export BASE_CTX_SWITCH_THRESHOLD=8000 - # export TOLERANT_UNPROGRESSED_CNT=9000 - # export NUM_TRY_TASKQ_HEAD=10 -fi +export ONEFLOW_OFCCL_SKIP_NEGO=0 +export RECV_SUCCESS_FACTOR=5 +export RECV_SUCCESS_THRESHOLD=10000000 +export BASE_CTX_SWITCH_THRESHOLD=20000 +export TOLERANT_UNPROGRESSED_CNT=80000 +export NUM_TRY_TASKQ_HEAD=10 echo GPUS=$GPUS echo ONEFLOW_ENABLE_OFCCL=$ONEFLOW_ENABLE_OFCCL From f30ea656f5ab95ff4d5caae61d99b8c44328924b Mon Sep 17 00:00:00 2001 From: Lichen Pan Date: Thu, 13 Apr 2023 11:06:24 +0800 Subject: [PATCH 24/33] Update vit_imagenet.py --- configs/vit_imagenet.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/configs/vit_imagenet.py b/configs/vit_imagenet.py index 7f84a6989..2c209b105 100644 --- a/configs/vit_imagenet.py +++ b/configs/vit_imagenet.py @@ -1,12 +1,5 @@ from libai.config import LazyCall - -import os -host = os.environ.get('HOST') -if (host == "oneflow-25" or host == "oneflow-27"): - from .common.models.vit.vit_base_patch16_224 import model -else: - from .common.models.vit.vit_small_patch16_224 import model - +from .common.models.vit.vit_base_patch16_224 import model from .common.models.graph import graph from .common.train import train from .common.optim import optim From 48a67b43388d3f88e66cadc39c63bc482afbde06 Mon Sep 17 00:00:00 2001 From: Lichen Pan Date: Thu, 13 Apr 2023 11:12:51 +0800 Subject: [PATCH 25/33] Update README.md --- README.md | 123 ++++-------------------------------------------------- 1 file changed, 9 insertions(+), 114 deletions(-) diff --git a/README.md b/README.md index 2f631b982..0f850cf5d 100644 --- a/README.md +++ b/README.md @@ -1,117 +1,12 @@ - +Please refer to the [official repository](https://github.com/Oneflow-Inc/libai) and the [official documentation page](https://libai.readthedocs.io/en/latest/) for guidance on installation and other related topics. -

LiBai

-

- - docs - - - GitHub - - - GitHub release - - - PRs Welcome - - - Python Checks - - - Docs Release Status - -

- - -## Introduction - -**English** | [简体中文](/README_zh-CN.md) - -LiBai is a large-scale open-source model training toolbox based on OneFlow. The main branch works with OneFlow 0.7.0. - -
- Highlights - -- **Support a collection of parallel training components** - - LiBai provides multiple parallelisms such as Data Parallelism, Tensor Parallelism, and Pipeline Parallelism. It's also extensible for other new parallelisms. - -- **Varied training techniques** - - LiBai provides many out-of-the-box training techniques such as Distributed Training, Mixed Precision Training, Activation Checkpointing, Recomputation, Gradient Accumulation, and Zero Redundancy Optimizer(ZeRO). - -- **Support for both CV and NLP tasks** - - LiBai has predifined data process for both CV and NLP datasets such as CIFAR, ImageNet, and BERT Dataset. - -- **Easy to use** - - LiBai's components are designed to be modular for easier usage as follows: - - LazyConfig system for more flexible syntax and no predefined structures - - Friendly trainer and engine - - Used as a library to support building research projects on it. See [projects/](/projects) for some projects that are built based on LiBai - -- **High Efficiency** - -
- -## Installation - -See [Installation instructions](https://libai.readthedocs.io/en/latest/tutorials/get_started/Installation.html). - -## Getting Started - -See [Quick Run](https://libai.readthedocs.io/en/latest/tutorials/get_started/quick_run.html) for the basic usage of LiBai. - -## Documentation - -See LiBai's [documentation](https://libai.readthedocs.io/en/latest/index.html) for full API documentation and tutorials. - -## ChangeLog - -**Beta 0.2.0** was released in 07/07/2022, the general changes in **0.2.0** version are as follows: - -**Features:** -- Support evaluation enabled and set `eval_iter` -- Support customized sampler in `config.py` -- Support rdma for pipeline-model-parallel -- Support multi fused kernel - - fused_scale_mask_softmax_dropout - - fused_scale_tril_softmax_mask_scale - - fused_self_attention in branch `libai_bench` -- User Experience Optimization -- Optimization for training throughput, see [benchmark](https://libai.readthedocs.io/en/latest/tutorials/get_started/Benchmark.html) for more details - -**Supported Models:** -- Support 3D parallel [Roberta](https://arxiv.org/abs/1907.11692) model -- Support 2D parallel (data parallel + tensor model parallel) [SimCSE](https://arxiv.org/abs/2104.08821) model -- Support Data parallel [MAE](https://arxiv.org/abs/2111.06377) model -- Support Data parallel [MOCOV3](https://arxiv.org/abs/2104.02057) model - -See [changelog](./changelog.md) for details and release history. - -## Contributing - -We appreciate all contributions to improve LiBai. See [CONTRIBUTING](./CONTRIBUTING.md) for the contributing guideline. - -## License - -This project is released under the [Apache 2.0 license](LICENSE). - -## Citation - -If you find this project useful for your research, consider cite: - -```BibTeX -@misc{of2021libai, - author = {Xingyu Liao and Peng Cheng and Tianhe Ren and Depeng Liang and - Kai Dang and Yi Wang and Xiaoyu Xu}, - title = {LiBai}, - howpublished = {\url{https://github.com/Oneflow-Inc/libai}}, - year = {2021} -} +## Running experiments in the OCCL paper +```shell +bash tools/train.sh tools/train_net.py configs/vit_imagenet.py ``` -## Join the WeChat group - -![LiBai_Wechat_QRcode](./docs/source/tutorials/assets/LiBai_Wechat.png) \ No newline at end of file +Notes: +- Prepare the ImageNet dataset in advance. +- Edit the [configs/vit_imagenet.py](configs/vit_imagenet.py#L84-L86) to switch among different distributed DNN training methods, following the guidelines in the [official doc](https://libai.readthedocs.io/en/latest/tutorials/basics/Distributed_Configuration.html). +- For training across multiple machines, edit the `NODE`, `NODE_RANK`, `ADDR`, and `ADDR_RANK` variables in [tools/train.sh](tools/train.sh#L8-L11). +- Edit [configs/vit_imagenet.py](configs/vit_imagenet.py#L2) to choose between the base ViT configuration or the large ViT configuration. From 354ac4437d1a13b5da80e36fa22233e72b1aa124 Mon Sep 17 00:00:00 2001 From: Lichen Pan Date: Thu, 13 Apr 2023 11:29:44 +0800 Subject: [PATCH 26/33] scripts --- tools/train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/train.sh b/tools/train.sh index 8c388d25b..d526528b4 100755 --- a/tools/train.sh +++ b/tools/train.sh @@ -109,5 +109,5 @@ echo cmd=$cmd $cmd \ --nproc_per_node $GPUS --nnodes $NODE --node_rank $NODE_RANK --master_addr $ADDR --master_port $PORT \ $FILE --config-file $CONFIG ${@:4} \ - > /home/panlichen/work/oneflow/log/oneflow.log 2>&1 + # > /home/panlichen/work/oneflow/log/oneflow.log 2>&1 From 6867b512ed1fc72754ececab09f0672da9d07760 Mon Sep 17 00:00:00 2001 From: Lichen Pan Date: Fri, 14 Apr 2023 10:32:12 +0800 Subject: [PATCH 27/33] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 0f850cf5d..ea20ee6c9 100644 --- a/README.md +++ b/README.md @@ -10,3 +10,4 @@ Notes: - Edit the [configs/vit_imagenet.py](configs/vit_imagenet.py#L84-L86) to switch among different distributed DNN training methods, following the guidelines in the [official doc](https://libai.readthedocs.io/en/latest/tutorials/basics/Distributed_Configuration.html). - For training across multiple machines, edit the `NODE`, `NODE_RANK`, `ADDR`, and `ADDR_RANK` variables in [tools/train.sh](tools/train.sh#L8-L11). - Edit [configs/vit_imagenet.py](configs/vit_imagenet.py#L2) to choose between the base ViT configuration or the large ViT configuration. +- If the environment virable `ONEFLOW_ENABLE_OFCCL` in [train.sh](tools/train.sh#L28) is set to `1`, OCCL will be used during training; otherwise, NCCL will be employed. From b1c7e7fc9147431805b6e7a3f2040a57fa2ceb83 Mon Sep 17 00:00:00 2001 From: Lichen Pan Date: Fri, 14 Apr 2023 10:57:02 +0800 Subject: [PATCH 28/33] scripts --- configs/vit_imagenet.py | 1 + 1 file changed, 1 insertion(+) diff --git a/configs/vit_imagenet.py b/configs/vit_imagenet.py index 2c209b105..3b2215cbd 100644 --- a/configs/vit_imagenet.py +++ b/configs/vit_imagenet.py @@ -1,5 +1,6 @@ from libai.config import LazyCall from .common.models.vit.vit_base_patch16_224 import model +#from .common.models.vit.vit_large_patch16_224 import model from .common.models.graph import graph from .common.train import train from .common.optim import optim From 85a8cf8a75be563feb89966dfd0dca897fa9b5c4 Mon Sep 17 00:00:00 2001 From: Lichen Pan Date: Fri, 14 Apr 2023 10:57:22 +0800 Subject: [PATCH 29/33] scripts --- configs/vit_imagenet.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/configs/vit_imagenet.py b/configs/vit_imagenet.py index 3b2215cbd..fed91e74f 100644 --- a/configs/vit_imagenet.py +++ b/configs/vit_imagenet.py @@ -1,6 +1,5 @@ from libai.config import LazyCall -from .common.models.vit.vit_base_patch16_224 import model -#from .common.models.vit.vit_large_patch16_224 import model +from .common.models.vit.vit_base_patch16_224 import model #from .common.models.vit.vit_large_patch16_224 import model from .common.models.graph import graph from .common.train import train from .common.optim import optim From 90837d0bdf22155e3f27dcffd0eb175e45e67714 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Mon, 24 Apr 2023 04:56:30 +0000 Subject: [PATCH 30/33] +scripts --- tools/train_a100.sh | 202 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 202 insertions(+) create mode 100755 tools/train_a100.sh diff --git a/tools/train_a100.sh b/tools/train_a100.sh new file mode 100755 index 000000000..20cc9930e --- /dev/null +++ b/tools/train_a100.sh @@ -0,0 +1,202 @@ +#!/usr/bin/env bash + +clear + +FILE=$1 +CONFIG=$2 +GPUS=$3 +NODE=${NODE:-1} +NODE_RANK=${NODE_RANK:-0} +ADDR=${ADDR:-127.0.0.1} +PORT=${PORT:-12345} + +export GLOG_logtostderr=1 +export ONEFLOW_ACTOR_ENABLE_LIGHT_ACTOR=0 # 禁用lightweight actor + +export NCCL_PROTO=Simple +export NCCL_ALGO=Ring +# export NCCL_MAX_NCHANNELS=1 +# export NCCL_MIN_NCHANNELS=1 +# export NCCL_NTHREADS=64 + +if [ -z $RUN_TYPE ];then + RUN_TYPE="PURE" + # RUN_TYPE="GDB" + # RUN_TYPE="NSYS" +fi + +export ONEFLOW_ENABLE_OFCCL=1 +export DISABLE_NCCL_COMPUTE_STREAM=1 +# export ONEFLOW_TIME_SHAPE=1 +export ONEFLOW_DEBUG_MODE=1 +export ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE=1 + +export GLOG_vmodule=plan_util*=1,of_collective_actor*=1,of_collective_boxing_kernels*=1,collective_backend_ofccl*=1,hierarchical_sub_task_graph_builder_impl*=1,of_request_store*=1,request_store*=1,runtime*=1,scheduler*=1,collective_manager*=1,of_collective_boxing_sub_task_graph_builder*=1,collective_boxing_sub_task_graph_builder*=1 +# nn_graph*=1, +# export GLOG_v=1 + +export SHOW_ALL_PREPARED_COLL=1 + +export DEV_TRY_ROUND=10 +export CHECK_REMAINING_SQE_INTERVAL=10000 +export DEBUG_FILE="/home/panlichen/work/oneflow/log/oneflow_cpu_rank_" + +export NUM_ITER_ENV=200 +echo NUM_ITER_ENV=$NUM_ITER_ENV + +if [ $GPUS = 2 ]; then + export CUDA_VISIBLE_DEVICES=4,5 + + #pure dp + # export RECV_SUCCESS_FACTOR=5 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=100 + # export TOLERANT_UNPROGRESSED_CNT=2000 + # export NUM_TRY_TASKQ_HEAD=40 + + #pure tp + export RECV_SUCCESS_FACTOR=20 + export RECV_SUCCESS_THRESHOLD=10000 + export BASE_CTX_SWITCH_THRESHOLD=120 + export TOLERANT_UNPROGRESSED_CNT=10000 + export NUM_TRY_TASKQ_HEAD=100 +elif [ $GPUS = 4 ]; then + export CUDA_VISIBLE_DEVICES=0,1,4,5 + export ONEFLOW_OFCCL_SKIP_NEGO=0 + + #pure dp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=40 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=30000 + # export TOLERANT_UNPROGRESSED_CNT=30000 + # export NUM_TRY_TASKQ_HEAD=200 + + #pure tp + export ONEFLOW_OFCCL_SKIP_NEGO=0 + export RECV_SUCCESS_FACTOR=40 + export RECV_SUCCESS_THRESHOLD=1000000000 + export BASE_CTX_SWITCH_THRESHOLD=100000 + export TOLERANT_UNPROGRESSED_CNT=16000 + export NUM_TRY_TASKQ_HEAD=200 + +elif [ $GPUS = 8 ]; then + + #pure dp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=30 + # export RECV_SUCCESS_THRESHOLD=100000000 + # export BASE_CTX_SWITCH_THRESHOLD=120000 + # export TOLERANT_UNPROGRESSED_CNT=180000 + # export NUM_TRY_TASKQ_HEAD=240 + + #pure tp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=10 + # export RECV_SUCCESS_THRESHOLD=1000000 + # export BASE_CTX_SWITCH_THRESHOLD=6000 + # export TOLERANT_UNPROGRESSED_CNT=8000 + # export NUM_TRY_TASKQ_HEAD=10 + + #4tp2dp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=10 + # export RECV_SUCCESS_THRESHOLD=10000000 + # export BASE_CTX_SWITCH_THRESHOLD=20000 + # export TOLERANT_UNPROGRESSED_CNT=9000 + # export NUM_TRY_TASKQ_HEAD=10 + + #3d + export ONEFLOW_OFCCL_SKIP_NEGO=0 + export RECV_SUCCESS_FACTOR=5 + export RECV_SUCCESS_THRESHOLD=10000000 + export BASE_CTX_SWITCH_THRESHOLD=20000 + export TOLERANT_UNPROGRESSED_CNT=80000 + export NUM_TRY_TASKQ_HEAD=10 + + #2dp4pp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=5 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=8000 + # export TOLERANT_UNPROGRESSED_CNT=80000 + # export NUM_TRY_TASKQ_HEAD=10 + + #2tp4pp + # export ONEFLOW_OFCCL_SKIP_NEGO=1 + # export RECV_SUCCESS_FACTOR=10 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=12000 + # export TOLERANT_UNPROGRESSED_CNT=8000 + # export NUM_TRY_TASKQ_HEAD=10 + + #4tp2pp + # export ONEFLOW_OFCCL_SKIP_NEGO=1 + # export RECV_SUCCESS_FACTOR=10 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=14000 + # export TOLERANT_UNPROGRESSED_CNT=8000 + # export NUM_TRY_TASKQ_HEAD=10 + +fi + +echo GPUS=$GPUS +echo ONEFLOW_ENABLE_OFCCL=$ONEFLOW_ENABLE_OFCCL +echo ONEFLOW_OFCCL_SKIP_NEGO=$ONEFLOW_OFCCL_SKIP_NEGO +echo ONEFLOW_ENABLE_OFCCL=$ONEFLOW_DEBUG_MODE +echo ONEFLOW_OFCCL_SKIP_NEGO=$ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE +echo NCCL_PROTO=$NCCL_PROTO +echo NCCL_ALGO=$NCCL_ALGO +echo NCCL_MAX_NCHANNELS=$NCCL_MAX_NCHANNELS +echo NCCL_NTHREADS=$NCCL_NTHREADS +echo ONEFLOW_OFCCL_CHAIN=$ONEFLOW_OFCCL_CHAIN +echo GLOG_vmodule=$GLOG_vmodule +echo GLOG_v=$GLOG_v +echo GLOG_logtostderr=$GLOG_logtostderr + +echo RECV_SUCCESS_FACTOR=$RECV_SUCCESS_FACTOR +echo TOLERANT_UNPROGRESSED_CNT=$TOLERANT_UNPROGRESSED_CNT +echo BASE_CTX_SWITCH_THRESHOLD=$BASE_CTX_SWITCH_THRESHOLD +echo NUM_TRY_TASKQ_HEAD=$NUM_TRY_TASKQ_HEAD +echo DEV_TRY_ROUND=$DEV_TRY_ROUND +echo CHECK_REMAINING_SQE_INTERVAL=$CHECK_REMAINING_SQE_INTERVAL +echo DEBUG_FILE=$DEBUG_FILE + +export PYTHONUNBUFFERED=1 +echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED +export NCCL_LAUNCH_MODE=PARALLEL +echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE +# export NCCL_DEBUG=INFO + +rm -rf /home/panlichen/work/libai/log +mkdir -p /home/panlichen/work/libai/log + +rm -rf /home/panlichen/work/oneflow/log +mkdir -p /home/panlichen/work/oneflow/log + +export ONEFLOW_FUSE_OPTIMIZER_UPDATE_CAST=true + +if [ "$ONEFLOW_ENABLE_OFCCL" == "1" ]; then + NSYS_FILE="ofccl_vit"_${HOST}_${GPUS}_card +else + NSYS_FILE="nccl_vit"_${HOST}_${GPUS}_card +fi + +if [ "$RUN_TYPE" == "PURE" ];then + cmd="python3 -m oneflow.distributed.launch" +elif [ "$RUN_TYPE" == "GDB" ];then + cmd="gdb -ex r --args python3 -m oneflow.distributed.launch" +elif [ "$RUN_TYPE" == "NSYS" ];then + if [ ! -d "/home/panlichen/work/oneflow/log/nsys" ];then + mkdir -p /home/panlichen/work/oneflow/log/nsys + fi + # cmd="nsys profile -f true --trace=cuda,cudnn,cublas,osrt,nvtx -o /home/panlichen/work/oneflow/log/nsys/$NSYS_FILE python3 -m oneflow.distributed.launch" + cmd="nsys profile -f true -o /home/panlichen/work/oneflow/log/nsys/$NSYS_FILE python3 -m oneflow.distributed.launch" +fi +echo cmd=$cmd + +$cmd \ + --nproc_per_node $GPUS --nnodes $NODE --node_rank $NODE_RANK --master_addr $ADDR --master_port $PORT \ + $FILE --config-file $CONFIG ${@:4} \ + > /home/panlichen/work/oneflow/log/oneflow.log 2>&1 + From 3b51f8661dc320f0edc2cfe3340516df83ab592c Mon Sep 17 00:00:00 2001 From: Panlichen Date: Mon, 24 Apr 2023 15:56:57 +0000 Subject: [PATCH 31/33] +scripts --- configs/vit_imagenet_a100.py | 65 ++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 configs/vit_imagenet_a100.py diff --git a/configs/vit_imagenet_a100.py b/configs/vit_imagenet_a100.py new file mode 100644 index 000000000..35e931d21 --- /dev/null +++ b/configs/vit_imagenet_a100.py @@ -0,0 +1,65 @@ +from libai.config import LazyCall +from .common.models.vit.vit_base_patch16_224 import model #from .common.models.vit.vit_large_patch16_224 import model +from .common.models.graph import graph +from .common.train import train +from .common.optim import optim +from .common.data.imagenet import dataloader + +from flowvision.data import Mixup +from flowvision.loss.cross_entropy import SoftTargetCrossEntropy + +# Refine data path to imagenet +dataloader.train.dataset[0].root = "/data/ImageNet/extract" +dataloader.test[0].dataset.root = "/data/ImageNet/extract" + +# Refine model cfg for vit training on imagenet +model.cfg.num_classes = 1000 +model.cfg.loss_func = SoftTargetCrossEntropy() + +# Add Mixup Func +dataloader.train.mixup_func = LazyCall(Mixup)( + mixup_alpha=0.8, + cutmix_alpha=1.0, + prob=1.0, + switch_prob=0.5, + mode="batch", + num_classes=model.cfg.num_classes, +) + +# Refine optimizer cfg for vit model +optim.lr = 1e-3 # 5e-4 * 1024 (batchsize) / 512 +optim.eps = 1e-8 +optim.weight_decay = 0.05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None +optim.params.overrides = {"pos_embed": {"weight_decay": 0.0}, "cls_token": {"weight_decay": 0.0}} + +# Refine train cfg for vit model +train.train_micro_batch_size = 128 +train.test_micro_batch_size = 128 +# train.train_epoch = 300 +train.train_epoch = 0 +import os +train.train_iter = int(os.getenv("NUM_ITER_ENV")) +train.warmup_ratio = 5 / 300 +train.evaluation.enabled = False +# train.evaluation.eval_period = 100 +train.log_period = 1 + +# Scheduler +train.scheduler.warmup_factor = 0.001 +train.scheduler.alpha = 0.01 +train.scheduler.warmup_method = "linear" + +# Set fp16 ON +train.amp.enabled = True + +# zero +train.zero_optimization.enabled = False +train.zero_optimization.stage = 1 + +# Distributed Settings +train.dist.pipeline_num_layers = model.cfg.depth +train.dist.data_parallel_size = 2 +train.dist.tensor_parallel_size = 2 +train.dist.pipeline_parallel_size = 2 From 0679d006c0e58b3c408451e6c900bc484ab77430 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Thu, 27 Apr 2023 02:58:01 +0000 Subject: [PATCH 32/33] scripts --- configs/vit_imagenet.py | 3 +++ tools/train_27.sh | 13 ++++++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/configs/vit_imagenet.py b/configs/vit_imagenet.py index fed91e74f..33728b5b0 100644 --- a/configs/vit_imagenet.py +++ b/configs/vit_imagenet.py @@ -12,6 +12,9 @@ dataloader.train.dataset[0].root = "/path/to/imagenet" dataloader.test[0].dataset.root = "/path/to/imagenet" +import os +host = os.getenv("HOST") + if (host == "oneflow-28"): dataloader.train.dataset[0].root = "/ssd/dataset/ImageNet/extract" dataloader.test[0].dataset.root = "/ssd/dataset/ImageNet/extract" diff --git a/tools/train_27.sh b/tools/train_27.sh index 20cc9930e..44305b1fe 100755 --- a/tools/train_27.sh +++ b/tools/train_27.sh @@ -107,12 +107,19 @@ elif [ $GPUS = 8 ]; then # export NUM_TRY_TASKQ_HEAD=10 #3d + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=5 + # export RECV_SUCCESS_THRESHOLD=10000000 + # export BASE_CTX_SWITCH_THRESHOLD=20000 + # export TOLERANT_UNPROGRESSED_CNT=80000 + # export NUM_TRY_TASKQ_HEAD=10 + export ONEFLOW_OFCCL_SKIP_NEGO=0 export RECV_SUCCESS_FACTOR=5 - export RECV_SUCCESS_THRESHOLD=10000000 - export BASE_CTX_SWITCH_THRESHOLD=20000 + export RECV_SUCCESS_THRESHOLD=2000 + export BASE_CTX_SWITCH_THRESHOLD=200 export TOLERANT_UNPROGRESSED_CNT=80000 - export NUM_TRY_TASKQ_HEAD=10 + export NUM_TRY_TASKQ_HEAD=5 #2dp4pp # export ONEFLOW_OFCCL_SKIP_NEGO=0 From ddd7e4b4b2d1d188c162fd01f689e091ca5d256c Mon Sep 17 00:00:00 2001 From: Lichen Pan Date: Sun, 12 May 2024 16:23:31 +0800 Subject: [PATCH 33/33] +4090_para scripts --- configs/vit_imagenet_para_4090.py | 68 ++++++++++ tools/train_para_4090.sh | 202 ++++++++++++++++++++++++++++++ 2 files changed, 270 insertions(+) create mode 100644 configs/vit_imagenet_para_4090.py create mode 100755 tools/train_para_4090.sh diff --git a/configs/vit_imagenet_para_4090.py b/configs/vit_imagenet_para_4090.py new file mode 100644 index 000000000..6ddf5a1d8 --- /dev/null +++ b/configs/vit_imagenet_para_4090.py @@ -0,0 +1,68 @@ +from libai.config import LazyCall +from .common.models.vit.vit_base_patch16_224 import model #from .common.models.vit.vit_large_patch16_224 import model +from .common.models.graph import graph +from .common.train import train +from .common.optim import optim +from .common.data.imagenet import dataloader + +from flowvision.data import Mixup +from flowvision.loss.cross_entropy import SoftTargetCrossEntropy + + +import os +host = os.getenv("HOST") + + +dataloader.train.dataset[0].root = "/HOME/scw6cab/run/OCCL/ImageNet" +dataloader.test[0].dataset.root = "/HOME/scw6cab/run/OCCL/ImageNet" + +# Refine model cfg for vit training on imagenet +model.cfg.num_classes = 1000 +model.cfg.loss_func = SoftTargetCrossEntropy() + +# Add Mixup Func +dataloader.train.mixup_func = LazyCall(Mixup)( + mixup_alpha=0.8, + cutmix_alpha=1.0, + prob=1.0, + switch_prob=0.5, + mode="batch", + num_classes=model.cfg.num_classes, +) + +# Refine optimizer cfg for vit model +optim.lr = 1e-3 # 5e-4 * 1024 (batchsize) / 512 +optim.eps = 1e-8 +optim.weight_decay = 0.05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None +optim.params.overrides = {"pos_embed": {"weight_decay": 0.0}, "cls_token": {"weight_decay": 0.0}} + +# Refine train cfg for vit model +train.train_micro_batch_size = 128 +train.test_micro_batch_size = 128 +# train.train_epoch = 300 +train.train_epoch = 0 +train.train_iter = int(os.getenv("NUM_ITER_ENV")) +train.warmup_ratio = 5 / 300 +train.evaluation.enabled = False +# train.evaluation.eval_period = 100 +train.log_period = 1 + +# Scheduler +train.scheduler.warmup_factor = 0.001 +train.scheduler.alpha = 0.01 +train.scheduler.warmup_method = "linear" + +# Set fp16 ON +train.amp.enabled = True + +# zero +train.zero_optimization.enabled = False +train.zero_optimization.stage = 1 + +# Distributed Settings +train.dist.pipeline_num_layers = model.cfg.depth +train.dist.data_parallel_size = 2 +train.dist.tensor_parallel_size = 2 +train.dist.pipeline_parallel_size = 2 diff --git a/tools/train_para_4090.sh b/tools/train_para_4090.sh new file mode 100755 index 000000000..4128a7f2a --- /dev/null +++ b/tools/train_para_4090.sh @@ -0,0 +1,202 @@ +#!/usr/bin/env bash + +clear + +FILE=$1 +CONFIG=$2 +GPUS=$3 +NODE=${NODE:-1} +NODE_RANK=${NODE_RANK:-0} +ADDR=${ADDR:-127.0.0.1} +PORT=${PORT:-12345} + +export GLOG_logtostderr=1 +export ONEFLOW_ACTOR_ENABLE_LIGHT_ACTOR=0 # 禁用lightweight actor + +export NCCL_PROTO=Simple +export NCCL_ALGO=Ring +# export NCCL_MAX_NCHANNELS=1 +# export NCCL_MIN_NCHANNELS=1 +# export NCCL_NTHREADS=64 + +if [ -z $RUN_TYPE ];then + RUN_TYPE="PURE" + # RUN_TYPE="GDB" + # RUN_TYPE="NSYS" +fi + +export ONEFLOW_ENABLE_OFCCL=0 +export DISABLE_NCCL_COMPUTE_STREAM=1 +# export ONEFLOW_TIME_SHAPE=1 +export ONEFLOW_DEBUG_MODE=1 +export ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE=1 + +export GLOG_vmodule=plan_util*=1,of_collective_actor*=1,of_collective_boxing_kernels*=1,collective_backend_ofccl*=1,hierarchical_sub_task_graph_builder_impl*=1,of_request_store*=1,request_store*=1,runtime*=1,scheduler*=1,collective_manager*=1,of_collective_boxing_sub_task_graph_builder*=1,collective_boxing_sub_task_graph_builder*=1 +# nn_graph*=1, +# export GLOG_v=1 + +export SHOW_ALL_PREPARED_COLL=1 + +export DEV_TRY_ROUND=10 +export CHECK_REMAINING_SQE_INTERVAL=10000 +export DEBUG_FILE="/HOME/scw6cab/run/OCCL/oneflow/log/oneflow_cpu_rank_" + +export NUM_ITER_ENV=200 +echo NUM_ITER_ENV=$NUM_ITER_ENV + +if [ $GPUS = 2 ]; then + # export CUDA_VISIBLE_DEVICES=4,5 + + #pure dp + # export RECV_SUCCESS_FACTOR=5 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=100 + # export TOLERANT_UNPROGRESSED_CNT=2000 + # export NUM_TRY_TASKQ_HEAD=40 + + #pure tp + export RECV_SUCCESS_FACTOR=20 + export RECV_SUCCESS_THRESHOLD=10000 + export BASE_CTX_SWITCH_THRESHOLD=120 + export TOLERANT_UNPROGRESSED_CNT=10000 + export NUM_TRY_TASKQ_HEAD=100 +elif [ $GPUS = 4 ]; then + # export CUDA_VISIBLE_DEVICES=0,1,4,5 + export ONEFLOW_OFCCL_SKIP_NEGO=0 + + #pure dp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=40 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=30000 + # export TOLERANT_UNPROGRESSED_CNT=30000 + # export NUM_TRY_TASKQ_HEAD=200 + + #pure tp + export ONEFLOW_OFCCL_SKIP_NEGO=0 + export RECV_SUCCESS_FACTOR=40 + export RECV_SUCCESS_THRESHOLD=1000000000 + export BASE_CTX_SWITCH_THRESHOLD=100000 + export TOLERANT_UNPROGRESSED_CNT=16000 + export NUM_TRY_TASKQ_HEAD=200 + +elif [ $GPUS = 8 ]; then + + #pure dp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=30 + # export RECV_SUCCESS_THRESHOLD=100000000 + # export BASE_CTX_SWITCH_THRESHOLD=120000 + # export TOLERANT_UNPROGRESSED_CNT=180000 + # export NUM_TRY_TASKQ_HEAD=240 + + #pure tp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=10 + # export RECV_SUCCESS_THRESHOLD=1000000 + # export BASE_CTX_SWITCH_THRESHOLD=6000 + # export TOLERANT_UNPROGRESSED_CNT=8000 + # export NUM_TRY_TASKQ_HEAD=10 + + #4tp2dp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=10 + # export RECV_SUCCESS_THRESHOLD=10000000 + # export BASE_CTX_SWITCH_THRESHOLD=20000 + # export TOLERANT_UNPROGRESSED_CNT=9000 + # export NUM_TRY_TASKQ_HEAD=10 + + #3d + export ONEFLOW_OFCCL_SKIP_NEGO=0 + export RECV_SUCCESS_FACTOR=5 + export RECV_SUCCESS_THRESHOLD=10000000 + export BASE_CTX_SWITCH_THRESHOLD=20000 + export TOLERANT_UNPROGRESSED_CNT=80000 + export NUM_TRY_TASKQ_HEAD=10 + + #2dp4pp + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=5 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=8000 + # export TOLERANT_UNPROGRESSED_CNT=80000 + # export NUM_TRY_TASKQ_HEAD=10 + + #2tp4pp + # export ONEFLOW_OFCCL_SKIP_NEGO=1 + # export RECV_SUCCESS_FACTOR=10 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=12000 + # export TOLERANT_UNPROGRESSED_CNT=8000 + # export NUM_TRY_TASKQ_HEAD=10 + + #4tp2pp + # export ONEFLOW_OFCCL_SKIP_NEGO=1 + # export RECV_SUCCESS_FACTOR=10 + # export RECV_SUCCESS_THRESHOLD=10000 + # export BASE_CTX_SWITCH_THRESHOLD=14000 + # export TOLERANT_UNPROGRESSED_CNT=8000 + # export NUM_TRY_TASKQ_HEAD=10 + +fi + +echo GPUS=$GPUS +echo ONEFLOW_ENABLE_OFCCL=$ONEFLOW_ENABLE_OFCCL +echo ONEFLOW_OFCCL_SKIP_NEGO=$ONEFLOW_OFCCL_SKIP_NEGO +echo ONEFLOW_ENABLE_OFCCL=$ONEFLOW_DEBUG_MODE +echo ONEFLOW_OFCCL_SKIP_NEGO=$ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE +echo NCCL_PROTO=$NCCL_PROTO +echo NCCL_ALGO=$NCCL_ALGO +echo NCCL_MAX_NCHANNELS=$NCCL_MAX_NCHANNELS +echo NCCL_NTHREADS=$NCCL_NTHREADS +echo ONEFLOW_OFCCL_CHAIN=$ONEFLOW_OFCCL_CHAIN +echo GLOG_vmodule=$GLOG_vmodule +echo GLOG_v=$GLOG_v +echo GLOG_logtostderr=$GLOG_logtostderr + +echo RECV_SUCCESS_FACTOR=$RECV_SUCCESS_FACTOR +echo TOLERANT_UNPROGRESSED_CNT=$TOLERANT_UNPROGRESSED_CNT +echo BASE_CTX_SWITCH_THRESHOLD=$BASE_CTX_SWITCH_THRESHOLD +echo NUM_TRY_TASKQ_HEAD=$NUM_TRY_TASKQ_HEAD +echo DEV_TRY_ROUND=$DEV_TRY_ROUND +echo CHECK_REMAINING_SQE_INTERVAL=$CHECK_REMAINING_SQE_INTERVAL +echo DEBUG_FILE=$DEBUG_FILE + +export PYTHONUNBUFFERED=1 +echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED +export NCCL_LAUNCH_MODE=PARALLEL +echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE +# export NCCL_DEBUG=INFO + +rm -rf /HOME/scw6cab/run/OCCL/libai/log +mkdir -p /HOME/scw6cab/run/OCCL/libai/log + +rm -rf /HOME/scw6cab/run/OCCL/oneflow/log +mkdir -p /HOME/scw6cab/run/OCCL/oneflow/log + +export ONEFLOW_FUSE_OPTIMIZER_UPDATE_CAST=true + +if [ "$ONEFLOW_ENABLE_OFCCL" == "1" ]; then + NSYS_FILE="ofccl_vit"_${HOST}_${GPUS}_card +else + NSYS_FILE="nccl_vit"_${HOST}_${GPUS}_card +fi + +if [ "$RUN_TYPE" == "PURE" ];then + cmd="python3 -m oneflow.distributed.launch" +elif [ "$RUN_TYPE" == "GDB" ];then + cmd="gdb -ex r --args python3 -m oneflow.distributed.launch" +elif [ "$RUN_TYPE" == "NSYS" ];then + if [ ! -d "/HOME/scw6cab/run/OCCL/oneflow/log/nsys" ];then + mkdir -p /HOME/scw6cab/run/OCCL/oneflow/log/nsys + fi + # cmd="nsys profile -f true --trace=cuda,cudnn,cublas,osrt,nvtx -o /HOME/scw6cab/run/OCCL/oneflow/log/nsys/$NSYS_FILE python3 -m oneflow.distributed.launch" + cmd="nsys profile -f true -o /HOME/scw6cab/run/OCCL/oneflow/log/nsys/$NSYS_FILE python3 -m oneflow.distributed.launch" +fi +echo cmd=$cmd + +$cmd \ + --nproc_per_node $GPUS --nnodes $NODE --node_rank $NODE_RANK --master_addr $ADDR --master_port $PORT \ + $FILE --config-file $CONFIG ${@:4} \ + > /HOME/scw6cab/run/OCCL/oneflow/log/oneflow.log 2>&1 +