From 15422e135f2c0c2079ef5d4fa46988c94339acc8 Mon Sep 17 00:00:00 2001 From: ljn7 Date: Tue, 8 Jul 2025 11:00:40 +0000 Subject: [PATCH 01/14] Updated for CUDA 11 and 12 --- .gitignore | 132 ++++++++++++++++++++ CMakeLists.txt | 214 +++++++++++++++----------------- README.md | 1 + pytorch_binding/README.md | 3 +- pytorch_binding/setup.py | 8 +- pytorch_binding/src/binding.cpp | 76 +++++++----- 6 files changed, 285 insertions(+), 149 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9d54d6b --- /dev/null +++ b/.gitignore @@ -0,0 +1,132 @@ +################################## +# Python +################################## + +# Byte-compiled +__pycache__/ +*.py[cod] +*$py.class + +# Virtual envs +.env/ +.venv/ +env/ +venv/ + +# Packaging +build/ +dist/ +*.egg-info/ +.eggs/ +*.whl + +# Caches / testing +.pytest_cache/ +htmlcov/ +coverage.* +.tox/ +.nox/ + +# Type checkers +.mypy_cache/ +.pytype/ +.pyre/ + +# Jupyter +.ipynb_checkpoints/ + +################################## +# C++ / CUDA / CMake +################################## + +# Objects and binaries +*.o +*.obj +*.a +*.so +*.dll +*.lib +*.dylib +*.out +*.exe +*.app + +# CUDA +*.cu.o +*.ptx +*.cubin +*.fatbin +*.nvvp +*.nvcpl +*.nvd* + +# Dependency / compiler files +*.d +*.mod + +# CMake and build +/build/ +CMakeFiles/ +CMakeCache.txt +cmake_install.cmake +Makefile +CTestTestfile.cmake +install_manifest.txt +compile_commands.json + +# Ninja +.ninja_log +.ninja_deps +build.ninja +rules.ninja + +################################## +# Python bindings (PyTorch / TensorFlow) +################################## + +# Compiled extensions +*.so +*.pyd +*.dll + +# Generated by setup.py +warprnnt_pytorch.egg-info/ +warprnnt_tensorflow.egg-info/ + +# Temporary output +*.log + +################################## +# Docs +################################## + +# LaTeX +*.aux +*.log +*.out +*.pdf +*.toc +*.lof +*.lot + +################################## +# IDE / Editor / OS +################################## + +# VSCode +.vscode/ + +# JetBrains / CLion / PyCharm +.idea/ +cmake-build-*/ + +# Vim / Emacs +*.swp +*.swo +*~ + +# macOS / Windows +.DS_Store +Thumbs.db +desktop.ini + diff --git a/CMakeLists.txt b/CMakeLists.txt index 2c570e0..c4b0ad3 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,136 +1,126 @@ -IF (APPLE) - cmake_minimum_required(VERSION 3.4) -ELSE() - cmake_minimum_required(VERSION 2.8) -ENDIF() +cmake_minimum_required(VERSION 3.18) -project(rnnt_release) +project(rnnt_release LANGUAGES CXX CUDA) -IF (NOT APPLE) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") -ENDIF() +# Set global flags +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) -IF (APPLE) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O2") +if(NOT APPLE) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") +else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 -O2") add_definitions(-DAPPLE) -ENDIF() +endif() +# Include headers include_directories(include) -FIND_PACKAGE(CUDA) -MESSAGE(STATUS "cuda found ${CUDA_FOUND}") +# Find CUDA Toolkit +find_package(CUDAToolkit REQUIRED) +message(STATUS "CUDA toolkit found") +# Options option(USE_NAIVE_KERNEL "use naive alpha-beta kernel" OFF) option(DEBUG_TIME "output kernel time" OFF) option(DEBUG_KERNEL "output alpha beta" OFF) -if (USE_NAIVE_KERNEL) - add_definitions(-DUSE_NAIVE_KERNEL) +option(WITH_GPU "compile warp-rnnt with cuda." ON) +option(WITH_OMP "compile warp-rnnt with openmp." ON) + +# Preprocessor flags +set(COMMON_DEFINITIONS "") +if(USE_NAIVE_KERNEL) + list(APPEND COMMON_DEFINITIONS -DUSE_NAIVE_KERNEL) endif() -if (DEBUG_TIME) - add_definitions(-DDEBUG_TIME) +if(DEBUG_TIME) + list(APPEND COMMON_DEFINITIONS -DDEBUG_TIME) endif() -if (DEBUG_KERNEL) - add_definitions(-DDEBUG_KERNEL) +if(DEBUG_KERNEL) + list(APPEND COMMON_DEFINITIONS -DDEBUG_KERNEL) endif() - -option(WITH_GPU "compile warp-rnnt with cuda." ${CUDA_FOUND}) -option(WITH_OMP "compile warp-rnnt with openmp." ON) - if(NOT WITH_OMP) - add_definitions(-DRNNT_DISABLE_OMP) + list(APPEND COMMON_DEFINITIONS -DRNNT_DISABLE_OMP) endif() -if (WITH_OMP) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fopenmp") -endif() - -# need to be at least 30 or __shfl_down in reduce wont compile -set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30 -O2") -set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_35,code=sm_35") - -set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_50,code=sm_50") -set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52") -IF(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5) - SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -D_MWAITXINTRIN_H_INCLUDED -D_FORCE_INLINES") -ENDIF() - -IF (CUDA_VERSION GREATER 7.6) - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60") - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61") - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62") -ENDIF() - -IF (CUDA_VERSION GREATER 8.9) - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_70,code=sm_70") -ENDIF() - -IF (CUDA_VERSION GREATER 9.9) - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_75,code=sm_75") -ENDIF() - -if (NOT APPLE) - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++11") - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -ENDIF() +# OpenMP +if(WITH_OMP) + find_package(OpenMP) + if(OpenMP_CXX_FOUND) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") + endif() +endif() -IF (APPLE) - EXEC_PROGRAM(uname ARGS -v OUTPUT_VARIABLE DARWIN_VERSION) - STRING(REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION}) - MESSAGE(STATUS "DARWIN_VERSION=${DARWIN_VERSION}") - - #for el capitain have to use rpath - - IF (DARWIN_VERSION LESS 15) - set(CMAKE_SKIP_RPATH TRUE) - ENDIF () - -ELSE() - #always skip for linux - set(CMAKE_SKIP_RPATH TRUE) -ENDIF() - - -IF (WITH_GPU) - - MESSAGE(STATUS "Building shared library with GPU support") - - CUDA_ADD_LIBRARY(warprnnt SHARED src/rnnt_entrypoint.cu) - IF (!Torch_FOUND) - TARGET_LINK_LIBRARIES(warprnnt ${CUDA_curand_LIBRARY}) - ENDIF() - - cuda_add_executable(test_time_gpu tests/test_time.cu tests/random.cpp ) - TARGET_LINK_LIBRARIES(test_time_gpu warprnnt ${CUDA_curand_LIBRARY}) - SET_TARGET_PROPERTIES(test_time_gpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") - - cuda_add_executable(test_gpu tests/test_gpu.cu tests/random.cpp ) - TARGET_LINK_LIBRARIES(test_gpu warprnnt ${CUDA_curand_LIBRARY}) - SET_TARGET_PROPERTIES(test_gpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") - -ELSE() - MESSAGE(STATUS "Building shared library with no GPU support") - - if (NOT APPLE) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O2") - ENDIF() - - ADD_LIBRARY(warprnnt SHARED src/rnnt_entrypoint.cpp) +set(CMAKE_CUDA_ARCHITECTURES 86) -ENDIF() +# -------------------- +# Shared Library Target +# -------------------- +if(WITH_GPU) + message(STATUS "Building shared library with GPU support") + add_library(warprnnt SHARED src/rnnt_entrypoint.cu) + target_link_libraries(warprnnt PRIVATE CUDA::cudart CUDA::curand) +else() + message(STATUS "Building shared library with no GPU support") -add_executable(test_cpu tests/test_cpu.cpp tests/random.cpp ) -TARGET_LINK_LIBRARIES(test_cpu warprnnt) -SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") + add_library(warprnnt SHARED src/rnnt_entrypoint.cpp) +endif() -add_executable(test_time tests/test_time.cpp tests/random.cpp ) -TARGET_LINK_LIBRARIES(test_time warprnnt) -SET_TARGET_PROPERTIES(test_time PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") +target_compile_features(warprnnt PRIVATE cxx_std_17) +target_compile_definitions(warprnnt PRIVATE ${COMMON_DEFINITIONS}) +target_include_directories(warprnnt PRIVATE include) +set_target_properties(warprnnt PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON +) +target_compile_options(warprnnt PRIVATE + $<$:--expt-relaxed-constexpr> +) + +# -------------------- +# Executables +# -------------------- +add_executable(test_cpu tests/test_cpu.cpp tests/random.cpp) +target_link_libraries(test_cpu PRIVATE warprnnt) +target_compile_features(test_cpu PRIVATE cxx_std_17) + +add_executable(test_time tests/test_time.cpp tests/random.cpp) +target_link_libraries(test_time PRIVATE warprnnt) +target_compile_features(test_time PRIVATE cxx_std_17) + +if(WITH_GPU) + add_executable(test_time_gpu tests/test_time.cu tests/random.cpp) + target_link_libraries(test_time_gpu PRIVATE warprnnt CUDA::curand) + target_compile_features(test_time_gpu PRIVATE cxx_std_17) + set_target_properties(test_time_gpu PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON + ) + target_compile_options(test_time_gpu PRIVATE + $<$:--expt-relaxed-constexpr> + ) + + add_executable(test_gpu tests/test_gpu.cu tests/random.cpp) + target_link_libraries(test_gpu PRIVATE warprnnt CUDA::curand) + target_compile_features(test_gpu PRIVATE cxx_std_17) + set_target_properties(test_gpu PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON + ) + target_compile_options(test_gpu PRIVATE + $<$:--expt-relaxed-constexpr> + ) +endif() -INSTALL(TARGETS warprnnt - RUNTIME DESTINATION "bin" - LIBRARY DESTINATION "lib" - ARCHIVE DESTINATION "lib") +# -------------------- +# Installation +# -------------------- +install(TARGETS warprnnt + RUNTIME DESTINATION bin + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib) -INSTALL(FILES include/rnnt.h DESTINATION "include") +install(FILES include/rnnt.h DESTINATION include) \ No newline at end of file diff --git a/README.md b/README.md index 11263ef..749a1e4 100644 --- a/README.md +++ b/README.md @@ -57,6 +57,7 @@ Run cmake and build: ```bash cmake -DCUDA_TOOLKIT_ROOT_DIR=$CUDA_HOME .. make +sudo make install #optional ``` if it logs ``` diff --git a/pytorch_binding/README.md b/pytorch_binding/README.md index cce44e7..190f56e 100644 --- a/pytorch_binding/README.md +++ b/pytorch_binding/README.md @@ -16,6 +16,7 @@ cd warp-transducer mkdir build; cd build cmake .. make +sudo make install # optional ``` Otherwise, set `WARP_RNNT_PATH` to wherever you have `libwarprnnt.so` @@ -30,7 +31,7 @@ export CUDA_HOME="/usr/local/cuda" Now install the bindings: (Please make sure the GCC version >= 4.9) ``` cd pytorch_binding -python setup.py install +pip install . # pip install . --break-package-system ``` If you try the above and get a dlopen error on OSX with anaconda3 (as recommended by pytorch): diff --git a/pytorch_binding/setup.py b/pytorch_binding/setup.py index ad960ea..08726f4 100644 --- a/pytorch_binding/setup.py +++ b/pytorch_binding/setup.py @@ -1,4 +1,4 @@ -from distutils.version import LooseVersion +from packaging.version import Version import os import platform import sys @@ -8,7 +8,9 @@ extra_compile_args = ['-fPIC'] -if LooseVersion(torch.__version__) >= LooseVersion("1.5.0"): +if Version(torch.__version__) >= Version("1.8.0"): + extra_compile_args += ['-std=c++17'] +elif Version(torch.__version__) >= Version("1.5.0"): extra_compile_args += ['-std=c++14'] else: extra_compile_args += ['-std=c++11'] @@ -39,7 +41,7 @@ setup( name='warprnnt_pytorch', - version="0.1", + version="0.2.0", description="PyTorch wrapper for RNN-Transducer", url="https://github.com/HawkAaron/warp-transducer", author="Mingkun Huang", diff --git a/pytorch_binding/src/binding.cpp b/pytorch_binding/src/binding.cpp index 7aaf000..53e5d2d 100644 --- a/pytorch_binding/src/binding.cpp +++ b/pytorch_binding/src/binding.cpp @@ -5,8 +5,8 @@ #include "rnnt.h" #ifdef WARPRNNT_ENABLE_GPU - #include "THC.h" - extern THCState* state; + #include + #include #endif int cpu_rnnt(torch::Tensor acts, @@ -23,11 +23,11 @@ int cpu_rnnt(torch::Tensor acts, int minibatch_size = acts.size(2); int alphabet_size = acts.size(3); - if (true) { - minibatch_size = acts.size(0); - maxT = acts.size(1); - maxU = acts.size(2); - } + if (true) { + minibatch_size = acts.size(0); + maxT = acts.size(1); + maxU = acts.size(2); + } rnntOptions options; memset(&options, 0, sizeof(options)); @@ -43,33 +43,33 @@ int cpu_rnnt(torch::Tensor acts, #endif size_t cpu_size_bytes = 0; - switch (acts.type().scalarType()) { - case torch::ScalarType::Float: + switch (acts.scalar_type()) { + case torch::kFloat: { get_workspace_size(maxT, maxU, minibatch_size, false, &cpu_size_bytes); float* cpu_workspace = (float*) new unsigned char[cpu_size_bytes]; - compute_rnnt_loss(acts.data(), grads.data(), - labels.data(), label_lengths.data(), - input_lengths.data(), alphabet_size, - minibatch_size, costs.data(), + compute_rnnt_loss(acts.data_ptr(), grads.data_ptr(), + labels.data_ptr(), label_lengths.data_ptr(), + input_lengths.data_ptr(), alphabet_size, + minibatch_size, costs.data_ptr(), cpu_workspace, options); delete cpu_workspace; return 0; } - case torch::ScalarType::Double: + case torch::kDouble: { get_workspace_size(maxT, maxU, minibatch_size, false, &cpu_size_bytes, sizeof(double)); double* cpu_workspace = (double*) new unsigned char[cpu_size_bytes]; - compute_rnnt_loss_fp64(acts.data(), grads.data(), - labels.data(), label_lengths.data(), - input_lengths.data(), alphabet_size, - minibatch_size, costs.data(), + compute_rnnt_loss_fp64(acts.data_ptr(), grads.data_ptr(), + labels.data_ptr(), label_lengths.data_ptr(), + input_lengths.data_ptr(), alphabet_size, + minibatch_size, costs.data_ptr(), cpu_workspace, options); delete cpu_workspace; @@ -108,8 +108,8 @@ int gpu_rnnt(torch::Tensor acts, options.num_threads = std::max(options.num_threads, (unsigned int) 1); #endif - switch (acts.type().scalarType()) { - case torch::ScalarType::Float: + switch (acts.scalar_type()) { + case torch::kFloat: { size_t gpu_size_bytes; get_workspace_size(maxT, maxU, minibatch_size, @@ -117,18 +117,23 @@ int gpu_rnnt(torch::Tensor acts, cudaSetDevice(acts.get_device()); - void* gpu_workspace = THCudaMalloc(state, gpu_size_bytes); + void* gpu_workspace = nullptr; + cudaMalloc(&gpu_workspace, gpu_size_bytes); + if (gpu_workspace == nullptr) { + std::cerr << __FILE__ << ':' << __LINE__ << ": " << "failed to allocate GPU workspace" << std::endl; + return -1; + } - compute_rnnt_loss(acts.data(), grads.data(), - labels.data(), label_lengths.data(), - input_lengths.data(), alphabet_size, - minibatch_size, costs.data(), + compute_rnnt_loss(acts.data_ptr(), grads.data_ptr(), + labels.data_ptr(), label_lengths.data_ptr(), + input_lengths.data_ptr(), alphabet_size, + minibatch_size, costs.data_ptr(), gpu_workspace, options); - THCudaFree(state, gpu_workspace); + cudaFree(gpu_workspace); return 0; } - case torch::ScalarType::Double: + case torch::kDouble: { size_t gpu_size_bytes; get_workspace_size(maxT, maxU, minibatch_size, @@ -136,15 +141,20 @@ int gpu_rnnt(torch::Tensor acts, cudaSetDevice(acts.get_device()); - void* gpu_workspace = THCudaMalloc(state, gpu_size_bytes); + void* gpu_workspace = nullptr; + cudaMalloc(&gpu_workspace, gpu_size_bytes); + if (gpu_workspace == nullptr) { + std::cerr << __FILE__ << ':' << __LINE__ << ": " << "failed to allocate GPU workspace" << std::endl; + return -1; + } - compute_rnnt_loss_fp64(acts.data(), grads.data(), - labels.data(), label_lengths.data(), - input_lengths.data(), alphabet_size, - minibatch_size, costs.data(), + compute_rnnt_loss_fp64(acts.data_ptr(), grads.data_ptr(), + labels.data_ptr(), label_lengths.data_ptr(), + input_lengths.data_ptr(), alphabet_size, + minibatch_size, costs.data_ptr(), gpu_workspace, options); - THCudaFree(state, gpu_workspace); + cudaFree(gpu_workspace); return 0; } default: From 5cbc6d685c6e13789a00c8e0cafbf34eed0685d7 Mon Sep 17 00:00:00 2001 From: John Nirmal Date: Wed, 9 Jul 2025 16:29:46 +0530 Subject: [PATCH 02/14] Update README.md --- pytorch_binding/README.md | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/pytorch_binding/README.md b/pytorch_binding/README.md index 190f56e..4be1252 100644 --- a/pytorch_binding/README.md +++ b/pytorch_binding/README.md @@ -10,6 +10,7 @@ Install [PyTorch](https://github.com/pytorch/pytorch#installation). (i.e. `libwarprnnt.so`). This defaults to `../build`, so from within a new warp-transducer clone you could build WarpRNNT like this: + ```bash git clone https://github.com/HawkAaron/warp-transducer cd warp-transducer @@ -90,3 +91,33 @@ forward(acts, labels, act_lens, label_lens): label_lens: Tensor of (batch) containing label length of each example """ ``` + +## Troubleshooting: `cuda_runtime_api.h: No such file or directory` + +If you encounter an error like this during installation: + +` +fatal error: cuda_runtime_api.h: No such file or directory +` + +This usually means the CUDA headers can't be found by the compiler. If your system has CUDA installed (e.g., version 12.9) and the file `cuda_runtime_api.h` exists somewhere like: + +` +/usr/local/cuda-12.9/targets/x86_64-linux/include/cuda_runtime_api.h +` + +Then you can fix this by manually setting the environment variables to point to the correct CUDA paths: + +```bash +export CUDA_HOME=/usr/local/cuda-12.9 +export CFLAGS="-I$CUDA_HOME/targets/x86_64-linux/include" +export LDFLAGS="-L$CUDA_HOME/targets/x86_64-linux/lib" +export PATH="$CUDA_HOME/bin:$PATH" +export LD_LIBRARY_PATH="$CUDA_HOME/lib64:$LD_LIBRARY_PATH" +``` + +Then run the installation again: + +```bash +pip install . +``` From a1de641cc96acc321079c712d986e84e98c0dd18 Mon Sep 17 00:00:00 2001 From: ljn7 Date: Wed, 16 Jul 2025 10:03:53 +0000 Subject: [PATCH 03/14] Changed to PyTorch CUDA allocator --- pytorch_binding/src/binding.cpp | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/pytorch_binding/src/binding.cpp b/pytorch_binding/src/binding.cpp index 53e5d2d..ec9ada7 100644 --- a/pytorch_binding/src/binding.cpp +++ b/pytorch_binding/src/binding.cpp @@ -5,8 +5,7 @@ #include "rnnt.h" #ifdef WARPRNNT_ENABLE_GPU - #include - #include + #include "c10/cuda/CUDACachingAllocator.h" #endif int cpu_rnnt(torch::Tensor acts, @@ -117,8 +116,7 @@ int gpu_rnnt(torch::Tensor acts, cudaSetDevice(acts.get_device()); - void* gpu_workspace = nullptr; - cudaMalloc(&gpu_workspace, gpu_size_bytes); + void* gpu_workspace = c10::cuda::CUDACachingAllocator::raw_alloc(gpu_size_bytes); if (gpu_workspace == nullptr) { std::cerr << __FILE__ << ':' << __LINE__ << ": " << "failed to allocate GPU workspace" << std::endl; return -1; @@ -130,7 +128,7 @@ int gpu_rnnt(torch::Tensor acts, minibatch_size, costs.data_ptr(), gpu_workspace, options); - cudaFree(gpu_workspace); + c10::cuda::CUDACachingAllocator::raw_delete(gpu_workspace); return 0; } case torch::kDouble: @@ -141,8 +139,7 @@ int gpu_rnnt(torch::Tensor acts, cudaSetDevice(acts.get_device()); - void* gpu_workspace = nullptr; - cudaMalloc(&gpu_workspace, gpu_size_bytes); + void* gpu_workspace = c10::cuda::CUDACachingAllocator::raw_alloc(gpu_size_bytes); if (gpu_workspace == nullptr) { std::cerr << __FILE__ << ':' << __LINE__ << ": " << "failed to allocate GPU workspace" << std::endl; return -1; @@ -154,7 +151,7 @@ int gpu_rnnt(torch::Tensor acts, minibatch_size, costs.data_ptr(), gpu_workspace, options); - cudaFree(gpu_workspace); + c10::cuda::CUDACachingAllocator::raw_delete(gpu_workspace); return 0; } default: From 2806d2da0ef163bf05a0507bdbab90c5463a2240 Mon Sep 17 00:00:00 2001 From: ljn7 Date: Wed, 16 Jul 2025 10:28:42 +0000 Subject: [PATCH 04/14] Requires C++17 for torch 2.1.0 and above and requires packaging --- pytorch_binding/setup.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pytorch_binding/setup.py b/pytorch_binding/setup.py index 08726f4..af8c094 100644 --- a/pytorch_binding/setup.py +++ b/pytorch_binding/setup.py @@ -8,7 +8,7 @@ extra_compile_args = ['-fPIC'] -if Version(torch.__version__) >= Version("1.8.0"): +if Version(torch.__version__) >= Version("2.1.0"): extra_compile_args += ['-std=c++17'] elif Version(torch.__version__) >= Version("1.5.0"): extra_compile_args += ['-std=c++14'] @@ -47,6 +47,9 @@ author="Mingkun Huang", author_email="mingkunhuang95@gmail.com", packages=find_packages(), + install_requires=[ + 'packaging' + ], ext_modules=[ CppExtension( name='warprnnt_pytorch.warp_rnnt', From 64477aefb0ed1d1a1a033cba71a9703bd9410ef2 Mon Sep 17 00:00:00 2001 From: ljn7 Date: Wed, 16 Jul 2025 10:55:36 +0000 Subject: [PATCH 05/14] Fixed test file for GPU testing --- pytorch_binding/test/test.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/pytorch_binding/test/test.py b/pytorch_binding/test/test.py index 14feb09..c378ed0 100644 --- a/pytorch_binding/test/test.py +++ b/pytorch_binding/test/test.py @@ -23,28 +23,24 @@ fn = rnntloss() if args.np else RNNTLoss(reduction='sum') -gpu = 1 +#acts = autograd.Variable(acts, requires_grad=True) def wrap_and_call(acts, labels): acts = torch.FloatTensor(acts) - if use_cuda: - acts = acts.cuda(gpu) - #acts = autograd.Variable(acts, requires_grad=True) + device = torch.device("cuda" if use_cuda else "cpu") + acts = acts.to(device) acts.requires_grad = True lengths = [acts.shape[1]] * acts.shape[0] label_lengths = [len(l) for l in labels] - labels = torch.IntTensor(labels) - lengths = torch.IntTensor(lengths) - label_lengths = torch.IntTensor(label_lengths) - if use_cuda: - labels = labels.cuda(gpu) - lengths = lengths.cuda(gpu) - label_lengths = label_lengths.cuda(gpu) + + labels = torch.IntTensor(labels).to(device) + lengths = torch.IntTensor(lengths).to(device) + label_lengths = torch.IntTensor(label_lengths).to(device) costs = fn(acts, labels, lengths, label_lengths) cost = torch.sum(costs) cost.backward() - # print(repr(acts.grad.data.cpu().numpy())) + return costs.data.cpu().numpy(), acts.grad.data.cpu().numpy() From a11cedd01f0d197016d0db2ad11e618d1026cd7f Mon Sep 17 00:00:00 2001 From: John Nirmal Date: Thu, 17 Jul 2025 10:52:00 +0530 Subject: [PATCH 06/14] Update README.md --- README.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/README.md b/README.md index 749a1e4..1ec3b88 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,30 @@ please run `rm CMakeCache.txt` and cmake again. The C library should now be built along with test executables. If CUDA was detected, then `test_gpu` will be built; `test_cpu` will always be built. +## Troubleshooting: `cuda_runtime_api.h: No such file or directory` + +If you encounter an error like this during installation: + +` +fatal error: cuda_runtime_api.h: No such file or directory +` + +This usually means the CUDA headers can't be found by the compiler. If your system has CUDA installed (e.g., version 12.9) and the file `cuda_runtime_api.h` exists somewhere like: + +` +/usr/local/cuda-12.9/targets/x86_64-linux/include/cuda_runtime_api.h +` + +Then you can fix this by manually setting the environment variables to point to the correct CUDA paths: + +```bash +export CUDA_HOME=/usr/local/cuda-12.9 +export CFLAGS="-I$CUDA_HOME/targets/x86_64-linux/include" +export LDFLAGS="-L$CUDA_HOME/targets/x86_64-linux/lib" +export PATH="$CUDA_HOME/bin:$PATH" +export LD_LIBRARY_PATH="$CUDA_HOME/lib64:$LD_LIBRARY_PATH" +``` + ## Test To run the tests, make sure the CUDA libraries are in `LD_LIBRARY_PATH` (DYLD_LIBRARY_PATH for OSX). From a8b11432712581123d4e27650b6150bf91825319 Mon Sep 17 00:00:00 2001 From: ljn7 Date: Thu, 17 Jul 2025 13:30:32 +0000 Subject: [PATCH 07/14] - Merged FastEmit code (originally by b-flo, forked from HawkAaron/warp-transducer) - Adapted and updated for compatibility and enhancements on my fork - Updated relevant documentation and options - Updated CMakeLists.txt to support CUDA and CMake version selection Updated CMakeLists.txt to support CUDA and CMake version detection/selection --- CMakeLists.txt | 233 ++++++++++++++----- include/detail/cpu_rnnt.h | 10 +- include/detail/gpu_rnnt.h | 18 +- include/detail/gpu_rnnt_kernel.h | 47 ++++ include/rnnt.h | 2 + pytorch_binding/src/binding.cpp | 4 + pytorch_binding/warprnnt_pytorch/__init__.py | 14 +- src/rnnt_entrypoint.cpp | 14 +- 8 files changed, 260 insertions(+), 82 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c4b0ad3..f7e4034 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,10 +1,22 @@ -cmake_minimum_required(VERSION 3.18) +if(APPLE) + cmake_minimum_required(VERSION 3.4) +else() + cmake_minimum_required(VERSION 3.5) +endif() project(rnnt_release LANGUAGES CXX CUDA) -# Set global flags +# ==== USER OPTIONS ==== +option(WITH_GPU "Build with CUDA support" ON) +option(WITH_OMP "Build with OpenMP support" ON) +option(USE_NAIVE_KERNEL "Use naive alpha-beta kernel" OFF) +option(DEBUG_TIME "Output kernel time" OFF) +option(DEBUG_KERNEL "Output alpha beta debug" OFF) + set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_POSITION_INDEPENDENT_CODE ON) +set(CMAKE_CXX_EXTENSIONS OFF) if(NOT APPLE) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") @@ -13,21 +25,9 @@ else() add_definitions(-DAPPLE) endif() -# Include headers include_directories(include) -# Find CUDA Toolkit -find_package(CUDAToolkit REQUIRED) -message(STATUS "CUDA toolkit found") - -# Options -option(USE_NAIVE_KERNEL "use naive alpha-beta kernel" OFF) -option(DEBUG_TIME "output kernel time" OFF) -option(DEBUG_KERNEL "output alpha beta" OFF) -option(WITH_GPU "compile warp-rnnt with cuda." ON) -option(WITH_OMP "compile warp-rnnt with openmp." ON) - -# Preprocessor flags +# ==== PREPROCESSOR FLAGS ==== set(COMMON_DEFINITIONS "") if(USE_NAIVE_KERNEL) list(APPEND COMMON_DEFINITIONS -DUSE_NAIVE_KERNEL) @@ -42,7 +42,6 @@ if(NOT WITH_OMP) list(APPEND COMMON_DEFINITIONS -DRNNT_DISABLE_OMP) endif() -# OpenMP if(WITH_OMP) find_package(OpenMP) if(OpenMP_CXX_FOUND) @@ -50,37 +49,137 @@ if(WITH_OMP) endif() endif() -set(CMAKE_CUDA_ARCHITECTURES 86) +# ==== SAFE CUDA VERSION DETECTION & ARCH SELECTION ==== + +# Try to detect CUDA version. Set major.minor or fallback to "0" +if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18") + find_package(CUDAToolkit REQUIRED) + set(CUDA_TOOLKIT_VERSION "${CUDAToolkit_VERSION}") + if(NOT CUDA_TOOLKIT_VERSION OR "${CUDA_TOOLKIT_VERSION}" STREQUAL "") + # CMake >= 3.18 but CUDAToolkit_VERSION not set? Fallback logic + set(CUDA_TOOLKIT_VERSION "0.0") + endif() +else() + find_package(CUDA REQUIRED) + if(DEFINED CUDA_VERSION AND NOT "${CUDA_VERSION}" STREQUAL "") + set(CUDA_TOOLKIT_VERSION "${CUDA_VERSION}") + else() + set(CUDA_TOOLKIT_VERSION "0.0") + endif() +endif() + +# Split CUDA version into major/minor +function(cuda_version_major VERSION OUTVAR) + if(NOT "${VERSION}" STREQUAL "") + string(REPLACE "." ";" TMP_LIST ${VERSION}) + list(GET TMP_LIST 0 MAJOR) + set(${OUTVAR} "${MAJOR}" PARENT_SCOPE) + else() + set(${OUTVAR} "0" PARENT_SCOPE) + endif() +endfunction() +function(cuda_version_minor VERSION OUTVAR) + if(NOT "${VERSION}" STREQUAL "") + string(REPLACE "." ";" TMP_LIST ${VERSION}) + list(LENGTH TMP_LIST TMP_LEN) + if(TMP_LEN GREATER 1) + list(GET TMP_LIST 1 MINOR) + else() + set(MINOR "0") + endif() + set(${OUTVAR} "${MINOR}" PARENT_SCOPE) + else() + set(${OUTVAR} "0" PARENT_SCOPE) + endif() +endfunction() + +cuda_version_major("${CUDA_TOOLKIT_VERSION}" CUDA_VERSION_MAJOR) +cuda_version_minor("${CUDA_TOOLKIT_VERSION}" CUDA_VERSION_MINOR) + +set(CUDA_ARCH_LIST "") +if(CUDA_TOOLKIT_VERSION STREQUAL "" OR CUDA_VERSION_MAJOR EQUAL 0) + set(CUDA_ARCH_LIST "52;60;70;75") + message(WARNING "Could not detect CUDA version. Defaulting to minimal arch set (52;60;70;75).") +elseif(CUDA_VERSION_MAJOR LESS 11) + set(CUDA_ARCH_LIST "52;60;70;75") +elseif(CUDA_VERSION_MAJOR EQUAL 11) + set(CUDA_ARCH_LIST "52;60;70;75;80;86") +elseif(CUDA_VERSION_MAJOR GREATER_EQUAL 12) + set(CUDA_ARCH_LIST "52;60;70;75;80;86;89") +endif() + +# Allow user override +if(DEFINED CMAKE_CUDA_ARCHITECTURES AND NOT "${CMAKE_CUDA_ARCHITECTURES}" STREQUAL "") + set(CUDA_ARCH_LIST "${CMAKE_CUDA_ARCHITECTURES}") +endif() + +if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18") + set(CMAKE_CUDA_ARCHITECTURES "${CUDA_ARCH_LIST}" CACHE STRING "Target GPU architectures") + message(STATUS "Auto-selected CUDA archs for CUDA ${CUDA_TOOLKIT_VERSION}: ${CMAKE_CUDA_ARCHITECTURES}") +else() + string(REPLACE ";" " " CUDA_ARCH_FLAGS "${CUDA_ARCH_LIST}") + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ") + foreach(ARCH ${CUDA_ARCH_LIST}) + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_${ARCH},code=sm_${ARCH}") + endforeach() + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++11") + message(STATUS "Auto-selected CUDA NVCC_ARCH flags for CUDA ${CUDA_TOOLKIT_VERSION}: ${CUDA_NVCC_FLAGS}") +endif() + +if(CUDA_TOOLKIT_VERSION AND CUDA_VERSION_MAJOR LESS 11) + if(CUDA_ARCH_LIST MATCHES "80|86|89") + message(WARNING " +Detected CUDA ${CUDA_TOOLKIT_VERSION}. +Some requested architectures (${CUDA_ARCH_LIST}) require CUDA >= 11 (Ampere/Ada). +Remove sm_80/86/89 or upgrade CUDA Toolkit if you see NVCC errors. +") + endif() +endif() + +# ==== Apple/Mac rpath ==== +if(APPLE) + execute_process(COMMAND uname -v OUTPUT_VARIABLE DARWIN_VERSION) + string(REGEX MATCH "[0-9]+" DARWIN_VERSION "${DARWIN_VERSION}") + message(STATUS "DARWIN_VERSION=${DARWIN_VERSION}") + if(${DARWIN_VERSION} LESS 15) + set(CMAKE_SKIP_RPATH TRUE) + endif() +else() + set(CMAKE_SKIP_RPATH TRUE) +endif() -# -------------------- -# Shared Library Target -# -------------------- +# ==== Main library target ==== if(WITH_GPU) message(STATUS "Building shared library with GPU support") - - add_library(warprnnt SHARED src/rnnt_entrypoint.cu) - target_link_libraries(warprnnt PRIVATE CUDA::cudart CUDA::curand) + if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18") + add_library(warprnnt SHARED src/rnnt_entrypoint.cu) + target_link_libraries(warprnnt PRIVATE CUDA::cudart CUDA::curand) + else() + CUDA_ADD_LIBRARY(warprnnt SHARED src/rnnt_entrypoint.cu) + if(NOT Torch_FOUND) + TARGET_LINK_LIBRARIES(warprnnt ${CUDA_curand_LIBRARY}) + endif() + endif() else() message(STATUS "Building shared library with no GPU support") - add_library(warprnnt SHARED src/rnnt_entrypoint.cpp) endif() target_compile_features(warprnnt PRIVATE cxx_std_17) target_compile_definitions(warprnnt PRIVATE ${COMMON_DEFINITIONS}) target_include_directories(warprnnt PRIVATE include) -set_target_properties(warprnnt PROPERTIES - CUDA_SEPARABLE_COMPILATION ON - CUDA_STANDARD 17 - CUDA_STANDARD_REQUIRED ON -) -target_compile_options(warprnnt PRIVATE - $<$:--expt-relaxed-constexpr> -) - -# -------------------- -# Executables -# -------------------- +if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18" AND TARGET warprnnt) + set_target_properties(warprnnt PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON + ) + target_compile_options(warprnnt PRIVATE + $<$:--expt-relaxed-constexpr> + ) +endif() + +# ==== Test executables ==== add_executable(test_cpu tests/test_cpu.cpp tests/random.cpp) target_link_libraries(test_cpu PRIVATE warprnnt) target_compile_features(test_cpu PRIVATE cxx_std_17) @@ -90,37 +189,45 @@ target_link_libraries(test_time PRIVATE warprnnt) target_compile_features(test_time PRIVATE cxx_std_17) if(WITH_GPU) - add_executable(test_time_gpu tests/test_time.cu tests/random.cpp) - target_link_libraries(test_time_gpu PRIVATE warprnnt CUDA::curand) - target_compile_features(test_time_gpu PRIVATE cxx_std_17) - set_target_properties(test_time_gpu PROPERTIES - CUDA_SEPARABLE_COMPILATION ON - CUDA_STANDARD 17 - CUDA_STANDARD_REQUIRED ON - ) - target_compile_options(test_time_gpu PRIVATE - $<$:--expt-relaxed-constexpr> - ) - - add_executable(test_gpu tests/test_gpu.cu tests/random.cpp) - target_link_libraries(test_gpu PRIVATE warprnnt CUDA::curand) - target_compile_features(test_gpu PRIVATE cxx_std_17) - set_target_properties(test_gpu PROPERTIES - CUDA_SEPARABLE_COMPILATION ON - CUDA_STANDARD 17 - CUDA_STANDARD_REQUIRED ON - ) - target_compile_options(test_gpu PRIVATE - $<$:--expt-relaxed-constexpr> - ) + if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18") + add_executable(test_time_gpu tests/test_time.cu tests/random.cpp) + target_link_libraries(test_time_gpu PRIVATE warprnnt CUDA::curand) + target_compile_features(test_time_gpu PRIVATE cxx_std_17) + set_target_properties(test_time_gpu PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON + ) + target_compile_options(test_time_gpu PRIVATE + $<$:--expt-relaxed-constexpr> + ) + + add_executable(test_gpu tests/test_gpu.cu tests/random.cpp) + target_link_libraries(test_gpu PRIVATE warprnnt CUDA::curand) + target_compile_features(test_gpu PRIVATE cxx_std_17) + set_target_properties(test_gpu PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON + ) + target_compile_options(test_gpu PRIVATE + $<$:--expt-relaxed-constexpr> + ) + else() + cuda_add_executable(test_time_gpu tests/test_time.cu tests/random.cpp ) + TARGET_LINK_LIBRARIES(test_time_gpu warprnnt ${CUDA_curand_LIBRARY}) + set_target_properties(test_time_gpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") + + cuda_add_executable(test_gpu tests/test_gpu.cu tests/random.cpp ) + TARGET_LINK_LIBRARIES(test_gpu warprnnt ${CUDA_curand_LIBRARY}) + set_target_properties(test_gpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") + endif() endif() -# -------------------- -# Installation -# -------------------- +# ==== Install targets ==== install(TARGETS warprnnt RUNTIME DESTINATION bin LIBRARY DESTINATION lib ARCHIVE DESTINATION lib) -install(FILES include/rnnt.h DESTINATION include) \ No newline at end of file +install(FILES include/rnnt.h DESTINATION include) diff --git a/include/detail/cpu_rnnt.h b/include/detail/cpu_rnnt.h index 0042b34..f4e586f 100644 --- a/include/detail/cpu_rnnt.h +++ b/include/detail/cpu_rnnt.h @@ -20,9 +20,9 @@ class CpuRNNT { public: // Noncopyable CpuRNNT(int minibatch, int maxT, int maxU, int alphabet_size, void* workspace, - int blank, int num_threads, bool batch_first) : + int blank, float fastemit_lambda, int num_threads, bool batch_first) : minibatch_(minibatch), maxT_(maxT), maxU_(maxU), alphabet_size_(alphabet_size), - workspace_(workspace), blank_(blank), num_threads_(num_threads), batch_first(batch_first) { + workspace_(workspace), blank_(blank), fastemit_lambda_(fastemit_lambda), num_threads_(num_threads), batch_first(batch_first) { #if defined(RNNT_DISABLE_OMP) || defined(APPLE) #else if (num_threads > 0) { @@ -82,6 +82,7 @@ class CpuRNNT { int alphabet_size_; // Number of characters plus blank void* workspace_; int blank_; + float fastemit_lambda_; int num_threads_; bool batch_first; @@ -259,7 +260,10 @@ CpuRNNT::compute_betas_and_grad(ProbT* grad, const ProbT* const log_probs } if (u < U-1) { ProbT g = alphas[idx(t, u)] + betas[idx(t, u+1)]; - grad[idx(t, u, labels[u])] = -std::exp(log_probs[idx(t, u) * 2 + 1] + g - loglike); + grad[idx(t, u, labels[u])] = -( + (1. + fastemit_lambda_) + * std::exp(log_probs[idx(t, u) * 2 + 1] + g - loglike) + ); } } } diff --git a/include/detail/gpu_rnnt.h b/include/detail/gpu_rnnt.h index dd7f874..c8e67df 100644 --- a/include/detail/gpu_rnnt.h +++ b/include/detail/gpu_rnnt.h @@ -20,9 +20,9 @@ class GpuRNNT { public: // Noncopyable GpuRNNT(int minibatch, int maxT, int maxU, int alphabet_size, void* workspace, - int blank, int num_threads, CUstream stream) : + int blank, float fastemit_lambda, int num_threads, CUstream stream) : minibatch_(minibatch), maxT_(maxT), maxU_(maxU), alphabet_size_(alphabet_size), - gpu_workspace(workspace), blank_(blank), num_threads_(num_threads), stream_(stream) { + gpu_workspace(workspace), blank_(blank), fastemit_lambda_(fastemit_lambda), num_threads_(num_threads), stream_(stream) { #if defined(RNNT_DISABLE_OMP) || defined(APPLE) #else if (num_threads > 0) { @@ -65,6 +65,7 @@ class GpuRNNT { int alphabet_size_; // Number of characters plus blank void* gpu_workspace; int blank_; + float fastemit_lambda_; int num_threads_; CUstream stream_; @@ -195,9 +196,16 @@ GpuRNNT::compute_cost_and_score(const ProbT* const acts, start = std::chrono::high_resolution_clock::now(); #endif // TODO optimize gradient kernel - compute_grad_kernel<128, ProbT><<>>(grads, - acts, denom, alphas, betas, llForward, input_lengths, label_lengths, labels, - minibatch_, maxT_, maxU_, alphabet_size_, blank_); + + if (fastemit_lambda_ > 0.0f) { + compute_fastemit_grad_kernel<128, ProbT><<>>(grads, + acts, denom, alphas, betas, llForward, input_lengths, label_lengths, labels, + minibatch_, maxT_, maxU_, alphabet_size_, blank_, fastemit_lambda_); + } else { + compute_grad_kernel<128, ProbT><<>>(grads, + acts, denom, alphas, betas, llForward, input_lengths, label_lengths, labels, + minibatch_, maxT_, maxU_, alphabet_size_, blank_); + } #if defined(DEBUG_TIME) cudaStreamSynchronize(stream_); end = std::chrono::high_resolution_clock::now(); diff --git a/include/detail/gpu_rnnt_kernel.h b/include/detail/gpu_rnnt_kernel.h index 738d7ef..ed0b0e6 100644 --- a/include/detail/gpu_rnnt_kernel.h +++ b/include/detail/gpu_rnnt_kernel.h @@ -177,3 +177,50 @@ __global__ void compute_grad_kernel(Tp* grads, const Tp* const acts, const Tp* c } } } + +template +__global__ void compute_fastemit_grad_kernel(Tp* grads, const Tp* const acts, const Tp* const denom, const Tp* alphas, const Tp* betas, const Tp* const logll, const int* const xlen, const int* const ylen, + const int* const mlabels, const int minibatch, const int maxT, const int maxU, const int alphabet_size, const int blank_, const Tp fastemit_lambda) { + int tid = threadIdx.x; // alphabet dim + int idx = tid; + int col = blockIdx.x; // mb, t, u + + int u = col % maxU; + int bt = (col - u) / maxU; + int t = bt % maxT; + int mb = (bt - t) / maxT; + + const int T = xlen[mb]; + const int U = ylen[mb] + 1; + const int* labels = mlabels + mb * (maxU - 1); + + if (t < T && u < U) { + while (idx < alphabet_size) { + Tp logpk = denom[col] + acts[col * alphabet_size + idx]; + // Tp logpk = logp(denom, acts, maxT, maxU, alphabet_size, mb, t, u, idx); + Tp grad = exp(alphas[col] + betas[col] + logpk - logll[mb]); + + Tp logy_btu1 = rnnt_helper::neg_inf(); // log(y(t,u)) + log(beta(t, u+1)) + if (u < U-1) { + logy_btu1 = denom[col] + acts[col * alphabet_size + labels[u]] + betas[col+1]; + } + grad += fastemit_lambda * exp(alphas[col] + logy_btu1 + logpk - logll[mb]); + + // grad to last blank transition + if (idx == blank_ && t == T-1 && u == U-1) { + grad -= exp(alphas[col] + logpk - logll[mb]); + grad -= fastemit_lambda * exp(alphas[col] + logy_btu1 + logpk - logll[mb]); + } + if (idx == blank_ && t < T-1) { + grad -= exp(alphas[col] + logpk - logll[mb] + betas[col + maxU]); + } + if (u < U-1 && idx == labels[u]) { + grad -= exp(alphas[col] + logpk - logll[mb] + betas[col+1]); + grad -= fastemit_lambda * exp(alphas[col] + logy_btu1 - logll[mb]); + } + grads[col * alphabet_size + idx] = grad; + + idx += NT; + } + } +} diff --git a/include/rnnt.h b/include/rnnt.h index 4759e0a..207065f 100644 --- a/include/rnnt.h +++ b/include/rnnt.h @@ -61,6 +61,8 @@ struct rnntOptions { /// memory structure bool batch_first; + + float fastemit_lambda; }; /** Compute the RNN Transducer loss between a sequence diff --git a/pytorch_binding/src/binding.cpp b/pytorch_binding/src/binding.cpp index ec9ada7..95f5ff8 100644 --- a/pytorch_binding/src/binding.cpp +++ b/pytorch_binding/src/binding.cpp @@ -15,6 +15,7 @@ int cpu_rnnt(torch::Tensor acts, torch::Tensor costs, torch::Tensor grads, int blank_label, + float fastemit_lambda, int num_threads) { int maxT = acts.size(0); @@ -32,6 +33,7 @@ int cpu_rnnt(torch::Tensor acts, memset(&options, 0, sizeof(options)); options.maxT = maxT; options.maxU = maxU; + options.fastemit_lambda = fastemit_lambda; options.blank_label = blank_label; options.batch_first = true; options.loc = RNNT_CPU; @@ -87,6 +89,7 @@ int gpu_rnnt(torch::Tensor acts, torch::Tensor costs, torch::Tensor grads, int blank_label, + float fastemit_lambda, int num_threads) { int minibatch_size = acts.size(0); @@ -101,6 +104,7 @@ int gpu_rnnt(torch::Tensor acts, options.blank_label = blank_label; options.loc = RNNT_GPU; options.stream = at::cuda::getCurrentCUDAStream(); + options.fastemit_lambda = fastemit_lambda; options.num_threads = num_threads; #if defined(RNNT_DISABLE_OMP) || defined(APPLE) // have to use at least one diff --git a/pytorch_binding/warprnnt_pytorch/__init__.py b/pytorch_binding/warprnnt_pytorch/__init__.py index 76c6c30..a078fb0 100644 --- a/pytorch_binding/warprnnt_pytorch/__init__.py +++ b/pytorch_binding/warprnnt_pytorch/__init__.py @@ -15,6 +15,7 @@ def forward(ctx, acts, labels, act_lens, label_lens, blank, reduction): labels: 2 dimensional Tensor containing all the targets of the batch with zero padded act_lens: Tensor of size (batch) containing size of each output sequence from the network label_lens: Tensor of (batch) containing label length of each example + fastemit_lambda: Regularization parameter for FastEmit (https://arxiv.org/pdf/2010.11148.pdf) """ is_cuda = acts.is_cuda @@ -31,6 +32,7 @@ def forward(ctx, acts, labels, act_lens, label_lens, blank, reduction): costs, grads, blank, + fastemit_lambda, 0) if reduction in ['sum', 'mean']: @@ -47,10 +49,10 @@ def forward(ctx, acts, labels, act_lens, label_lens, blank, reduction): @staticmethod def backward(ctx, grad_output): grad_output = grad_output.view(-1, 1, 1, 1).to(ctx.grads) - return ctx.grads.mul_(grad_output), None, None, None, None, None + return ctx.grads.mul_(grad_output), None, None, None, None, None, None -def rnnt_loss(acts, labels, act_lens, label_lens, blank=0, reduction='mean'): +def rnnt_loss(acts, labels, act_lens, label_lens, blank=0, reduction='mean', fastemit_lambda=0.001): """ RNN Transducer Loss Args: @@ -63,11 +65,12 @@ def rnnt_loss(acts, labels, act_lens, label_lens, blank=0, reduction='mean'): 'none' | 'mean' | 'sum'. 'none': no reduction will be applied, 'mean': the output losses will be divided by the target lengths and then the mean over the batch is taken. Default: 'mean' + fastemit_lambda: Regularization parameter for FastEmit (https://arxiv.org/pdf/2010.11148.pdf) """ if not acts.is_cuda: acts = torch.nn.functional.log_softmax(acts, -1) - return _RNNT.apply(acts, labels, act_lens, label_lens, blank, reduction) + return _RNNT.apply(acts, labels, act_lens, label_lens, blank, reduction, fastemit_lambda) class RNNTLoss(Module): @@ -79,10 +82,11 @@ class RNNTLoss(Module): 'mean': the output losses will be divided by the target lengths and then the mean over the batch is taken. Default: 'mean' """ - def __init__(self, blank=0, reduction='mean'): + def __init__(self, blank=0, fastemit_lambda=0.001, reduction='mean'): super(RNNTLoss, self).__init__() self.blank = blank self.reduction = reduction + self.fastemit_lambda = fastemit_lambda self.loss = _RNNT.apply def forward(self, acts, labels, act_lens, label_lens): @@ -97,7 +101,7 @@ def forward(self, acts, labels, act_lens, label_lens): # log_softmax is computed within GPU version. acts = torch.nn.functional.log_softmax(acts, -1) - return self.loss(acts, labels, act_lens, label_lens, self.blank, self.reduction) + return self.loss(acts, labels, act_lens, label_lens, self.blank, self.reduction, self.fastemit_lambda) def check_type(var, t, name): diff --git a/src/rnnt_entrypoint.cpp b/src/rnnt_entrypoint.cpp index 0d742d8..9854470 100644 --- a/src/rnnt_entrypoint.cpp +++ b/src/rnnt_entrypoint.cpp @@ -55,12 +55,13 @@ rnntStatus_t compute_rnnt_loss(const float* const activations, //BTUV alphabet_size <= 0 || minibatch <= 0 || options.maxT <= 0 || - options.maxU <= 0) + options.maxU <= 0 || + options.fastemit_lambda < 0) return RNNT_STATUS_INVALID_VALUE; if (options.loc == RNNT_CPU) { CpuRNNT rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace, - options.blank_label, options.num_threads, options.batch_first); + options.blank_label, options.fastemit_lambda, options.num_threads, options.batch_first); if (gradients != NULL) return rnnt.cost_and_grad(activations, gradients, @@ -73,7 +74,7 @@ rnntStatus_t compute_rnnt_loss(const float* const activations, //BTUV } else if (options.loc == RNNT_GPU) { #ifdef __CUDACC__ GpuRNNT rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace, - options.blank_label, options.num_threads, options.stream); + options.blank_label, options.fastemit_lambda, options.num_threads, options.stream); if (gradients != NULL) return rnnt.cost_and_grad(activations, gradients, @@ -147,12 +148,13 @@ rnntStatus_t compute_rnnt_loss_fp64(const double* const activations, //BTUV alphabet_size <= 0 || minibatch <= 0 || options.maxT <= 0 || - options.maxU <= 0) + options.maxU <= 0 || + options.fastemit_lambda < 0) return RNNT_STATUS_INVALID_VALUE; if (options.loc == RNNT_CPU) { CpuRNNT rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace, - options.blank_label, options.num_threads, options.batch_first); + options.blank_label, options.fastemit_lambda, options.num_threads, options.batch_first); if (gradients != NULL) return rnnt.cost_and_grad(activations, gradients, @@ -165,7 +167,7 @@ rnntStatus_t compute_rnnt_loss_fp64(const double* const activations, //BTUV } else if (options.loc == RNNT_GPU) { #ifdef __CUDACC__ GpuRNNT rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace, - options.blank_label, options.num_threads, options.stream); + options.blank_label, options.fastemit_lambda, options.num_threads, options.stream); if (gradients != NULL) return rnnt.cost_and_grad(activations, gradients, From d4bdce529285711862d37586f3cb24bbe2e32590 Mon Sep 17 00:00:00 2001 From: John Nirmal Date: Sat, 19 Jul 2025 20:34:04 +0530 Subject: [PATCH 08/14] Update CMakeLists.txt Detect CUDA to on/off WITH_GPU by default --- CMakeLists.txt | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f7e4034..273285b 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,8 +6,16 @@ endif() project(rnnt_release LANGUAGES CXX CUDA) +if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18") + find_package(CUDAToolkit) + set(CUDA_FOUND ${CUDAToolkit_FOUND}) +else() + find_package(CUDA) + set(CUDA_FOUND ${CUDA_FOUND}) +endif() + # ==== USER OPTIONS ==== -option(WITH_GPU "Build with CUDA support" ON) +option(WITH_GPU "Build with CUDA support" ${CUDA_FOUND}) option(WITH_OMP "Build with OpenMP support" ON) option(USE_NAIVE_KERNEL "Use naive alpha-beta kernel" OFF) option(DEBUG_TIME "Output kernel time" OFF) From 946669ae910d40dd0144be10c69c030017734dd4 Mon Sep 17 00:00:00 2001 From: John Nirmal Date: Sat, 19 Jul 2025 22:01:58 +0530 Subject: [PATCH 09/14] Add graceful CUDA detection and fallback to CPU-only build - Check CUDA availability before declaring it as project language - Fall back to CPU-only build when CUDA toolkit not found - Prevents "Failed to find nvcc" error on systems without CUDA - Maintains GPU support when CUDA is properly installed --- CMakeLists.txt | 170 ++++++++++++++++++++++++++----------------------- 1 file changed, 90 insertions(+), 80 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 273285b..fcb7766 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,16 +4,22 @@ else() cmake_minimum_required(VERSION 3.5) endif() -project(rnnt_release LANGUAGES CXX CUDA) - if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18") - find_package(CUDAToolkit) + find_package(CUDAToolkit QUIET) set(CUDA_FOUND ${CUDAToolkit_FOUND}) else() - find_package(CUDA) + find_package(CUDA QUIET) set(CUDA_FOUND ${CUDA_FOUND}) endif() +if(CUDA_FOUND) + project(rnnt_release LANGUAGES CXX CUDA) + message(STATUS "CUDA found, enabling GPU support") +else() + project(rnnt_release LANGUAGES CXX) + message(STATUS "CUDA not found, building CPU-only version") +endif() + # ==== USER OPTIONS ==== option(WITH_GPU "Build with CUDA support" ${CUDA_FOUND}) option(WITH_OMP "Build with OpenMP support" ON) @@ -21,6 +27,11 @@ option(USE_NAIVE_KERNEL "Use naive alpha-beta kernel" OFF) option(DEBUG_TIME "Output kernel time" OFF) option(DEBUG_KERNEL "Output alpha beta debug" OFF) +if(WITH_GPU AND NOT CUDA_FOUND) + message(WARNING "WITH_GPU requested but CUDA not found. Building CPU-only version.") + set(WITH_GPU OFF) +endif() + set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_POSITION_INDEPENDENT_CODE ON) @@ -57,90 +68,89 @@ if(WITH_OMP) endif() endif() -# ==== SAFE CUDA VERSION DETECTION & ARCH SELECTION ==== - -# Try to detect CUDA version. Set major.minor or fallback to "0" -if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18") - find_package(CUDAToolkit REQUIRED) - set(CUDA_TOOLKIT_VERSION "${CUDAToolkit_VERSION}") - if(NOT CUDA_TOOLKIT_VERSION OR "${CUDA_TOOLKIT_VERSION}" STREQUAL "") - # CMake >= 3.18 but CUDAToolkit_VERSION not set? Fallback logic - set(CUDA_TOOLKIT_VERSION "0.0") - endif() -else() - find_package(CUDA REQUIRED) - if(DEFINED CUDA_VERSION AND NOT "${CUDA_VERSION}" STREQUAL "") - set(CUDA_TOOLKIT_VERSION "${CUDA_VERSION}") +# ==== CUDA SETUP (only if WITH_GPU is ON) ==== +if(WITH_GPU) + if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18") + find_package(CUDAToolkit REQUIRED) + set(CUDA_TOOLKIT_VERSION "${CUDAToolkit_VERSION}") + if(NOT CUDA_TOOLKIT_VERSION OR "${CUDA_TOOLKIT_VERSION}" STREQUAL "") + set(CUDA_TOOLKIT_VERSION "0.0") + endif() else() - set(CUDA_TOOLKIT_VERSION "0.0") + find_package(CUDA REQUIRED) + if(DEFINED CUDA_VERSION AND NOT "${CUDA_VERSION}" STREQUAL "") + set(CUDA_TOOLKIT_VERSION "${CUDA_VERSION}") + else() + set(CUDA_TOOLKIT_VERSION "0.0") + endif() endif() -endif() -# Split CUDA version into major/minor -function(cuda_version_major VERSION OUTVAR) - if(NOT "${VERSION}" STREQUAL "") - string(REPLACE "." ";" TMP_LIST ${VERSION}) - list(GET TMP_LIST 0 MAJOR) - set(${OUTVAR} "${MAJOR}" PARENT_SCOPE) - else() - set(${OUTVAR} "0" PARENT_SCOPE) - endif() -endfunction() -function(cuda_version_minor VERSION OUTVAR) - if(NOT "${VERSION}" STREQUAL "") - string(REPLACE "." ";" TMP_LIST ${VERSION}) - list(LENGTH TMP_LIST TMP_LEN) - if(TMP_LEN GREATER 1) - list(GET TMP_LIST 1 MINOR) + # Split CUDA version into major/minor + function(cuda_version_major VERSION OUTVAR) + if(NOT "${VERSION}" STREQUAL "") + string(REPLACE "." ";" TMP_LIST ${VERSION}) + list(GET TMP_LIST 0 MAJOR) + set(${OUTVAR} "${MAJOR}" PARENT_SCOPE) else() - set(MINOR "0") + set(${OUTVAR} "0" PARENT_SCOPE) endif() - set(${OUTVAR} "${MINOR}" PARENT_SCOPE) - else() - set(${OUTVAR} "0" PARENT_SCOPE) + endfunction() + function(cuda_version_minor VERSION OUTVAR) + if(NOT "${VERSION}" STREQUAL "") + string(REPLACE "." ";" TMP_LIST ${VERSION}) + list(LENGTH TMP_LIST TMP_LEN) + if(TMP_LEN GREATER 1) + list(GET TMP_LIST 1 MINOR) + else() + set(MINOR "0") + endif() + set(${OUTVAR} "${MINOR}" PARENT_SCOPE) + else() + set(${OUTVAR} "0" PARENT_SCOPE) + endif() + endfunction() + + cuda_version_major("${CUDA_TOOLKIT_VERSION}" CUDA_VERSION_MAJOR) + cuda_version_minor("${CUDA_TOOLKIT_VERSION}" CUDA_VERSION_MINOR) + + set(CUDA_ARCH_LIST "") + if(CUDA_TOOLKIT_VERSION STREQUAL "" OR CUDA_VERSION_MAJOR EQUAL 0) + set(CUDA_ARCH_LIST "52;60;70;75") + message(WARNING "Could not detect CUDA version. Defaulting to minimal arch set (52;60;70;75).") + elseif(CUDA_VERSION_MAJOR LESS 11) + set(CUDA_ARCH_LIST "52;60;70;75") + elseif(CUDA_VERSION_MAJOR EQUAL 11) + set(CUDA_ARCH_LIST "52;60;70;75;80;86") + elseif(CUDA_VERSION_MAJOR GREATER_EQUAL 12) + set(CUDA_ARCH_LIST "52;60;70;75;80;86;89") endif() -endfunction() - -cuda_version_major("${CUDA_TOOLKIT_VERSION}" CUDA_VERSION_MAJOR) -cuda_version_minor("${CUDA_TOOLKIT_VERSION}" CUDA_VERSION_MINOR) - -set(CUDA_ARCH_LIST "") -if(CUDA_TOOLKIT_VERSION STREQUAL "" OR CUDA_VERSION_MAJOR EQUAL 0) - set(CUDA_ARCH_LIST "52;60;70;75") - message(WARNING "Could not detect CUDA version. Defaulting to minimal arch set (52;60;70;75).") -elseif(CUDA_VERSION_MAJOR LESS 11) - set(CUDA_ARCH_LIST "52;60;70;75") -elseif(CUDA_VERSION_MAJOR EQUAL 11) - set(CUDA_ARCH_LIST "52;60;70;75;80;86") -elseif(CUDA_VERSION_MAJOR GREATER_EQUAL 12) - set(CUDA_ARCH_LIST "52;60;70;75;80;86;89") -endif() -# Allow user override -if(DEFINED CMAKE_CUDA_ARCHITECTURES AND NOT "${CMAKE_CUDA_ARCHITECTURES}" STREQUAL "") - set(CUDA_ARCH_LIST "${CMAKE_CUDA_ARCHITECTURES}") -endif() -if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18") - set(CMAKE_CUDA_ARCHITECTURES "${CUDA_ARCH_LIST}" CACHE STRING "Target GPU architectures") - message(STATUS "Auto-selected CUDA archs for CUDA ${CUDA_TOOLKIT_VERSION}: ${CMAKE_CUDA_ARCHITECTURES}") -else() - string(REPLACE ";" " " CUDA_ARCH_FLAGS "${CUDA_ARCH_LIST}") - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ") - foreach(ARCH ${CUDA_ARCH_LIST}) - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_${ARCH},code=sm_${ARCH}") - endforeach() - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++11") - message(STATUS "Auto-selected CUDA NVCC_ARCH flags for CUDA ${CUDA_TOOLKIT_VERSION}: ${CUDA_NVCC_FLAGS}") -endif() + if(DEFINED CMAKE_CUDA_ARCHITECTURES AND NOT "${CMAKE_CUDA_ARCHITECTURES}" STREQUAL "") + set(CUDA_ARCH_LIST "${CMAKE_CUDA_ARCHITECTURES}") + endif() + + if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18") + set(CMAKE_CUDA_ARCHITECTURES "${CUDA_ARCH_LIST}" CACHE STRING "Target GPU architectures") + message(STATUS "Auto-selected CUDA archs for CUDA ${CUDA_TOOLKIT_VERSION}: ${CMAKE_CUDA_ARCHITECTURES}") + else() + string(REPLACE ";" " " CUDA_ARCH_FLAGS "${CUDA_ARCH_LIST}") + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ") + foreach(ARCH ${CUDA_ARCH_LIST}) + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_${ARCH},code=sm_${ARCH}") + endforeach() + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++11") + message(STATUS "Auto-selected CUDA NVCC_ARCH flags for CUDA ${CUDA_TOOLKIT_VERSION}: ${CUDA_NVCC_FLAGS}") + endif() -if(CUDA_TOOLKIT_VERSION AND CUDA_VERSION_MAJOR LESS 11) - if(CUDA_ARCH_LIST MATCHES "80|86|89") - message(WARNING " -Detected CUDA ${CUDA_TOOLKIT_VERSION}. -Some requested architectures (${CUDA_ARCH_LIST}) require CUDA >= 11 (Ampere/Ada). -Remove sm_80/86/89 or upgrade CUDA Toolkit if you see NVCC errors. -") + if(CUDA_TOOLKIT_VERSION AND CUDA_VERSION_MAJOR LESS 11) + if(CUDA_ARCH_LIST MATCHES "80|86|89") + message(WARNING " + Detected CUDA ${CUDA_TOOLKIT_VERSION}. + Some requested architectures (${CUDA_ARCH_LIST}) require CUDA >= 11 (Ampere/Ada). + Remove sm_80/86/89 or upgrade CUDA Toolkit if you see NVCC errors. + ") + endif() endif() endif() @@ -176,7 +186,7 @@ endif() target_compile_features(warprnnt PRIVATE cxx_std_17) target_compile_definitions(warprnnt PRIVATE ${COMMON_DEFINITIONS}) target_include_directories(warprnnt PRIVATE include) -if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18" AND TARGET warprnnt) +if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18" AND TARGET warprnnt AND WITH_GPU) set_target_properties(warprnnt PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_STANDARD 17 From 80f897970637d5d681de93ce1f6c35d48f425272 Mon Sep 17 00:00:00 2001 From: ljn7 Date: Sat, 19 Jul 2025 18:49:55 +0000 Subject: [PATCH 10/14] Added fastemit argument --- pytorch_binding/warprnnt_pytorch/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_binding/warprnnt_pytorch/__init__.py b/pytorch_binding/warprnnt_pytorch/__init__.py index a078fb0..8f1d261 100644 --- a/pytorch_binding/warprnnt_pytorch/__init__.py +++ b/pytorch_binding/warprnnt_pytorch/__init__.py @@ -9,7 +9,7 @@ class _RNNT(Function): @staticmethod - def forward(ctx, acts, labels, act_lens, label_lens, blank, reduction): + def forward(ctx, acts, labels, act_lens, label_lens, blank, reduction, fastemit_lambda): """ acts: Tensor of (batch x seqLength x labelLength x outputDim) containing output from network labels: 2 dimensional Tensor containing all the targets of the batch with zero padded From 2e0dc61a432f6e8418e5079f6d5ef98de31c99c8 Mon Sep 17 00:00:00 2001 From: ljn7 Date: Sun, 20 Jul 2025 18:27:34 +0000 Subject: [PATCH 11/14] feat(build): add Windows support for CUDA builds and improve cross-platform checks - Enabled robust detection of CUDA and platform compatibility - Improved build logic to support CUDA builds on Windows - Added fallback mechanisms and clearer error handling - Ensured smoother CPU-only builds without redundant checks - Reduced platform-specific pain points for easier maintenance --- CMakeLists.txt | 139 +++++++++++++++++----------- src/rnnt_entrypoint.cpp | 189 -------------------------------------- src/rnnt_entrypoint.cu | 198 +++++++++++++++++++++++++++++++++++++++- tests/test_cpu.cpp | 4 +- tests/test_time.cpp | 4 +- 5 files changed, 285 insertions(+), 249 deletions(-) delete mode 100644 src/rnnt_entrypoint.cpp mode change 120000 => 100644 src/rnnt_entrypoint.cu diff --git a/CMakeLists.txt b/CMakeLists.txt index fcb7766..a31e5a4 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,35 +1,64 @@ -if(APPLE) - cmake_minimum_required(VERSION 3.4) -else() - cmake_minimum_required(VERSION 3.5) -endif() +cmake_minimum_required(VERSION 3.10) + +option(WITH_GPU "Build with GPU (CUDA) support" ON) + +set(CUDA_FOUND FALSE) +set(CUDA_TOOLKIT_VERSION "0.0") if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18") find_package(CUDAToolkit QUIET) - set(CUDA_FOUND ${CUDAToolkit_FOUND}) + + if(CMAKE_VERSION VERSION_LESS "3.27") + find_package(CUDA QUIET) + endif() + + if(CUDAToolkit_FOUND OR CUDA_FOUND) + set(CUDA_FOUND TRUE) + if(CUDAToolkit_FOUND) + set(CUDA_TOOLKIT_VERSION "${CUDAToolkit_VERSION}") + elseif(DEFINED CUDA_VERSION AND NOT "${CUDA_VERSION}" STREQUAL "") + set(CUDA_TOOLKIT_VERSION "${CUDA_VERSION}") + endif() + endif() else() find_package(CUDA QUIET) - set(CUDA_FOUND ${CUDA_FOUND}) + if(CUDA_FOUND) + if(DEFINED CUDA_VERSION AND NOT "${CUDA_VERSION}" STREQUAL "") + set(CUDA_TOOLKIT_VERSION "${CUDA_VERSION}") + endif() + endif() +endif() + +if(NOT WITH_GPU) + message(STATUS "WITH_GPU=OFF: forcing CPU-only build, even if CUDA is available") + set(CUDA_FOUND FALSE) endif() -if(CUDA_FOUND) +if(WITH_GPU AND NOT CUDA_FOUND) + message(FATAL_ERROR "WITH_GPU=ON but no compatible CUDA toolkit was found") +endif() + +if(WITH_GPU AND CUDA_FOUND) project(rnnt_release LANGUAGES CXX CUDA) - message(STATUS "CUDA found, enabling GPU support") + set(USE_CUDA TRUE) + message(STATUS "CUDA ${CUDA_TOOLKIT_VERSION} found, GPU build enabled") else() project(rnnt_release LANGUAGES CXX) - message(STATUS "CUDA not found, building CPU-only version") + set(USE_CUDA FALSE) + message(STATUS "CUDA not found or disabled, building CPU-only version") endif() # ==== USER OPTIONS ==== -option(WITH_GPU "Build with CUDA support" ${CUDA_FOUND}) option(WITH_OMP "Build with OpenMP support" ON) option(USE_NAIVE_KERNEL "Use naive alpha-beta kernel" OFF) option(DEBUG_TIME "Output kernel time" OFF) option(DEBUG_KERNEL "Output alpha beta debug" OFF) -if(WITH_GPU AND NOT CUDA_FOUND) - message(WARNING "WITH_GPU requested but CUDA not found. Building CPU-only version.") - set(WITH_GPU OFF) +if(WITH_OMP) + find_package(OpenMP) + if(OpenMP_CXX_FOUND) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") + endif() endif() set(CMAKE_CXX_STANDARD 17) @@ -61,31 +90,8 @@ if(NOT WITH_OMP) list(APPEND COMMON_DEFINITIONS -DRNNT_DISABLE_OMP) endif() -if(WITH_OMP) - find_package(OpenMP) - if(OpenMP_CXX_FOUND) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") - endif() -endif() - # ==== CUDA SETUP (only if WITH_GPU is ON) ==== -if(WITH_GPU) - if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18") - find_package(CUDAToolkit REQUIRED) - set(CUDA_TOOLKIT_VERSION "${CUDAToolkit_VERSION}") - if(NOT CUDA_TOOLKIT_VERSION OR "${CUDA_TOOLKIT_VERSION}" STREQUAL "") - set(CUDA_TOOLKIT_VERSION "0.0") - endif() - else() - find_package(CUDA REQUIRED) - if(DEFINED CUDA_VERSION AND NOT "${CUDA_VERSION}" STREQUAL "") - set(CUDA_TOOLKIT_VERSION "${CUDA_VERSION}") - else() - set(CUDA_TOOLKIT_VERSION "0.0") - endif() - endif() - - # Split CUDA version into major/minor +if(USE_CUDA) function(cuda_version_major VERSION OUTVAR) if(NOT "${VERSION}" STREQUAL "") string(REPLACE "." ";" TMP_LIST ${VERSION}) @@ -95,6 +101,7 @@ if(WITH_GPU) set(${OUTVAR} "0" PARENT_SCOPE) endif() endfunction() + function(cuda_version_minor VERSION OUTVAR) if(NOT "${VERSION}" STREQUAL "") string(REPLACE "." ";" TMP_LIST ${VERSION}) @@ -125,7 +132,6 @@ if(WITH_GPU) set(CUDA_ARCH_LIST "52;60;70;75;80;86;89") endif() - if(DEFINED CMAKE_CUDA_ARCHITECTURES AND NOT "${CMAKE_CUDA_ARCHITECTURES}" STREQUAL "") set(CUDA_ARCH_LIST "${CMAKE_CUDA_ARCHITECTURES}") endif() @@ -145,11 +151,9 @@ if(WITH_GPU) if(CUDA_TOOLKIT_VERSION AND CUDA_VERSION_MAJOR LESS 11) if(CUDA_ARCH_LIST MATCHES "80|86|89") - message(WARNING " - Detected CUDA ${CUDA_TOOLKIT_VERSION}. - Some requested architectures (${CUDA_ARCH_LIST}) require CUDA >= 11 (Ampere/Ada). - Remove sm_80/86/89 or upgrade CUDA Toolkit if you see NVCC errors. - ") + message(WARNING "\nDetected CUDA ${CUDA_TOOLKIT_VERSION}. + Some requested architectures (${CUDA_ARCH_LIST}) require CUDA >= 11 (Ampere/Ada). + Remove sm_80/86/89 or upgrade CUDA Toolkit if you see NVCC errors.\n") endif() endif() endif() @@ -166,27 +170,50 @@ else() set(CMAKE_SKIP_RPATH TRUE) endif() +set(RNNT_ENTRYPOINT_SRC src/rnnt_entrypoint.cu) +if(NOT USE_CUDA) + configure_file( + ${RNNT_ENTRYPOINT_SRC} + ${CMAKE_CURRENT_BINARY_DIR}/rnnt_entrypoint_cpu.cpp + COPYONLY + ) + set(RNNT_ENTRYPOINT_SRC ${CMAKE_CURRENT_BINARY_DIR}/rnnt_entrypoint_cpu.cpp) +endif() + # ==== Main library target ==== -if(WITH_GPU) +if(USE_CUDA) message(STATUS "Building shared library with GPU support") if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18") - add_library(warprnnt SHARED src/rnnt_entrypoint.cu) + add_library(warprnnt SHARED ${RNNT_ENTRYPOINT_SRC}) target_link_libraries(warprnnt PRIVATE CUDA::cudart CUDA::curand) else() - CUDA_ADD_LIBRARY(warprnnt SHARED src/rnnt_entrypoint.cu) + CUDA_ADD_LIBRARY(warprnnt SHARED ${RNNT_ENTRYPOINT_SRC}) if(NOT Torch_FOUND) TARGET_LINK_LIBRARIES(warprnnt ${CUDA_curand_LIBRARY}) endif() endif() else() message(STATUS "Building shared library with no GPU support") - add_library(warprnnt SHARED src/rnnt_entrypoint.cpp) + add_library(warprnnt SHARED ${RNNT_ENTRYPOINT_SRC}) +endif() + +set_target_properties(warprnnt PROPERTIES + RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR} + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR} + ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR} +) + +if(WIN32) + set_target_properties(warprnnt PROPERTIES + WINDOWS_EXPORT_ALL_SYMBOLS ON + ) endif() target_compile_features(warprnnt PRIVATE cxx_std_17) target_compile_definitions(warprnnt PRIVATE ${COMMON_DEFINITIONS}) target_include_directories(warprnnt PRIVATE include) -if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18" AND TARGET warprnnt AND WITH_GPU) + +if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18" AND TARGET warprnnt AND USE_CUDA) set_target_properties(warprnnt PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_STANDARD 17 @@ -206,7 +233,7 @@ add_executable(test_time tests/test_time.cpp tests/random.cpp) target_link_libraries(test_time PRIVATE warprnnt) target_compile_features(test_time PRIVATE cxx_std_17) -if(WITH_GPU) +if(USE_CUDA) if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18") add_executable(test_time_gpu tests/test_time.cu tests/random.cpp) target_link_libraries(test_time_gpu PRIVATE warprnnt CUDA::curand) @@ -232,11 +259,11 @@ if(WITH_GPU) $<$:--expt-relaxed-constexpr> ) else() - cuda_add_executable(test_time_gpu tests/test_time.cu tests/random.cpp ) + cuda_add_executable(test_time_gpu tests/test_time.cu tests/random.cpp) TARGET_LINK_LIBRARIES(test_time_gpu warprnnt ${CUDA_curand_LIBRARY}) set_target_properties(test_time_gpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") - cuda_add_executable(test_gpu tests/test_gpu.cu tests/random.cpp ) + cuda_add_executable(test_gpu tests/test_gpu.cu tests/random.cpp) TARGET_LINK_LIBRARIES(test_gpu warprnnt ${CUDA_curand_LIBRARY}) set_target_properties(test_gpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") endif() @@ -244,8 +271,10 @@ endif() # ==== Install targets ==== install(TARGETS warprnnt - RUNTIME DESTINATION bin - LIBRARY DESTINATION lib - ARCHIVE DESTINATION lib) + RUNTIME DESTINATION bin + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib +) install(FILES include/rnnt.h DESTINATION include) + diff --git a/src/rnnt_entrypoint.cpp b/src/rnnt_entrypoint.cpp deleted file mode 100644 index 9854470..0000000 --- a/src/rnnt_entrypoint.cpp +++ /dev/null @@ -1,189 +0,0 @@ -#include -#include -#include - -#include - -#include "detail/cpu_rnnt.h" -#ifdef __CUDACC__ - #include "detail/gpu_rnnt.h" -#endif - -extern "C" { - -int get_warprnnt_version() { - return 1; -} - -const char* rnntGetStatusString(rnntStatus_t status) { - switch (status) { - case RNNT_STATUS_SUCCESS: - return "no error"; - case RNNT_STATUS_MEMOPS_FAILED: - return "cuda memcpy or memset failed"; - case RNNT_STATUS_INVALID_VALUE: - return "invalid value"; - case RNNT_STATUS_EXECUTION_FAILED: - return "execution failed"; - - case RNNT_STATUS_UNKNOWN_ERROR: - default: - return "unknown error"; - - } - -} - - -rnntStatus_t compute_rnnt_loss(const float* const activations, //BTUV - float* gradients, - const int* const flat_labels, - const int* const label_lengths, - const int* const input_lengths, - int alphabet_size, - int minibatch, - float *costs, - void *workspace, - rnntOptions options) { - - if (activations == nullptr || - flat_labels == nullptr || - label_lengths == nullptr || - input_lengths == nullptr || - costs == nullptr || - workspace == nullptr || - alphabet_size <= 0 || - minibatch <= 0 || - options.maxT <= 0 || - options.maxU <= 0 || - options.fastemit_lambda < 0) - return RNNT_STATUS_INVALID_VALUE; - - if (options.loc == RNNT_CPU) { - CpuRNNT rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace, - options.blank_label, options.fastemit_lambda, options.num_threads, options.batch_first); - - if (gradients != NULL) - return rnnt.cost_and_grad(activations, gradients, - costs, - flat_labels, label_lengths, - input_lengths); - else - return rnnt.score_forward(activations, costs, flat_labels, - label_lengths, input_lengths); - } else if (options.loc == RNNT_GPU) { -#ifdef __CUDACC__ - GpuRNNT rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace, - options.blank_label, options.fastemit_lambda, options.num_threads, options.stream); - - if (gradients != NULL) - return rnnt.cost_and_grad(activations, gradients, - costs, - flat_labels, label_lengths, - input_lengths); - else - return rnnt.score_forward(activations, costs, flat_labels, - label_lengths, input_lengths); -#else - std::cerr << "GPU execution requested, but not compiled with GPU support" << std::endl; - return RNNT_STATUS_EXECUTION_FAILED; -#endif - } else { - return RNNT_STATUS_INVALID_VALUE; - } -} - - -rnntStatus_t get_workspace_size(int maxT, int maxU, - int minibatch, - bool gpu, - size_t* size_bytes, - size_t dtype_size) -{ - if (minibatch <= 0 || - maxT <= 0 || - maxU <= 0) - return RNNT_STATUS_INVALID_VALUE; - - *size_bytes = 0; - - // per minibatch memory - size_t per_minibatch_bytes = 0; - - // alphas & betas - per_minibatch_bytes += dtype_size * maxT * maxU * 2; - - if (!gpu) { - // blank & label log probability cache - per_minibatch_bytes += dtype_size * maxT * maxU * 2; - } else { - // softmax denominator - per_minibatch_bytes += dtype_size * maxT * maxU; - // forward-backward loglikelihood - per_minibatch_bytes += dtype_size * 2; - } - - *size_bytes = per_minibatch_bytes * minibatch; - - return RNNT_STATUS_SUCCESS; -} - -rnntStatus_t compute_rnnt_loss_fp64(const double* const activations, //BTUV - double* gradients, - const int* const flat_labels, - const int* const label_lengths, - const int* const input_lengths, - int alphabet_size, - int minibatch, - double *costs, - void *workspace, - rnntOptions options) { - - if (activations == nullptr || - flat_labels == nullptr || - label_lengths == nullptr || - input_lengths == nullptr || - costs == nullptr || - workspace == nullptr || - alphabet_size <= 0 || - minibatch <= 0 || - options.maxT <= 0 || - options.maxU <= 0 || - options.fastemit_lambda < 0) - return RNNT_STATUS_INVALID_VALUE; - - if (options.loc == RNNT_CPU) { - CpuRNNT rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace, - options.blank_label, options.fastemit_lambda, options.num_threads, options.batch_first); - - if (gradients != NULL) - return rnnt.cost_and_grad(activations, gradients, - costs, - flat_labels, label_lengths, - input_lengths); - else - return rnnt.score_forward(activations, costs, flat_labels, - label_lengths, input_lengths); - } else if (options.loc == RNNT_GPU) { -#ifdef __CUDACC__ - GpuRNNT rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace, - options.blank_label, options.fastemit_lambda, options.num_threads, options.stream); - - if (gradients != NULL) - return rnnt.cost_and_grad(activations, gradients, - costs, - flat_labels, label_lengths, - input_lengths); - else - return rnnt.score_forward(activations, costs, flat_labels, - label_lengths, input_lengths); -#else - std::cerr << "GPU execution requested, but not compiled with GPU support" << std::endl; - return RNNT_STATUS_EXECUTION_FAILED; -#endif - } else { - return RNNT_STATUS_INVALID_VALUE; - } -} - -} diff --git a/src/rnnt_entrypoint.cu b/src/rnnt_entrypoint.cu deleted file mode 120000 index c483c3c..0000000 --- a/src/rnnt_entrypoint.cu +++ /dev/null @@ -1 +0,0 @@ -rnnt_entrypoint.cpp \ No newline at end of file diff --git a/src/rnnt_entrypoint.cu b/src/rnnt_entrypoint.cu new file mode 100644 index 0000000..c213f4c --- /dev/null +++ b/src/rnnt_entrypoint.cu @@ -0,0 +1,197 @@ +#include +#include +#include + +#include + +#include "detail/cpu_rnnt.h" +#ifdef __CUDACC__ + #include "detail/gpu_rnnt.h" +#endif + +extern "C" { + +int get_warprnnt_version() { + return 1; +} + +const char* rnntGetStatusString(rnntStatus_t status) { + switch (status) { + case RNNT_STATUS_SUCCESS: + return "no error"; + case RNNT_STATUS_MEMOPS_FAILED: + return "cuda memcpy or memset failed"; + case RNNT_STATUS_INVALID_VALUE: + return "invalid value"; + case RNNT_STATUS_EXECUTION_FAILED: + return "execution failed"; + + case RNNT_STATUS_UNKNOWN_ERROR: + default: + return "unknown error"; + + } + +} + + +rnntStatus_t compute_rnnt_loss(const float* const activations, //BTUV + float* gradients, + const int* const flat_labels, + const int* const label_lengths, + const int* const input_lengths, + int alphabet_size, + int minibatch, + float *costs, + void *workspace, + rnntOptions options) { + + if (activations == nullptr || + flat_labels == nullptr || + label_lengths == nullptr || + input_lengths == nullptr || + costs == nullptr || + workspace == nullptr || + alphabet_size <= 0 || + minibatch <= 0 || + options.maxT <= 0 || + options.maxU <= 0 || + options.fastemit_lambda < 0) + return RNNT_STATUS_INVALID_VALUE; + + if (options.loc == RNNT_CPU) { + CpuRNNT rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace, + options.blank_label, options.fastemit_lambda, options.num_threads, options.batch_first); + + if (gradients != NULL) + return rnnt.cost_and_grad(activations, gradients, + costs, + flat_labels, label_lengths, + input_lengths); + else + return rnnt.score_forward(activations, costs, flat_labels, + label_lengths, input_lengths); + } else if (options.loc == RNNT_GPU) { +#ifdef __CUDACC__ + GpuRNNT rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace, + options.blank_label, options.fastemit_lambda, options.num_threads, options.stream); + + if (gradients != NULL) + return rnnt.cost_and_grad(activations, gradients, + costs, + flat_labels, label_lengths, + input_lengths); + else + return rnnt.score_forward(activations, costs, flat_labels, + label_lengths, input_lengths); +#else + std::cerr << "GPU execution requested, but not compiled with GPU support" << std::endl; + return RNNT_STATUS_EXECUTION_FAILED; +#endif + } else { + return RNNT_STATUS_INVALID_VALUE; + } +} + + +rnntStatus_t get_workspace_size(int maxT, int maxU, + int minibatch, + bool gpu, + size_t* size_bytes, + size_t dtype_size) +{ + if (minibatch <= 0 || + maxT <= 0 || + maxU <= 0) + return RNNT_STATUS_INVALID_VALUE; + + *size_bytes = 0; + + // per minibatch memory + size_t per_minibatch_bytes = 0; + + // alphas & betas + per_minibatch_bytes += dtype_size * maxT * maxU * 2; + + if (!gpu) { + // blank & label log probability cache + per_minibatch_bytes += dtype_size * maxT * maxU * 2; + } else { + // softmax denominator + per_minibatch_bytes += dtype_size * maxT * maxU; + // forward-backward loglikelihood + per_minibatch_bytes += dtype_size * 2; + } + + *size_bytes = per_minibatch_bytes * minibatch; + + return RNNT_STATUS_SUCCESS; +} + +rnntStatus_t compute_rnnt_loss_fp64(const double* const activations, //BTUV + double* gradients, + const int* const flat_labels, + const int* const label_lengths, + const int* const input_lengths, + int alphabet_size, + int minibatch, + double *costs, + void *workspace, + rnntOptions options) { + + if (activations == nullptr || + flat_labels == nullptr || + label_lengths == nullptr || + input_lengths == nullptr || + costs == nullptr || + workspace == nullptr || + alphabet_size <= 0 || + minibatch <= 0 || + options.maxT <= 0 || + options.maxU <= 0 || + options.fastemit_lambda < 0) + return RNNT_STATUS_INVALID_VALUE; + + if (options.loc == RNNT_CPU) { + CpuRNNT rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace, + options.blank_label, options.fastemit_lambda, options.num_threads, options.batch_first); + + if (gradients != NULL) + return rnnt.cost_and_grad(activations, gradients, + costs, + flat_labels, label_lengths, + input_lengths); + else + return rnnt.score_forward(activations, costs, flat_labels, + label_lengths, input_lengths); + } else if (options.loc == RNNT_GPU) { +#ifdef __CUDACC__ + GpuRNNT rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace, + options.blank_label, options.fastemit_lambda, options.num_threads, options.stream); + + if (gradients != NULL) + return rnnt.cost_and_grad(activations, gradients, + costs, + flat_labels, label_lengths, + input_lengths); + else + return rnnt.score_forward(activations, costs, flat_labels, + label_lengths, input_lengths); +#else + std::cerr << "GPU execution requested, but not compiled with GPU support" << std::endl; + return RNNT_STATUS_EXECUTION_FAILED; +#endif + } else { + return RNNT_STATUS_INVALID_VALUE; + } +} + +const char* rnnt_backend_mode() { +#ifdef __CUDACC__ + return "GPU"; +#else + return "CPU"; +#endif +} + +} diff --git a/tests/test_cpu.cpp b/tests/test_cpu.cpp index 554a18b..fee4eb5 100644 --- a/tests/test_cpu.cpp +++ b/tests/test_cpu.cpp @@ -239,7 +239,7 @@ bool inf_test() { return status; } -float numeric_grad(std::vector& acts, std::vector& flat_labels, std::vector& label_lengths, +void numeric_grad(std::vector& acts, std::vector& flat_labels, std::vector& label_lengths, std::vector sizes, int alphabet_size, int minibatch, void* rnnt_cpu_workspace, rnntOptions& options, std::vector& num_grad) { @@ -403,4 +403,4 @@ int main(void) { std::cout << "Some or all tests fail" << std::endl; return 1; } -} \ No newline at end of file +} diff --git a/tests/test_time.cpp b/tests/test_time.cpp index 3083b0e..ce77ba6 100644 --- a/tests/test_time.cpp +++ b/tests/test_time.cpp @@ -12,7 +12,7 @@ #include "test.h" -bool run_test(int B, int T, int L, int A, int num_threads) { +void run_test(int B, int T, int L, int A, int num_threads) { std::mt19937 gen(2); auto start = std::chrono::high_resolution_clock::now(); @@ -119,4 +119,4 @@ int main(int argc, char** argv) { } run_test(B, T, L, A, num_threads); -} \ No newline at end of file +} From a6f8c2210f8d41dd323404ff3fac35bab83e380d Mon Sep 17 00:00:00 2001 From: ljn7 Date: Sat, 26 Jul 2025 14:15:05 +0000 Subject: [PATCH 12/14] Improve GPU build configuration and clarify CUDA usage - Added robust detection of CUDA toolkit using CUDAToolkit and CUDA packages - Introduced WITH_GPU option to allow user-controlled GPU builds - Automatically fallback to CPU-only build if CUDA is unavailable or WITH_GPU=OFF - Improved status messages to reflect user intent and system CUDA availability - Added support for CUDA architecture detection and override via CMAKE_CUDA_ARCHITECTURES - Improved build summary output for clarity --- CMakeLists.txt | 231 +++++++++++++++++++++++++++++-------------------- 1 file changed, 137 insertions(+), 94 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a31e5a4..9c8839c 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,17 +1,17 @@ cmake_minimum_required(VERSION 3.10) +project(rnnt_release LANGUAGES CXX) -option(WITH_GPU "Build with GPU (CUDA) support" ON) - +# ==== CUDA DETECTION ==== set(CUDA_FOUND FALSE) set(CUDA_TOOLKIT_VERSION "0.0") if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18") find_package(CUDAToolkit QUIET) - + if(CMAKE_VERSION VERSION_LESS "3.27") find_package(CUDA QUIET) endif() - + if(CUDAToolkit_FOUND OR CUDA_FOUND) set(CUDA_FOUND TRUE) if(CUDAToolkit_FOUND) @@ -22,75 +22,105 @@ if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18") endif() else() find_package(CUDA QUIET) - if(CUDA_FOUND) - if(DEFINED CUDA_VERSION AND NOT "${CUDA_VERSION}" STREQUAL "") - set(CUDA_TOOLKIT_VERSION "${CUDA_VERSION}") - endif() + if(CUDA_FOUND AND DEFINED CUDA_VERSION AND NOT "${CUDA_VERSION}" STREQUAL "") + set(CUDA_TOOLKIT_VERSION "${CUDA_VERSION}") endif() endif() -if(NOT WITH_GPU) - message(STATUS "WITH_GPU=OFF: forcing CPU-only build, even if CUDA is available") - set(CUDA_FOUND FALSE) -endif() - -if(WITH_GPU AND NOT CUDA_FOUND) - message(FATAL_ERROR "WITH_GPU=ON but no compatible CUDA toolkit was found") -endif() - -if(WITH_GPU AND CUDA_FOUND) - project(rnnt_release LANGUAGES CXX CUDA) - set(USE_CUDA TRUE) - message(STATUS "CUDA ${CUDA_TOOLKIT_VERSION} found, GPU build enabled") -else() - project(rnnt_release LANGUAGES CXX) - set(USE_CUDA FALSE) - message(STATUS "CUDA not found or disabled, building CPU-only version") -endif() - # ==== USER OPTIONS ==== +option(WITH_GPU "Build with GPU (CUDA) support" ${CUDA_FOUND}) option(WITH_OMP "Build with OpenMP support" ON) option(USE_NAIVE_KERNEL "Use naive alpha-beta kernel" OFF) option(DEBUG_TIME "Output kernel time" OFF) option(DEBUG_KERNEL "Output alpha beta debug" OFF) +# ==== GPU BUILD CONFIGURATION ==== +if(WITH_GPU) + if(NOT CUDA_FOUND) + message(WARNING "WITH_GPU=ON was requested, but no compatible CUDA toolkit was found. Falling back to CPU-only build.") + set(WITH_GPU OFF CACHE BOOL "Build with GPU (CUDA) support" FORCE) + set(USE_CUDA FALSE) + else() + enable_language(CUDA) + set(USE_CUDA TRUE) + message(STATUS "Building with CUDA ${CUDA_TOOLKIT_VERSION} support") + endif() +else() + set(USE_CUDA FALSE) + if(CUDA_FOUND) + message(STATUS "CUDA ${CUDA_TOOLKIT_VERSION} detected but GPU support disabled (WITH_GPU=OFF)") + else() + message(STATUS "Building CPU-only version (no CUDA toolkit found)") + endif() +endif() + +# ==== OPENMP CONFIGURATION ==== if(WITH_OMP) find_package(OpenMP) if(OpenMP_CXX_FOUND) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") + message(STATUS "OpenMP support enabled") + else() + message(WARNING "OpenMP requested but not found") + set(WITH_OMP OFF) endif() endif() +# ==== COMPILER SETTINGS ==== set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) -set(CMAKE_POSITION_INDEPENDENT_CODE ON) set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) -if(NOT APPLE) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Build type" FORCE) +endif() + +if(CMAKE_BUILD_TYPE STREQUAL "Debug") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O0 -g") else() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 -O2") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") +endif() + +if(APPLE) add_definitions(-DAPPLE) + execute_process(COMMAND uname -v OUTPUT_VARIABLE DARWIN_VERSION) + string(REGEX MATCH "[0-9]+" DARWIN_VERSION "${DARWIN_VERSION}") + message(STATUS "Darwin version: ${DARWIN_VERSION}") + if(${DARWIN_VERSION} LESS 15) + set(CMAKE_SKIP_RPATH TRUE) + endif() +else() + set(CMAKE_SKIP_RPATH TRUE) endif() include_directories(include) -# ==== PREPROCESSOR FLAGS ==== +# ==== PREPROCESSOR DEFINITIONS ==== set(COMMON_DEFINITIONS "") + +if(USE_CUDA) + list(APPEND COMMON_DEFINITIONS -DWITH_CUDA=1) +else() + list(APPEND COMMON_DEFINITIONS -DWITH_CUDA=0) +endif() + if(USE_NAIVE_KERNEL) list(APPEND COMMON_DEFINITIONS -DUSE_NAIVE_KERNEL) endif() + if(DEBUG_TIME) list(APPEND COMMON_DEFINITIONS -DDEBUG_TIME) endif() + if(DEBUG_KERNEL) list(APPEND COMMON_DEFINITIONS -DDEBUG_KERNEL) endif() + if(NOT WITH_OMP) list(APPEND COMMON_DEFINITIONS -DRNNT_DISABLE_OMP) endif() -# ==== CUDA SETUP (only if WITH_GPU is ON) ==== +# ==== CUDA ARCHITECTURE SETUP ==== if(USE_CUDA) function(cuda_version_major VERSION OUTVAR) if(NOT "${VERSION}" STREQUAL "") @@ -123,7 +153,7 @@ if(USE_CUDA) set(CUDA_ARCH_LIST "") if(CUDA_TOOLKIT_VERSION STREQUAL "" OR CUDA_VERSION_MAJOR EQUAL 0) set(CUDA_ARCH_LIST "52;60;70;75") - message(WARNING "Could not detect CUDA version. Defaulting to minimal arch set (52;60;70;75).") + message(WARNING "Could not detect CUDA version. Using minimal architecture set: ${CUDA_ARCH_LIST}") elseif(CUDA_VERSION_MAJOR LESS 11) set(CUDA_ARCH_LIST "52;60;70;75") elseif(CUDA_VERSION_MAJOR EQUAL 11) @@ -134,42 +164,33 @@ if(USE_CUDA) if(DEFINED CMAKE_CUDA_ARCHITECTURES AND NOT "${CMAKE_CUDA_ARCHITECTURES}" STREQUAL "") set(CUDA_ARCH_LIST "${CMAKE_CUDA_ARCHITECTURES}") + message(STATUS "Using user-specified CUDA architectures: ${CUDA_ARCH_LIST}") + else() + message(STATUS "Auto-selected CUDA architectures for CUDA ${CUDA_TOOLKIT_VERSION}: ${CUDA_ARCH_LIST}") + message(STATUS "To override, set -DCMAKE_CUDA_ARCHITECTURES=xx;yy;zz") endif() if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18") set(CMAKE_CUDA_ARCHITECTURES "${CUDA_ARCH_LIST}" CACHE STRING "Target GPU architectures") - message(STATUS "Auto-selected CUDA archs for CUDA ${CUDA_TOOLKIT_VERSION}: ${CMAKE_CUDA_ARCHITECTURES}") else() - string(REPLACE ";" " " CUDA_ARCH_FLAGS "${CUDA_ARCH_LIST}") - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ") + set(CUDA_NVCC_FLAGS "") foreach(ARCH ${CUDA_ARCH_LIST}) set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_${ARCH},code=sm_${ARCH}") endforeach() - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++11") - message(STATUS "Auto-selected CUDA NVCC_ARCH flags for CUDA ${CUDA_TOOLKIT_VERSION}: ${CUDA_NVCC_FLAGS}") + message(STATUS "CUDA NVCC flags: ${CUDA_NVCC_FLAGS}") endif() - if(CUDA_TOOLKIT_VERSION AND CUDA_VERSION_MAJOR LESS 11) - if(CUDA_ARCH_LIST MATCHES "80|86|89") - message(WARNING "\nDetected CUDA ${CUDA_TOOLKIT_VERSION}. - Some requested architectures (${CUDA_ARCH_LIST}) require CUDA >= 11 (Ampere/Ada). - Remove sm_80/86/89 or upgrade CUDA Toolkit if you see NVCC errors.\n") + if(CUDA_VERSION_MAJOR GREATER 0 AND CUDA_VERSION_MAJOR LESS 11) + string(REGEX MATCH "8[0689]" AMPERE_ARCH "${CUDA_ARCH_LIST}") + if(AMPERE_ARCH) + message(WARNING + "Detected CUDA ${CUDA_TOOLKIT_VERSION} with Ampere/Ada architectures (${AMPERE_ARCH}) " + "which require CUDA >= 11. Consider upgrading CUDA or removing sm_80/86/89 architectures.") endif() endif() endif() -# ==== Apple/Mac rpath ==== -if(APPLE) - execute_process(COMMAND uname -v OUTPUT_VARIABLE DARWIN_VERSION) - string(REGEX MATCH "[0-9]+" DARWIN_VERSION "${DARWIN_VERSION}") - message(STATUS "DARWIN_VERSION=${DARWIN_VERSION}") - if(${DARWIN_VERSION} LESS 15) - set(CMAKE_SKIP_RPATH TRUE) - endif() -else() - set(CMAKE_SKIP_RPATH TRUE) -endif() - +# ==== SOURCE FILE PREPARATION ==== set(RNNT_ENTRYPOINT_SRC src/rnnt_entrypoint.cu) if(NOT USE_CUDA) configure_file( @@ -180,51 +201,53 @@ if(NOT USE_CUDA) set(RNNT_ENTRYPOINT_SRC ${CMAKE_CURRENT_BINARY_DIR}/rnnt_entrypoint_cpu.cpp) endif() -# ==== Main library target ==== -if(USE_CUDA) - message(STATUS "Building shared library with GPU support") - if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18") - add_library(warprnnt SHARED ${RNNT_ENTRYPOINT_SRC}) - target_link_libraries(warprnnt PRIVATE CUDA::cudart CUDA::curand) - else() - CUDA_ADD_LIBRARY(warprnnt SHARED ${RNNT_ENTRYPOINT_SRC}) - if(NOT Torch_FOUND) - TARGET_LINK_LIBRARIES(warprnnt ${CUDA_curand_LIBRARY}) - endif() - endif() -else() - message(STATUS "Building shared library with no GPU support") - add_library(warprnnt SHARED ${RNNT_ENTRYPOINT_SRC}) -endif() +# ==== MAIN LIBRARY TARGET ==== +add_library(warprnnt SHARED ${RNNT_ENTRYPOINT_SRC}) +# Configure library properties set_target_properties(warprnnt PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR} LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR} ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR} + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON ) if(WIN32) - set_target_properties(warprnnt PROPERTIES - WINDOWS_EXPORT_ALL_SYMBOLS ON - ) + set_target_properties(warprnnt PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) endif() -target_compile_features(warprnnt PRIVATE cxx_std_17) target_compile_definitions(warprnnt PRIVATE ${COMMON_DEFINITIONS}) target_include_directories(warprnnt PRIVATE include) -if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18" AND TARGET warprnnt AND USE_CUDA) - set_target_properties(warprnnt PROPERTIES - CUDA_SEPARABLE_COMPILATION ON - CUDA_STANDARD 17 - CUDA_STANDARD_REQUIRED ON - ) - target_compile_options(warprnnt PRIVATE - $<$:--expt-relaxed-constexpr> - ) +if(USE_CUDA) + message(STATUS "Configuring library with GPU support") + + if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18") + target_link_libraries(warprnnt PRIVATE CUDA::cudart CUDA::curand) + set_target_properties(warprnnt PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON + ) + target_compile_options(warprnnt PRIVATE + $<$:--expt-relaxed-constexpr> + ) + else() + cuda_add_library(warprnnt SHARED ${RNNT_ENTRYPOINT_SRC}) + if(NOT Torch_FOUND) + target_link_libraries(warprnnt ${CUDA_curand_LIBRARY}) + endif() + endif() +else() + message(STATUS "Configuring library with CPU-only support") +endif() + +if(WITH_OMP AND OpenMP_CXX_FOUND) + target_link_libraries(warprnnt PRIVATE OpenMP::OpenMP_CXX) endif() -# ==== Test executables ==== +# ==== TEST EXECUTABLES ==== add_executable(test_cpu tests/test_cpu.cpp tests/random.cpp) target_link_libraries(test_cpu PRIVATE warprnnt) target_compile_features(test_cpu PRIVATE cxx_std_17) @@ -237,7 +260,6 @@ if(USE_CUDA) if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18") add_executable(test_time_gpu tests/test_time.cu tests/random.cpp) target_link_libraries(test_time_gpu PRIVATE warprnnt CUDA::curand) - target_compile_features(test_time_gpu PRIVATE cxx_std_17) set_target_properties(test_time_gpu PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_STANDARD 17 @@ -249,7 +271,6 @@ if(USE_CUDA) add_executable(test_gpu tests/test_gpu.cu tests/random.cpp) target_link_libraries(test_gpu PRIVATE warprnnt CUDA::curand) - target_compile_features(test_gpu PRIVATE cxx_std_17) set_target_properties(test_gpu PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_STANDARD 17 @@ -260,16 +281,20 @@ if(USE_CUDA) ) else() cuda_add_executable(test_time_gpu tests/test_time.cu tests/random.cpp) - TARGET_LINK_LIBRARIES(test_time_gpu warprnnt ${CUDA_curand_LIBRARY}) - set_target_properties(test_time_gpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") + target_link_libraries(test_time_gpu warprnnt ${CUDA_curand_LIBRARY}) + set_target_properties(test_time_gpu PROPERTIES + COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11" + ) cuda_add_executable(test_gpu tests/test_gpu.cu tests/random.cpp) - TARGET_LINK_LIBRARIES(test_gpu warprnnt ${CUDA_curand_LIBRARY}) - set_target_properties(test_gpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") + target_link_libraries(test_gpu warprnnt ${CUDA_curand_LIBRARY}) + set_target_properties(test_gpu PROPERTIES + COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11" + ) endif() endif() -# ==== Install targets ==== +# ==== INSTALL TARGETS ==== install(TARGETS warprnnt RUNTIME DESTINATION bin LIBRARY DESTINATION lib @@ -278,3 +303,21 @@ install(TARGETS warprnnt install(FILES include/rnnt.h DESTINATION include) +# ==== BUILD SUMMARY ==== +message(STATUS "") +message(STATUS "=== Build Configuration Summary ===") +message(STATUS "Build Type: ${CMAKE_BUILD_TYPE}") +message(STATUS "CUDA Available: ${CUDA_FOUND}") +if(CUDA_FOUND) + message(STATUS "CUDA Version: ${CUDA_TOOLKIT_VERSION}") +endif() +message(STATUS "GPU Support: ${USE_CUDA}") +if(USE_CUDA) + message(STATUS "CUDA Architectures: ${CMAKE_CUDA_ARCHITECTURES}") +endif() +message(STATUS "OpenMP Support: ${WITH_OMP}") +message(STATUS "Naive Kernel: ${USE_NAIVE_KERNEL}") +message(STATUS "Debug Time: ${DEBUG_TIME}") +message(STATUS "Debug Kernel: ${DEBUG_KERNEL}") +message(STATUS "=====================================") +message(STATUS "") \ No newline at end of file From cd86c036333a31813257cb697a50e5c1159012bc Mon Sep 17 00:00:00 2001 From: John Nirmal Date: Sat, 26 Jul 2025 21:09:09 +0530 Subject: [PATCH 13/14] ci: add CPU-only GitHub Actions workflow for build and test - Uses cmake and make to build and run test_cpu - Targets only CPU environments for faster and simpler CI --- .github/workflows/ci.yml | 42 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..65f01ae --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,42 @@ +name: CI (CPU-only) + +on: + push: + branches: [main, master] + pull_request: + branches: [main, master] + +jobs: + build-and-test: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y cmake build-essential + + - name: Configure CMake (CPU-only) + run: | + mkdir -p build + cd build + cmake .. + + - name: Build project + run: | + cd build + make -j$(nproc) + + - name: Install and update linker cache + run: | + cd build + sudo make install + sudo ldconfig + + - name: Run CPU tests + run: | + cd build + ./test_cpu From 48434133ea3e0c2d50fb01b27408d3df5f93715d Mon Sep 17 00:00:00 2001 From: ljn7 Date: Thu, 7 Aug 2025 17:18:05 +0530 Subject: [PATCH 14/14] Updated CMakeLists - Updated for compatability and performance similar to b-flo's implementation - Removed tensorflow and mxnet bindings (might add as git-modules) --- CMakeLists.txt | 232 ++++++------------ mxnet_binding | 1 - tensorflow_binding/README.md | 97 -------- tensorflow_binding/setup.py | 149 ----------- tensorflow_binding/src/warprnnt_op.cc | 191 -------------- tensorflow_binding/tests/test_basic.py | 56 ----- tensorflow_binding/tests/test_warprnnt_op.py | 97 -------- .../warprnnt_tensorflow/__init__.py | 48 ---- 8 files changed, 79 insertions(+), 792 deletions(-) mode change 100755 => 100644 CMakeLists.txt delete mode 160000 mxnet_binding delete mode 100644 tensorflow_binding/README.md delete mode 100644 tensorflow_binding/setup.py delete mode 100644 tensorflow_binding/src/warprnnt_op.cc delete mode 100644 tensorflow_binding/tests/test_basic.py delete mode 100644 tensorflow_binding/tests/test_warprnnt_op.py delete mode 100644 tensorflow_binding/warprnnt_tensorflow/__init__.py diff --git a/CMakeLists.txt b/CMakeLists.txt old mode 100755 new mode 100644 index 9c8839c..f422901 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,13 +5,11 @@ project(rnnt_release LANGUAGES CXX) set(CUDA_FOUND FALSE) set(CUDA_TOOLKIT_VERSION "0.0") +find_package(CUDA QUIET) + if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18") find_package(CUDAToolkit QUIET) - if(CMAKE_VERSION VERSION_LESS "3.27") - find_package(CUDA QUIET) - endif() - if(CUDAToolkit_FOUND OR CUDA_FOUND) set(CUDA_FOUND TRUE) if(CUDAToolkit_FOUND) @@ -21,7 +19,6 @@ if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18") endif() endif() else() - find_package(CUDA QUIET) if(CUDA_FOUND AND DEFINED CUDA_VERSION AND NOT "${CUDA_VERSION}" STREQUAL "") set(CUDA_TOOLKIT_VERSION "${CUDA_VERSION}") endif() @@ -41,7 +38,6 @@ if(WITH_GPU) set(WITH_GPU OFF CACHE BOOL "Build with GPU (CUDA) support" FORCE) set(USE_CUDA FALSE) else() - enable_language(CUDA) set(USE_CUDA TRUE) message(STATUS "Building with CUDA ${CUDA_TOOLKIT_VERSION} support") endif() @@ -69,7 +65,6 @@ endif() set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) -set(CMAKE_POSITION_INDEPENDENT_CODE ON) if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Build type" FORCE) @@ -81,6 +76,10 @@ else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") endif() +if(WITH_OMP) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") +endif() + if(APPLE) add_definitions(-DAPPLE) execute_process(COMMAND uname -v OUTPUT_VARIABLE DARWIN_VERSION) @@ -96,98 +95,64 @@ endif() include_directories(include) # ==== PREPROCESSOR DEFINITIONS ==== -set(COMMON_DEFINITIONS "") - -if(USE_CUDA) - list(APPEND COMMON_DEFINITIONS -DWITH_CUDA=1) -else() - list(APPEND COMMON_DEFINITIONS -DWITH_CUDA=0) -endif() - if(USE_NAIVE_KERNEL) - list(APPEND COMMON_DEFINITIONS -DUSE_NAIVE_KERNEL) + add_definitions(-DUSE_NAIVE_KERNEL) endif() if(DEBUG_TIME) - list(APPEND COMMON_DEFINITIONS -DDEBUG_TIME) + add_definitions(-DDEBUG_TIME) endif() if(DEBUG_KERNEL) - list(APPEND COMMON_DEFINITIONS -DDEBUG_KERNEL) + add_definitions(-DDEBUG_KERNEL) endif() if(NOT WITH_OMP) - list(APPEND COMMON_DEFINITIONS -DRNNT_DISABLE_OMP) + add_definitions(-DRNNT_DISABLE_OMP) endif() -# ==== CUDA ARCHITECTURE SETUP ==== +# ==== CUDA CONFIGURATION ==== if(USE_CUDA) - function(cuda_version_major VERSION OUTVAR) - if(NOT "${VERSION}" STREQUAL "") - string(REPLACE "." ";" TMP_LIST ${VERSION}) - list(GET TMP_LIST 0 MAJOR) - set(${OUTVAR} "${MAJOR}" PARENT_SCOPE) - else() - set(${OUTVAR} "0" PARENT_SCOPE) - endif() - endfunction() - - function(cuda_version_minor VERSION OUTVAR) - if(NOT "${VERSION}" STREQUAL "") - string(REPLACE "." ";" TMP_LIST ${VERSION}) - list(LENGTH TMP_LIST TMP_LEN) - if(TMP_LEN GREATER 1) - list(GET TMP_LIST 1 MINOR) - else() - set(MINOR "0") - endif() - set(${OUTVAR} "${MINOR}" PARENT_SCOPE) - else() - set(${OUTVAR} "0" PARENT_SCOPE) - endif() - endfunction() - - cuda_version_major("${CUDA_TOOLKIT_VERSION}" CUDA_VERSION_MAJOR) - cuda_version_minor("${CUDA_TOOLKIT_VERSION}" CUDA_VERSION_MINOR) - - set(CUDA_ARCH_LIST "") - if(CUDA_TOOLKIT_VERSION STREQUAL "" OR CUDA_VERSION_MAJOR EQUAL 0) - set(CUDA_ARCH_LIST "52;60;70;75") - message(WARNING "Could not detect CUDA version. Using minimal architecture set: ${CUDA_ARCH_LIST}") - elseif(CUDA_VERSION_MAJOR LESS 11) - set(CUDA_ARCH_LIST "52;60;70;75") - elseif(CUDA_VERSION_MAJOR EQUAL 11) - set(CUDA_ARCH_LIST "52;60;70;75;80;86") - elseif(CUDA_VERSION_MAJOR GREATER_EQUAL 12) - set(CUDA_ARCH_LIST "52;60;70;75;80;86;89") + if(WITH_OMP) + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fopenmp") endif() - if(DEFINED CMAKE_CUDA_ARCHITECTURES AND NOT "${CMAKE_CUDA_ARCHITECTURES}" STREQUAL "") - set(CUDA_ARCH_LIST "${CMAKE_CUDA_ARCHITECTURES}") - message(STATUS "Using user-specified CUDA architectures: ${CUDA_ARCH_LIST}") - else() - message(STATUS "Auto-selected CUDA architectures for CUDA ${CUDA_TOOLKIT_VERSION}: ${CUDA_ARCH_LIST}") - message(STATUS "To override, set -DCMAKE_CUDA_ARCHITECTURES=xx;yy;zz") + if(NOT (CUDA_VERSION GREATER 10.2)) + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30 -O2") + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_35,code=sm_35") + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_50,code=sm_50") endif() - if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18") - set(CMAKE_CUDA_ARCHITECTURES "${CUDA_ARCH_LIST}" CACHE STRING "Target GPU architectures") - else() - set(CUDA_NVCC_FLAGS "") - foreach(ARCH ${CUDA_ARCH_LIST}) - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_${ARCH},code=sm_${ARCH}") - endforeach() - message(STATUS "CUDA NVCC flags: ${CUDA_NVCC_FLAGS}") + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52") + + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5) + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -D_MWAITXINTRIN_H_INCLUDED -D_FORCE_INLINES") endif() - if(CUDA_VERSION_MAJOR GREATER 0 AND CUDA_VERSION_MAJOR LESS 11) - string(REGEX MATCH "8[0689]" AMPERE_ARCH "${CUDA_ARCH_LIST}") - if(AMPERE_ARCH) - message(WARNING - "Detected CUDA ${CUDA_TOOLKIT_VERSION} with Ampere/Ada architectures (${AMPERE_ARCH}) " - "which require CUDA >= 11. Consider upgrading CUDA or removing sm_80/86/89 architectures.") - endif() + if(CUDA_VERSION GREATER 7.6) + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60") + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61") + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62") + endif() + + if(CUDA_VERSION GREATER 8.9) + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_70,code=sm_70") + endif() + + if(CUDA_VERSION GREATER 9.9) + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_75,code=sm_75") + endif() + + if(CUDA_VERSION GREATER 11.0) + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_80,code=sm_80") + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_86,code=sm_86") + endif() + + if(NOT APPLE) + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++17") endif() + + message(STATUS "CUDA NVCC flags: ${CUDA_NVCC_FLAGS}") endif() # ==== SOURCE FILE PREPARATION ==== @@ -202,96 +167,57 @@ if(NOT USE_CUDA) endif() # ==== MAIN LIBRARY TARGET ==== -add_library(warprnnt SHARED ${RNNT_ENTRYPOINT_SRC}) +if(USE_CUDA) + message(STATUS "Building shared library with GPU support") + + cuda_add_library(warprnnt SHARED ${RNNT_ENTRYPOINT_SRC}) + + if(NOT Torch_FOUND) + target_link_libraries(warprnnt ${CUDA_curand_LIBRARY}) + endif() + + if(WITH_OMP AND OpenMP_CXX_FOUND) + target_link_libraries(warprnnt ${OpenMP_CXX_LIBRARIES}) + endif() +else() + message(STATUS "Building shared library with CPU-only support") + add_library(warprnnt SHARED ${RNNT_ENTRYPOINT_SRC}) + + if(WITH_OMP AND OpenMP_CXX_FOUND) + target_link_libraries(warprnnt PRIVATE OpenMP::OpenMP_CXX) + endif() +endif() -# Configure library properties set_target_properties(warprnnt PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR} LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR} ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR} - CXX_STANDARD 17 - CXX_STANDARD_REQUIRED ON ) if(WIN32) set_target_properties(warprnnt PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) endif() -target_compile_definitions(warprnnt PRIVATE ${COMMON_DEFINITIONS}) -target_include_directories(warprnnt PRIVATE include) - -if(USE_CUDA) - message(STATUS "Configuring library with GPU support") - - if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18") - target_link_libraries(warprnnt PRIVATE CUDA::cudart CUDA::curand) - set_target_properties(warprnnt PROPERTIES - CUDA_SEPARABLE_COMPILATION ON - CUDA_STANDARD 17 - CUDA_STANDARD_REQUIRED ON - ) - target_compile_options(warprnnt PRIVATE - $<$:--expt-relaxed-constexpr> - ) - else() - cuda_add_library(warprnnt SHARED ${RNNT_ENTRYPOINT_SRC}) - if(NOT Torch_FOUND) - target_link_libraries(warprnnt ${CUDA_curand_LIBRARY}) - endif() - endif() -else() - message(STATUS "Configuring library with CPU-only support") -endif() - -if(WITH_OMP AND OpenMP_CXX_FOUND) - target_link_libraries(warprnnt PRIVATE OpenMP::OpenMP_CXX) -endif() +# Use include_directories (global) since we're mixing legacy/modern approaches +# target_include_directories(warprnnt PRIVATE include) # Don't use this with legacy CUDA # ==== TEST EXECUTABLES ==== add_executable(test_cpu tests/test_cpu.cpp tests/random.cpp) -target_link_libraries(test_cpu PRIVATE warprnnt) -target_compile_features(test_cpu PRIVATE cxx_std_17) +target_link_libraries(test_cpu warprnnt) +set_target_properties(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") add_executable(test_time tests/test_time.cpp tests/random.cpp) -target_link_libraries(test_time PRIVATE warprnnt) -target_compile_features(test_time PRIVATE cxx_std_17) +target_link_libraries(test_time warprnnt) +set_target_properties(test_time PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") if(USE_CUDA) - if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18") - add_executable(test_time_gpu tests/test_time.cu tests/random.cpp) - target_link_libraries(test_time_gpu PRIVATE warprnnt CUDA::curand) - set_target_properties(test_time_gpu PROPERTIES - CUDA_SEPARABLE_COMPILATION ON - CUDA_STANDARD 17 - CUDA_STANDARD_REQUIRED ON - ) - target_compile_options(test_time_gpu PRIVATE - $<$:--expt-relaxed-constexpr> - ) - - add_executable(test_gpu tests/test_gpu.cu tests/random.cpp) - target_link_libraries(test_gpu PRIVATE warprnnt CUDA::curand) - set_target_properties(test_gpu PROPERTIES - CUDA_SEPARABLE_COMPILATION ON - CUDA_STANDARD 17 - CUDA_STANDARD_REQUIRED ON - ) - target_compile_options(test_gpu PRIVATE - $<$:--expt-relaxed-constexpr> - ) - else() - cuda_add_executable(test_time_gpu tests/test_time.cu tests/random.cpp) - target_link_libraries(test_time_gpu warprnnt ${CUDA_curand_LIBRARY}) - set_target_properties(test_time_gpu PROPERTIES - COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11" - ) - - cuda_add_executable(test_gpu tests/test_gpu.cu tests/random.cpp) - target_link_libraries(test_gpu warprnnt ${CUDA_curand_LIBRARY}) - set_target_properties(test_gpu PROPERTIES - COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11" - ) - endif() + cuda_add_executable(test_time_gpu tests/test_time.cu tests/random.cpp) + target_link_libraries(test_time_gpu warprnnt ${CUDA_curand_LIBRARY}) + set_target_properties(test_time_gpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") + + cuda_add_executable(test_gpu tests/test_gpu.cu tests/random.cpp) + target_link_libraries(test_gpu warprnnt ${CUDA_curand_LIBRARY}) + set_target_properties(test_gpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") endif() # ==== INSTALL TARGETS ==== @@ -313,11 +239,11 @@ if(CUDA_FOUND) endif() message(STATUS "GPU Support: ${USE_CUDA}") if(USE_CUDA) - message(STATUS "CUDA Architectures: ${CMAKE_CUDA_ARCHITECTURES}") + message(STATUS "CUDA NVCC Flags: ${CUDA_NVCC_FLAGS}") endif() message(STATUS "OpenMP Support: ${WITH_OMP}") message(STATUS "Naive Kernel: ${USE_NAIVE_KERNEL}") message(STATUS "Debug Time: ${DEBUG_TIME}") message(STATUS "Debug Kernel: ${DEBUG_KERNEL}") message(STATUS "=====================================") -message(STATUS "") \ No newline at end of file +message(STATUS "") diff --git a/mxnet_binding b/mxnet_binding deleted file mode 160000 index 75874b2..0000000 --- a/mxnet_binding +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 75874b21a476dd9f9e5480841f0872f9d7c2723e diff --git a/tensorflow_binding/README.md b/tensorflow_binding/README.md deleted file mode 100644 index b35ae4c..0000000 --- a/tensorflow_binding/README.md +++ /dev/null @@ -1,97 +0,0 @@ -# TensorFlow binding for WarpRNNT - -This package provides TensorFlow kernels that wrap the WarpRNNT -library. - -## Installation - -To build the kernels it is necessary to have the TensorFlow source -code available, since TensorFlow doesn't currently install the -necessary headers to handle the SparseTensor that the CTCLoss op uses -to input the labels. You can retrieve the TensorFlow source from -github.com: - -```bash -git clone https://github.com/tensorflow/tensorflow.git -``` - -`WARP_RNNT_PATH` should be set to the location of a built WarpRNNT -(i.e. `libwarprnnt.so`). This defaults to `../build`, so from within a -new warp-rnnt clone you could build WarpRNNT like this: - -```bash -mkdir build; cd build -cmake -DCUDA_TOOLKIT_ROOT_DIR=/path/to/cuda .. -make -``` - -Otherwise, set `WARP_RNNT_PATH` to wherever you have `libwarprnnt.so` -installed. If you have a GPU, you should also make sure that -`CUDA_HOME` is set to the home cuda directory (i.e. where -`include/cuda.h` and `lib/libcudart.so` live). - -You should now be able to use `setup.py` to install the package into -your current Python environment: - -```bash -CUDA=/path/to/cuda python setup.py install -``` - -You can run a few unit tests with `setup.py` as well if you want: - -```bash -python setup.py test -``` - -## Using the kernels - -First import the module: - -```python -import warprnnt_tensorflow -``` - -The WarpRNNT op is available via the `warprnnt_tensorflow.rnnt_loss` function: - -```python -costs = warprnnt_tensorflow.rnnt_loss(acts, labels, input_lengths, label_lengths) -``` - -The `acts` is a 4 dimensional Tensor, `labels` -is 2 dimensinal Tensor, and all the others are single dimension Tensors. -See the main WarpRNNT documentation for more information. - -## Python interface -```python -def rnnt_loss(acts, labels, input_lengths, label_lengths, blank_label=0): - '''Computes the RNNT loss between a sequence of activations and a - ground truth labeling. - Args: - acts: A 4-D Tensor of floats. The dimensions - should be (B, T, U+1, V), where B is the minibatch index, - T is the time index, U is the label sequence - length (+1 means blank label prepanded), - and V indexes over activations for each - symbol in the alphabet. - labels: A 2-D Tensor of ints, a padded label sequences to make sure - labels for the minibatch are same length. - input_lengths: A 1-D Tensor of ints, the number of time steps - for each sequence in the minibatch. - label_lengths: A 1-D Tensor of ints, the length of each label - for each example in the minibatch. - blank_label: int, the label value/index that the RNNT - calculation should use as the blank label - Returns: - 1-D float Tensor, the cost of each example in the minibatch - (as negative log probabilities). - * This class performs the softmax operation internally. - * The label reserved for the blank symbol should be label 0. - ''' -``` diff --git a/tensorflow_binding/setup.py b/tensorflow_binding/setup.py deleted file mode 100644 index 170f4d6..0000000 --- a/tensorflow_binding/setup.py +++ /dev/null @@ -1,149 +0,0 @@ -"""setup.py script for warp-rnnt TensorFlow wrapper""" - -from __future__ import print_function - -import os -import platform -import re -import warnings -import setuptools -import sys -import unittest -from setuptools.command.build_ext import build_ext as orig_build_ext -from distutils.version import LooseVersion - -# We need to import tensorflow to find where its include directory is. -try: - import tensorflow as tf -except ImportError: - raise RuntimeError("Tensorflow must be installed to build the tensorflow wrapper.") - -if "CUDA_HOME" not in os.environ: - print("CUDA_HOME not found in the environment so building " - "without GPU support. To build with GPU support " - "please define the CUDA_HOME environment variable. " - "This should be a path which contains include/cuda.h", - file=sys.stderr) - enable_gpu = False -else: - enable_gpu = True - -if platform.system() == 'Darwin': - lib_ext = ".dylib" -else: - lib_ext = ".so" - -warp_rnnt_path = "../build" -if "WARP_RNNT_PATH" in os.environ: - warp_rnnt_path = os.environ["WARP_RNNT_PATH"] -if not os.path.exists(os.path.join(warp_rnnt_path, "libwarprnnt"+lib_ext)): - print(("Could not find libwarprnnt.so in {}.\n" - "Build warp-rnnt and set WARP_RNNT_PATH to the location of" - " libwarprnnt.so (default is '../build')").format(warp_rnnt_path), - file=sys.stderr) - sys.exit(1) - -root_path = os.path.realpath(os.path.dirname(__file__)) - -tf_include = tf.sysconfig.get_include() -tf_src_dir = tf.sysconfig.get_lib() # os.environ["TENSORFLOW_SRC_PATH"] -tf_includes = [tf_include, tf_src_dir] -warp_rnnt_includes = [os.path.join(root_path, '../include')] -include_dirs = tf_includes + warp_rnnt_includes - -if LooseVersion(tf.__version__) >= LooseVersion('1.4'): - nsync_dir = '../../external/nsync/public' - if LooseVersion(tf.__version__) >= LooseVersion('1.10'): - nsync_dir = 'external/nsync/public' - include_dirs += [os.path.join(tf_include, nsync_dir)] - -if os.getenv("TF_CXX11_ABI") is not None: - TF_CXX11_ABI = os.getenv("TF_CXX11_ABI") -else: - warnings.warn("Assuming tensorflow was compiled without C++11 ABI. " - "It is generally true if you are using binary pip package. " - "If you compiled tensorflow from source with gcc >= 5 and didn't set " - "-D_GLIBCXX_USE_CXX11_ABI=0 during compilation, you need to set " - "environment variable TF_CXX11_ABI=1 when compiling this bindings. " - "Also be sure to touch some files in src to trigger recompilation. " - "Also, you need to set (or unsed) this environment variable if getting " - "undefined symbol: _ZN10tensorflow... errors") - TF_CXX11_ABI = "0" - -extra_compile_args = ['-std=c++11', '-fPIC', '-D_GLIBCXX_USE_CXX11_ABI=' + TF_CXX11_ABI] -# current tensorflow code triggers return type errors, silence those for now -extra_compile_args += ['-Wno-return-type'] -if LooseVersion(tf.__version__) >= LooseVersion('1.4'): - extra_compile_args += tf.sysconfig.get_compile_flags() - -extra_link_args = [] -if LooseVersion(tf.__version__) >= LooseVersion('1.4'): - extra_link_args += tf.sysconfig.get_link_flags() - -if (enable_gpu): - extra_compile_args += ['-DWARPRNNT_ENABLE_GPU'] - include_dirs += [os.path.join(os.environ["CUDA_HOME"], 'include')] - - # mimic tensorflow cuda include setup so that their include command work - if not os.path.exists(os.path.join(root_path, "include")): - os.mkdir(os.path.join(root_path, "include")) - - cuda_inc_path = os.path.join(root_path, "include/cuda") - if not os.path.exists(cuda_inc_path) or os.readlink(cuda_inc_path) != os.environ["CUDA_HOME"]: - if os.path.exists(cuda_inc_path): - os.remove(cuda_inc_path) - os.symlink(os.environ["CUDA_HOME"], cuda_inc_path) - include_dirs += [os.path.join(root_path, 'include')] - -# Ensure that all expected files and directories exist. -for loc in include_dirs: - if not os.path.exists(loc): - print(("Could not find file or directory {}.\n" - "Check your environment variables and paths?").format(loc), - file=sys.stderr) - sys.exit(1) - -lib_srcs = ['src/warprnnt_op.cc'] - -ext = setuptools.Extension('warprnnt_tensorflow.kernels', - sources = lib_srcs, - language = 'c++', - include_dirs = include_dirs, - library_dirs = [warp_rnnt_path], - runtime_library_dirs = [os.path.realpath(warp_rnnt_path)], - libraries = ['warprnnt'], - extra_compile_args = extra_compile_args, - extra_link_args = extra_link_args) - -class build_tf_ext(orig_build_ext): - def build_extensions(self): - if LooseVersion(tf.__version__) < LooseVersion('1.4'): - self.compiler.compiler_so.remove('-Wstrict-prototypes') - orig_build_ext.build_extensions(self) - -def discover_test_suite(): - test_loader = unittest.TestLoader() - test_suite = test_loader.discover('tests', pattern='test_*.py') - return test_suite - -# Read the README.md file for the long description. This lets us avoid -# duplicating the package description in multiple places in the source. -README_PATH = os.path.join(os.path.dirname(__file__), "README.md") -with open(README_PATH, "r") as handle: - # Extract everything between the first set of ## headlines - LONG_DESCRIPTION = re.search("#.*([^#]*)##", handle.read()).group(1).strip() - -setuptools.setup( - name = "warprnnt_tensorflow", - version = "0.1", - description = "TensorFlow wrapper for warp-transducer", - url="https://github.com/HawkAaron/warp-transducer", - long_description = LONG_DESCRIPTION, - author = "Mingkun Huang", - author_email = "mingkunhuang95@gmail.com", - license = "Apache", - packages = ["warprnnt_tensorflow"], - ext_modules = [ext], - cmdclass = {'build_ext': build_tf_ext}, - test_suite = 'setup.discover_test_suite', -) diff --git a/tensorflow_binding/src/warprnnt_op.cc b/tensorflow_binding/src/warprnnt_op.cc deleted file mode 100644 index e3d2210..0000000 --- a/tensorflow_binding/src/warprnnt_op.cc +++ /dev/null @@ -1,191 +0,0 @@ -#ifdef WARPRNNT_ENABLE_GPU -#define EIGEN_USE_GPU -#include -#endif - -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/bounds_check.h" -#include "tensorflow/core/framework/allocator.h" -#include "rnnt.h" - - -REGISTER_OP("WarpRNNT") - .Input("acts: float32") - .Input("labels: int32") - .Input("input_lengths: int32") - .Input("label_lengths: int32") - .Attr("blank_label: int = 0") - .Output("costs: float32") - .Output("grads: float32"); - -namespace tf = tensorflow; - -namespace warp_rnnt { - -class WarpRNNTOpBase : public tf::OpKernel { - public: - explicit WarpRNNTOpBase(tf::OpKernelConstruction* ctx) : tf::OpKernel(ctx) { - OP_REQUIRES_OK(ctx, ctx->GetAttr("blank_label", &blank_label_)); - } - - void Compute(tf::OpKernelContext* ctx) override { - // Grab the input tensors - const tf::Tensor* acts; - const tf::Tensor* labels; - const tf::Tensor* label_lengths; - const tf::Tensor* input_lengths; - OP_REQUIRES_OK(ctx, ctx->input("acts", &acts)); - OP_REQUIRES_OK(ctx, ctx->input("labels", &labels)); - OP_REQUIRES_OK(ctx, ctx->input("label_lengths", &label_lengths)); - OP_REQUIRES_OK(ctx, ctx->input("input_lengths", &input_lengths)); - - OP_REQUIRES(ctx, acts->shape().dims() == 4, - tf::errors::InvalidArgument("acts is not a 4-Tensor")); - OP_REQUIRES(ctx, labels->shape().dims() == 2, - tf::errors::InvalidArgument("labels is not a 2-Tensor")); - OP_REQUIRES(ctx, tf::TensorShapeUtils::IsVector(label_lengths->shape()), - tf::errors::InvalidArgument("label_lengths is not a vector")); - OP_REQUIRES(ctx, tf::TensorShapeUtils::IsVector(input_lengths->shape()), - tf::errors::InvalidArgument("input_lengths is not a vector")); - - const auto& acts_shape = acts->shape(); - const auto batch_size = acts_shape.dim_size(0); - const auto max_time = acts_shape.dim_size(1); - const auto max_u = acts_shape.dim_size(2); - const auto num_classes_raw = acts_shape.dim_size(3); - - auto acts_t = acts->tensor(); - auto labels_t = labels->tensor(); - - OP_REQUIRES( - ctx, tf::FastBoundsCheck(num_classes_raw, std::numeric_limits::max()), - tf::errors::InvalidArgument("num_classes cannot exceed max int")); - const auto alphabet_size = static_cast(num_classes_raw); - - OP_REQUIRES( - ctx, batch_size == input_lengths->dim_size(0), - tf::errors::InvalidArgument("len(input_lengths) != batch_size. ", - "len(input_length): ", input_lengths->dim_size(0), - " batch_size: ", batch_size)); - auto input_lengths_t = input_lengths->vec(); - - OP_REQUIRES( - ctx, batch_size == label_lengths->dim_size(0), - tf::errors::InvalidArgument("len(label_lengths) != batch_size. ", - "len(label_length): ", label_lengths->dim_size(0), - " batch_size: ", batch_size)); - auto label_lengths_t = label_lengths->vec(); - - // TODO check that labels are in the alphabet? - // Refer to line 185, we know that - // Tensor input_lengths is in GPU, so cannot compare with CPU variable - //for (int b = 0; b < batch_size; b++) { - // OP_REQUIRES(ctx, input_lengths_t(b) <= max_time, - // tf::errors::InvalidArgument("input_lengths(", b, ") <= ", max_time)); - //} - - tf::Tensor* costs = nullptr; - OP_REQUIRES_OK(ctx, ctx->allocate_output("costs", input_lengths->shape(), &costs)); - auto costs_t = costs->vec(); - - tf::Tensor* grads = nullptr; - OP_REQUIRES_OK(ctx, ctx->allocate_output("grads", acts->shape(), &grads)); - set_zero(grads); - auto grads_t = grads->tensor(); - - auto options = create_options(ctx); - options.blank_label = blank_label_; - options.maxT = max_time; - options.maxU = max_u; - - size_t workspace_size_bytes; - bool use_gpu = false; - if(options.loc == RNNT_GPU) { - use_gpu = true; - } - auto warp_status = get_workspace_size(max_time, - max_u, - batch_size, - use_gpu, - &workspace_size_bytes); - - OP_REQUIRES(ctx, warp_status == RNNT_STATUS_SUCCESS, - tf::errors::Internal("warp_rnnt error in get_workspace_size: ", - rnntGetStatusString(warp_status))); - - auto workspace_shape = tf::TensorShape{static_cast(workspace_size_bytes)}; - tf::Tensor workspace; - OP_REQUIRES_OK(ctx, ctx->allocate_temp(tf::DT_UINT8, workspace_shape, &workspace)); - auto workspace_t = workspace.flat(); - - // compute RNNT - warp_status = compute_rnnt_loss(acts_t.data(), - grads_t.data(), - labels_t.data(), - label_lengths_t.data(), - input_lengths_t.data(), - alphabet_size, batch_size, - costs_t.data(), workspace_t.data(), options); - - OP_REQUIRES(ctx, warp_status == RNNT_STATUS_SUCCESS, - tf::errors::Internal("warp_rnnt error in compute_rnnt_loss: ", - rnntGetStatusString(warp_status))); - - } - private: - int blank_label_; - virtual void set_zero(tf::Tensor* t) = 0; - virtual rnntOptions create_options(tf::OpKernelContext* ctx) = 0; -}; - -class WarpRNNTOpCPU : public WarpRNNTOpBase { - public: - explicit WarpRNNTOpCPU(tf::OpKernelConstruction* ctx) : WarpRNNTOpBase(ctx) { - } - - private: - void set_zero(tf::Tensor* t) override { - t->flat().setZero(); - } - - rnntOptions create_options(tf::OpKernelContext* ctx) override { - auto options = rnntOptions{}; - options.loc = RNNT_CPU; - options.batch_first = true; - options.num_threads = ctx->device()->tensorflow_cpu_worker_threads()->num_threads; - return options; - } -}; - -REGISTER_KERNEL_BUILDER(Name("WarpRNNT").Device(::tensorflow::DEVICE_CPU), WarpRNNTOpCPU); - -#ifdef WARPRNNT_ENABLE_GPU - -class WarpRNNTOpGPU : public WarpRNNTOpBase { - public: - explicit WarpRNNTOpGPU(tf::OpKernelConstruction* ctx) : WarpRNNTOpBase(ctx) { - } - - private: - void set_zero(tf::Tensor* t) override { - // here is not need - // cudaMemset(t->flat().data(), 0, t->NumElements()*sizeof(float)); - } - - rnntOptions create_options(tf::OpKernelContext* ctx) override { - auto cuda_stream = ctx->eigen_device().stream(); - auto options = rnntOptions{}; - options.loc = RNNT_GPU; - options.stream = cuda_stream; - return options; - } -}; - -REGISTER_KERNEL_BUILDER(Name("WarpRNNT").Device(::tensorflow::DEVICE_GPU) - .HostMemory("costs"), - WarpRNNTOpGPU); -#undef EIGEN_USE_GPU -#endif - -} diff --git a/tensorflow_binding/tests/test_basic.py b/tensorflow_binding/tests/test_basic.py deleted file mode 100644 index d6d13e2..0000000 --- a/tensorflow_binding/tests/test_basic.py +++ /dev/null @@ -1,56 +0,0 @@ -import numpy as np -import tensorflow as tf -from warprnnt_tensorflow import rnnt_loss - -acts = tf.placeholder(tf.float32, [None, None, None, None]) -labels = tf.placeholder(tf.int32, [None, None]) -input_length = tf.placeholder(tf.int32, [None]) -label_length = tf.placeholder(tf.int32, [None]) - -B = 2; T = 4; U = 3; V = 3; blank = 0 - -logits = tf.nn.log_softmax(acts) -costs = rnnt_loss(logits, labels, input_length, label_length, blank) -grad = tf.gradients(costs, [acts]) - -a = np.array([[[[0.06535690384862791, 0.7875301411923206, 0.08159176605666074], - [0.5297155426466327, 0.7506749639230854, 0.7541348379087998], - [0.6097641124736383, 0.8681404965673826, 0.6225318186056529]], - - [[0.6685222872103057, 0.8580392805336061, 0.16453892311765583], - [0.989779515236694, 0.944298460961015, 0.6031678586829663], - [0.9467833543605416, 0.666202507295747, 0.28688179752461884]], - - [[0.09418426230195986, 0.3666735970751962, 0.736168049462793], - [0.1666804425271342, 0.7141542198635192, 0.3993997272216727], - [0.5359823524146038, 0.29182076440286386, 0.6126422611507932]], - - [[0.3242405528768486, 0.8007644367291621, 0.5241057606558068], - [0.779194617063042, 0.18331417220174862, 0.113745182072432], - [0.24022162381327106, 0.3394695622533106, 0.1341595066017014]]], - - [[[0.5055615569388828, 0.051597282072282646, 0.6402903936686337], - [0.43073311517251, 0.8294731834714112, 0.1774668847323424], - [0.3207001991262245, 0.04288308912457006, 0.30280282975568984]], - - [[0.6751777088333762, 0.569537369330242, 0.5584738347504452], - [0.08313242153985256, 0.06016544344162322, 0.10795752845152584], - [0.7486153608562472, 0.943918041459349, 0.4863558118797222]], - - [[0.4181986264486809, 0.6524078485043804, 0.024242983423721887], - [0.13458171554507403, 0.3663418070512402, 0.2958297395361563], - [0.9236695822497084, 0.6899291482654177, 0.7418981733448822]], - - [[0.25000547599982104, 0.6034295486281007, 0.9872887878887768], - [0.5926057265215715, 0.8846724004467684, 0.5434495396894328], - [0.6607698886038497, 0.3771277082495921, 0.3580209022231813]]]], dtype=np.float32) - -b = np.array([[1, 2], [1, 1]], dtype=np.int32) -c = np.array([4, 4], dtype=np.int32) -d = np.array([2, 2], dtype=np.int32) - -feed = {acts: a, labels: b, input_length: c, label_length: d} -with tf.Session() as sess: - cost, grads = sess.run([costs, grad], feed_dict=feed) - print(cost) - print(grads) \ No newline at end of file diff --git a/tensorflow_binding/tests/test_warprnnt_op.py b/tensorflow_binding/tests/test_warprnnt_op.py deleted file mode 100644 index f50d58e..0000000 --- a/tensorflow_binding/tests/test_warprnnt_op.py +++ /dev/null @@ -1,97 +0,0 @@ -import tensorflow as tf -import numpy as np -from warprnnt_tensorflow import rnnt_loss -from tensorflow.python.client import device_lib - -def is_gpu_available(): - """Returns whether Tensorflow can access a GPU.""" - return any(x.device_type == 'GPU' for x in device_lib.list_local_devices()) - -class WarpRNNTTest(tf.test.TestCase): - - def _run_rnnt(self, acts, labels, input_lengths, label_lengths, - expected_costs, expected_grads, blank, use_gpu=False): - self.assertEquals(acts.shape, expected_grads.shape) - acts_t = tf.constant(acts) - labels_t = tf.constant(labels) - input_lengths_t = tf.constant(input_lengths) - label_lengths_t = tf.constant(label_lengths) - - logits = acts_t if use_gpu else tf.nn.log_softmax(acts_t) - costs = rnnt_loss(logits, labels_t, input_lengths_t, label_lengths_t, blank) - - grads = tf.gradients(costs, [acts_t])[0] - - with self.test_session(use_gpu=use_gpu) as sess: - (tf_costs, tf_grad) = sess.run([costs, grads]) - self.assertAllClose(tf_costs, expected_costs, atol=1e-6) - self.assertAllClose(tf_grad, expected_grads, atol=1e-6) - - def test_forward(self): - # Softmax activations for the following inputs: - acts = np.array([0.1, 0.6, 0.1, 0.1, 0.1, 0.1, - 0.1, 0.6, 0.1, 0.1, 0.1, 0.1, - 0.2, 0.8, 0.1, 0.1, 0.6, 0.1, - 0.1, 0.1, 0.1, 0.1, 0.2, 0.1, - 0.1, 0.7, 0.1, 0.2, 0.1, 0.1], dtype=np.float32).reshape(1, 2, 3, 5) - - labels = np.array([[1, 2]], dtype=np.int32) - input_lengths = np.array([2], dtype=np.int32) - label_lengths = np.array([2], dtype=np.int32) - - acts_t = tf.constant(acts) - labels_t = tf.constant(labels) - input_lengths_t = tf.constant(input_lengths) - label_lengths_t = tf.constant(label_lengths) - acts_t = tf.nn.log_softmax(acts_t) # NOTE cpu - costs = rnnt_loss(acts_t, labels_t, input_lengths_t, label_lengths_t) - with self.test_session(): - print(costs.eval()) - - def _test_multiple_batches(self, use_gpu): - B = 2; T = 4; U = 3; V = 3 - - acts = np.array([0.065357, 0.787530, 0.081592, 0.529716, 0.750675, 0.754135, - 0.609764, 0.868140, 0.622532, 0.668522, 0.858039, 0.164539, - 0.989780, 0.944298, 0.603168, 0.946783, 0.666203, 0.286882, - 0.094184, 0.366674, 0.736168, 0.166680, 0.714154, 0.399400, - 0.535982, 0.291821, 0.612642, 0.324241, 0.800764, 0.524106, - 0.779195, 0.183314, 0.113745, 0.240222, 0.339470, 0.134160, - 0.505562, 0.051597, 0.640290, 0.430733, 0.829473, 0.177467, - 0.320700, 0.042883, 0.302803, 0.675178, 0.569537, 0.558474, - 0.083132, 0.060165, 0.107958, 0.748615, 0.943918, 0.486356, - 0.418199, 0.652408, 0.024243, 0.134582, 0.366342, 0.295830, - 0.923670, 0.689929, 0.741898, 0.250005, 0.603430, 0.987289, - 0.592606, 0.884672, 0.543450, 0.660770, 0.377128, 0.358021], dtype=np.float32).reshape(B, T, U, V); - - expected_costs = np.array([4.28065, 3.93844], dtype=np.float32) - expected_grads = np.array([-0.186844, -0.062555, 0.249399, -0.203377, 0.202399, 0.000977, - -0.141016, 0.079123, 0.061893, -0.011552, -0.081280, 0.092832, - -0.154257, 0.229433, -0.075176, -0.246593, 0.146405, 0.100188, - -0.012918, -0.061593, 0.074512, -0.055986, 0.219831, -0.163845, - -0.497627, 0.209240, 0.288387, 0.013605, -0.030220, 0.016615, - 0.113925, 0.062781, -0.176706, -0.667078, 0.367659, 0.299419, - -0.356344, -0.055347, 0.411691, -0.096922, 0.029459, 0.067463, - -0.063518, 0.027654, 0.035863, -0.154499, -0.073942, 0.228441, - -0.166790, -0.000088, 0.166878, -0.172370, 0.105565, 0.066804, - 0.023875, -0.118256, 0.094381, -0.104707, -0.108934, 0.213642, - -0.369844, 0.180118, 0.189726, 0.025714, -0.079462, 0.053748, - 0.122328, -0.238789, 0.116460, -0.598687, 0.302203, 0.296484], dtype=np.float32).reshape(B, T, U, V); - - labels = np.array([[1, 2], [1, 1]], dtype=np.int32) - input_lengths = np.array([4, 4], dtype=np.int32) - label_lengths = np.array([2, 2], dtype=np.int32) - - self._run_rnnt(acts, labels, input_lengths, label_lengths, expected_costs, expected_grads, 0, use_gpu) - - def test_multiple_batches_cpu(self): - self._test_multiple_batches(use_gpu=False) - - def test_multiple_batches_gpu(self): - if (is_gpu_available()): - self._test_multiple_batches(use_gpu=True) - else: - print('Skipping GPU test, no gpus available') - -if __name__ == '__main__': - tf.test.main() diff --git a/tensorflow_binding/warprnnt_tensorflow/__init__.py b/tensorflow_binding/warprnnt_tensorflow/__init__.py deleted file mode 100644 index de45611..0000000 --- a/tensorflow_binding/warprnnt_tensorflow/__init__.py +++ /dev/null @@ -1,48 +0,0 @@ -import imp -import tensorflow as tf -from tensorflow.python.framework import ops - -lib_file = imp.find_module('kernels', __path__)[1] -_warprnnt = tf.load_op_library(lib_file) - - -def rnnt_loss(acts, labels, input_lengths, label_lengths, blank_label=0): - '''Computes the RNNT loss between a sequence of activations and a - ground truth labeling. - Args: - acts: A 4-D Tensor of floats. The dimensions - should be (B, T, U, V), where B is the minibatch index, - T is the time index, U is the prediction network sequence - length, and V indexes over activations for each - symbol in the alphabet. - labels: A 2-D Tensor of ints, a padded label sequences to make sure - labels for the minibatch are same length. - input_lengths: A 1-D Tensor of ints, the number of time steps - for each sequence in the minibatch. - label_lengths: A 1-D Tensor of ints, the length of each label - for each example in the minibatch. - blank_label: int, the label value/index that the RNNT - calculation should use as the blank label - Returns: - 1-D float Tensor, the cost of each example in the minibatch - (as negative log probabilities). - * This class performs the softmax operation internally. - * The label reserved for the blank symbol should be label 0. - ''' - loss, _ = _warprnnt.warp_rnnt(acts, labels, input_lengths, - label_lengths, blank_label) - return loss - - -@ops.RegisterGradient("WarpRNNT") -def _RNNTLossGrad(op, grad_loss, _): - grad = op.outputs[1] - # NOTE since here we are batch first, cannot use _BroadcastMul - grad_loss = tf.reshape(grad_loss, (-1, 1, 1, 1)) - return [grad_loss * grad, None, None, None] - -@ops.RegisterShape("WarpRNNT") -def _RNNTLossShape(op): - inputs_shape = op.inputs[0].get_shape().with_rank(4) - batch_size = inputs_shape[0] - return [batch_size, inputs_shape] \ No newline at end of file