diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..65f01ae --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,42 @@ +name: CI (CPU-only) + +on: + push: + branches: [main, master] + pull_request: + branches: [main, master] + +jobs: + build-and-test: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y cmake build-essential + + - name: Configure CMake (CPU-only) + run: | + mkdir -p build + cd build + cmake .. + + - name: Build project + run: | + cd build + make -j$(nproc) + + - name: Install and update linker cache + run: | + cd build + sudo make install + sudo ldconfig + + - name: Run CPU tests + run: | + cd build + ./test_cpu diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9d54d6b --- /dev/null +++ b/.gitignore @@ -0,0 +1,132 @@ +################################## +# Python +################################## + +# Byte-compiled +__pycache__/ +*.py[cod] +*$py.class + +# Virtual envs +.env/ +.venv/ +env/ +venv/ + +# Packaging +build/ +dist/ +*.egg-info/ +.eggs/ +*.whl + +# Caches / testing +.pytest_cache/ +htmlcov/ +coverage.* +.tox/ +.nox/ + +# Type checkers +.mypy_cache/ +.pytype/ +.pyre/ + +# Jupyter +.ipynb_checkpoints/ + +################################## +# C++ / CUDA / CMake +################################## + +# Objects and binaries +*.o +*.obj +*.a +*.so +*.dll +*.lib +*.dylib +*.out +*.exe +*.app + +# CUDA +*.cu.o +*.ptx +*.cubin +*.fatbin +*.nvvp +*.nvcpl +*.nvd* + +# Dependency / compiler files +*.d +*.mod + +# CMake and build +/build/ +CMakeFiles/ +CMakeCache.txt +cmake_install.cmake +Makefile +CTestTestfile.cmake +install_manifest.txt +compile_commands.json + +# Ninja +.ninja_log +.ninja_deps +build.ninja +rules.ninja + +################################## +# Python bindings (PyTorch / TensorFlow) +################################## + +# Compiled extensions +*.so +*.pyd +*.dll + +# Generated by setup.py +warprnnt_pytorch.egg-info/ +warprnnt_tensorflow.egg-info/ + +# Temporary output +*.log + +################################## +# Docs +################################## + +# LaTeX +*.aux +*.log +*.out +*.pdf +*.toc +*.lof +*.lot + +################################## +# IDE / Editor / OS +################################## + +# VSCode +.vscode/ + +# JetBrains / CLion / PyCharm +.idea/ +cmake-build-*/ + +# Vim / Emacs +*.swp +*.swo +*~ + +# macOS / Windows +.DS_Store +Thumbs.db +desktop.ini + diff --git a/CMakeLists.txt b/CMakeLists.txt old mode 100755 new mode 100644 index 2c570e0..f422901 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,136 +1,249 @@ -IF (APPLE) - cmake_minimum_required(VERSION 3.4) -ELSE() - cmake_minimum_required(VERSION 2.8) -ENDIF() +cmake_minimum_required(VERSION 3.10) +project(rnnt_release LANGUAGES CXX) + +# ==== CUDA DETECTION ==== +set(CUDA_FOUND FALSE) +set(CUDA_TOOLKIT_VERSION "0.0") + +find_package(CUDA QUIET) + +if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18") + find_package(CUDAToolkit QUIET) + + if(CUDAToolkit_FOUND OR CUDA_FOUND) + set(CUDA_FOUND TRUE) + if(CUDAToolkit_FOUND) + set(CUDA_TOOLKIT_VERSION "${CUDAToolkit_VERSION}") + elseif(DEFINED CUDA_VERSION AND NOT "${CUDA_VERSION}" STREQUAL "") + set(CUDA_TOOLKIT_VERSION "${CUDA_VERSION}") + endif() + endif() +else() + if(CUDA_FOUND AND DEFINED CUDA_VERSION AND NOT "${CUDA_VERSION}" STREQUAL "") + set(CUDA_TOOLKIT_VERSION "${CUDA_VERSION}") + endif() +endif() + +# ==== USER OPTIONS ==== +option(WITH_GPU "Build with GPU (CUDA) support" ${CUDA_FOUND}) +option(WITH_OMP "Build with OpenMP support" ON) +option(USE_NAIVE_KERNEL "Use naive alpha-beta kernel" OFF) +option(DEBUG_TIME "Output kernel time" OFF) +option(DEBUG_KERNEL "Output alpha beta debug" OFF) + +# ==== GPU BUILD CONFIGURATION ==== +if(WITH_GPU) + if(NOT CUDA_FOUND) + message(WARNING "WITH_GPU=ON was requested, but no compatible CUDA toolkit was found. Falling back to CPU-only build.") + set(WITH_GPU OFF CACHE BOOL "Build with GPU (CUDA) support" FORCE) + set(USE_CUDA FALSE) + else() + set(USE_CUDA TRUE) + message(STATUS "Building with CUDA ${CUDA_TOOLKIT_VERSION} support") + endif() +else() + set(USE_CUDA FALSE) + if(CUDA_FOUND) + message(STATUS "CUDA ${CUDA_TOOLKIT_VERSION} detected but GPU support disabled (WITH_GPU=OFF)") + else() + message(STATUS "Building CPU-only version (no CUDA toolkit found)") + endif() +endif() -project(rnnt_release) +# ==== OPENMP CONFIGURATION ==== +if(WITH_OMP) + find_package(OpenMP) + if(OpenMP_CXX_FOUND) + message(STATUS "OpenMP support enabled") + else() + message(WARNING "OpenMP requested but not found") + set(WITH_OMP OFF) + endif() +endif() + +# ==== COMPILER SETTINGS ==== +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) -IF (NOT APPLE) +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Build type" FORCE) +endif() + +if(CMAKE_BUILD_TYPE STREQUAL "Debug") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O0 -g") +else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") -ENDIF() +endif() + +if(WITH_OMP) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") +endif() -IF (APPLE) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O2") +if(APPLE) add_definitions(-DAPPLE) -ENDIF() + execute_process(COMMAND uname -v OUTPUT_VARIABLE DARWIN_VERSION) + string(REGEX MATCH "[0-9]+" DARWIN_VERSION "${DARWIN_VERSION}") + message(STATUS "Darwin version: ${DARWIN_VERSION}") + if(${DARWIN_VERSION} LESS 15) + set(CMAKE_SKIP_RPATH TRUE) + endif() +else() + set(CMAKE_SKIP_RPATH TRUE) +endif() include_directories(include) -FIND_PACKAGE(CUDA) -MESSAGE(STATUS "cuda found ${CUDA_FOUND}") - -option(USE_NAIVE_KERNEL "use naive alpha-beta kernel" OFF) -option(DEBUG_TIME "output kernel time" OFF) -option(DEBUG_KERNEL "output alpha beta" OFF) -if (USE_NAIVE_KERNEL) +# ==== PREPROCESSOR DEFINITIONS ==== +if(USE_NAIVE_KERNEL) add_definitions(-DUSE_NAIVE_KERNEL) endif() -if (DEBUG_TIME) + +if(DEBUG_TIME) add_definitions(-DDEBUG_TIME) endif() -if (DEBUG_KERNEL) + +if(DEBUG_KERNEL) add_definitions(-DDEBUG_KERNEL) endif() -option(WITH_GPU "compile warp-rnnt with cuda." ${CUDA_FOUND}) -option(WITH_OMP "compile warp-rnnt with openmp." ON) - if(NOT WITH_OMP) add_definitions(-DRNNT_DISABLE_OMP) endif() -if (WITH_OMP) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fopenmp") -endif() - -# need to be at least 30 or __shfl_down in reduce wont compile -set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30 -O2") -set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_35,code=sm_35") - -set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_50,code=sm_50") -set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52") -IF(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5) - SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -D_MWAITXINTRIN_H_INCLUDED -D_FORCE_INLINES") -ENDIF() -IF (CUDA_VERSION GREATER 7.6) - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60") - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61") - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62") -ENDIF() - -IF (CUDA_VERSION GREATER 8.9) - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_70,code=sm_70") -ENDIF() - -IF (CUDA_VERSION GREATER 9.9) - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_75,code=sm_75") -ENDIF() - -if (NOT APPLE) - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++11") - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") -ENDIF() - - -IF (APPLE) - EXEC_PROGRAM(uname ARGS -v OUTPUT_VARIABLE DARWIN_VERSION) - STRING(REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION}) - MESSAGE(STATUS "DARWIN_VERSION=${DARWIN_VERSION}") - - #for el capitain have to use rpath - - IF (DARWIN_VERSION LESS 15) - set(CMAKE_SKIP_RPATH TRUE) - ENDIF () - -ELSE() - #always skip for linux - set(CMAKE_SKIP_RPATH TRUE) -ENDIF() - - -IF (WITH_GPU) - - MESSAGE(STATUS "Building shared library with GPU support") - - CUDA_ADD_LIBRARY(warprnnt SHARED src/rnnt_entrypoint.cu) - IF (!Torch_FOUND) - TARGET_LINK_LIBRARIES(warprnnt ${CUDA_curand_LIBRARY}) - ENDIF() - - cuda_add_executable(test_time_gpu tests/test_time.cu tests/random.cpp ) - TARGET_LINK_LIBRARIES(test_time_gpu warprnnt ${CUDA_curand_LIBRARY}) - SET_TARGET_PROPERTIES(test_time_gpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") +# ==== CUDA CONFIGURATION ==== +if(USE_CUDA) + if(WITH_OMP) + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fopenmp") + endif() + + if(NOT (CUDA_VERSION GREATER 10.2)) + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30 -O2") + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_35,code=sm_35") + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_50,code=sm_50") + endif() + + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52") + + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5) + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -D_MWAITXINTRIN_H_INCLUDED -D_FORCE_INLINES") + endif() + + if(CUDA_VERSION GREATER 7.6) + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60") + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61") + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62") + endif() + + if(CUDA_VERSION GREATER 8.9) + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_70,code=sm_70") + endif() + + if(CUDA_VERSION GREATER 9.9) + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_75,code=sm_75") + endif() + + if(CUDA_VERSION GREATER 11.0) + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_80,code=sm_80") + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_86,code=sm_86") + endif() + + if(NOT APPLE) + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++17") + endif() + + message(STATUS "CUDA NVCC flags: ${CUDA_NVCC_FLAGS}") +endif() - cuda_add_executable(test_gpu tests/test_gpu.cu tests/random.cpp ) - TARGET_LINK_LIBRARIES(test_gpu warprnnt ${CUDA_curand_LIBRARY}) - SET_TARGET_PROPERTIES(test_gpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") +# ==== SOURCE FILE PREPARATION ==== +set(RNNT_ENTRYPOINT_SRC src/rnnt_entrypoint.cu) +if(NOT USE_CUDA) + configure_file( + ${RNNT_ENTRYPOINT_SRC} + ${CMAKE_CURRENT_BINARY_DIR}/rnnt_entrypoint_cpu.cpp + COPYONLY + ) + set(RNNT_ENTRYPOINT_SRC ${CMAKE_CURRENT_BINARY_DIR}/rnnt_entrypoint_cpu.cpp) +endif() -ELSE() - MESSAGE(STATUS "Building shared library with no GPU support") +# ==== MAIN LIBRARY TARGET ==== +if(USE_CUDA) + message(STATUS "Building shared library with GPU support") + + cuda_add_library(warprnnt SHARED ${RNNT_ENTRYPOINT_SRC}) + + if(NOT Torch_FOUND) + target_link_libraries(warprnnt ${CUDA_curand_LIBRARY}) + endif() + + if(WITH_OMP AND OpenMP_CXX_FOUND) + target_link_libraries(warprnnt ${OpenMP_CXX_LIBRARIES}) + endif() +else() + message(STATUS "Building shared library with CPU-only support") + add_library(warprnnt SHARED ${RNNT_ENTRYPOINT_SRC}) + + if(WITH_OMP AND OpenMP_CXX_FOUND) + target_link_libraries(warprnnt PRIVATE OpenMP::OpenMP_CXX) + endif() +endif() - if (NOT APPLE) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O2") - ENDIF() +set_target_properties(warprnnt PROPERTIES + RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR} + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR} + ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR} +) - ADD_LIBRARY(warprnnt SHARED src/rnnt_entrypoint.cpp) +if(WIN32) + set_target_properties(warprnnt PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) +endif() -ENDIF() +# Use include_directories (global) since we're mixing legacy/modern approaches +# target_include_directories(warprnnt PRIVATE include) # Don't use this with legacy CUDA +# ==== TEST EXECUTABLES ==== +add_executable(test_cpu tests/test_cpu.cpp tests/random.cpp) +target_link_libraries(test_cpu warprnnt) +set_target_properties(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") -add_executable(test_cpu tests/test_cpu.cpp tests/random.cpp ) -TARGET_LINK_LIBRARIES(test_cpu warprnnt) -SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") +add_executable(test_time tests/test_time.cpp tests/random.cpp) +target_link_libraries(test_time warprnnt) +set_target_properties(test_time PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") -add_executable(test_time tests/test_time.cpp tests/random.cpp ) -TARGET_LINK_LIBRARIES(test_time warprnnt) -SET_TARGET_PROPERTIES(test_time PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") +if(USE_CUDA) + cuda_add_executable(test_time_gpu tests/test_time.cu tests/random.cpp) + target_link_libraries(test_time_gpu warprnnt ${CUDA_curand_LIBRARY}) + set_target_properties(test_time_gpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") -INSTALL(TARGETS warprnnt - RUNTIME DESTINATION "bin" - LIBRARY DESTINATION "lib" - ARCHIVE DESTINATION "lib") + cuda_add_executable(test_gpu tests/test_gpu.cu tests/random.cpp) + target_link_libraries(test_gpu warprnnt ${CUDA_curand_LIBRARY}) + set_target_properties(test_gpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") +endif() -INSTALL(FILES include/rnnt.h DESTINATION "include") +# ==== INSTALL TARGETS ==== +install(TARGETS warprnnt + RUNTIME DESTINATION bin + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib +) + +install(FILES include/rnnt.h DESTINATION include) + +# ==== BUILD SUMMARY ==== +message(STATUS "") +message(STATUS "=== Build Configuration Summary ===") +message(STATUS "Build Type: ${CMAKE_BUILD_TYPE}") +message(STATUS "CUDA Available: ${CUDA_FOUND}") +if(CUDA_FOUND) + message(STATUS "CUDA Version: ${CUDA_TOOLKIT_VERSION}") +endif() +message(STATUS "GPU Support: ${USE_CUDA}") +if(USE_CUDA) + message(STATUS "CUDA NVCC Flags: ${CUDA_NVCC_FLAGS}") +endif() +message(STATUS "OpenMP Support: ${WITH_OMP}") +message(STATUS "Naive Kernel: ${USE_NAIVE_KERNEL}") +message(STATUS "Debug Time: ${DEBUG_TIME}") +message(STATUS "Debug Kernel: ${DEBUG_KERNEL}") +message(STATUS "=====================================") +message(STATUS "") diff --git a/README.md b/README.md index 11263ef..1ec3b88 100644 --- a/README.md +++ b/README.md @@ -57,6 +57,7 @@ Run cmake and build: ```bash cmake -DCUDA_TOOLKIT_ROOT_DIR=$CUDA_HOME .. make +sudo make install #optional ``` if it logs ``` @@ -68,6 +69,30 @@ please run `rm CMakeCache.txt` and cmake again. The C library should now be built along with test executables. If CUDA was detected, then `test_gpu` will be built; `test_cpu` will always be built. +## Troubleshooting: `cuda_runtime_api.h: No such file or directory` + +If you encounter an error like this during installation: + +` +fatal error: cuda_runtime_api.h: No such file or directory +` + +This usually means the CUDA headers can't be found by the compiler. If your system has CUDA installed (e.g., version 12.9) and the file `cuda_runtime_api.h` exists somewhere like: + +` +/usr/local/cuda-12.9/targets/x86_64-linux/include/cuda_runtime_api.h +` + +Then you can fix this by manually setting the environment variables to point to the correct CUDA paths: + +```bash +export CUDA_HOME=/usr/local/cuda-12.9 +export CFLAGS="-I$CUDA_HOME/targets/x86_64-linux/include" +export LDFLAGS="-L$CUDA_HOME/targets/x86_64-linux/lib" +export PATH="$CUDA_HOME/bin:$PATH" +export LD_LIBRARY_PATH="$CUDA_HOME/lib64:$LD_LIBRARY_PATH" +``` + ## Test To run the tests, make sure the CUDA libraries are in `LD_LIBRARY_PATH` (DYLD_LIBRARY_PATH for OSX). diff --git a/include/detail/cpu_rnnt.h b/include/detail/cpu_rnnt.h index 0042b34..f4e586f 100644 --- a/include/detail/cpu_rnnt.h +++ b/include/detail/cpu_rnnt.h @@ -20,9 +20,9 @@ class CpuRNNT { public: // Noncopyable CpuRNNT(int minibatch, int maxT, int maxU, int alphabet_size, void* workspace, - int blank, int num_threads, bool batch_first) : + int blank, float fastemit_lambda, int num_threads, bool batch_first) : minibatch_(minibatch), maxT_(maxT), maxU_(maxU), alphabet_size_(alphabet_size), - workspace_(workspace), blank_(blank), num_threads_(num_threads), batch_first(batch_first) { + workspace_(workspace), blank_(blank), fastemit_lambda_(fastemit_lambda), num_threads_(num_threads), batch_first(batch_first) { #if defined(RNNT_DISABLE_OMP) || defined(APPLE) #else if (num_threads > 0) { @@ -82,6 +82,7 @@ class CpuRNNT { int alphabet_size_; // Number of characters plus blank void* workspace_; int blank_; + float fastemit_lambda_; int num_threads_; bool batch_first; @@ -259,7 +260,10 @@ CpuRNNT::compute_betas_and_grad(ProbT* grad, const ProbT* const log_probs } if (u < U-1) { ProbT g = alphas[idx(t, u)] + betas[idx(t, u+1)]; - grad[idx(t, u, labels[u])] = -std::exp(log_probs[idx(t, u) * 2 + 1] + g - loglike); + grad[idx(t, u, labels[u])] = -( + (1. + fastemit_lambda_) + * std::exp(log_probs[idx(t, u) * 2 + 1] + g - loglike) + ); } } } diff --git a/include/detail/gpu_rnnt.h b/include/detail/gpu_rnnt.h index dd7f874..c8e67df 100644 --- a/include/detail/gpu_rnnt.h +++ b/include/detail/gpu_rnnt.h @@ -20,9 +20,9 @@ class GpuRNNT { public: // Noncopyable GpuRNNT(int minibatch, int maxT, int maxU, int alphabet_size, void* workspace, - int blank, int num_threads, CUstream stream) : + int blank, float fastemit_lambda, int num_threads, CUstream stream) : minibatch_(minibatch), maxT_(maxT), maxU_(maxU), alphabet_size_(alphabet_size), - gpu_workspace(workspace), blank_(blank), num_threads_(num_threads), stream_(stream) { + gpu_workspace(workspace), blank_(blank), fastemit_lambda_(fastemit_lambda), num_threads_(num_threads), stream_(stream) { #if defined(RNNT_DISABLE_OMP) || defined(APPLE) #else if (num_threads > 0) { @@ -65,6 +65,7 @@ class GpuRNNT { int alphabet_size_; // Number of characters plus blank void* gpu_workspace; int blank_; + float fastemit_lambda_; int num_threads_; CUstream stream_; @@ -195,9 +196,16 @@ GpuRNNT::compute_cost_and_score(const ProbT* const acts, start = std::chrono::high_resolution_clock::now(); #endif // TODO optimize gradient kernel - compute_grad_kernel<128, ProbT><<>>(grads, - acts, denom, alphas, betas, llForward, input_lengths, label_lengths, labels, - minibatch_, maxT_, maxU_, alphabet_size_, blank_); + + if (fastemit_lambda_ > 0.0f) { + compute_fastemit_grad_kernel<128, ProbT><<>>(grads, + acts, denom, alphas, betas, llForward, input_lengths, label_lengths, labels, + minibatch_, maxT_, maxU_, alphabet_size_, blank_, fastemit_lambda_); + } else { + compute_grad_kernel<128, ProbT><<>>(grads, + acts, denom, alphas, betas, llForward, input_lengths, label_lengths, labels, + minibatch_, maxT_, maxU_, alphabet_size_, blank_); + } #if defined(DEBUG_TIME) cudaStreamSynchronize(stream_); end = std::chrono::high_resolution_clock::now(); diff --git a/include/detail/gpu_rnnt_kernel.h b/include/detail/gpu_rnnt_kernel.h index 738d7ef..ed0b0e6 100644 --- a/include/detail/gpu_rnnt_kernel.h +++ b/include/detail/gpu_rnnt_kernel.h @@ -177,3 +177,50 @@ __global__ void compute_grad_kernel(Tp* grads, const Tp* const acts, const Tp* c } } } + +template +__global__ void compute_fastemit_grad_kernel(Tp* grads, const Tp* const acts, const Tp* const denom, const Tp* alphas, const Tp* betas, const Tp* const logll, const int* const xlen, const int* const ylen, + const int* const mlabels, const int minibatch, const int maxT, const int maxU, const int alphabet_size, const int blank_, const Tp fastemit_lambda) { + int tid = threadIdx.x; // alphabet dim + int idx = tid; + int col = blockIdx.x; // mb, t, u + + int u = col % maxU; + int bt = (col - u) / maxU; + int t = bt % maxT; + int mb = (bt - t) / maxT; + + const int T = xlen[mb]; + const int U = ylen[mb] + 1; + const int* labels = mlabels + mb * (maxU - 1); + + if (t < T && u < U) { + while (idx < alphabet_size) { + Tp logpk = denom[col] + acts[col * alphabet_size + idx]; + // Tp logpk = logp(denom, acts, maxT, maxU, alphabet_size, mb, t, u, idx); + Tp grad = exp(alphas[col] + betas[col] + logpk - logll[mb]); + + Tp logy_btu1 = rnnt_helper::neg_inf(); // log(y(t,u)) + log(beta(t, u+1)) + if (u < U-1) { + logy_btu1 = denom[col] + acts[col * alphabet_size + labels[u]] + betas[col+1]; + } + grad += fastemit_lambda * exp(alphas[col] + logy_btu1 + logpk - logll[mb]); + + // grad to last blank transition + if (idx == blank_ && t == T-1 && u == U-1) { + grad -= exp(alphas[col] + logpk - logll[mb]); + grad -= fastemit_lambda * exp(alphas[col] + logy_btu1 + logpk - logll[mb]); + } + if (idx == blank_ && t < T-1) { + grad -= exp(alphas[col] + logpk - logll[mb] + betas[col + maxU]); + } + if (u < U-1 && idx == labels[u]) { + grad -= exp(alphas[col] + logpk - logll[mb] + betas[col+1]); + grad -= fastemit_lambda * exp(alphas[col] + logy_btu1 - logll[mb]); + } + grads[col * alphabet_size + idx] = grad; + + idx += NT; + } + } +} diff --git a/include/rnnt.h b/include/rnnt.h index 4759e0a..207065f 100644 --- a/include/rnnt.h +++ b/include/rnnt.h @@ -61,6 +61,8 @@ struct rnntOptions { /// memory structure bool batch_first; + + float fastemit_lambda; }; /** Compute the RNN Transducer loss between a sequence diff --git a/mxnet_binding b/mxnet_binding deleted file mode 160000 index 75874b2..0000000 --- a/mxnet_binding +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 75874b21a476dd9f9e5480841f0872f9d7c2723e diff --git a/pytorch_binding/README.md b/pytorch_binding/README.md index cce44e7..4be1252 100644 --- a/pytorch_binding/README.md +++ b/pytorch_binding/README.md @@ -10,12 +10,14 @@ Install [PyTorch](https://github.com/pytorch/pytorch#installation). (i.e. `libwarprnnt.so`). This defaults to `../build`, so from within a new warp-transducer clone you could build WarpRNNT like this: + ```bash git clone https://github.com/HawkAaron/warp-transducer cd warp-transducer mkdir build; cd build cmake .. make +sudo make install # optional ``` Otherwise, set `WARP_RNNT_PATH` to wherever you have `libwarprnnt.so` @@ -30,7 +32,7 @@ export CUDA_HOME="/usr/local/cuda" Now install the bindings: (Please make sure the GCC version >= 4.9) ``` cd pytorch_binding -python setup.py install +pip install . # pip install . --break-package-system ``` If you try the above and get a dlopen error on OSX with anaconda3 (as recommended by pytorch): @@ -89,3 +91,33 @@ forward(acts, labels, act_lens, label_lens): label_lens: Tensor of (batch) containing label length of each example """ ``` + +## Troubleshooting: `cuda_runtime_api.h: No such file or directory` + +If you encounter an error like this during installation: + +` +fatal error: cuda_runtime_api.h: No such file or directory +` + +This usually means the CUDA headers can't be found by the compiler. If your system has CUDA installed (e.g., version 12.9) and the file `cuda_runtime_api.h` exists somewhere like: + +` +/usr/local/cuda-12.9/targets/x86_64-linux/include/cuda_runtime_api.h +` + +Then you can fix this by manually setting the environment variables to point to the correct CUDA paths: + +```bash +export CUDA_HOME=/usr/local/cuda-12.9 +export CFLAGS="-I$CUDA_HOME/targets/x86_64-linux/include" +export LDFLAGS="-L$CUDA_HOME/targets/x86_64-linux/lib" +export PATH="$CUDA_HOME/bin:$PATH" +export LD_LIBRARY_PATH="$CUDA_HOME/lib64:$LD_LIBRARY_PATH" +``` + +Then run the installation again: + +```bash +pip install . +``` diff --git a/pytorch_binding/setup.py b/pytorch_binding/setup.py index ad960ea..af8c094 100644 --- a/pytorch_binding/setup.py +++ b/pytorch_binding/setup.py @@ -1,4 +1,4 @@ -from distutils.version import LooseVersion +from packaging.version import Version import os import platform import sys @@ -8,7 +8,9 @@ extra_compile_args = ['-fPIC'] -if LooseVersion(torch.__version__) >= LooseVersion("1.5.0"): +if Version(torch.__version__) >= Version("2.1.0"): + extra_compile_args += ['-std=c++17'] +elif Version(torch.__version__) >= Version("1.5.0"): extra_compile_args += ['-std=c++14'] else: extra_compile_args += ['-std=c++11'] @@ -39,12 +41,15 @@ setup( name='warprnnt_pytorch', - version="0.1", + version="0.2.0", description="PyTorch wrapper for RNN-Transducer", url="https://github.com/HawkAaron/warp-transducer", author="Mingkun Huang", author_email="mingkunhuang95@gmail.com", packages=find_packages(), + install_requires=[ + 'packaging' + ], ext_modules=[ CppExtension( name='warprnnt_pytorch.warp_rnnt', diff --git a/pytorch_binding/src/binding.cpp b/pytorch_binding/src/binding.cpp index 7aaf000..95f5ff8 100644 --- a/pytorch_binding/src/binding.cpp +++ b/pytorch_binding/src/binding.cpp @@ -5,8 +5,7 @@ #include "rnnt.h" #ifdef WARPRNNT_ENABLE_GPU - #include "THC.h" - extern THCState* state; + #include "c10/cuda/CUDACachingAllocator.h" #endif int cpu_rnnt(torch::Tensor acts, @@ -16,6 +15,7 @@ int cpu_rnnt(torch::Tensor acts, torch::Tensor costs, torch::Tensor grads, int blank_label, + float fastemit_lambda, int num_threads) { int maxT = acts.size(0); @@ -23,16 +23,17 @@ int cpu_rnnt(torch::Tensor acts, int minibatch_size = acts.size(2); int alphabet_size = acts.size(3); - if (true) { - minibatch_size = acts.size(0); - maxT = acts.size(1); - maxU = acts.size(2); - } + if (true) { + minibatch_size = acts.size(0); + maxT = acts.size(1); + maxU = acts.size(2); + } rnntOptions options; memset(&options, 0, sizeof(options)); options.maxT = maxT; options.maxU = maxU; + options.fastemit_lambda = fastemit_lambda; options.blank_label = blank_label; options.batch_first = true; options.loc = RNNT_CPU; @@ -43,33 +44,33 @@ int cpu_rnnt(torch::Tensor acts, #endif size_t cpu_size_bytes = 0; - switch (acts.type().scalarType()) { - case torch::ScalarType::Float: + switch (acts.scalar_type()) { + case torch::kFloat: { get_workspace_size(maxT, maxU, minibatch_size, false, &cpu_size_bytes); float* cpu_workspace = (float*) new unsigned char[cpu_size_bytes]; - compute_rnnt_loss(acts.data(), grads.data(), - labels.data(), label_lengths.data(), - input_lengths.data(), alphabet_size, - minibatch_size, costs.data(), + compute_rnnt_loss(acts.data_ptr(), grads.data_ptr(), + labels.data_ptr(), label_lengths.data_ptr(), + input_lengths.data_ptr(), alphabet_size, + minibatch_size, costs.data_ptr(), cpu_workspace, options); delete cpu_workspace; return 0; } - case torch::ScalarType::Double: + case torch::kDouble: { get_workspace_size(maxT, maxU, minibatch_size, false, &cpu_size_bytes, sizeof(double)); double* cpu_workspace = (double*) new unsigned char[cpu_size_bytes]; - compute_rnnt_loss_fp64(acts.data(), grads.data(), - labels.data(), label_lengths.data(), - input_lengths.data(), alphabet_size, - minibatch_size, costs.data(), + compute_rnnt_loss_fp64(acts.data_ptr(), grads.data_ptr(), + labels.data_ptr(), label_lengths.data_ptr(), + input_lengths.data_ptr(), alphabet_size, + minibatch_size, costs.data_ptr(), cpu_workspace, options); delete cpu_workspace; @@ -88,6 +89,7 @@ int gpu_rnnt(torch::Tensor acts, torch::Tensor costs, torch::Tensor grads, int blank_label, + float fastemit_lambda, int num_threads) { int minibatch_size = acts.size(0); @@ -102,14 +104,15 @@ int gpu_rnnt(torch::Tensor acts, options.blank_label = blank_label; options.loc = RNNT_GPU; options.stream = at::cuda::getCurrentCUDAStream(); + options.fastemit_lambda = fastemit_lambda; options.num_threads = num_threads; #if defined(RNNT_DISABLE_OMP) || defined(APPLE) // have to use at least one options.num_threads = std::max(options.num_threads, (unsigned int) 1); #endif - switch (acts.type().scalarType()) { - case torch::ScalarType::Float: + switch (acts.scalar_type()) { + case torch::kFloat: { size_t gpu_size_bytes; get_workspace_size(maxT, maxU, minibatch_size, @@ -117,18 +120,22 @@ int gpu_rnnt(torch::Tensor acts, cudaSetDevice(acts.get_device()); - void* gpu_workspace = THCudaMalloc(state, gpu_size_bytes); + void* gpu_workspace = c10::cuda::CUDACachingAllocator::raw_alloc(gpu_size_bytes); + if (gpu_workspace == nullptr) { + std::cerr << __FILE__ << ':' << __LINE__ << ": " << "failed to allocate GPU workspace" << std::endl; + return -1; + } - compute_rnnt_loss(acts.data(), grads.data(), - labels.data(), label_lengths.data(), - input_lengths.data(), alphabet_size, - minibatch_size, costs.data(), + compute_rnnt_loss(acts.data_ptr(), grads.data_ptr(), + labels.data_ptr(), label_lengths.data_ptr(), + input_lengths.data_ptr(), alphabet_size, + minibatch_size, costs.data_ptr(), gpu_workspace, options); - THCudaFree(state, gpu_workspace); + c10::cuda::CUDACachingAllocator::raw_delete(gpu_workspace); return 0; } - case torch::ScalarType::Double: + case torch::kDouble: { size_t gpu_size_bytes; get_workspace_size(maxT, maxU, minibatch_size, @@ -136,15 +143,19 @@ int gpu_rnnt(torch::Tensor acts, cudaSetDevice(acts.get_device()); - void* gpu_workspace = THCudaMalloc(state, gpu_size_bytes); + void* gpu_workspace = c10::cuda::CUDACachingAllocator::raw_alloc(gpu_size_bytes); + if (gpu_workspace == nullptr) { + std::cerr << __FILE__ << ':' << __LINE__ << ": " << "failed to allocate GPU workspace" << std::endl; + return -1; + } - compute_rnnt_loss_fp64(acts.data(), grads.data(), - labels.data(), label_lengths.data(), - input_lengths.data(), alphabet_size, - minibatch_size, costs.data(), + compute_rnnt_loss_fp64(acts.data_ptr(), grads.data_ptr(), + labels.data_ptr(), label_lengths.data_ptr(), + input_lengths.data_ptr(), alphabet_size, + minibatch_size, costs.data_ptr(), gpu_workspace, options); - THCudaFree(state, gpu_workspace); + c10::cuda::CUDACachingAllocator::raw_delete(gpu_workspace); return 0; } default: diff --git a/pytorch_binding/test/test.py b/pytorch_binding/test/test.py index 14feb09..c378ed0 100644 --- a/pytorch_binding/test/test.py +++ b/pytorch_binding/test/test.py @@ -23,28 +23,24 @@ fn = rnntloss() if args.np else RNNTLoss(reduction='sum') -gpu = 1 +#acts = autograd.Variable(acts, requires_grad=True) def wrap_and_call(acts, labels): acts = torch.FloatTensor(acts) - if use_cuda: - acts = acts.cuda(gpu) - #acts = autograd.Variable(acts, requires_grad=True) + device = torch.device("cuda" if use_cuda else "cpu") + acts = acts.to(device) acts.requires_grad = True lengths = [acts.shape[1]] * acts.shape[0] label_lengths = [len(l) for l in labels] - labels = torch.IntTensor(labels) - lengths = torch.IntTensor(lengths) - label_lengths = torch.IntTensor(label_lengths) - if use_cuda: - labels = labels.cuda(gpu) - lengths = lengths.cuda(gpu) - label_lengths = label_lengths.cuda(gpu) + + labels = torch.IntTensor(labels).to(device) + lengths = torch.IntTensor(lengths).to(device) + label_lengths = torch.IntTensor(label_lengths).to(device) costs = fn(acts, labels, lengths, label_lengths) cost = torch.sum(costs) cost.backward() - # print(repr(acts.grad.data.cpu().numpy())) + return costs.data.cpu().numpy(), acts.grad.data.cpu().numpy() diff --git a/pytorch_binding/warprnnt_pytorch/__init__.py b/pytorch_binding/warprnnt_pytorch/__init__.py index 76c6c30..8f1d261 100644 --- a/pytorch_binding/warprnnt_pytorch/__init__.py +++ b/pytorch_binding/warprnnt_pytorch/__init__.py @@ -9,12 +9,13 @@ class _RNNT(Function): @staticmethod - def forward(ctx, acts, labels, act_lens, label_lens, blank, reduction): + def forward(ctx, acts, labels, act_lens, label_lens, blank, reduction, fastemit_lambda): """ acts: Tensor of (batch x seqLength x labelLength x outputDim) containing output from network labels: 2 dimensional Tensor containing all the targets of the batch with zero padded act_lens: Tensor of size (batch) containing size of each output sequence from the network label_lens: Tensor of (batch) containing label length of each example + fastemit_lambda: Regularization parameter for FastEmit (https://arxiv.org/pdf/2010.11148.pdf) """ is_cuda = acts.is_cuda @@ -31,6 +32,7 @@ def forward(ctx, acts, labels, act_lens, label_lens, blank, reduction): costs, grads, blank, + fastemit_lambda, 0) if reduction in ['sum', 'mean']: @@ -47,10 +49,10 @@ def forward(ctx, acts, labels, act_lens, label_lens, blank, reduction): @staticmethod def backward(ctx, grad_output): grad_output = grad_output.view(-1, 1, 1, 1).to(ctx.grads) - return ctx.grads.mul_(grad_output), None, None, None, None, None + return ctx.grads.mul_(grad_output), None, None, None, None, None, None -def rnnt_loss(acts, labels, act_lens, label_lens, blank=0, reduction='mean'): +def rnnt_loss(acts, labels, act_lens, label_lens, blank=0, reduction='mean', fastemit_lambda=0.001): """ RNN Transducer Loss Args: @@ -63,11 +65,12 @@ def rnnt_loss(acts, labels, act_lens, label_lens, blank=0, reduction='mean'): 'none' | 'mean' | 'sum'. 'none': no reduction will be applied, 'mean': the output losses will be divided by the target lengths and then the mean over the batch is taken. Default: 'mean' + fastemit_lambda: Regularization parameter for FastEmit (https://arxiv.org/pdf/2010.11148.pdf) """ if not acts.is_cuda: acts = torch.nn.functional.log_softmax(acts, -1) - return _RNNT.apply(acts, labels, act_lens, label_lens, blank, reduction) + return _RNNT.apply(acts, labels, act_lens, label_lens, blank, reduction, fastemit_lambda) class RNNTLoss(Module): @@ -79,10 +82,11 @@ class RNNTLoss(Module): 'mean': the output losses will be divided by the target lengths and then the mean over the batch is taken. Default: 'mean' """ - def __init__(self, blank=0, reduction='mean'): + def __init__(self, blank=0, fastemit_lambda=0.001, reduction='mean'): super(RNNTLoss, self).__init__() self.blank = blank self.reduction = reduction + self.fastemit_lambda = fastemit_lambda self.loss = _RNNT.apply def forward(self, acts, labels, act_lens, label_lens): @@ -97,7 +101,7 @@ def forward(self, acts, labels, act_lens, label_lens): # log_softmax is computed within GPU version. acts = torch.nn.functional.log_softmax(acts, -1) - return self.loss(acts, labels, act_lens, label_lens, self.blank, self.reduction) + return self.loss(acts, labels, act_lens, label_lens, self.blank, self.reduction, self.fastemit_lambda) def check_type(var, t, name): diff --git a/src/rnnt_entrypoint.cpp b/src/rnnt_entrypoint.cpp deleted file mode 100644 index 0d742d8..0000000 --- a/src/rnnt_entrypoint.cpp +++ /dev/null @@ -1,187 +0,0 @@ -#include -#include -#include - -#include - -#include "detail/cpu_rnnt.h" -#ifdef __CUDACC__ - #include "detail/gpu_rnnt.h" -#endif - -extern "C" { - -int get_warprnnt_version() { - return 1; -} - -const char* rnntGetStatusString(rnntStatus_t status) { - switch (status) { - case RNNT_STATUS_SUCCESS: - return "no error"; - case RNNT_STATUS_MEMOPS_FAILED: - return "cuda memcpy or memset failed"; - case RNNT_STATUS_INVALID_VALUE: - return "invalid value"; - case RNNT_STATUS_EXECUTION_FAILED: - return "execution failed"; - - case RNNT_STATUS_UNKNOWN_ERROR: - default: - return "unknown error"; - - } - -} - - -rnntStatus_t compute_rnnt_loss(const float* const activations, //BTUV - float* gradients, - const int* const flat_labels, - const int* const label_lengths, - const int* const input_lengths, - int alphabet_size, - int minibatch, - float *costs, - void *workspace, - rnntOptions options) { - - if (activations == nullptr || - flat_labels == nullptr || - label_lengths == nullptr || - input_lengths == nullptr || - costs == nullptr || - workspace == nullptr || - alphabet_size <= 0 || - minibatch <= 0 || - options.maxT <= 0 || - options.maxU <= 0) - return RNNT_STATUS_INVALID_VALUE; - - if (options.loc == RNNT_CPU) { - CpuRNNT rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace, - options.blank_label, options.num_threads, options.batch_first); - - if (gradients != NULL) - return rnnt.cost_and_grad(activations, gradients, - costs, - flat_labels, label_lengths, - input_lengths); - else - return rnnt.score_forward(activations, costs, flat_labels, - label_lengths, input_lengths); - } else if (options.loc == RNNT_GPU) { -#ifdef __CUDACC__ - GpuRNNT rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace, - options.blank_label, options.num_threads, options.stream); - - if (gradients != NULL) - return rnnt.cost_and_grad(activations, gradients, - costs, - flat_labels, label_lengths, - input_lengths); - else - return rnnt.score_forward(activations, costs, flat_labels, - label_lengths, input_lengths); -#else - std::cerr << "GPU execution requested, but not compiled with GPU support" << std::endl; - return RNNT_STATUS_EXECUTION_FAILED; -#endif - } else { - return RNNT_STATUS_INVALID_VALUE; - } -} - - -rnntStatus_t get_workspace_size(int maxT, int maxU, - int minibatch, - bool gpu, - size_t* size_bytes, - size_t dtype_size) -{ - if (minibatch <= 0 || - maxT <= 0 || - maxU <= 0) - return RNNT_STATUS_INVALID_VALUE; - - *size_bytes = 0; - - // per minibatch memory - size_t per_minibatch_bytes = 0; - - // alphas & betas - per_minibatch_bytes += dtype_size * maxT * maxU * 2; - - if (!gpu) { - // blank & label log probability cache - per_minibatch_bytes += dtype_size * maxT * maxU * 2; - } else { - // softmax denominator - per_minibatch_bytes += dtype_size * maxT * maxU; - // forward-backward loglikelihood - per_minibatch_bytes += dtype_size * 2; - } - - *size_bytes = per_minibatch_bytes * minibatch; - - return RNNT_STATUS_SUCCESS; -} - -rnntStatus_t compute_rnnt_loss_fp64(const double* const activations, //BTUV - double* gradients, - const int* const flat_labels, - const int* const label_lengths, - const int* const input_lengths, - int alphabet_size, - int minibatch, - double *costs, - void *workspace, - rnntOptions options) { - - if (activations == nullptr || - flat_labels == nullptr || - label_lengths == nullptr || - input_lengths == nullptr || - costs == nullptr || - workspace == nullptr || - alphabet_size <= 0 || - minibatch <= 0 || - options.maxT <= 0 || - options.maxU <= 0) - return RNNT_STATUS_INVALID_VALUE; - - if (options.loc == RNNT_CPU) { - CpuRNNT rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace, - options.blank_label, options.num_threads, options.batch_first); - - if (gradients != NULL) - return rnnt.cost_and_grad(activations, gradients, - costs, - flat_labels, label_lengths, - input_lengths); - else - return rnnt.score_forward(activations, costs, flat_labels, - label_lengths, input_lengths); - } else if (options.loc == RNNT_GPU) { -#ifdef __CUDACC__ - GpuRNNT rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace, - options.blank_label, options.num_threads, options.stream); - - if (gradients != NULL) - return rnnt.cost_and_grad(activations, gradients, - costs, - flat_labels, label_lengths, - input_lengths); - else - return rnnt.score_forward(activations, costs, flat_labels, - label_lengths, input_lengths); -#else - std::cerr << "GPU execution requested, but not compiled with GPU support" << std::endl; - return RNNT_STATUS_EXECUTION_FAILED; -#endif - } else { - return RNNT_STATUS_INVALID_VALUE; - } -} - -} diff --git a/src/rnnt_entrypoint.cu b/src/rnnt_entrypoint.cu deleted file mode 120000 index c483c3c..0000000 --- a/src/rnnt_entrypoint.cu +++ /dev/null @@ -1 +0,0 @@ -rnnt_entrypoint.cpp \ No newline at end of file diff --git a/src/rnnt_entrypoint.cu b/src/rnnt_entrypoint.cu new file mode 100644 index 0000000..c213f4c --- /dev/null +++ b/src/rnnt_entrypoint.cu @@ -0,0 +1,197 @@ +#include +#include +#include + +#include + +#include "detail/cpu_rnnt.h" +#ifdef __CUDACC__ + #include "detail/gpu_rnnt.h" +#endif + +extern "C" { + +int get_warprnnt_version() { + return 1; +} + +const char* rnntGetStatusString(rnntStatus_t status) { + switch (status) { + case RNNT_STATUS_SUCCESS: + return "no error"; + case RNNT_STATUS_MEMOPS_FAILED: + return "cuda memcpy or memset failed"; + case RNNT_STATUS_INVALID_VALUE: + return "invalid value"; + case RNNT_STATUS_EXECUTION_FAILED: + return "execution failed"; + + case RNNT_STATUS_UNKNOWN_ERROR: + default: + return "unknown error"; + + } + +} + + +rnntStatus_t compute_rnnt_loss(const float* const activations, //BTUV + float* gradients, + const int* const flat_labels, + const int* const label_lengths, + const int* const input_lengths, + int alphabet_size, + int minibatch, + float *costs, + void *workspace, + rnntOptions options) { + + if (activations == nullptr || + flat_labels == nullptr || + label_lengths == nullptr || + input_lengths == nullptr || + costs == nullptr || + workspace == nullptr || + alphabet_size <= 0 || + minibatch <= 0 || + options.maxT <= 0 || + options.maxU <= 0 || + options.fastemit_lambda < 0) + return RNNT_STATUS_INVALID_VALUE; + + if (options.loc == RNNT_CPU) { + CpuRNNT rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace, + options.blank_label, options.fastemit_lambda, options.num_threads, options.batch_first); + + if (gradients != NULL) + return rnnt.cost_and_grad(activations, gradients, + costs, + flat_labels, label_lengths, + input_lengths); + else + return rnnt.score_forward(activations, costs, flat_labels, + label_lengths, input_lengths); + } else if (options.loc == RNNT_GPU) { +#ifdef __CUDACC__ + GpuRNNT rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace, + options.blank_label, options.fastemit_lambda, options.num_threads, options.stream); + + if (gradients != NULL) + return rnnt.cost_and_grad(activations, gradients, + costs, + flat_labels, label_lengths, + input_lengths); + else + return rnnt.score_forward(activations, costs, flat_labels, + label_lengths, input_lengths); +#else + std::cerr << "GPU execution requested, but not compiled with GPU support" << std::endl; + return RNNT_STATUS_EXECUTION_FAILED; +#endif + } else { + return RNNT_STATUS_INVALID_VALUE; + } +} + + +rnntStatus_t get_workspace_size(int maxT, int maxU, + int minibatch, + bool gpu, + size_t* size_bytes, + size_t dtype_size) +{ + if (minibatch <= 0 || + maxT <= 0 || + maxU <= 0) + return RNNT_STATUS_INVALID_VALUE; + + *size_bytes = 0; + + // per minibatch memory + size_t per_minibatch_bytes = 0; + + // alphas & betas + per_minibatch_bytes += dtype_size * maxT * maxU * 2; + + if (!gpu) { + // blank & label log probability cache + per_minibatch_bytes += dtype_size * maxT * maxU * 2; + } else { + // softmax denominator + per_minibatch_bytes += dtype_size * maxT * maxU; + // forward-backward loglikelihood + per_minibatch_bytes += dtype_size * 2; + } + + *size_bytes = per_minibatch_bytes * minibatch; + + return RNNT_STATUS_SUCCESS; +} + +rnntStatus_t compute_rnnt_loss_fp64(const double* const activations, //BTUV + double* gradients, + const int* const flat_labels, + const int* const label_lengths, + const int* const input_lengths, + int alphabet_size, + int minibatch, + double *costs, + void *workspace, + rnntOptions options) { + + if (activations == nullptr || + flat_labels == nullptr || + label_lengths == nullptr || + input_lengths == nullptr || + costs == nullptr || + workspace == nullptr || + alphabet_size <= 0 || + minibatch <= 0 || + options.maxT <= 0 || + options.maxU <= 0 || + options.fastemit_lambda < 0) + return RNNT_STATUS_INVALID_VALUE; + + if (options.loc == RNNT_CPU) { + CpuRNNT rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace, + options.blank_label, options.fastemit_lambda, options.num_threads, options.batch_first); + + if (gradients != NULL) + return rnnt.cost_and_grad(activations, gradients, + costs, + flat_labels, label_lengths, + input_lengths); + else + return rnnt.score_forward(activations, costs, flat_labels, + label_lengths, input_lengths); + } else if (options.loc == RNNT_GPU) { +#ifdef __CUDACC__ + GpuRNNT rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace, + options.blank_label, options.fastemit_lambda, options.num_threads, options.stream); + + if (gradients != NULL) + return rnnt.cost_and_grad(activations, gradients, + costs, + flat_labels, label_lengths, + input_lengths); + else + return rnnt.score_forward(activations, costs, flat_labels, + label_lengths, input_lengths); +#else + std::cerr << "GPU execution requested, but not compiled with GPU support" << std::endl; + return RNNT_STATUS_EXECUTION_FAILED; +#endif + } else { + return RNNT_STATUS_INVALID_VALUE; + } +} + +const char* rnnt_backend_mode() { +#ifdef __CUDACC__ + return "GPU"; +#else + return "CPU"; +#endif +} + +} diff --git a/tensorflow_binding/README.md b/tensorflow_binding/README.md deleted file mode 100644 index b35ae4c..0000000 --- a/tensorflow_binding/README.md +++ /dev/null @@ -1,97 +0,0 @@ -# TensorFlow binding for WarpRNNT - -This package provides TensorFlow kernels that wrap the WarpRNNT -library. - -## Installation - -To build the kernels it is necessary to have the TensorFlow source -code available, since TensorFlow doesn't currently install the -necessary headers to handle the SparseTensor that the CTCLoss op uses -to input the labels. You can retrieve the TensorFlow source from -github.com: - -```bash -git clone https://github.com/tensorflow/tensorflow.git -``` - -`WARP_RNNT_PATH` should be set to the location of a built WarpRNNT -(i.e. `libwarprnnt.so`). This defaults to `../build`, so from within a -new warp-rnnt clone you could build WarpRNNT like this: - -```bash -mkdir build; cd build -cmake -DCUDA_TOOLKIT_ROOT_DIR=/path/to/cuda .. -make -``` - -Otherwise, set `WARP_RNNT_PATH` to wherever you have `libwarprnnt.so` -installed. If you have a GPU, you should also make sure that -`CUDA_HOME` is set to the home cuda directory (i.e. where -`include/cuda.h` and `lib/libcudart.so` live). - -You should now be able to use `setup.py` to install the package into -your current Python environment: - -```bash -CUDA=/path/to/cuda python setup.py install -``` - -You can run a few unit tests with `setup.py` as well if you want: - -```bash -python setup.py test -``` - -## Using the kernels - -First import the module: - -```python -import warprnnt_tensorflow -``` - -The WarpRNNT op is available via the `warprnnt_tensorflow.rnnt_loss` function: - -```python -costs = warprnnt_tensorflow.rnnt_loss(acts, labels, input_lengths, label_lengths) -``` - -The `acts` is a 4 dimensional Tensor, `labels` -is 2 dimensinal Tensor, and all the others are single dimension Tensors. -See the main WarpRNNT documentation for more information. - -## Python interface -```python -def rnnt_loss(acts, labels, input_lengths, label_lengths, blank_label=0): - '''Computes the RNNT loss between a sequence of activations and a - ground truth labeling. - Args: - acts: A 4-D Tensor of floats. The dimensions - should be (B, T, U+1, V), where B is the minibatch index, - T is the time index, U is the label sequence - length (+1 means blank label prepanded), - and V indexes over activations for each - symbol in the alphabet. - labels: A 2-D Tensor of ints, a padded label sequences to make sure - labels for the minibatch are same length. - input_lengths: A 1-D Tensor of ints, the number of time steps - for each sequence in the minibatch. - label_lengths: A 1-D Tensor of ints, the length of each label - for each example in the minibatch. - blank_label: int, the label value/index that the RNNT - calculation should use as the blank label - Returns: - 1-D float Tensor, the cost of each example in the minibatch - (as negative log probabilities). - * This class performs the softmax operation internally. - * The label reserved for the blank symbol should be label 0. - ''' -``` diff --git a/tensorflow_binding/setup.py b/tensorflow_binding/setup.py deleted file mode 100644 index 170f4d6..0000000 --- a/tensorflow_binding/setup.py +++ /dev/null @@ -1,149 +0,0 @@ -"""setup.py script for warp-rnnt TensorFlow wrapper""" - -from __future__ import print_function - -import os -import platform -import re -import warnings -import setuptools -import sys -import unittest -from setuptools.command.build_ext import build_ext as orig_build_ext -from distutils.version import LooseVersion - -# We need to import tensorflow to find where its include directory is. -try: - import tensorflow as tf -except ImportError: - raise RuntimeError("Tensorflow must be installed to build the tensorflow wrapper.") - -if "CUDA_HOME" not in os.environ: - print("CUDA_HOME not found in the environment so building " - "without GPU support. To build with GPU support " - "please define the CUDA_HOME environment variable. " - "This should be a path which contains include/cuda.h", - file=sys.stderr) - enable_gpu = False -else: - enable_gpu = True - -if platform.system() == 'Darwin': - lib_ext = ".dylib" -else: - lib_ext = ".so" - -warp_rnnt_path = "../build" -if "WARP_RNNT_PATH" in os.environ: - warp_rnnt_path = os.environ["WARP_RNNT_PATH"] -if not os.path.exists(os.path.join(warp_rnnt_path, "libwarprnnt"+lib_ext)): - print(("Could not find libwarprnnt.so in {}.\n" - "Build warp-rnnt and set WARP_RNNT_PATH to the location of" - " libwarprnnt.so (default is '../build')").format(warp_rnnt_path), - file=sys.stderr) - sys.exit(1) - -root_path = os.path.realpath(os.path.dirname(__file__)) - -tf_include = tf.sysconfig.get_include() -tf_src_dir = tf.sysconfig.get_lib() # os.environ["TENSORFLOW_SRC_PATH"] -tf_includes = [tf_include, tf_src_dir] -warp_rnnt_includes = [os.path.join(root_path, '../include')] -include_dirs = tf_includes + warp_rnnt_includes - -if LooseVersion(tf.__version__) >= LooseVersion('1.4'): - nsync_dir = '../../external/nsync/public' - if LooseVersion(tf.__version__) >= LooseVersion('1.10'): - nsync_dir = 'external/nsync/public' - include_dirs += [os.path.join(tf_include, nsync_dir)] - -if os.getenv("TF_CXX11_ABI") is not None: - TF_CXX11_ABI = os.getenv("TF_CXX11_ABI") -else: - warnings.warn("Assuming tensorflow was compiled without C++11 ABI. " - "It is generally true if you are using binary pip package. " - "If you compiled tensorflow from source with gcc >= 5 and didn't set " - "-D_GLIBCXX_USE_CXX11_ABI=0 during compilation, you need to set " - "environment variable TF_CXX11_ABI=1 when compiling this bindings. " - "Also be sure to touch some files in src to trigger recompilation. " - "Also, you need to set (or unsed) this environment variable if getting " - "undefined symbol: _ZN10tensorflow... errors") - TF_CXX11_ABI = "0" - -extra_compile_args = ['-std=c++11', '-fPIC', '-D_GLIBCXX_USE_CXX11_ABI=' + TF_CXX11_ABI] -# current tensorflow code triggers return type errors, silence those for now -extra_compile_args += ['-Wno-return-type'] -if LooseVersion(tf.__version__) >= LooseVersion('1.4'): - extra_compile_args += tf.sysconfig.get_compile_flags() - -extra_link_args = [] -if LooseVersion(tf.__version__) >= LooseVersion('1.4'): - extra_link_args += tf.sysconfig.get_link_flags() - -if (enable_gpu): - extra_compile_args += ['-DWARPRNNT_ENABLE_GPU'] - include_dirs += [os.path.join(os.environ["CUDA_HOME"], 'include')] - - # mimic tensorflow cuda include setup so that their include command work - if not os.path.exists(os.path.join(root_path, "include")): - os.mkdir(os.path.join(root_path, "include")) - - cuda_inc_path = os.path.join(root_path, "include/cuda") - if not os.path.exists(cuda_inc_path) or os.readlink(cuda_inc_path) != os.environ["CUDA_HOME"]: - if os.path.exists(cuda_inc_path): - os.remove(cuda_inc_path) - os.symlink(os.environ["CUDA_HOME"], cuda_inc_path) - include_dirs += [os.path.join(root_path, 'include')] - -# Ensure that all expected files and directories exist. -for loc in include_dirs: - if not os.path.exists(loc): - print(("Could not find file or directory {}.\n" - "Check your environment variables and paths?").format(loc), - file=sys.stderr) - sys.exit(1) - -lib_srcs = ['src/warprnnt_op.cc'] - -ext = setuptools.Extension('warprnnt_tensorflow.kernels', - sources = lib_srcs, - language = 'c++', - include_dirs = include_dirs, - library_dirs = [warp_rnnt_path], - runtime_library_dirs = [os.path.realpath(warp_rnnt_path)], - libraries = ['warprnnt'], - extra_compile_args = extra_compile_args, - extra_link_args = extra_link_args) - -class build_tf_ext(orig_build_ext): - def build_extensions(self): - if LooseVersion(tf.__version__) < LooseVersion('1.4'): - self.compiler.compiler_so.remove('-Wstrict-prototypes') - orig_build_ext.build_extensions(self) - -def discover_test_suite(): - test_loader = unittest.TestLoader() - test_suite = test_loader.discover('tests', pattern='test_*.py') - return test_suite - -# Read the README.md file for the long description. This lets us avoid -# duplicating the package description in multiple places in the source. -README_PATH = os.path.join(os.path.dirname(__file__), "README.md") -with open(README_PATH, "r") as handle: - # Extract everything between the first set of ## headlines - LONG_DESCRIPTION = re.search("#.*([^#]*)##", handle.read()).group(1).strip() - -setuptools.setup( - name = "warprnnt_tensorflow", - version = "0.1", - description = "TensorFlow wrapper for warp-transducer", - url="https://github.com/HawkAaron/warp-transducer", - long_description = LONG_DESCRIPTION, - author = "Mingkun Huang", - author_email = "mingkunhuang95@gmail.com", - license = "Apache", - packages = ["warprnnt_tensorflow"], - ext_modules = [ext], - cmdclass = {'build_ext': build_tf_ext}, - test_suite = 'setup.discover_test_suite', -) diff --git a/tensorflow_binding/src/warprnnt_op.cc b/tensorflow_binding/src/warprnnt_op.cc deleted file mode 100644 index e3d2210..0000000 --- a/tensorflow_binding/src/warprnnt_op.cc +++ /dev/null @@ -1,191 +0,0 @@ -#ifdef WARPRNNT_ENABLE_GPU -#define EIGEN_USE_GPU -#include -#endif - -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/bounds_check.h" -#include "tensorflow/core/framework/allocator.h" -#include "rnnt.h" - - -REGISTER_OP("WarpRNNT") - .Input("acts: float32") - .Input("labels: int32") - .Input("input_lengths: int32") - .Input("label_lengths: int32") - .Attr("blank_label: int = 0") - .Output("costs: float32") - .Output("grads: float32"); - -namespace tf = tensorflow; - -namespace warp_rnnt { - -class WarpRNNTOpBase : public tf::OpKernel { - public: - explicit WarpRNNTOpBase(tf::OpKernelConstruction* ctx) : tf::OpKernel(ctx) { - OP_REQUIRES_OK(ctx, ctx->GetAttr("blank_label", &blank_label_)); - } - - void Compute(tf::OpKernelContext* ctx) override { - // Grab the input tensors - const tf::Tensor* acts; - const tf::Tensor* labels; - const tf::Tensor* label_lengths; - const tf::Tensor* input_lengths; - OP_REQUIRES_OK(ctx, ctx->input("acts", &acts)); - OP_REQUIRES_OK(ctx, ctx->input("labels", &labels)); - OP_REQUIRES_OK(ctx, ctx->input("label_lengths", &label_lengths)); - OP_REQUIRES_OK(ctx, ctx->input("input_lengths", &input_lengths)); - - OP_REQUIRES(ctx, acts->shape().dims() == 4, - tf::errors::InvalidArgument("acts is not a 4-Tensor")); - OP_REQUIRES(ctx, labels->shape().dims() == 2, - tf::errors::InvalidArgument("labels is not a 2-Tensor")); - OP_REQUIRES(ctx, tf::TensorShapeUtils::IsVector(label_lengths->shape()), - tf::errors::InvalidArgument("label_lengths is not a vector")); - OP_REQUIRES(ctx, tf::TensorShapeUtils::IsVector(input_lengths->shape()), - tf::errors::InvalidArgument("input_lengths is not a vector")); - - const auto& acts_shape = acts->shape(); - const auto batch_size = acts_shape.dim_size(0); - const auto max_time = acts_shape.dim_size(1); - const auto max_u = acts_shape.dim_size(2); - const auto num_classes_raw = acts_shape.dim_size(3); - - auto acts_t = acts->tensor(); - auto labels_t = labels->tensor(); - - OP_REQUIRES( - ctx, tf::FastBoundsCheck(num_classes_raw, std::numeric_limits::max()), - tf::errors::InvalidArgument("num_classes cannot exceed max int")); - const auto alphabet_size = static_cast(num_classes_raw); - - OP_REQUIRES( - ctx, batch_size == input_lengths->dim_size(0), - tf::errors::InvalidArgument("len(input_lengths) != batch_size. ", - "len(input_length): ", input_lengths->dim_size(0), - " batch_size: ", batch_size)); - auto input_lengths_t = input_lengths->vec(); - - OP_REQUIRES( - ctx, batch_size == label_lengths->dim_size(0), - tf::errors::InvalidArgument("len(label_lengths) != batch_size. ", - "len(label_length): ", label_lengths->dim_size(0), - " batch_size: ", batch_size)); - auto label_lengths_t = label_lengths->vec(); - - // TODO check that labels are in the alphabet? - // Refer to line 185, we know that - // Tensor input_lengths is in GPU, so cannot compare with CPU variable - //for (int b = 0; b < batch_size; b++) { - // OP_REQUIRES(ctx, input_lengths_t(b) <= max_time, - // tf::errors::InvalidArgument("input_lengths(", b, ") <= ", max_time)); - //} - - tf::Tensor* costs = nullptr; - OP_REQUIRES_OK(ctx, ctx->allocate_output("costs", input_lengths->shape(), &costs)); - auto costs_t = costs->vec(); - - tf::Tensor* grads = nullptr; - OP_REQUIRES_OK(ctx, ctx->allocate_output("grads", acts->shape(), &grads)); - set_zero(grads); - auto grads_t = grads->tensor(); - - auto options = create_options(ctx); - options.blank_label = blank_label_; - options.maxT = max_time; - options.maxU = max_u; - - size_t workspace_size_bytes; - bool use_gpu = false; - if(options.loc == RNNT_GPU) { - use_gpu = true; - } - auto warp_status = get_workspace_size(max_time, - max_u, - batch_size, - use_gpu, - &workspace_size_bytes); - - OP_REQUIRES(ctx, warp_status == RNNT_STATUS_SUCCESS, - tf::errors::Internal("warp_rnnt error in get_workspace_size: ", - rnntGetStatusString(warp_status))); - - auto workspace_shape = tf::TensorShape{static_cast(workspace_size_bytes)}; - tf::Tensor workspace; - OP_REQUIRES_OK(ctx, ctx->allocate_temp(tf::DT_UINT8, workspace_shape, &workspace)); - auto workspace_t = workspace.flat(); - - // compute RNNT - warp_status = compute_rnnt_loss(acts_t.data(), - grads_t.data(), - labels_t.data(), - label_lengths_t.data(), - input_lengths_t.data(), - alphabet_size, batch_size, - costs_t.data(), workspace_t.data(), options); - - OP_REQUIRES(ctx, warp_status == RNNT_STATUS_SUCCESS, - tf::errors::Internal("warp_rnnt error in compute_rnnt_loss: ", - rnntGetStatusString(warp_status))); - - } - private: - int blank_label_; - virtual void set_zero(tf::Tensor* t) = 0; - virtual rnntOptions create_options(tf::OpKernelContext* ctx) = 0; -}; - -class WarpRNNTOpCPU : public WarpRNNTOpBase { - public: - explicit WarpRNNTOpCPU(tf::OpKernelConstruction* ctx) : WarpRNNTOpBase(ctx) { - } - - private: - void set_zero(tf::Tensor* t) override { - t->flat().setZero(); - } - - rnntOptions create_options(tf::OpKernelContext* ctx) override { - auto options = rnntOptions{}; - options.loc = RNNT_CPU; - options.batch_first = true; - options.num_threads = ctx->device()->tensorflow_cpu_worker_threads()->num_threads; - return options; - } -}; - -REGISTER_KERNEL_BUILDER(Name("WarpRNNT").Device(::tensorflow::DEVICE_CPU), WarpRNNTOpCPU); - -#ifdef WARPRNNT_ENABLE_GPU - -class WarpRNNTOpGPU : public WarpRNNTOpBase { - public: - explicit WarpRNNTOpGPU(tf::OpKernelConstruction* ctx) : WarpRNNTOpBase(ctx) { - } - - private: - void set_zero(tf::Tensor* t) override { - // here is not need - // cudaMemset(t->flat().data(), 0, t->NumElements()*sizeof(float)); - } - - rnntOptions create_options(tf::OpKernelContext* ctx) override { - auto cuda_stream = ctx->eigen_device().stream(); - auto options = rnntOptions{}; - options.loc = RNNT_GPU; - options.stream = cuda_stream; - return options; - } -}; - -REGISTER_KERNEL_BUILDER(Name("WarpRNNT").Device(::tensorflow::DEVICE_GPU) - .HostMemory("costs"), - WarpRNNTOpGPU); -#undef EIGEN_USE_GPU -#endif - -} diff --git a/tensorflow_binding/tests/test_basic.py b/tensorflow_binding/tests/test_basic.py deleted file mode 100644 index d6d13e2..0000000 --- a/tensorflow_binding/tests/test_basic.py +++ /dev/null @@ -1,56 +0,0 @@ -import numpy as np -import tensorflow as tf -from warprnnt_tensorflow import rnnt_loss - -acts = tf.placeholder(tf.float32, [None, None, None, None]) -labels = tf.placeholder(tf.int32, [None, None]) -input_length = tf.placeholder(tf.int32, [None]) -label_length = tf.placeholder(tf.int32, [None]) - -B = 2; T = 4; U = 3; V = 3; blank = 0 - -logits = tf.nn.log_softmax(acts) -costs = rnnt_loss(logits, labels, input_length, label_length, blank) -grad = tf.gradients(costs, [acts]) - -a = np.array([[[[0.06535690384862791, 0.7875301411923206, 0.08159176605666074], - [0.5297155426466327, 0.7506749639230854, 0.7541348379087998], - [0.6097641124736383, 0.8681404965673826, 0.6225318186056529]], - - [[0.6685222872103057, 0.8580392805336061, 0.16453892311765583], - [0.989779515236694, 0.944298460961015, 0.6031678586829663], - [0.9467833543605416, 0.666202507295747, 0.28688179752461884]], - - [[0.09418426230195986, 0.3666735970751962, 0.736168049462793], - [0.1666804425271342, 0.7141542198635192, 0.3993997272216727], - [0.5359823524146038, 0.29182076440286386, 0.6126422611507932]], - - [[0.3242405528768486, 0.8007644367291621, 0.5241057606558068], - [0.779194617063042, 0.18331417220174862, 0.113745182072432], - [0.24022162381327106, 0.3394695622533106, 0.1341595066017014]]], - - [[[0.5055615569388828, 0.051597282072282646, 0.6402903936686337], - [0.43073311517251, 0.8294731834714112, 0.1774668847323424], - [0.3207001991262245, 0.04288308912457006, 0.30280282975568984]], - - [[0.6751777088333762, 0.569537369330242, 0.5584738347504452], - [0.08313242153985256, 0.06016544344162322, 0.10795752845152584], - [0.7486153608562472, 0.943918041459349, 0.4863558118797222]], - - [[0.4181986264486809, 0.6524078485043804, 0.024242983423721887], - [0.13458171554507403, 0.3663418070512402, 0.2958297395361563], - [0.9236695822497084, 0.6899291482654177, 0.7418981733448822]], - - [[0.25000547599982104, 0.6034295486281007, 0.9872887878887768], - [0.5926057265215715, 0.8846724004467684, 0.5434495396894328], - [0.6607698886038497, 0.3771277082495921, 0.3580209022231813]]]], dtype=np.float32) - -b = np.array([[1, 2], [1, 1]], dtype=np.int32) -c = np.array([4, 4], dtype=np.int32) -d = np.array([2, 2], dtype=np.int32) - -feed = {acts: a, labels: b, input_length: c, label_length: d} -with tf.Session() as sess: - cost, grads = sess.run([costs, grad], feed_dict=feed) - print(cost) - print(grads) \ No newline at end of file diff --git a/tensorflow_binding/tests/test_warprnnt_op.py b/tensorflow_binding/tests/test_warprnnt_op.py deleted file mode 100644 index f50d58e..0000000 --- a/tensorflow_binding/tests/test_warprnnt_op.py +++ /dev/null @@ -1,97 +0,0 @@ -import tensorflow as tf -import numpy as np -from warprnnt_tensorflow import rnnt_loss -from tensorflow.python.client import device_lib - -def is_gpu_available(): - """Returns whether Tensorflow can access a GPU.""" - return any(x.device_type == 'GPU' for x in device_lib.list_local_devices()) - -class WarpRNNTTest(tf.test.TestCase): - - def _run_rnnt(self, acts, labels, input_lengths, label_lengths, - expected_costs, expected_grads, blank, use_gpu=False): - self.assertEquals(acts.shape, expected_grads.shape) - acts_t = tf.constant(acts) - labels_t = tf.constant(labels) - input_lengths_t = tf.constant(input_lengths) - label_lengths_t = tf.constant(label_lengths) - - logits = acts_t if use_gpu else tf.nn.log_softmax(acts_t) - costs = rnnt_loss(logits, labels_t, input_lengths_t, label_lengths_t, blank) - - grads = tf.gradients(costs, [acts_t])[0] - - with self.test_session(use_gpu=use_gpu) as sess: - (tf_costs, tf_grad) = sess.run([costs, grads]) - self.assertAllClose(tf_costs, expected_costs, atol=1e-6) - self.assertAllClose(tf_grad, expected_grads, atol=1e-6) - - def test_forward(self): - # Softmax activations for the following inputs: - acts = np.array([0.1, 0.6, 0.1, 0.1, 0.1, 0.1, - 0.1, 0.6, 0.1, 0.1, 0.1, 0.1, - 0.2, 0.8, 0.1, 0.1, 0.6, 0.1, - 0.1, 0.1, 0.1, 0.1, 0.2, 0.1, - 0.1, 0.7, 0.1, 0.2, 0.1, 0.1], dtype=np.float32).reshape(1, 2, 3, 5) - - labels = np.array([[1, 2]], dtype=np.int32) - input_lengths = np.array([2], dtype=np.int32) - label_lengths = np.array([2], dtype=np.int32) - - acts_t = tf.constant(acts) - labels_t = tf.constant(labels) - input_lengths_t = tf.constant(input_lengths) - label_lengths_t = tf.constant(label_lengths) - acts_t = tf.nn.log_softmax(acts_t) # NOTE cpu - costs = rnnt_loss(acts_t, labels_t, input_lengths_t, label_lengths_t) - with self.test_session(): - print(costs.eval()) - - def _test_multiple_batches(self, use_gpu): - B = 2; T = 4; U = 3; V = 3 - - acts = np.array([0.065357, 0.787530, 0.081592, 0.529716, 0.750675, 0.754135, - 0.609764, 0.868140, 0.622532, 0.668522, 0.858039, 0.164539, - 0.989780, 0.944298, 0.603168, 0.946783, 0.666203, 0.286882, - 0.094184, 0.366674, 0.736168, 0.166680, 0.714154, 0.399400, - 0.535982, 0.291821, 0.612642, 0.324241, 0.800764, 0.524106, - 0.779195, 0.183314, 0.113745, 0.240222, 0.339470, 0.134160, - 0.505562, 0.051597, 0.640290, 0.430733, 0.829473, 0.177467, - 0.320700, 0.042883, 0.302803, 0.675178, 0.569537, 0.558474, - 0.083132, 0.060165, 0.107958, 0.748615, 0.943918, 0.486356, - 0.418199, 0.652408, 0.024243, 0.134582, 0.366342, 0.295830, - 0.923670, 0.689929, 0.741898, 0.250005, 0.603430, 0.987289, - 0.592606, 0.884672, 0.543450, 0.660770, 0.377128, 0.358021], dtype=np.float32).reshape(B, T, U, V); - - expected_costs = np.array([4.28065, 3.93844], dtype=np.float32) - expected_grads = np.array([-0.186844, -0.062555, 0.249399, -0.203377, 0.202399, 0.000977, - -0.141016, 0.079123, 0.061893, -0.011552, -0.081280, 0.092832, - -0.154257, 0.229433, -0.075176, -0.246593, 0.146405, 0.100188, - -0.012918, -0.061593, 0.074512, -0.055986, 0.219831, -0.163845, - -0.497627, 0.209240, 0.288387, 0.013605, -0.030220, 0.016615, - 0.113925, 0.062781, -0.176706, -0.667078, 0.367659, 0.299419, - -0.356344, -0.055347, 0.411691, -0.096922, 0.029459, 0.067463, - -0.063518, 0.027654, 0.035863, -0.154499, -0.073942, 0.228441, - -0.166790, -0.000088, 0.166878, -0.172370, 0.105565, 0.066804, - 0.023875, -0.118256, 0.094381, -0.104707, -0.108934, 0.213642, - -0.369844, 0.180118, 0.189726, 0.025714, -0.079462, 0.053748, - 0.122328, -0.238789, 0.116460, -0.598687, 0.302203, 0.296484], dtype=np.float32).reshape(B, T, U, V); - - labels = np.array([[1, 2], [1, 1]], dtype=np.int32) - input_lengths = np.array([4, 4], dtype=np.int32) - label_lengths = np.array([2, 2], dtype=np.int32) - - self._run_rnnt(acts, labels, input_lengths, label_lengths, expected_costs, expected_grads, 0, use_gpu) - - def test_multiple_batches_cpu(self): - self._test_multiple_batches(use_gpu=False) - - def test_multiple_batches_gpu(self): - if (is_gpu_available()): - self._test_multiple_batches(use_gpu=True) - else: - print('Skipping GPU test, no gpus available') - -if __name__ == '__main__': - tf.test.main() diff --git a/tensorflow_binding/warprnnt_tensorflow/__init__.py b/tensorflow_binding/warprnnt_tensorflow/__init__.py deleted file mode 100644 index de45611..0000000 --- a/tensorflow_binding/warprnnt_tensorflow/__init__.py +++ /dev/null @@ -1,48 +0,0 @@ -import imp -import tensorflow as tf -from tensorflow.python.framework import ops - -lib_file = imp.find_module('kernels', __path__)[1] -_warprnnt = tf.load_op_library(lib_file) - - -def rnnt_loss(acts, labels, input_lengths, label_lengths, blank_label=0): - '''Computes the RNNT loss between a sequence of activations and a - ground truth labeling. - Args: - acts: A 4-D Tensor of floats. The dimensions - should be (B, T, U, V), where B is the minibatch index, - T is the time index, U is the prediction network sequence - length, and V indexes over activations for each - symbol in the alphabet. - labels: A 2-D Tensor of ints, a padded label sequences to make sure - labels for the minibatch are same length. - input_lengths: A 1-D Tensor of ints, the number of time steps - for each sequence in the minibatch. - label_lengths: A 1-D Tensor of ints, the length of each label - for each example in the minibatch. - blank_label: int, the label value/index that the RNNT - calculation should use as the blank label - Returns: - 1-D float Tensor, the cost of each example in the minibatch - (as negative log probabilities). - * This class performs the softmax operation internally. - * The label reserved for the blank symbol should be label 0. - ''' - loss, _ = _warprnnt.warp_rnnt(acts, labels, input_lengths, - label_lengths, blank_label) - return loss - - -@ops.RegisterGradient("WarpRNNT") -def _RNNTLossGrad(op, grad_loss, _): - grad = op.outputs[1] - # NOTE since here we are batch first, cannot use _BroadcastMul - grad_loss = tf.reshape(grad_loss, (-1, 1, 1, 1)) - return [grad_loss * grad, None, None, None] - -@ops.RegisterShape("WarpRNNT") -def _RNNTLossShape(op): - inputs_shape = op.inputs[0].get_shape().with_rank(4) - batch_size = inputs_shape[0] - return [batch_size, inputs_shape] \ No newline at end of file diff --git a/tests/test_cpu.cpp b/tests/test_cpu.cpp index 554a18b..fee4eb5 100644 --- a/tests/test_cpu.cpp +++ b/tests/test_cpu.cpp @@ -239,7 +239,7 @@ bool inf_test() { return status; } -float numeric_grad(std::vector& acts, std::vector& flat_labels, std::vector& label_lengths, +void numeric_grad(std::vector& acts, std::vector& flat_labels, std::vector& label_lengths, std::vector sizes, int alphabet_size, int minibatch, void* rnnt_cpu_workspace, rnntOptions& options, std::vector& num_grad) { @@ -403,4 +403,4 @@ int main(void) { std::cout << "Some or all tests fail" << std::endl; return 1; } -} \ No newline at end of file +} diff --git a/tests/test_time.cpp b/tests/test_time.cpp index 3083b0e..ce77ba6 100644 --- a/tests/test_time.cpp +++ b/tests/test_time.cpp @@ -12,7 +12,7 @@ #include "test.h" -bool run_test(int B, int T, int L, int A, int num_threads) { +void run_test(int B, int T, int L, int A, int num_threads) { std::mt19937 gen(2); auto start = std::chrono::high_resolution_clock::now(); @@ -119,4 +119,4 @@ int main(int argc, char** argv) { } run_test(B, T, L, A, num_threads); -} \ No newline at end of file +}