From 15422e135f2c0c2079ef5d4fa46988c94339acc8 Mon Sep 17 00:00:00 2001
From: ljn7 <john.nirmal@tihiitb.org>
Date: Tue, 8 Jul 2025 11:00:40 +0000
Subject: [PATCH 01/14] Updated for CUDA 11 and 12

---
 .gitignore                      | 132 ++++++++++++++++++++
 CMakeLists.txt                  | 214 +++++++++++++++-----------------
 README.md                       |   1 +
 pytorch_binding/README.md       |   3 +-
 pytorch_binding/setup.py        |   8 +-
 pytorch_binding/src/binding.cpp |  76 +++++++-----
 6 files changed, 285 insertions(+), 149 deletions(-)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..9d54d6b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,132 @@
+##################################
+# Python
+##################################
+
+# Byte-compiled
+__pycache__/
+*.py[cod]
+*$py.class
+
+# Virtual envs
+.env/
+.venv/
+env/
+venv/
+
+# Packaging
+build/
+dist/
+*.egg-info/
+.eggs/
+*.whl
+
+# Caches / testing
+.pytest_cache/
+htmlcov/
+coverage.*
+.tox/
+.nox/
+
+# Type checkers
+.mypy_cache/
+.pytype/
+.pyre/
+
+# Jupyter
+.ipynb_checkpoints/
+
+##################################
+# C++ / CUDA / CMake
+##################################
+
+# Objects and binaries
+*.o
+*.obj
+*.a
+*.so
+*.dll
+*.lib
+*.dylib
+*.out
+*.exe
+*.app
+
+# CUDA
+*.cu.o
+*.ptx
+*.cubin
+*.fatbin
+*.nvvp
+*.nvcpl
+*.nvd*
+
+# Dependency / compiler files
+*.d
+*.mod
+
+# CMake and build
+/build/
+CMakeFiles/
+CMakeCache.txt
+cmake_install.cmake
+Makefile
+CTestTestfile.cmake
+install_manifest.txt
+compile_commands.json
+
+# Ninja
+.ninja_log
+.ninja_deps
+build.ninja
+rules.ninja
+
+##################################
+# Python bindings (PyTorch / TensorFlow)
+##################################
+
+# Compiled extensions
+*.so
+*.pyd
+*.dll
+
+# Generated by setup.py
+warprnnt_pytorch.egg-info/
+warprnnt_tensorflow.egg-info/
+
+# Temporary output
+*.log
+
+##################################
+# Docs
+##################################
+
+# LaTeX
+*.aux
+*.log
+*.out
+*.pdf
+*.toc
+*.lof
+*.lot
+
+##################################
+# IDE / Editor / OS
+##################################
+
+# VSCode
+.vscode/
+
+# JetBrains / CLion / PyCharm
+.idea/
+cmake-build-*/
+
+# Vim / Emacs
+*.swp
+*.swo
+*~
+
+# macOS / Windows
+.DS_Store
+Thumbs.db
+desktop.ini
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2c570e0..c4b0ad3 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,136 +1,126 @@
-IF (APPLE)
-    cmake_minimum_required(VERSION 3.4)
-ELSE()
-    cmake_minimum_required(VERSION 2.8)
-ENDIF()
+cmake_minimum_required(VERSION 3.18)
 
-project(rnnt_release)
+project(rnnt_release LANGUAGES CXX CUDA)
 
-IF (NOT APPLE)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
-ENDIF()
+# Set global flags
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
-IF (APPLE)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O2")
+if(NOT APPLE)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
+else()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 -O2")
     add_definitions(-DAPPLE)
-ENDIF()
+endif()
 
+# Include headers
 include_directories(include)
 
-FIND_PACKAGE(CUDA)
-MESSAGE(STATUS "cuda found ${CUDA_FOUND}")
+# Find CUDA Toolkit
+find_package(CUDAToolkit REQUIRED)
+message(STATUS "CUDA toolkit found")
 
+# Options
 option(USE_NAIVE_KERNEL "use naive alpha-beta kernel" OFF)
 option(DEBUG_TIME "output kernel time" OFF)
 option(DEBUG_KERNEL "output alpha beta" OFF)
-if (USE_NAIVE_KERNEL)
-    add_definitions(-DUSE_NAIVE_KERNEL)
+option(WITH_GPU "compile warp-rnnt with cuda." ON)
+option(WITH_OMP "compile warp-rnnt with openmp." ON)
+
+# Preprocessor flags
+set(COMMON_DEFINITIONS "")
+if(USE_NAIVE_KERNEL)
+    list(APPEND COMMON_DEFINITIONS -DUSE_NAIVE_KERNEL)
 endif()
-if (DEBUG_TIME)
-    add_definitions(-DDEBUG_TIME)
+if(DEBUG_TIME)
+    list(APPEND COMMON_DEFINITIONS -DDEBUG_TIME)
 endif()
-if (DEBUG_KERNEL)
-    add_definitions(-DDEBUG_KERNEL)
+if(DEBUG_KERNEL)
+    list(APPEND COMMON_DEFINITIONS -DDEBUG_KERNEL)
 endif()
-
-option(WITH_GPU "compile warp-rnnt with cuda." ${CUDA_FOUND})
-option(WITH_OMP "compile warp-rnnt with openmp." ON)
-
 if(NOT WITH_OMP)
-    add_definitions(-DRNNT_DISABLE_OMP)
+    list(APPEND COMMON_DEFINITIONS -DRNNT_DISABLE_OMP)
 endif()
-if (WITH_OMP)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fopenmp")
-endif()
-
-# need to be at least 30 or __shfl_down in reduce wont compile
-set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30 -O2")
-set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_35,code=sm_35")
-
-set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_50,code=sm_50")
-set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52")
-IF(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5)
-  SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -D_MWAITXINTRIN_H_INCLUDED -D_FORCE_INLINES")
-ENDIF()
-
-IF (CUDA_VERSION GREATER 7.6)
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60")
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61")
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62")
-ENDIF()
-
-IF (CUDA_VERSION GREATER 8.9)
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_70,code=sm_70")
-ENDIF()
-
-IF (CUDA_VERSION GREATER 9.9)
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_75,code=sm_75")
-ENDIF()
-
-if (NOT APPLE)
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++11")
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-ENDIF()
 
+# OpenMP
+if(WITH_OMP)
+    find_package(OpenMP)
+    if(OpenMP_CXX_FOUND)
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    endif()
+endif()
 
-IF (APPLE)
-    EXEC_PROGRAM(uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION)
-    STRING(REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
-    MESSAGE(STATUS "DARWIN_VERSION=${DARWIN_VERSION}")
-
-    #for el capitain have to use rpath
-
-    IF (DARWIN_VERSION LESS 15)
-        set(CMAKE_SKIP_RPATH TRUE)
-    ENDIF ()
-
-ELSE()
-    #always skip for linux
-    set(CMAKE_SKIP_RPATH TRUE)
-ENDIF()
-
-
-IF (WITH_GPU)
-
-    MESSAGE(STATUS "Building shared library with GPU support")
-
-    CUDA_ADD_LIBRARY(warprnnt SHARED src/rnnt_entrypoint.cu)
-    IF (!Torch_FOUND)
-        TARGET_LINK_LIBRARIES(warprnnt ${CUDA_curand_LIBRARY})
-    ENDIF()
-
-    cuda_add_executable(test_time_gpu tests/test_time.cu tests/random.cpp )
-    TARGET_LINK_LIBRARIES(test_time_gpu warprnnt ${CUDA_curand_LIBRARY})
-    SET_TARGET_PROPERTIES(test_time_gpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
-
-    cuda_add_executable(test_gpu tests/test_gpu.cu tests/random.cpp )
-    TARGET_LINK_LIBRARIES(test_gpu warprnnt ${CUDA_curand_LIBRARY})
-    SET_TARGET_PROPERTIES(test_gpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
-
-ELSE()
-    MESSAGE(STATUS "Building shared library with no GPU support")
-
-    if (NOT APPLE)
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O2")
-    ENDIF()
-
-    ADD_LIBRARY(warprnnt SHARED src/rnnt_entrypoint.cpp)
+set(CMAKE_CUDA_ARCHITECTURES 86)
 
-ENDIF()
+# --------------------
+# Shared Library Target
+# --------------------
+if(WITH_GPU)
+    message(STATUS "Building shared library with GPU support")
 
+    add_library(warprnnt SHARED src/rnnt_entrypoint.cu)
+    target_link_libraries(warprnnt PRIVATE CUDA::cudart CUDA::curand)
+else()
+    message(STATUS "Building shared library with no GPU support")
 
-add_executable(test_cpu tests/test_cpu.cpp tests/random.cpp )
-TARGET_LINK_LIBRARIES(test_cpu warprnnt)
-SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
+    add_library(warprnnt SHARED src/rnnt_entrypoint.cpp)
+endif()
 
-add_executable(test_time tests/test_time.cpp tests/random.cpp )
-TARGET_LINK_LIBRARIES(test_time warprnnt)
-SET_TARGET_PROPERTIES(test_time PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
+target_compile_features(warprnnt PRIVATE cxx_std_17)
+target_compile_definitions(warprnnt PRIVATE ${COMMON_DEFINITIONS})
+target_include_directories(warprnnt PRIVATE include)
+set_target_properties(warprnnt PROPERTIES
+    CUDA_SEPARABLE_COMPILATION ON
+    CUDA_STANDARD 17
+    CUDA_STANDARD_REQUIRED ON
+)
+target_compile_options(warprnnt PRIVATE
+    $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
+)
+
+# --------------------
+# Executables
+# --------------------
+add_executable(test_cpu tests/test_cpu.cpp tests/random.cpp)
+target_link_libraries(test_cpu PRIVATE warprnnt)
+target_compile_features(test_cpu PRIVATE cxx_std_17)
+
+add_executable(test_time tests/test_time.cpp tests/random.cpp)
+target_link_libraries(test_time PRIVATE warprnnt)
+target_compile_features(test_time PRIVATE cxx_std_17)
+
+if(WITH_GPU)
+    add_executable(test_time_gpu tests/test_time.cu tests/random.cpp)
+    target_link_libraries(test_time_gpu PRIVATE warprnnt CUDA::curand)
+    target_compile_features(test_time_gpu PRIVATE cxx_std_17)
+    set_target_properties(test_time_gpu PROPERTIES
+        CUDA_SEPARABLE_COMPILATION ON
+        CUDA_STANDARD 17
+        CUDA_STANDARD_REQUIRED ON
+    )
+    target_compile_options(test_time_gpu PRIVATE
+        $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
+    )
+
+    add_executable(test_gpu tests/test_gpu.cu tests/random.cpp)
+    target_link_libraries(test_gpu PRIVATE warprnnt CUDA::curand)
+    target_compile_features(test_gpu PRIVATE cxx_std_17)
+    set_target_properties(test_gpu PROPERTIES
+        CUDA_SEPARABLE_COMPILATION ON
+        CUDA_STANDARD 17
+        CUDA_STANDARD_REQUIRED ON
+    )
+    target_compile_options(test_gpu PRIVATE
+        $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
+    )
+endif()
 
-INSTALL(TARGETS warprnnt
-        RUNTIME DESTINATION "bin"
-        LIBRARY DESTINATION "lib"
-        ARCHIVE DESTINATION "lib")
+# --------------------
+# Installation
+# --------------------
+install(TARGETS warprnnt
+        RUNTIME DESTINATION bin
+        LIBRARY DESTINATION lib
+        ARCHIVE DESTINATION lib)
 
-INSTALL(FILES include/rnnt.h DESTINATION "include")
+install(FILES include/rnnt.h DESTINATION include)
\ No newline at end of file
diff --git a/README.md b/README.md
index 11263ef..749a1e4 100644
--- a/README.md
+++ b/README.md
@@ -57,6 +57,7 @@ Run cmake and build:
 ```bash
 cmake -DCUDA_TOOLKIT_ROOT_DIR=$CUDA_HOME ..
 make
+sudo make install #optional
 ```
 if it logs
 ```
diff --git a/pytorch_binding/README.md b/pytorch_binding/README.md
index cce44e7..190f56e 100644
--- a/pytorch_binding/README.md
+++ b/pytorch_binding/README.md
@@ -16,6 +16,7 @@ cd warp-transducer
 mkdir build; cd build
 cmake ..
 make
+sudo make install # optional
 ```
 
 Otherwise, set `WARP_RNNT_PATH` to wherever you have `libwarprnnt.so`
@@ -30,7 +31,7 @@ export CUDA_HOME="/usr/local/cuda"
 Now install the bindings: (Please make sure the GCC version >= 4.9)
 ```
 cd pytorch_binding
-python setup.py install
+pip install . # pip install . --break-package-system
 ```
 
 If you try the above and get a dlopen error on OSX with anaconda3 (as recommended by pytorch):
diff --git a/pytorch_binding/setup.py b/pytorch_binding/setup.py
index ad960ea..08726f4 100644
--- a/pytorch_binding/setup.py
+++ b/pytorch_binding/setup.py
@@ -1,4 +1,4 @@
-from distutils.version import LooseVersion
+from packaging.version import Version
 import os
 import platform
 import sys
@@ -8,7 +8,9 @@
 
 
 extra_compile_args = ['-fPIC']
-if LooseVersion(torch.__version__) >= LooseVersion("1.5.0"):
+if Version(torch.__version__) >= Version("1.8.0"):
+    extra_compile_args += ['-std=c++17']
+elif Version(torch.__version__) >= Version("1.5.0"):
     extra_compile_args += ['-std=c++14']
 else:
     extra_compile_args += ['-std=c++11']
@@ -39,7 +41,7 @@
 
 setup(
     name='warprnnt_pytorch',
-    version="0.1",
+    version="0.2.0",
     description="PyTorch wrapper for RNN-Transducer",
     url="https://github.com/HawkAaron/warp-transducer",
     author="Mingkun Huang",
diff --git a/pytorch_binding/src/binding.cpp b/pytorch_binding/src/binding.cpp
index 7aaf000..53e5d2d 100644
--- a/pytorch_binding/src/binding.cpp
+++ b/pytorch_binding/src/binding.cpp
@@ -5,8 +5,8 @@
 #include "rnnt.h"
 
 #ifdef WARPRNNT_ENABLE_GPU
-    #include "THC.h"
-    extern THCState* state;
+    #include <ATen/cuda/CUDAContext.h>
+    #include <cuda_runtime_api.h>
 #endif
 
 int cpu_rnnt(torch::Tensor acts,
@@ -23,11 +23,11 @@ int cpu_rnnt(torch::Tensor acts,
     int minibatch_size = acts.size(2);
     int alphabet_size = acts.size(3);
 
-	if (true) {
-		minibatch_size = acts.size(0);
-		maxT = acts.size(1);
-		maxU = acts.size(2);
-	}
+    if (true) {
+      minibatch_size = acts.size(0);
+      maxT = acts.size(1);
+      maxU = acts.size(2);
+    }
 
     rnntOptions options;
     memset(&options, 0, sizeof(options));
@@ -43,33 +43,33 @@ int cpu_rnnt(torch::Tensor acts,
 #endif
 
     size_t cpu_size_bytes = 0;
-    switch (acts.type().scalarType()) {
-      case torch::ScalarType::Float:
+    switch (acts.scalar_type()) {
+      case torch::kFloat:
         {
         get_workspace_size(maxT, maxU, minibatch_size,
                            false, &cpu_size_bytes);
 
         float* cpu_workspace = (float*) new unsigned char[cpu_size_bytes];
-        compute_rnnt_loss(acts.data<float>(), grads.data<float>(),
-                         labels.data<int>(), label_lengths.data<int>(),
-                         input_lengths.data<int>(), alphabet_size,
-                         minibatch_size, costs.data<float>(),
+        compute_rnnt_loss(acts.data_ptr<float>(), grads.data_ptr<float>(),
+                         labels.data_ptr<int>(), label_lengths.data_ptr<int>(),
+                         input_lengths.data_ptr<int>(), alphabet_size,
+                         minibatch_size, costs.data_ptr<float>(),
                          cpu_workspace, options);
 
         delete cpu_workspace;
         return 0;
         }
-      case torch::ScalarType::Double:
+      case torch::kDouble:
         {
         get_workspace_size(maxT, maxU, minibatch_size,
                            false, &cpu_size_bytes,
                            sizeof(double));
 
         double* cpu_workspace = (double*) new unsigned char[cpu_size_bytes];
-        compute_rnnt_loss_fp64(acts.data<double>(), grads.data<double>(),
-                         labels.data<int>(), label_lengths.data<int>(),
-                         input_lengths.data<int>(), alphabet_size,
-                         minibatch_size, costs.data<double>(),
+        compute_rnnt_loss_fp64(acts.data_ptr<double>(), grads.data_ptr<double>(),
+                         labels.data_ptr<int>(), label_lengths.data_ptr<int>(),
+                         input_lengths.data_ptr<int>(), alphabet_size,
+                         minibatch_size, costs.data_ptr<double>(),
                          cpu_workspace, options);
 
         delete cpu_workspace;
@@ -108,8 +108,8 @@ int gpu_rnnt(torch::Tensor acts,
     options.num_threads = std::max(options.num_threads, (unsigned int) 1);
 #endif
 
-    switch (acts.type().scalarType()) {
-      case torch::ScalarType::Float:
+    switch (acts.scalar_type()) {
+      case torch::kFloat:
         {
         size_t gpu_size_bytes;
         get_workspace_size(maxT, maxU, minibatch_size,
@@ -117,18 +117,23 @@ int gpu_rnnt(torch::Tensor acts,
 
         cudaSetDevice(acts.get_device());
 
-        void* gpu_workspace = THCudaMalloc(state, gpu_size_bytes);
+        void* gpu_workspace = nullptr;
+        cudaMalloc(&gpu_workspace, gpu_size_bytes);
+        if (gpu_workspace == nullptr) {
+            std::cerr << __FILE__ << ':' << __LINE__ << ": " << "failed to allocate GPU workspace" << std::endl;
+            return -1;
+        }
 
-        compute_rnnt_loss(acts.data<float>(), grads.data<float>(),
-                         labels.data<int>(), label_lengths.data<int>(),
-                         input_lengths.data<int>(), alphabet_size,
-                         minibatch_size, costs.data<float>(),
+        compute_rnnt_loss(acts.data_ptr<float>(), grads.data_ptr<float>(),
+                         labels.data_ptr<int>(), label_lengths.data_ptr<int>(),
+                         input_lengths.data_ptr<int>(), alphabet_size,
+                         minibatch_size, costs.data_ptr<float>(),
                          gpu_workspace, options);
 
-        THCudaFree(state, gpu_workspace);
+        cudaFree(gpu_workspace);
         return 0;
         }
-      case torch::ScalarType::Double:
+      case torch::kDouble:
         {
         size_t gpu_size_bytes;
         get_workspace_size(maxT, maxU, minibatch_size,
@@ -136,15 +141,20 @@ int gpu_rnnt(torch::Tensor acts,
 
         cudaSetDevice(acts.get_device());
 
-        void* gpu_workspace = THCudaMalloc(state, gpu_size_bytes);
+        void* gpu_workspace = nullptr;
+        cudaMalloc(&gpu_workspace, gpu_size_bytes);
+        if (gpu_workspace == nullptr) {
+            std::cerr << __FILE__ << ':' << __LINE__ << ": " << "failed to allocate GPU workspace" << std::endl;
+            return -1;
+        }
 
-        compute_rnnt_loss_fp64(acts.data<double>(), grads.data<double>(),
-                         labels.data<int>(), label_lengths.data<int>(),
-                         input_lengths.data<int>(), alphabet_size,
-                         minibatch_size, costs.data<double>(),
+        compute_rnnt_loss_fp64(acts.data_ptr<double>(), grads.data_ptr<double>(),
+                         labels.data_ptr<int>(), label_lengths.data_ptr<int>(),
+                         input_lengths.data_ptr<int>(), alphabet_size,
+                         minibatch_size, costs.data_ptr<double>(),
                          gpu_workspace, options);
 
-        THCudaFree(state, gpu_workspace);
+        cudaFree(gpu_workspace);
         return 0;
         }
       default:

From 5cbc6d685c6e13789a00c8e0cafbf34eed0685d7 Mon Sep 17 00:00:00 2001
From: John Nirmal <john.nirmal@tihiitb.org>
Date: Wed, 9 Jul 2025 16:29:46 +0530
Subject: [PATCH 02/14] Update README.md

---
 pytorch_binding/README.md | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/pytorch_binding/README.md b/pytorch_binding/README.md
index 190f56e..4be1252 100644
--- a/pytorch_binding/README.md
+++ b/pytorch_binding/README.md
@@ -10,6 +10,7 @@ Install [PyTorch](https://github.com/pytorch/pytorch#installation).
 (i.e. `libwarprnnt.so`).  This defaults to `../build`, so from within a
 new warp-transducer clone you could build WarpRNNT like this:
 
+
 ```bash
 git clone https://github.com/HawkAaron/warp-transducer
 cd warp-transducer
@@ -90,3 +91,33 @@ forward(acts, labels, act_lens, label_lens):
     label_lens: Tensor of (batch) containing label length of each example
     """
 ```
+
+## Troubleshooting: `cuda_runtime_api.h: No such file or directory`
+
+If you encounter an error like this during installation:
+
+`
+fatal error: cuda_runtime_api.h: No such file or directory
+`
+
+This usually means the CUDA headers can't be found by the compiler. If your system has CUDA installed (e.g., version 12.9) and the file `cuda_runtime_api.h` exists somewhere like:
+
+`
+/usr/local/cuda-12.9/targets/x86_64-linux/include/cuda_runtime_api.h
+`
+
+Then you can fix this by manually setting the environment variables to point to the correct CUDA paths:
+
+```bash
+export CUDA_HOME=/usr/local/cuda-12.9
+export CFLAGS="-I$CUDA_HOME/targets/x86_64-linux/include"
+export LDFLAGS="-L$CUDA_HOME/targets/x86_64-linux/lib"
+export PATH="$CUDA_HOME/bin:$PATH"
+export LD_LIBRARY_PATH="$CUDA_HOME/lib64:$LD_LIBRARY_PATH"
+```
+
+Then run the installation again:
+
+```bash
+pip install .
+```

From a1de641cc96acc321079c712d986e84e98c0dd18 Mon Sep 17 00:00:00 2001
From: ljn7 <john.nirmal@tihiitb.org>
Date: Wed, 16 Jul 2025 10:03:53 +0000
Subject: [PATCH 03/14] Changed to PyTorch CUDA allocator

---
 pytorch_binding/src/binding.cpp | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/pytorch_binding/src/binding.cpp b/pytorch_binding/src/binding.cpp
index 53e5d2d..ec9ada7 100644
--- a/pytorch_binding/src/binding.cpp
+++ b/pytorch_binding/src/binding.cpp
@@ -5,8 +5,7 @@
 #include "rnnt.h"
 
 #ifdef WARPRNNT_ENABLE_GPU
-    #include <ATen/cuda/CUDAContext.h>
-    #include <cuda_runtime_api.h>
+    #include "c10/cuda/CUDACachingAllocator.h"
 #endif
 
 int cpu_rnnt(torch::Tensor acts,
@@ -117,8 +116,7 @@ int gpu_rnnt(torch::Tensor acts,
 
         cudaSetDevice(acts.get_device());
 
-        void* gpu_workspace = nullptr;
-        cudaMalloc(&gpu_workspace, gpu_size_bytes);
+        void* gpu_workspace = c10::cuda::CUDACachingAllocator::raw_alloc(gpu_size_bytes);
         if (gpu_workspace == nullptr) {
             std::cerr << __FILE__ << ':' << __LINE__ << ": " << "failed to allocate GPU workspace" << std::endl;
             return -1;
@@ -130,7 +128,7 @@ int gpu_rnnt(torch::Tensor acts,
                          minibatch_size, costs.data_ptr<float>(),
                          gpu_workspace, options);
 
-        cudaFree(gpu_workspace);
+        c10::cuda::CUDACachingAllocator::raw_delete(gpu_workspace);
         return 0;
         }
       case torch::kDouble:
@@ -141,8 +139,7 @@ int gpu_rnnt(torch::Tensor acts,
 
         cudaSetDevice(acts.get_device());
 
-        void* gpu_workspace = nullptr;
-        cudaMalloc(&gpu_workspace, gpu_size_bytes);
+        void* gpu_workspace = c10::cuda::CUDACachingAllocator::raw_alloc(gpu_size_bytes);
         if (gpu_workspace == nullptr) {
             std::cerr << __FILE__ << ':' << __LINE__ << ": " << "failed to allocate GPU workspace" << std::endl;
             return -1;
@@ -154,7 +151,7 @@ int gpu_rnnt(torch::Tensor acts,
                          minibatch_size, costs.data_ptr<double>(),
                          gpu_workspace, options);
 
-        cudaFree(gpu_workspace);
+        c10::cuda::CUDACachingAllocator::raw_delete(gpu_workspace);
         return 0;
         }
       default:

From 2806d2da0ef163bf05a0507bdbab90c5463a2240 Mon Sep 17 00:00:00 2001
From: ljn7 <john.nirmal@tihiitb.org>
Date: Wed, 16 Jul 2025 10:28:42 +0000
Subject: [PATCH 04/14] Requires C++17 for torch 2.1.0 and above and requires
 packaging

---
 pytorch_binding/setup.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pytorch_binding/setup.py b/pytorch_binding/setup.py
index 08726f4..af8c094 100644
--- a/pytorch_binding/setup.py
+++ b/pytorch_binding/setup.py
@@ -8,7 +8,7 @@
 
 
 extra_compile_args = ['-fPIC']
-if Version(torch.__version__) >= Version("1.8.0"):
+if Version(torch.__version__) >= Version("2.1.0"):
     extra_compile_args += ['-std=c++17']
 elif Version(torch.__version__) >= Version("1.5.0"):
     extra_compile_args += ['-std=c++14']
@@ -47,6 +47,9 @@
     author="Mingkun Huang",
     author_email="mingkunhuang95@gmail.com",
     packages=find_packages(),
+    install_requires=[
+        'packaging'
+    ],
     ext_modules=[
         CppExtension(
             name='warprnnt_pytorch.warp_rnnt',

From 64477aefb0ed1d1a1a033cba71a9703bd9410ef2 Mon Sep 17 00:00:00 2001
From: ljn7 <john.nirmal@tihiitb.org>
Date: Wed, 16 Jul 2025 10:55:36 +0000
Subject: [PATCH 05/14] Fixed test file for GPU testing

---
 pytorch_binding/test/test.py | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/pytorch_binding/test/test.py b/pytorch_binding/test/test.py
index 14feb09..c378ed0 100644
--- a/pytorch_binding/test/test.py
+++ b/pytorch_binding/test/test.py
@@ -23,28 +23,24 @@
 
 fn = rnntloss() if args.np else RNNTLoss(reduction='sum')
 
-gpu = 1
+#acts = autograd.Variable(acts, requires_grad=True)
 def wrap_and_call(acts, labels):
     acts = torch.FloatTensor(acts)
-    if use_cuda:
-        acts = acts.cuda(gpu)
-    #acts = autograd.Variable(acts, requires_grad=True)
+    device = torch.device("cuda" if use_cuda else "cpu")
+    acts = acts.to(device)
     acts.requires_grad = True
 
     lengths = [acts.shape[1]] * acts.shape[0]
     label_lengths = [len(l) for l in labels]
-    labels = torch.IntTensor(labels)
-    lengths = torch.IntTensor(lengths)
-    label_lengths = torch.IntTensor(label_lengths)
-    if use_cuda:
-        labels = labels.cuda(gpu)
-        lengths = lengths.cuda(gpu)
-        label_lengths = label_lengths.cuda(gpu)
+
+    labels = torch.IntTensor(labels).to(device)
+    lengths = torch.IntTensor(lengths).to(device)
+    label_lengths = torch.IntTensor(label_lengths).to(device)
 
     costs = fn(acts, labels, lengths, label_lengths)
     cost = torch.sum(costs)
     cost.backward()
-    # print(repr(acts.grad.data.cpu().numpy()))
+
     return costs.data.cpu().numpy(), acts.grad.data.cpu().numpy()
 
 

From a11cedd01f0d197016d0db2ad11e618d1026cd7f Mon Sep 17 00:00:00 2001
From: John Nirmal <john.nirmal@tihiitb.org>
Date: Thu, 17 Jul 2025 10:52:00 +0530
Subject: [PATCH 06/14] Update README.md

---
 README.md | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/README.md b/README.md
index 749a1e4..1ec3b88 100644
--- a/README.md
+++ b/README.md
@@ -69,6 +69,30 @@ please run `rm CMakeCache.txt` and cmake again.
 The C library should now be built along with test executables. If CUDA was detected, then `test_gpu` will be built;
 `test_cpu` will always be built.
 
+## Troubleshooting: `cuda_runtime_api.h: No such file or directory`
+
+If you encounter an error like this during installation:
+
+`
+fatal error: cuda_runtime_api.h: No such file or directory
+`
+
+This usually means the CUDA headers can't be found by the compiler. If your system has CUDA installed (e.g., version 12.9) and the file `cuda_runtime_api.h` exists somewhere like:
+
+`
+/usr/local/cuda-12.9/targets/x86_64-linux/include/cuda_runtime_api.h
+`
+
+Then you can fix this by manually setting the environment variables to point to the correct CUDA paths:
+
+```bash
+export CUDA_HOME=/usr/local/cuda-12.9
+export CFLAGS="-I$CUDA_HOME/targets/x86_64-linux/include"
+export LDFLAGS="-L$CUDA_HOME/targets/x86_64-linux/lib"
+export PATH="$CUDA_HOME/bin:$PATH"
+export LD_LIBRARY_PATH="$CUDA_HOME/lib64:$LD_LIBRARY_PATH"
+```
+
 ## Test
 To run the tests, make sure the CUDA libraries are in `LD_LIBRARY_PATH` (DYLD_LIBRARY_PATH for OSX).
 

From a8b11432712581123d4e27650b6150bf91825319 Mon Sep 17 00:00:00 2001
From: ljn7 <john.nirmal@tihiitb.org>
Date: Thu, 17 Jul 2025 13:30:32 +0000
Subject: [PATCH 07/14] - Merged FastEmit code (originally by b-flo, forked
 from HawkAaron/warp-transducer) - Adapted and updated for compatibility and
 enhancements on my fork - Updated relevant documentation and options -
 Updated CMakeLists.txt to support CUDA and CMake version selectionUpdated
 CMakeLists.txt to support CUDA and CMake version detection/selection

---
 CMakeLists.txt                               | 233 ++++++++++++++-----
 include/detail/cpu_rnnt.h                    |  10 +-
 include/detail/gpu_rnnt.h                    |  18 +-
 include/detail/gpu_rnnt_kernel.h             |  47 ++++
 include/rnnt.h                               |   2 +
 pytorch_binding/src/binding.cpp              |   4 +
 pytorch_binding/warprnnt_pytorch/__init__.py |  14 +-
 src/rnnt_entrypoint.cpp                      |  14 +-
 8 files changed, 260 insertions(+), 82 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c4b0ad3..f7e4034 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,10 +1,22 @@
-cmake_minimum_required(VERSION 3.18)
+if(APPLE)
+    cmake_minimum_required(VERSION 3.4)
+else()
+    cmake_minimum_required(VERSION 3.5)
+endif()
 
 project(rnnt_release LANGUAGES CXX CUDA)
 
-# Set global flags
+# ==== USER OPTIONS ====
+option(WITH_GPU "Build with CUDA support" ON)
+option(WITH_OMP "Build with OpenMP support" ON)
+option(USE_NAIVE_KERNEL "Use naive alpha-beta kernel" OFF)
+option(DEBUG_TIME "Output kernel time" OFF)
+option(DEBUG_KERNEL "Output alpha beta debug" OFF)
+
 set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
 
 if(NOT APPLE)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
@@ -13,21 +25,9 @@ else()
     add_definitions(-DAPPLE)
 endif()
 
-# Include headers
 include_directories(include)
 
-# Find CUDA Toolkit
-find_package(CUDAToolkit REQUIRED)
-message(STATUS "CUDA toolkit found")
-
-# Options
-option(USE_NAIVE_KERNEL "use naive alpha-beta kernel" OFF)
-option(DEBUG_TIME "output kernel time" OFF)
-option(DEBUG_KERNEL "output alpha beta" OFF)
-option(WITH_GPU "compile warp-rnnt with cuda." ON)
-option(WITH_OMP "compile warp-rnnt with openmp." ON)
-
-# Preprocessor flags
+# ==== PREPROCESSOR FLAGS ====
 set(COMMON_DEFINITIONS "")
 if(USE_NAIVE_KERNEL)
     list(APPEND COMMON_DEFINITIONS -DUSE_NAIVE_KERNEL)
@@ -42,7 +42,6 @@ if(NOT WITH_OMP)
     list(APPEND COMMON_DEFINITIONS -DRNNT_DISABLE_OMP)
 endif()
 
-# OpenMP
 if(WITH_OMP)
     find_package(OpenMP)
     if(OpenMP_CXX_FOUND)
@@ -50,37 +49,137 @@ if(WITH_OMP)
     endif()
 endif()
 
-set(CMAKE_CUDA_ARCHITECTURES 86)
+# ==== SAFE CUDA VERSION DETECTION & ARCH SELECTION ====
+
+# Try to detect CUDA version. Set major.minor or fallback to "0"
+if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
+    find_package(CUDAToolkit REQUIRED)
+    set(CUDA_TOOLKIT_VERSION "${CUDAToolkit_VERSION}")
+    if(NOT CUDA_TOOLKIT_VERSION OR "${CUDA_TOOLKIT_VERSION}" STREQUAL "")
+        # CMake >= 3.18 but CUDAToolkit_VERSION not set? Fallback logic
+        set(CUDA_TOOLKIT_VERSION "0.0")
+    endif()
+else()
+    find_package(CUDA REQUIRED)
+    if(DEFINED CUDA_VERSION AND NOT "${CUDA_VERSION}" STREQUAL "")
+        set(CUDA_TOOLKIT_VERSION "${CUDA_VERSION}")
+    else()
+        set(CUDA_TOOLKIT_VERSION "0.0")
+    endif()
+endif()
+
+# Split CUDA version into major/minor
+function(cuda_version_major VERSION OUTVAR)
+    if(NOT "${VERSION}" STREQUAL "")
+        string(REPLACE "." ";" TMP_LIST ${VERSION})
+        list(GET TMP_LIST 0 MAJOR)
+        set(${OUTVAR} "${MAJOR}" PARENT_SCOPE)
+    else()
+        set(${OUTVAR} "0" PARENT_SCOPE)
+    endif()
+endfunction()
+function(cuda_version_minor VERSION OUTVAR)
+    if(NOT "${VERSION}" STREQUAL "")
+        string(REPLACE "." ";" TMP_LIST ${VERSION})
+        list(LENGTH TMP_LIST TMP_LEN)
+        if(TMP_LEN GREATER 1)
+            list(GET TMP_LIST 1 MINOR)
+        else()
+            set(MINOR "0")
+        endif()
+        set(${OUTVAR} "${MINOR}" PARENT_SCOPE)
+    else()
+        set(${OUTVAR} "0" PARENT_SCOPE)
+    endif()
+endfunction()
+
+cuda_version_major("${CUDA_TOOLKIT_VERSION}" CUDA_VERSION_MAJOR)
+cuda_version_minor("${CUDA_TOOLKIT_VERSION}" CUDA_VERSION_MINOR)
+
+set(CUDA_ARCH_LIST "")
+if(CUDA_TOOLKIT_VERSION STREQUAL "" OR CUDA_VERSION_MAJOR EQUAL 0)
+    set(CUDA_ARCH_LIST "52;60;70;75")
+    message(WARNING "Could not detect CUDA version. Defaulting to minimal arch set (52;60;70;75).")
+elseif(CUDA_VERSION_MAJOR LESS 11)
+    set(CUDA_ARCH_LIST "52;60;70;75")
+elseif(CUDA_VERSION_MAJOR EQUAL 11)
+    set(CUDA_ARCH_LIST "52;60;70;75;80;86")
+elseif(CUDA_VERSION_MAJOR GREATER_EQUAL 12)
+    set(CUDA_ARCH_LIST "52;60;70;75;80;86;89")
+endif()
+
+# Allow user override
+if(DEFINED CMAKE_CUDA_ARCHITECTURES AND NOT "${CMAKE_CUDA_ARCHITECTURES}" STREQUAL "")
+    set(CUDA_ARCH_LIST "${CMAKE_CUDA_ARCHITECTURES}")
+endif()
+
+if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
+    set(CMAKE_CUDA_ARCHITECTURES "${CUDA_ARCH_LIST}" CACHE STRING "Target GPU architectures")
+    message(STATUS "Auto-selected CUDA archs for CUDA ${CUDA_TOOLKIT_VERSION}: ${CMAKE_CUDA_ARCHITECTURES}")
+else()
+    string(REPLACE ";" " " CUDA_ARCH_FLAGS "${CUDA_ARCH_LIST}")
+    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ")
+    foreach(ARCH ${CUDA_ARCH_LIST})
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_${ARCH},code=sm_${ARCH}")
+    endforeach()
+    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++11")
+    message(STATUS "Auto-selected CUDA NVCC_ARCH flags for CUDA ${CUDA_TOOLKIT_VERSION}: ${CUDA_NVCC_FLAGS}")
+endif()
+
+if(CUDA_TOOLKIT_VERSION AND CUDA_VERSION_MAJOR LESS 11)
+    if(CUDA_ARCH_LIST MATCHES "80|86|89")
+        message(WARNING "
+Detected CUDA ${CUDA_TOOLKIT_VERSION}.
+Some requested architectures (${CUDA_ARCH_LIST}) require CUDA >= 11 (Ampere/Ada).
+Remove sm_80/86/89 or upgrade CUDA Toolkit if you see NVCC errors.
+")
+    endif()
+endif()
+
+# ==== Apple/Mac rpath ====
+if(APPLE)
+    execute_process(COMMAND uname -v OUTPUT_VARIABLE DARWIN_VERSION)
+    string(REGEX MATCH "[0-9]+" DARWIN_VERSION "${DARWIN_VERSION}")
+    message(STATUS "DARWIN_VERSION=${DARWIN_VERSION}")
+    if(${DARWIN_VERSION} LESS 15)
+        set(CMAKE_SKIP_RPATH TRUE)
+    endif()
+else()
+    set(CMAKE_SKIP_RPATH TRUE)
+endif()
 
-# --------------------
-# Shared Library Target
-# --------------------
+# ==== Main library target ====
 if(WITH_GPU)
     message(STATUS "Building shared library with GPU support")
-
-    add_library(warprnnt SHARED src/rnnt_entrypoint.cu)
-    target_link_libraries(warprnnt PRIVATE CUDA::cudart CUDA::curand)
+    if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
+        add_library(warprnnt SHARED src/rnnt_entrypoint.cu)
+        target_link_libraries(warprnnt PRIVATE CUDA::cudart CUDA::curand)
+    else()
+        CUDA_ADD_LIBRARY(warprnnt SHARED src/rnnt_entrypoint.cu)
+        if(NOT Torch_FOUND)
+            TARGET_LINK_LIBRARIES(warprnnt ${CUDA_curand_LIBRARY})
+        endif()
+    endif()
 else()
     message(STATUS "Building shared library with no GPU support")
-
     add_library(warprnnt SHARED src/rnnt_entrypoint.cpp)
 endif()
 
 target_compile_features(warprnnt PRIVATE cxx_std_17)
 target_compile_definitions(warprnnt PRIVATE ${COMMON_DEFINITIONS})
 target_include_directories(warprnnt PRIVATE include)
-set_target_properties(warprnnt PROPERTIES
-    CUDA_SEPARABLE_COMPILATION ON
-    CUDA_STANDARD 17
-    CUDA_STANDARD_REQUIRED ON
-)
-target_compile_options(warprnnt PRIVATE
-    $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
-)
-
-# --------------------
-# Executables
-# --------------------
+if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18" AND TARGET warprnnt)
+    set_target_properties(warprnnt PROPERTIES
+        CUDA_SEPARABLE_COMPILATION ON
+        CUDA_STANDARD 17
+        CUDA_STANDARD_REQUIRED ON
+    )
+    target_compile_options(warprnnt PRIVATE
+        $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
+    )
+endif()
+
+# ==== Test executables ====
 add_executable(test_cpu tests/test_cpu.cpp tests/random.cpp)
 target_link_libraries(test_cpu PRIVATE warprnnt)
 target_compile_features(test_cpu PRIVATE cxx_std_17)
@@ -90,37 +189,45 @@ target_link_libraries(test_time PRIVATE warprnnt)
 target_compile_features(test_time PRIVATE cxx_std_17)
 
 if(WITH_GPU)
-    add_executable(test_time_gpu tests/test_time.cu tests/random.cpp)
-    target_link_libraries(test_time_gpu PRIVATE warprnnt CUDA::curand)
-    target_compile_features(test_time_gpu PRIVATE cxx_std_17)
-    set_target_properties(test_time_gpu PROPERTIES
-        CUDA_SEPARABLE_COMPILATION ON
-        CUDA_STANDARD 17
-        CUDA_STANDARD_REQUIRED ON
-    )
-    target_compile_options(test_time_gpu PRIVATE
-        $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
-    )
-
-    add_executable(test_gpu tests/test_gpu.cu tests/random.cpp)
-    target_link_libraries(test_gpu PRIVATE warprnnt CUDA::curand)
-    target_compile_features(test_gpu PRIVATE cxx_std_17)
-    set_target_properties(test_gpu PROPERTIES
-        CUDA_SEPARABLE_COMPILATION ON
-        CUDA_STANDARD 17
-        CUDA_STANDARD_REQUIRED ON
-    )
-    target_compile_options(test_gpu PRIVATE
-        $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
-    )
+    if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
+        add_executable(test_time_gpu tests/test_time.cu tests/random.cpp)
+        target_link_libraries(test_time_gpu PRIVATE warprnnt CUDA::curand)
+        target_compile_features(test_time_gpu PRIVATE cxx_std_17)
+        set_target_properties(test_time_gpu PROPERTIES
+            CUDA_SEPARABLE_COMPILATION ON
+            CUDA_STANDARD 17
+            CUDA_STANDARD_REQUIRED ON
+        )
+        target_compile_options(test_time_gpu PRIVATE
+            $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
+        )
+
+        add_executable(test_gpu tests/test_gpu.cu tests/random.cpp)
+        target_link_libraries(test_gpu PRIVATE warprnnt CUDA::curand)
+        target_compile_features(test_gpu PRIVATE cxx_std_17)
+        set_target_properties(test_gpu PROPERTIES
+            CUDA_SEPARABLE_COMPILATION ON
+            CUDA_STANDARD 17
+            CUDA_STANDARD_REQUIRED ON
+        )
+        target_compile_options(test_gpu PRIVATE
+            $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
+        )
+    else()
+        cuda_add_executable(test_time_gpu tests/test_time.cu tests/random.cpp )
+        TARGET_LINK_LIBRARIES(test_time_gpu warprnnt ${CUDA_curand_LIBRARY})
+        set_target_properties(test_time_gpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
+
+        cuda_add_executable(test_gpu tests/test_gpu.cu tests/random.cpp )
+        TARGET_LINK_LIBRARIES(test_gpu warprnnt ${CUDA_curand_LIBRARY})
+        set_target_properties(test_gpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
+    endif()
 endif()
 
-# --------------------
-# Installation
-# --------------------
+# ==== Install targets ====
 install(TARGETS warprnnt
         RUNTIME DESTINATION bin
         LIBRARY DESTINATION lib
         ARCHIVE DESTINATION lib)
 
-install(FILES include/rnnt.h DESTINATION include)
\ No newline at end of file
+install(FILES include/rnnt.h DESTINATION include)
diff --git a/include/detail/cpu_rnnt.h b/include/detail/cpu_rnnt.h
index 0042b34..f4e586f 100644
--- a/include/detail/cpu_rnnt.h
+++ b/include/detail/cpu_rnnt.h
@@ -20,9 +20,9 @@ class CpuRNNT {
 public:
     // Noncopyable
     CpuRNNT(int minibatch, int maxT, int maxU, int alphabet_size, void* workspace, 
-            int blank, int num_threads, bool batch_first) :
+            int blank, float fastemit_lambda, int num_threads, bool batch_first) :
         minibatch_(minibatch), maxT_(maxT), maxU_(maxU), alphabet_size_(alphabet_size), 
-        workspace_(workspace), blank_(blank), num_threads_(num_threads), batch_first(batch_first) {
+          workspace_(workspace), blank_(blank), fastemit_lambda_(fastemit_lambda), num_threads_(num_threads), batch_first(batch_first) {
 #if defined(RNNT_DISABLE_OMP) || defined(APPLE)
 #else
         if (num_threads > 0) {
@@ -82,6 +82,7 @@ class CpuRNNT {
     int alphabet_size_; // Number of characters plus blank
     void* workspace_;
     int blank_;
+    float fastemit_lambda_;
     int num_threads_;
     bool batch_first;
     
@@ -259,7 +260,10 @@ CpuRNNT<ProbT>::compute_betas_and_grad(ProbT* grad, const ProbT* const log_probs
             }
             if (u < U-1) {
                 ProbT g = alphas[idx(t, u)] + betas[idx(t, u+1)];
-                grad[idx(t, u, labels[u])] = -std::exp(log_probs[idx(t, u) * 2 + 1] + g - loglike);
+                grad[idx(t, u, labels[u])] = -(
+                                               (1. + fastemit_lambda_)
+                                               * std::exp(log_probs[idx(t, u) * 2 + 1] + g - loglike)
+                                               );
             }
         }
     }
diff --git a/include/detail/gpu_rnnt.h b/include/detail/gpu_rnnt.h
index dd7f874..c8e67df 100644
--- a/include/detail/gpu_rnnt.h
+++ b/include/detail/gpu_rnnt.h
@@ -20,9 +20,9 @@ class GpuRNNT {
 public:
     // Noncopyable
     GpuRNNT(int minibatch, int maxT, int maxU, int alphabet_size, void* workspace, 
-            int blank, int num_threads, CUstream stream) :
+            int blank, float fastemit_lambda, int num_threads, CUstream stream) :
         minibatch_(minibatch), maxT_(maxT), maxU_(maxU), alphabet_size_(alphabet_size), 
-        gpu_workspace(workspace), blank_(blank), num_threads_(num_threads), stream_(stream) {
+          gpu_workspace(workspace), blank_(blank), fastemit_lambda_(fastemit_lambda), num_threads_(num_threads), stream_(stream) {
 #if defined(RNNT_DISABLE_OMP) || defined(APPLE)
 #else
         if (num_threads > 0) {
@@ -65,6 +65,7 @@ class GpuRNNT {
     int alphabet_size_; // Number of characters plus blank
     void* gpu_workspace;
     int blank_;
+    float fastemit_lambda_;
     int num_threads_;
     CUstream stream_;
     
@@ -195,9 +196,16 @@ GpuRNNT<ProbT>::compute_cost_and_score(const ProbT* const acts,
         start = std::chrono::high_resolution_clock::now();
 #endif
         // TODO optimize gradient kernel
-        compute_grad_kernel<128, ProbT><<<minibatch_ * maxT_ * maxU_, 128, 0, stream_>>>(grads, 
-            acts, denom, alphas, betas, llForward, input_lengths, label_lengths, labels, 
-            minibatch_, maxT_, maxU_, alphabet_size_, blank_);
+
+        if (fastemit_lambda_ > 0.0f) {
+            compute_fastemit_grad_kernel<128, ProbT><<<minibatch_ * maxT_ * maxU_, 128, 0, stream_>>>(grads, 
+                acts, denom, alphas, betas, llForward, input_lengths, label_lengths, labels, 
+                minibatch_, maxT_, maxU_, alphabet_size_, blank_, fastemit_lambda_);
+        } else {
+            compute_grad_kernel<128, ProbT><<<minibatch_ * maxT_ * maxU_, 128, 0, stream_>>>(grads, 
+                acts, denom, alphas, betas, llForward, input_lengths, label_lengths, labels, 
+                minibatch_, maxT_, maxU_, alphabet_size_, blank_);
+        }
 #if defined(DEBUG_TIME)
         cudaStreamSynchronize(stream_);
         end = std::chrono::high_resolution_clock::now();
diff --git a/include/detail/gpu_rnnt_kernel.h b/include/detail/gpu_rnnt_kernel.h
index 738d7ef..ed0b0e6 100644
--- a/include/detail/gpu_rnnt_kernel.h
+++ b/include/detail/gpu_rnnt_kernel.h
@@ -177,3 +177,50 @@ __global__ void compute_grad_kernel(Tp* grads, const Tp* const acts, const Tp* c
         }
     }
 }
+
+template<int NT, typename Tp>
+__global__ void compute_fastemit_grad_kernel(Tp* grads, const Tp* const acts, const Tp* const denom, const Tp* alphas, const Tp* betas, const Tp* const logll, const int* const xlen, const int* const ylen, 
+    const int* const mlabels, const int minibatch, const int maxT, const int maxU, const int alphabet_size, const int blank_, const Tp fastemit_lambda) {
+    int tid = threadIdx.x; // alphabet dim
+    int idx = tid;
+    int col = blockIdx.x; // mb, t, u
+
+    int u = col % maxU;
+    int bt = (col - u) / maxU;
+    int t = bt % maxT;
+    int mb = (bt - t) / maxT;
+
+    const int T = xlen[mb];
+    const int U = ylen[mb] + 1;
+    const int* labels = mlabels + mb * (maxU - 1);
+
+    if (t < T && u < U) {
+        while (idx < alphabet_size) {
+            Tp logpk = denom[col] + acts[col * alphabet_size + idx];
+            // Tp logpk = logp(denom, acts, maxT, maxU, alphabet_size, mb, t, u, idx);
+            Tp grad = exp(alphas[col] + betas[col] + logpk - logll[mb]);
+
+            Tp logy_btu1 = rnnt_helper::neg_inf<Tp>(); // log(y(t,u)) + log(beta(t, u+1))
+            if (u < U-1) {
+                logy_btu1 = denom[col] + acts[col * alphabet_size + labels[u]] + betas[col+1];
+            }
+            grad += fastemit_lambda * exp(alphas[col] + logy_btu1 + logpk - logll[mb]);
+
+            // grad to last blank transition
+            if (idx == blank_ && t == T-1 && u == U-1) {
+                grad -= exp(alphas[col] + logpk - logll[mb]);
+                grad -= fastemit_lambda * exp(alphas[col] + logy_btu1 + logpk - logll[mb]);
+            }
+            if (idx == blank_ && t < T-1) {
+                grad -= exp(alphas[col] + logpk - logll[mb] + betas[col + maxU]);
+            }
+            if (u < U-1 && idx == labels[u]) {
+                grad -= exp(alphas[col] + logpk - logll[mb] + betas[col+1]);
+                grad -= fastemit_lambda * exp(alphas[col] + logy_btu1 - logll[mb]);
+            }
+            grads[col * alphabet_size + idx] = grad;
+
+            idx += NT;
+        }
+    }
+}
diff --git a/include/rnnt.h b/include/rnnt.h
index 4759e0a..207065f 100644
--- a/include/rnnt.h
+++ b/include/rnnt.h
@@ -61,6 +61,8 @@ struct rnntOptions {
 
     /// memory structure
     bool batch_first;
+
+    float fastemit_lambda;
 };
 
 /** Compute the RNN Transducer loss between a sequence
diff --git a/pytorch_binding/src/binding.cpp b/pytorch_binding/src/binding.cpp
index ec9ada7..95f5ff8 100644
--- a/pytorch_binding/src/binding.cpp
+++ b/pytorch_binding/src/binding.cpp
@@ -15,6 +15,7 @@ int cpu_rnnt(torch::Tensor acts,
             torch::Tensor costs,
             torch::Tensor grads,
             int blank_label,
+            float fastemit_lambda,
             int num_threads) {
 
     int maxT = acts.size(0);
@@ -32,6 +33,7 @@ int cpu_rnnt(torch::Tensor acts,
     memset(&options, 0, sizeof(options));
     options.maxT = maxT;
     options.maxU = maxU;
+    options.fastemit_lambda = fastemit_lambda;
     options.blank_label = blank_label;
     options.batch_first = true;
     options.loc = RNNT_CPU;
@@ -87,6 +89,7 @@ int gpu_rnnt(torch::Tensor acts,
             torch::Tensor costs,
             torch::Tensor grads,
             int blank_label,
+            float fastemit_lambda,
             int num_threads) {
 
     int minibatch_size = acts.size(0);
@@ -101,6 +104,7 @@ int gpu_rnnt(torch::Tensor acts,
     options.blank_label = blank_label;
     options.loc = RNNT_GPU;
     options.stream = at::cuda::getCurrentCUDAStream();
+    options.fastemit_lambda = fastemit_lambda;
     options.num_threads = num_threads;
 #if defined(RNNT_DISABLE_OMP) || defined(APPLE)
     // have to use at least one
diff --git a/pytorch_binding/warprnnt_pytorch/__init__.py b/pytorch_binding/warprnnt_pytorch/__init__.py
index 76c6c30..a078fb0 100644
--- a/pytorch_binding/warprnnt_pytorch/__init__.py
+++ b/pytorch_binding/warprnnt_pytorch/__init__.py
@@ -15,6 +15,7 @@ def forward(ctx, acts, labels, act_lens, label_lens, blank, reduction):
         labels: 2 dimensional Tensor containing all the targets of the batch with zero padded
         act_lens: Tensor of size (batch) containing size of each output sequence from the network
         label_lens: Tensor of (batch) containing label length of each example
+        fastemit_lambda: Regularization parameter for FastEmit (https://arxiv.org/pdf/2010.11148.pdf)
         """
         is_cuda = acts.is_cuda
 
@@ -31,6 +32,7 @@ def forward(ctx, acts, labels, act_lens, label_lens, blank, reduction):
                   costs,
                   grads,
                   blank,
+                  fastemit_lambda,
                   0)
 
         if reduction in ['sum', 'mean']:
@@ -47,10 +49,10 @@ def forward(ctx, acts, labels, act_lens, label_lens, blank, reduction):
     @staticmethod
     def backward(ctx, grad_output):
         grad_output = grad_output.view(-1, 1, 1, 1).to(ctx.grads)
-        return ctx.grads.mul_(grad_output), None, None, None, None, None
+        return ctx.grads.mul_(grad_output), None, None, None, None, None, None
 
 
-def rnnt_loss(acts, labels, act_lens, label_lens, blank=0, reduction='mean'):
+def rnnt_loss(acts, labels, act_lens, label_lens, blank=0, reduction='mean', fastemit_lambda=0.001):
     """ RNN Transducer Loss
 
     Args:
@@ -63,11 +65,12 @@ def rnnt_loss(acts, labels, act_lens, label_lens, blank=0, reduction='mean'):
             'none' | 'mean' | 'sum'. 'none': no reduction will be applied, 
             'mean': the output losses will be divided by the target lengths and
             then the mean over the batch is taken. Default: 'mean'
+       fastemit_lambda: Regularization parameter for FastEmit (https://arxiv.org/pdf/2010.11148.pdf)
     """
     if not acts.is_cuda:
         acts = torch.nn.functional.log_softmax(acts, -1)
 
-    return _RNNT.apply(acts, labels, act_lens, label_lens, blank, reduction)
+    return _RNNT.apply(acts, labels, act_lens, label_lens, blank, reduction, fastemit_lambda)
 
 
 class RNNTLoss(Module):
@@ -79,10 +82,11 @@ class RNNTLoss(Module):
             'mean': the output losses will be divided by the target lengths and
             then the mean over the batch is taken. Default: 'mean'
     """
-    def __init__(self, blank=0, reduction='mean'):
+    def __init__(self, blank=0, fastemit_lambda=0.001, reduction='mean'):
         super(RNNTLoss, self).__init__()
         self.blank = blank
         self.reduction = reduction
+        self.fastemit_lambda = fastemit_lambda
         self.loss = _RNNT.apply
 
     def forward(self, acts, labels, act_lens, label_lens):
@@ -97,7 +101,7 @@ def forward(self, acts, labels, act_lens, label_lens):
             # log_softmax is computed within GPU version.
             acts = torch.nn.functional.log_softmax(acts, -1)
 
-        return self.loss(acts, labels, act_lens, label_lens, self.blank, self.reduction)
+        return self.loss(acts, labels, act_lens, label_lens, self.blank, self.reduction, self.fastemit_lambda)
 
 
 def check_type(var, t, name):
diff --git a/src/rnnt_entrypoint.cpp b/src/rnnt_entrypoint.cpp
index 0d742d8..9854470 100644
--- a/src/rnnt_entrypoint.cpp
+++ b/src/rnnt_entrypoint.cpp
@@ -55,12 +55,13 @@ rnntStatus_t compute_rnnt_loss(const float* const activations, //BTUV
         alphabet_size <= 0 ||
         minibatch <= 0 ||
         options.maxT <= 0 ||
-        options.maxU <= 0)
+        options.maxU <= 0 ||
+        options.fastemit_lambda < 0)
         return RNNT_STATUS_INVALID_VALUE;
 
     if (options.loc == RNNT_CPU) {
         CpuRNNT<float> rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace, 
-                                options.blank_label, options.num_threads, options.batch_first);
+                            options.blank_label, options.fastemit_lambda, options.num_threads, options.batch_first);
 
         if (gradients != NULL)
             return rnnt.cost_and_grad(activations, gradients,
@@ -73,7 +74,7 @@ rnntStatus_t compute_rnnt_loss(const float* const activations, //BTUV
     } else if (options.loc == RNNT_GPU) {
 #ifdef __CUDACC__
         GpuRNNT<float> rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace,
-                                options.blank_label, options.num_threads, options.stream);
+                            options.blank_label, options.fastemit_lambda, options.num_threads, options.stream);
 
         if (gradients != NULL)
             return rnnt.cost_and_grad(activations, gradients,
@@ -147,12 +148,13 @@ rnntStatus_t compute_rnnt_loss_fp64(const double* const activations, //BTUV
         alphabet_size <= 0 ||
         minibatch <= 0 ||
         options.maxT <= 0 ||
-        options.maxU <= 0)
+        options.maxU <= 0 ||
+        options.fastemit_lambda < 0)
         return RNNT_STATUS_INVALID_VALUE;
 
     if (options.loc == RNNT_CPU) {
         CpuRNNT<double> rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace, 
-                                options.blank_label, options.num_threads, options.batch_first);
+                                options.blank_label, options.fastemit_lambda, options.num_threads, options.batch_first);
 
         if (gradients != NULL)
             return rnnt.cost_and_grad(activations, gradients,
@@ -165,7 +167,7 @@ rnntStatus_t compute_rnnt_loss_fp64(const double* const activations, //BTUV
     } else if (options.loc == RNNT_GPU) {
 #ifdef __CUDACC__
         GpuRNNT<double> rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace,
-                                options.blank_label, options.num_threads, options.stream);
+                                options.blank_label, options.fastemit_lambda, options.num_threads, options.stream);
 
         if (gradients != NULL)
             return rnnt.cost_and_grad(activations, gradients,

From d4bdce529285711862d37586f3cb24bbe2e32590 Mon Sep 17 00:00:00 2001
From: John Nirmal <johnnirmal.works@gmail.com>
Date: Sat, 19 Jul 2025 20:34:04 +0530
Subject: [PATCH 08/14] Update CMakeLists.txt

Detect CUDA to on/off WITH_GPU by default
---
 CMakeLists.txt | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f7e4034..273285b 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,8 +6,16 @@ endif()
 
 project(rnnt_release LANGUAGES CXX CUDA)
 
+if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
+    find_package(CUDAToolkit)
+    set(CUDA_FOUND ${CUDAToolkit_FOUND})
+else()
+    find_package(CUDA)
+    set(CUDA_FOUND ${CUDA_FOUND})
+endif()
+
 # ==== USER OPTIONS ====
-option(WITH_GPU "Build with CUDA support" ON)
+option(WITH_GPU "Build with CUDA support" ${CUDA_FOUND})
 option(WITH_OMP "Build with OpenMP support" ON)
 option(USE_NAIVE_KERNEL "Use naive alpha-beta kernel" OFF)
 option(DEBUG_TIME "Output kernel time" OFF)

From 946669ae910d40dd0144be10c69c030017734dd4 Mon Sep 17 00:00:00 2001
From: John Nirmal <johnnirmal.works@gmail.com>
Date: Sat, 19 Jul 2025 22:01:58 +0530
Subject: [PATCH 09/14] Add graceful CUDA detection and fallback to CPU-only
 build

- Check CUDA availability before declaring it as project language
- Fall back to CPU-only build when CUDA toolkit not found
- Prevents "Failed to find nvcc" error on systems without CUDA
- Maintains GPU support when CUDA is properly installed
---
 CMakeLists.txt | 170 ++++++++++++++++++++++++++-----------------------
 1 file changed, 90 insertions(+), 80 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 273285b..fcb7766 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,16 +4,22 @@ else()
     cmake_minimum_required(VERSION 3.5)
 endif()
 
-project(rnnt_release LANGUAGES CXX CUDA)
-
 if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
-    find_package(CUDAToolkit)
+    find_package(CUDAToolkit QUIET)
     set(CUDA_FOUND ${CUDAToolkit_FOUND})
 else()
-    find_package(CUDA)
+    find_package(CUDA QUIET)
     set(CUDA_FOUND ${CUDA_FOUND})
 endif()
 
+if(CUDA_FOUND)
+    project(rnnt_release LANGUAGES CXX CUDA)
+    message(STATUS "CUDA found, enabling GPU support")
+else()
+    project(rnnt_release LANGUAGES CXX)
+    message(STATUS "CUDA not found, building CPU-only version")
+endif()
+
 # ==== USER OPTIONS ====
 option(WITH_GPU "Build with CUDA support" ${CUDA_FOUND})
 option(WITH_OMP "Build with OpenMP support" ON)
@@ -21,6 +27,11 @@ option(USE_NAIVE_KERNEL "Use naive alpha-beta kernel" OFF)
 option(DEBUG_TIME "Output kernel time" OFF)
 option(DEBUG_KERNEL "Output alpha beta debug" OFF)
 
+if(WITH_GPU AND NOT CUDA_FOUND)
+    message(WARNING "WITH_GPU requested but CUDA not found. Building CPU-only version.")
+    set(WITH_GPU OFF)
+endif()
+
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
@@ -57,90 +68,89 @@ if(WITH_OMP)
     endif()
 endif()
 
-# ==== SAFE CUDA VERSION DETECTION & ARCH SELECTION ====
-
-# Try to detect CUDA version. Set major.minor or fallback to "0"
-if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
-    find_package(CUDAToolkit REQUIRED)
-    set(CUDA_TOOLKIT_VERSION "${CUDAToolkit_VERSION}")
-    if(NOT CUDA_TOOLKIT_VERSION OR "${CUDA_TOOLKIT_VERSION}" STREQUAL "")
-        # CMake >= 3.18 but CUDAToolkit_VERSION not set? Fallback logic
-        set(CUDA_TOOLKIT_VERSION "0.0")
-    endif()
-else()
-    find_package(CUDA REQUIRED)
-    if(DEFINED CUDA_VERSION AND NOT "${CUDA_VERSION}" STREQUAL "")
-        set(CUDA_TOOLKIT_VERSION "${CUDA_VERSION}")
+# ==== CUDA SETUP (only if WITH_GPU is ON) ====
+if(WITH_GPU)
+    if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
+        find_package(CUDAToolkit REQUIRED)
+        set(CUDA_TOOLKIT_VERSION "${CUDAToolkit_VERSION}")
+        if(NOT CUDA_TOOLKIT_VERSION OR "${CUDA_TOOLKIT_VERSION}" STREQUAL "")
+            set(CUDA_TOOLKIT_VERSION "0.0")
+        endif()
     else()
-        set(CUDA_TOOLKIT_VERSION "0.0")
+        find_package(CUDA REQUIRED)
+        if(DEFINED CUDA_VERSION AND NOT "${CUDA_VERSION}" STREQUAL "")
+            set(CUDA_TOOLKIT_VERSION "${CUDA_VERSION}")
+        else()
+            set(CUDA_TOOLKIT_VERSION "0.0")
+        endif()
     endif()
-endif()
 
-# Split CUDA version into major/minor
-function(cuda_version_major VERSION OUTVAR)
-    if(NOT "${VERSION}" STREQUAL "")
-        string(REPLACE "." ";" TMP_LIST ${VERSION})
-        list(GET TMP_LIST 0 MAJOR)
-        set(${OUTVAR} "${MAJOR}" PARENT_SCOPE)
-    else()
-        set(${OUTVAR} "0" PARENT_SCOPE)
-    endif()
-endfunction()
-function(cuda_version_minor VERSION OUTVAR)
-    if(NOT "${VERSION}" STREQUAL "")
-        string(REPLACE "." ";" TMP_LIST ${VERSION})
-        list(LENGTH TMP_LIST TMP_LEN)
-        if(TMP_LEN GREATER 1)
-            list(GET TMP_LIST 1 MINOR)
+    # Split CUDA version into major/minor
+    function(cuda_version_major VERSION OUTVAR)
+        if(NOT "${VERSION}" STREQUAL "")
+            string(REPLACE "." ";" TMP_LIST ${VERSION})
+            list(GET TMP_LIST 0 MAJOR)
+            set(${OUTVAR} "${MAJOR}" PARENT_SCOPE)
         else()
-            set(MINOR "0")
+            set(${OUTVAR} "0" PARENT_SCOPE)
         endif()
-        set(${OUTVAR} "${MINOR}" PARENT_SCOPE)
-    else()
-        set(${OUTVAR} "0" PARENT_SCOPE)
+    endfunction()
+    function(cuda_version_minor VERSION OUTVAR)
+        if(NOT "${VERSION}" STREQUAL "")
+            string(REPLACE "." ";" TMP_LIST ${VERSION})
+            list(LENGTH TMP_LIST TMP_LEN)
+            if(TMP_LEN GREATER 1)
+                list(GET TMP_LIST 1 MINOR)
+            else()
+                set(MINOR "0")
+            endif()
+            set(${OUTVAR} "${MINOR}" PARENT_SCOPE)
+        else()
+            set(${OUTVAR} "0" PARENT_SCOPE)
+        endif()
+    endfunction()
+
+    cuda_version_major("${CUDA_TOOLKIT_VERSION}" CUDA_VERSION_MAJOR)
+    cuda_version_minor("${CUDA_TOOLKIT_VERSION}" CUDA_VERSION_MINOR)
+
+    set(CUDA_ARCH_LIST "")
+    if(CUDA_TOOLKIT_VERSION STREQUAL "" OR CUDA_VERSION_MAJOR EQUAL 0)
+        set(CUDA_ARCH_LIST "52;60;70;75")
+        message(WARNING "Could not detect CUDA version. Defaulting to minimal arch set (52;60;70;75).")
+    elseif(CUDA_VERSION_MAJOR LESS 11)
+        set(CUDA_ARCH_LIST "52;60;70;75")
+    elseif(CUDA_VERSION_MAJOR EQUAL 11)
+        set(CUDA_ARCH_LIST "52;60;70;75;80;86")
+    elseif(CUDA_VERSION_MAJOR GREATER_EQUAL 12)
+        set(CUDA_ARCH_LIST "52;60;70;75;80;86;89")
     endif()
-endfunction()
-
-cuda_version_major("${CUDA_TOOLKIT_VERSION}" CUDA_VERSION_MAJOR)
-cuda_version_minor("${CUDA_TOOLKIT_VERSION}" CUDA_VERSION_MINOR)
-
-set(CUDA_ARCH_LIST "")
-if(CUDA_TOOLKIT_VERSION STREQUAL "" OR CUDA_VERSION_MAJOR EQUAL 0)
-    set(CUDA_ARCH_LIST "52;60;70;75")
-    message(WARNING "Could not detect CUDA version. Defaulting to minimal arch set (52;60;70;75).")
-elseif(CUDA_VERSION_MAJOR LESS 11)
-    set(CUDA_ARCH_LIST "52;60;70;75")
-elseif(CUDA_VERSION_MAJOR EQUAL 11)
-    set(CUDA_ARCH_LIST "52;60;70;75;80;86")
-elseif(CUDA_VERSION_MAJOR GREATER_EQUAL 12)
-    set(CUDA_ARCH_LIST "52;60;70;75;80;86;89")
-endif()
 
-# Allow user override
-if(DEFINED CMAKE_CUDA_ARCHITECTURES AND NOT "${CMAKE_CUDA_ARCHITECTURES}" STREQUAL "")
-    set(CUDA_ARCH_LIST "${CMAKE_CUDA_ARCHITECTURES}")
-endif()
 
-if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
-    set(CMAKE_CUDA_ARCHITECTURES "${CUDA_ARCH_LIST}" CACHE STRING "Target GPU architectures")
-    message(STATUS "Auto-selected CUDA archs for CUDA ${CUDA_TOOLKIT_VERSION}: ${CMAKE_CUDA_ARCHITECTURES}")
-else()
-    string(REPLACE ";" " " CUDA_ARCH_FLAGS "${CUDA_ARCH_LIST}")
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ")
-    foreach(ARCH ${CUDA_ARCH_LIST})
-        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_${ARCH},code=sm_${ARCH}")
-    endforeach()
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++11")
-    message(STATUS "Auto-selected CUDA NVCC_ARCH flags for CUDA ${CUDA_TOOLKIT_VERSION}: ${CUDA_NVCC_FLAGS}")
-endif()
+    if(DEFINED CMAKE_CUDA_ARCHITECTURES AND NOT "${CMAKE_CUDA_ARCHITECTURES}" STREQUAL "")
+        set(CUDA_ARCH_LIST "${CMAKE_CUDA_ARCHITECTURES}")
+    endif()
+
+    if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
+        set(CMAKE_CUDA_ARCHITECTURES "${CUDA_ARCH_LIST}" CACHE STRING "Target GPU architectures")
+        message(STATUS "Auto-selected CUDA archs for CUDA ${CUDA_TOOLKIT_VERSION}: ${CMAKE_CUDA_ARCHITECTURES}")
+    else()
+        string(REPLACE ";" " " CUDA_ARCH_FLAGS "${CUDA_ARCH_LIST}")
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ")
+        foreach(ARCH ${CUDA_ARCH_LIST})
+            set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_${ARCH},code=sm_${ARCH}")
+        endforeach()
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++11")
+        message(STATUS "Auto-selected CUDA NVCC_ARCH flags for CUDA ${CUDA_TOOLKIT_VERSION}: ${CUDA_NVCC_FLAGS}")
+    endif()
 
-if(CUDA_TOOLKIT_VERSION AND CUDA_VERSION_MAJOR LESS 11)
-    if(CUDA_ARCH_LIST MATCHES "80|86|89")
-        message(WARNING "
-Detected CUDA ${CUDA_TOOLKIT_VERSION}.
-Some requested architectures (${CUDA_ARCH_LIST}) require CUDA >= 11 (Ampere/Ada).
-Remove sm_80/86/89 or upgrade CUDA Toolkit if you see NVCC errors.
-")
+    if(CUDA_TOOLKIT_VERSION AND CUDA_VERSION_MAJOR LESS 11)
+        if(CUDA_ARCH_LIST MATCHES "80|86|89")
+            message(WARNING "
+    Detected CUDA ${CUDA_TOOLKIT_VERSION}.
+    Some requested architectures (${CUDA_ARCH_LIST}) require CUDA >= 11 (Ampere/Ada).
+    Remove sm_80/86/89 or upgrade CUDA Toolkit if you see NVCC errors.
+    ")
+        endif()
     endif()
 endif()
 
@@ -176,7 +186,7 @@ endif()
 target_compile_features(warprnnt PRIVATE cxx_std_17)
 target_compile_definitions(warprnnt PRIVATE ${COMMON_DEFINITIONS})
 target_include_directories(warprnnt PRIVATE include)
-if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18" AND TARGET warprnnt)
+if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18" AND TARGET warprnnt AND WITH_GPU)
     set_target_properties(warprnnt PROPERTIES
         CUDA_SEPARABLE_COMPILATION ON
         CUDA_STANDARD 17

From 80f897970637d5d681de93ce1f6c35d48f425272 Mon Sep 17 00:00:00 2001
From: ljn7 <john.nirmal@tihiitb.org>
Date: Sat, 19 Jul 2025 18:49:55 +0000
Subject: [PATCH 10/14] Added fastemit argument

---
 pytorch_binding/warprnnt_pytorch/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_binding/warprnnt_pytorch/__init__.py b/pytorch_binding/warprnnt_pytorch/__init__.py
index a078fb0..8f1d261 100644
--- a/pytorch_binding/warprnnt_pytorch/__init__.py
+++ b/pytorch_binding/warprnnt_pytorch/__init__.py
@@ -9,7 +9,7 @@
 
 class _RNNT(Function):
     @staticmethod
-    def forward(ctx, acts, labels, act_lens, label_lens, blank, reduction):
+    def forward(ctx, acts, labels, act_lens, label_lens, blank, reduction, fastemit_lambda):
         """
         acts: Tensor of (batch x seqLength x labelLength x outputDim) containing output from network
         labels: 2 dimensional Tensor containing all the targets of the batch with zero padded

From 2e0dc61a432f6e8418e5079f6d5ef98de31c99c8 Mon Sep 17 00:00:00 2001
From: ljn7 <john.nirmal@tihiitb.org>
Date: Sun, 20 Jul 2025 18:27:34 +0000
Subject: [PATCH 11/14] feat(build): add Windows support for CUDA builds and
 improve cross-platform checks

- Enabled robust detection of CUDA and platform compatibility
- Improved build logic to support CUDA builds on Windows
- Added fallback mechanisms and clearer error handling
- Ensured smoother CPU-only builds without redundant checks
- Reduced platform-specific pain points for easier maintenance
---
 CMakeLists.txt          | 139 +++++++++++++++++-----------
 src/rnnt_entrypoint.cpp | 189 --------------------------------------
 src/rnnt_entrypoint.cu  | 198 +++++++++++++++++++++++++++++++++++++++-
 tests/test_cpu.cpp      |   4 +-
 tests/test_time.cpp     |   4 +-
 5 files changed, 285 insertions(+), 249 deletions(-)
 delete mode 100644 src/rnnt_entrypoint.cpp
 mode change 120000 => 100644 src/rnnt_entrypoint.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fcb7766..a31e5a4 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,35 +1,64 @@
-if(APPLE)
-    cmake_minimum_required(VERSION 3.4)
-else()
-    cmake_minimum_required(VERSION 3.5)
-endif()
+cmake_minimum_required(VERSION 3.10)
+
+option(WITH_GPU "Build with GPU (CUDA) support" ON)
+
+set(CUDA_FOUND FALSE)
+set(CUDA_TOOLKIT_VERSION "0.0")
 
 if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
     find_package(CUDAToolkit QUIET)
-    set(CUDA_FOUND ${CUDAToolkit_FOUND})
+
+    if(CMAKE_VERSION VERSION_LESS "3.27")
+        find_package(CUDA QUIET)
+    endif()
+
+    if(CUDAToolkit_FOUND OR CUDA_FOUND)
+        set(CUDA_FOUND TRUE)
+        if(CUDAToolkit_FOUND)
+            set(CUDA_TOOLKIT_VERSION "${CUDAToolkit_VERSION}")
+        elseif(DEFINED CUDA_VERSION AND NOT "${CUDA_VERSION}" STREQUAL "")
+            set(CUDA_TOOLKIT_VERSION "${CUDA_VERSION}")
+        endif()
+    endif()
 else()
     find_package(CUDA QUIET)
-    set(CUDA_FOUND ${CUDA_FOUND})
+    if(CUDA_FOUND)
+        if(DEFINED CUDA_VERSION AND NOT "${CUDA_VERSION}" STREQUAL "")
+            set(CUDA_TOOLKIT_VERSION "${CUDA_VERSION}")
+        endif()
+    endif()
+endif()
+
+if(NOT WITH_GPU)
+    message(STATUS "WITH_GPU=OFF: forcing CPU-only build, even if CUDA is available")
+    set(CUDA_FOUND FALSE)
 endif()
 
-if(CUDA_FOUND)
+if(WITH_GPU AND NOT CUDA_FOUND)
+    message(FATAL_ERROR "WITH_GPU=ON but no compatible CUDA toolkit was found")
+endif()
+
+if(WITH_GPU AND CUDA_FOUND)
     project(rnnt_release LANGUAGES CXX CUDA)
-    message(STATUS "CUDA found, enabling GPU support")
+    set(USE_CUDA TRUE)
+    message(STATUS "CUDA ${CUDA_TOOLKIT_VERSION} found, GPU build enabled")
 else()
     project(rnnt_release LANGUAGES CXX)
-    message(STATUS "CUDA not found, building CPU-only version")
+    set(USE_CUDA FALSE)
+    message(STATUS "CUDA not found or disabled, building CPU-only version")
 endif()
 
 # ==== USER OPTIONS ====
-option(WITH_GPU "Build with CUDA support" ${CUDA_FOUND})
 option(WITH_OMP "Build with OpenMP support" ON)
 option(USE_NAIVE_KERNEL "Use naive alpha-beta kernel" OFF)
 option(DEBUG_TIME "Output kernel time" OFF)
 option(DEBUG_KERNEL "Output alpha beta debug" OFF)
 
-if(WITH_GPU AND NOT CUDA_FOUND)
-    message(WARNING "WITH_GPU requested but CUDA not found. Building CPU-only version.")
-    set(WITH_GPU OFF)
+if(WITH_OMP)
+    find_package(OpenMP)
+    if(OpenMP_CXX_FOUND)
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    endif()
 endif()
 
 set(CMAKE_CXX_STANDARD 17)
@@ -61,31 +90,8 @@ if(NOT WITH_OMP)
     list(APPEND COMMON_DEFINITIONS -DRNNT_DISABLE_OMP)
 endif()
 
-if(WITH_OMP)
-    find_package(OpenMP)
-    if(OpenMP_CXX_FOUND)
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-    endif()
-endif()
-
 # ==== CUDA SETUP (only if WITH_GPU is ON) ====
-if(WITH_GPU)
-    if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
-        find_package(CUDAToolkit REQUIRED)
-        set(CUDA_TOOLKIT_VERSION "${CUDAToolkit_VERSION}")
-        if(NOT CUDA_TOOLKIT_VERSION OR "${CUDA_TOOLKIT_VERSION}" STREQUAL "")
-            set(CUDA_TOOLKIT_VERSION "0.0")
-        endif()
-    else()
-        find_package(CUDA REQUIRED)
-        if(DEFINED CUDA_VERSION AND NOT "${CUDA_VERSION}" STREQUAL "")
-            set(CUDA_TOOLKIT_VERSION "${CUDA_VERSION}")
-        else()
-            set(CUDA_TOOLKIT_VERSION "0.0")
-        endif()
-    endif()
-
-    # Split CUDA version into major/minor
+if(USE_CUDA)
     function(cuda_version_major VERSION OUTVAR)
         if(NOT "${VERSION}" STREQUAL "")
             string(REPLACE "." ";" TMP_LIST ${VERSION})
@@ -95,6 +101,7 @@ if(WITH_GPU)
             set(${OUTVAR} "0" PARENT_SCOPE)
         endif()
     endfunction()
+
     function(cuda_version_minor VERSION OUTVAR)
         if(NOT "${VERSION}" STREQUAL "")
             string(REPLACE "." ";" TMP_LIST ${VERSION})
@@ -125,7 +132,6 @@ if(WITH_GPU)
         set(CUDA_ARCH_LIST "52;60;70;75;80;86;89")
     endif()
 
-
     if(DEFINED CMAKE_CUDA_ARCHITECTURES AND NOT "${CMAKE_CUDA_ARCHITECTURES}" STREQUAL "")
         set(CUDA_ARCH_LIST "${CMAKE_CUDA_ARCHITECTURES}")
     endif()
@@ -145,11 +151,9 @@ if(WITH_GPU)
 
     if(CUDA_TOOLKIT_VERSION AND CUDA_VERSION_MAJOR LESS 11)
         if(CUDA_ARCH_LIST MATCHES "80|86|89")
-            message(WARNING "
-    Detected CUDA ${CUDA_TOOLKIT_VERSION}.
-    Some requested architectures (${CUDA_ARCH_LIST}) require CUDA >= 11 (Ampere/Ada).
-    Remove sm_80/86/89 or upgrade CUDA Toolkit if you see NVCC errors.
-    ")
+            message(WARNING "\nDetected CUDA ${CUDA_TOOLKIT_VERSION}.
+            Some requested architectures (${CUDA_ARCH_LIST}) require CUDA >= 11 (Ampere/Ada).
+            Remove sm_80/86/89 or upgrade CUDA Toolkit if you see NVCC errors.\n")
         endif()
     endif()
 endif()
@@ -166,27 +170,50 @@ else()
     set(CMAKE_SKIP_RPATH TRUE)
 endif()
 
+set(RNNT_ENTRYPOINT_SRC src/rnnt_entrypoint.cu)
+if(NOT USE_CUDA)
+    configure_file(
+        ${RNNT_ENTRYPOINT_SRC}
+        ${CMAKE_CURRENT_BINARY_DIR}/rnnt_entrypoint_cpu.cpp
+        COPYONLY
+    )
+    set(RNNT_ENTRYPOINT_SRC ${CMAKE_CURRENT_BINARY_DIR}/rnnt_entrypoint_cpu.cpp)
+endif()
+
 # ==== Main library target ====
-if(WITH_GPU)
+if(USE_CUDA)
     message(STATUS "Building shared library with GPU support")
     if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
-        add_library(warprnnt SHARED src/rnnt_entrypoint.cu)
+        add_library(warprnnt SHARED ${RNNT_ENTRYPOINT_SRC})
         target_link_libraries(warprnnt PRIVATE CUDA::cudart CUDA::curand)
     else()
-        CUDA_ADD_LIBRARY(warprnnt SHARED src/rnnt_entrypoint.cu)
+        CUDA_ADD_LIBRARY(warprnnt SHARED ${RNNT_ENTRYPOINT_SRC})
         if(NOT Torch_FOUND)
             TARGET_LINK_LIBRARIES(warprnnt ${CUDA_curand_LIBRARY})
         endif()
     endif()
 else()
     message(STATUS "Building shared library with no GPU support")
-    add_library(warprnnt SHARED src/rnnt_entrypoint.cpp)
+    add_library(warprnnt SHARED ${RNNT_ENTRYPOINT_SRC})
+endif()
+
+set_target_properties(warprnnt PROPERTIES
+    RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}
+    LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}
+    ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}
+)
+
+if(WIN32)
+    set_target_properties(warprnnt PROPERTIES
+        WINDOWS_EXPORT_ALL_SYMBOLS ON
+    )
 endif()
 
 target_compile_features(warprnnt PRIVATE cxx_std_17)
 target_compile_definitions(warprnnt PRIVATE ${COMMON_DEFINITIONS})
 target_include_directories(warprnnt PRIVATE include)
-if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18" AND TARGET warprnnt AND WITH_GPU)
+
+if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18" AND TARGET warprnnt AND USE_CUDA)
     set_target_properties(warprnnt PROPERTIES
         CUDA_SEPARABLE_COMPILATION ON
         CUDA_STANDARD 17
@@ -206,7 +233,7 @@ add_executable(test_time tests/test_time.cpp tests/random.cpp)
 target_link_libraries(test_time PRIVATE warprnnt)
 target_compile_features(test_time PRIVATE cxx_std_17)
 
-if(WITH_GPU)
+if(USE_CUDA)
     if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
         add_executable(test_time_gpu tests/test_time.cu tests/random.cpp)
         target_link_libraries(test_time_gpu PRIVATE warprnnt CUDA::curand)
@@ -232,11 +259,11 @@ if(WITH_GPU)
             $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
         )
     else()
-        cuda_add_executable(test_time_gpu tests/test_time.cu tests/random.cpp )
+        cuda_add_executable(test_time_gpu tests/test_time.cu tests/random.cpp)
         TARGET_LINK_LIBRARIES(test_time_gpu warprnnt ${CUDA_curand_LIBRARY})
         set_target_properties(test_time_gpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
 
-        cuda_add_executable(test_gpu tests/test_gpu.cu tests/random.cpp )
+        cuda_add_executable(test_gpu tests/test_gpu.cu tests/random.cpp)
         TARGET_LINK_LIBRARIES(test_gpu warprnnt ${CUDA_curand_LIBRARY})
         set_target_properties(test_gpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
     endif()
@@ -244,8 +271,10 @@ endif()
 
 # ==== Install targets ====
 install(TARGETS warprnnt
-        RUNTIME DESTINATION bin
-        LIBRARY DESTINATION lib
-        ARCHIVE DESTINATION lib)
+    RUNTIME DESTINATION bin
+    LIBRARY DESTINATION lib
+    ARCHIVE DESTINATION lib
+)
 
 install(FILES include/rnnt.h DESTINATION include)
+
diff --git a/src/rnnt_entrypoint.cpp b/src/rnnt_entrypoint.cpp
deleted file mode 100644
index 9854470..0000000
--- a/src/rnnt_entrypoint.cpp
+++ /dev/null
@@ -1,189 +0,0 @@
-#include <cstddef>
-#include <iostream>
-#include <algorithm>
-
-#include <rnnt.h>
-
-#include "detail/cpu_rnnt.h"
-#ifdef __CUDACC__
-    #include "detail/gpu_rnnt.h"
-#endif
-
-extern "C" {
-
-int get_warprnnt_version() {
-    return 1;
-}
-
-const char* rnntGetStatusString(rnntStatus_t status) {
-    switch (status) {
-    case RNNT_STATUS_SUCCESS:
-        return "no error";
-    case RNNT_STATUS_MEMOPS_FAILED:
-        return "cuda memcpy or memset failed";
-    case RNNT_STATUS_INVALID_VALUE:
-        return "invalid value";
-    case RNNT_STATUS_EXECUTION_FAILED:
-        return "execution failed";
-
-    case RNNT_STATUS_UNKNOWN_ERROR:
-    default:
-        return "unknown error";
-
-    }
-
-}
-
-
-rnntStatus_t compute_rnnt_loss(const float* const activations, //BTUV
-                             float* gradients,
-                             const int* const flat_labels,
-                             const int* const label_lengths,
-                             const int* const input_lengths,
-                             int alphabet_size,
-                             int minibatch,
-                             float *costs,
-                             void *workspace,
-                             rnntOptions options) {
-
-    if (activations == nullptr ||
-        flat_labels == nullptr ||
-        label_lengths == nullptr ||
-        input_lengths == nullptr ||
-        costs == nullptr ||
-        workspace == nullptr ||
-        alphabet_size <= 0 ||
-        minibatch <= 0 ||
-        options.maxT <= 0 ||
-        options.maxU <= 0 ||
-        options.fastemit_lambda < 0)
-        return RNNT_STATUS_INVALID_VALUE;
-
-    if (options.loc == RNNT_CPU) {
-        CpuRNNT<float> rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace, 
-                            options.blank_label, options.fastemit_lambda, options.num_threads, options.batch_first);
-
-        if (gradients != NULL)
-            return rnnt.cost_and_grad(activations, gradients,
-                                        costs,
-                                        flat_labels, label_lengths,
-                                        input_lengths);
-        else
-            return rnnt.score_forward(activations, costs, flat_labels,
-                                        label_lengths, input_lengths);
-    } else if (options.loc == RNNT_GPU) {
-#ifdef __CUDACC__
-        GpuRNNT<float> rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace,
-                            options.blank_label, options.fastemit_lambda, options.num_threads, options.stream);
-
-        if (gradients != NULL)
-            return rnnt.cost_and_grad(activations, gradients,
-                                        costs,
-                                        flat_labels, label_lengths,
-                                        input_lengths);
-        else
-            return rnnt.score_forward(activations, costs, flat_labels,
-                                        label_lengths, input_lengths);
-#else
-        std::cerr << "GPU execution requested, but not compiled with GPU support" << std::endl;
-        return RNNT_STATUS_EXECUTION_FAILED;
-#endif
-    } else {
-        return RNNT_STATUS_INVALID_VALUE;
-    }
-}
-
-
-rnntStatus_t get_workspace_size(int maxT, int maxU,
-                               int minibatch,
-                               bool gpu,
-                               size_t* size_bytes,
-                               size_t dtype_size)
-{
-    if (minibatch <= 0 ||
-        maxT <= 0 ||
-        maxU <= 0)
-        return RNNT_STATUS_INVALID_VALUE;
-
-    *size_bytes = 0;
-
-    // per minibatch memory
-    size_t per_minibatch_bytes = 0;
-
-    // alphas & betas
-    per_minibatch_bytes += dtype_size * maxT * maxU * 2;
-
-    if (!gpu) {
-        // blank & label log probability cache
-        per_minibatch_bytes += dtype_size * maxT * maxU * 2;
-    } else {
-        // softmax denominator
-        per_minibatch_bytes += dtype_size * maxT * maxU;
-        // forward-backward loglikelihood
-        per_minibatch_bytes += dtype_size * 2;
-    }
-
-    *size_bytes = per_minibatch_bytes * minibatch;
-
-    return RNNT_STATUS_SUCCESS;
-}
-
-rnntStatus_t compute_rnnt_loss_fp64(const double* const activations, //BTUV
-                             double* gradients,
-                             const int* const flat_labels,
-                             const int* const label_lengths,
-                             const int* const input_lengths,
-                             int alphabet_size,
-                             int minibatch,
-                             double *costs,
-                             void *workspace,
-                             rnntOptions options) {
-
-    if (activations == nullptr ||
-        flat_labels == nullptr ||
-        label_lengths == nullptr ||
-        input_lengths == nullptr ||
-        costs == nullptr ||
-        workspace == nullptr ||
-        alphabet_size <= 0 ||
-        minibatch <= 0 ||
-        options.maxT <= 0 ||
-        options.maxU <= 0 ||
-        options.fastemit_lambda < 0)
-        return RNNT_STATUS_INVALID_VALUE;
-
-    if (options.loc == RNNT_CPU) {
-        CpuRNNT<double> rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace, 
-                                options.blank_label, options.fastemit_lambda, options.num_threads, options.batch_first);
-
-        if (gradients != NULL)
-            return rnnt.cost_and_grad(activations, gradients,
-                                        costs,
-                                        flat_labels, label_lengths,
-                                        input_lengths);
-        else
-            return rnnt.score_forward(activations, costs, flat_labels,
-                                        label_lengths, input_lengths);
-    } else if (options.loc == RNNT_GPU) {
-#ifdef __CUDACC__
-        GpuRNNT<double> rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace,
-                                options.blank_label, options.fastemit_lambda, options.num_threads, options.stream);
-
-        if (gradients != NULL)
-            return rnnt.cost_and_grad(activations, gradients,
-                                        costs,
-                                        flat_labels, label_lengths,
-                                        input_lengths);
-        else
-            return rnnt.score_forward(activations, costs, flat_labels,
-                                        label_lengths, input_lengths);
-#else
-        std::cerr << "GPU execution requested, but not compiled with GPU support" << std::endl;
-        return RNNT_STATUS_EXECUTION_FAILED;
-#endif
-    } else {
-        return RNNT_STATUS_INVALID_VALUE;
-    }
-}
-
-}
diff --git a/src/rnnt_entrypoint.cu b/src/rnnt_entrypoint.cu
deleted file mode 120000
index c483c3c..0000000
--- a/src/rnnt_entrypoint.cu
+++ /dev/null
@@ -1 +0,0 @@
-rnnt_entrypoint.cpp
\ No newline at end of file
diff --git a/src/rnnt_entrypoint.cu b/src/rnnt_entrypoint.cu
new file mode 100644
index 0000000..c213f4c
--- /dev/null
+++ b/src/rnnt_entrypoint.cu
@@ -0,0 +1,197 @@
+#include <cstddef>
+#include <iostream>
+#include <algorithm>
+
+#include <rnnt.h>
+
+#include "detail/cpu_rnnt.h"
+#ifdef __CUDACC__
+    #include "detail/gpu_rnnt.h"
+#endif
+
+extern "C" {
+
+int get_warprnnt_version() {
+    return 1;
+}
+
+const char* rnntGetStatusString(rnntStatus_t status) {
+    switch (status) {
+    case RNNT_STATUS_SUCCESS:
+        return "no error";
+    case RNNT_STATUS_MEMOPS_FAILED:
+        return "cuda memcpy or memset failed";
+    case RNNT_STATUS_INVALID_VALUE:
+        return "invalid value";
+    case RNNT_STATUS_EXECUTION_FAILED:
+        return "execution failed";
+
+    case RNNT_STATUS_UNKNOWN_ERROR:
+    default:
+        return "unknown error";
+
+    }
+
+}
+
+
+rnntStatus_t compute_rnnt_loss(const float* const activations, //BTUV
+                             float* gradients,
+                             const int* const flat_labels,
+                             const int* const label_lengths,
+                             const int* const input_lengths,
+                             int alphabet_size,
+                             int minibatch,
+                             float *costs,
+                             void *workspace,
+                             rnntOptions options) {
+
+    if (activations == nullptr ||
+        flat_labels == nullptr ||
+        label_lengths == nullptr ||
+        input_lengths == nullptr ||
+        costs == nullptr ||
+        workspace == nullptr ||
+        alphabet_size <= 0 ||
+        minibatch <= 0 ||
+        options.maxT <= 0 ||
+        options.maxU <= 0 ||
+        options.fastemit_lambda < 0)
+        return RNNT_STATUS_INVALID_VALUE;
+
+    if (options.loc == RNNT_CPU) {
+        CpuRNNT<float> rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace, 
+                            options.blank_label, options.fastemit_lambda, options.num_threads, options.batch_first);
+
+        if (gradients != NULL)
+            return rnnt.cost_and_grad(activations, gradients,
+                                        costs,
+                                        flat_labels, label_lengths,
+                                        input_lengths);
+        else
+            return rnnt.score_forward(activations, costs, flat_labels,
+                                        label_lengths, input_lengths);
+    } else if (options.loc == RNNT_GPU) {
+#ifdef __CUDACC__
+        GpuRNNT<float> rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace,
+                            options.blank_label, options.fastemit_lambda, options.num_threads, options.stream);
+
+        if (gradients != NULL)
+            return rnnt.cost_and_grad(activations, gradients,
+                                        costs,
+                                        flat_labels, label_lengths,
+                                        input_lengths);
+        else
+            return rnnt.score_forward(activations, costs, flat_labels,
+                                        label_lengths, input_lengths);
+#else
+        std::cerr << "GPU execution requested, but not compiled with GPU support" << std::endl;
+        return RNNT_STATUS_EXECUTION_FAILED;
+#endif
+    } else {
+        return RNNT_STATUS_INVALID_VALUE;
+    }
+}
+
+
+rnntStatus_t get_workspace_size(int maxT, int maxU,
+                               int minibatch,
+                               bool gpu,
+                               size_t* size_bytes,
+                               size_t dtype_size)
+{
+    if (minibatch <= 0 ||
+        maxT <= 0 ||
+        maxU <= 0)
+        return RNNT_STATUS_INVALID_VALUE;
+
+    *size_bytes = 0;
+
+    // per minibatch memory
+    size_t per_minibatch_bytes = 0;
+
+    // alphas & betas
+    per_minibatch_bytes += dtype_size * maxT * maxU * 2;
+
+    if (!gpu) {
+        // blank & label log probability cache
+        per_minibatch_bytes += dtype_size * maxT * maxU * 2;
+    } else {
+        // softmax denominator
+        per_minibatch_bytes += dtype_size * maxT * maxU;
+        // forward-backward loglikelihood
+        per_minibatch_bytes += dtype_size * 2;
+    }
+
+    *size_bytes = per_minibatch_bytes * minibatch;
+
+    return RNNT_STATUS_SUCCESS;
+}
+
+rnntStatus_t compute_rnnt_loss_fp64(const double* const activations, //BTUV
+                             double* gradients,
+                             const int* const flat_labels,
+                             const int* const label_lengths,
+                             const int* const input_lengths,
+                             int alphabet_size,
+                             int minibatch,
+                             double *costs,
+                             void *workspace,
+                             rnntOptions options) {
+
+    if (activations == nullptr ||
+        flat_labels == nullptr ||
+        label_lengths == nullptr ||
+        input_lengths == nullptr ||
+        costs == nullptr ||
+        workspace == nullptr ||
+        alphabet_size <= 0 ||
+        minibatch <= 0 ||
+        options.maxT <= 0 ||
+        options.maxU <= 0 ||
+        options.fastemit_lambda < 0)
+        return RNNT_STATUS_INVALID_VALUE;
+
+    if (options.loc == RNNT_CPU) {
+        CpuRNNT<double> rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace, 
+                                options.blank_label, options.fastemit_lambda, options.num_threads, options.batch_first);
+
+        if (gradients != NULL)
+            return rnnt.cost_and_grad(activations, gradients,
+                                        costs,
+                                        flat_labels, label_lengths,
+                                        input_lengths);
+        else
+            return rnnt.score_forward(activations, costs, flat_labels,
+                                        label_lengths, input_lengths);
+    } else if (options.loc == RNNT_GPU) {
+#ifdef __CUDACC__
+        GpuRNNT<double> rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace,
+                                options.blank_label, options.fastemit_lambda, options.num_threads, options.stream);
+
+        if (gradients != NULL)
+            return rnnt.cost_and_grad(activations, gradients,
+                                        costs,
+                                        flat_labels, label_lengths,
+                                        input_lengths);
+        else
+            return rnnt.score_forward(activations, costs, flat_labels,
+                                        label_lengths, input_lengths);
+#else
+        std::cerr << "GPU execution requested, but not compiled with GPU support" << std::endl;
+        return RNNT_STATUS_EXECUTION_FAILED;
+#endif
+    } else {
+        return RNNT_STATUS_INVALID_VALUE;
+    }
+}
+
+const char* rnnt_backend_mode() {
+#ifdef __CUDACC__
+    return "GPU";
+#else
+    return "CPU";
+#endif
+}
+
+}
diff --git a/tests/test_cpu.cpp b/tests/test_cpu.cpp
index 554a18b..fee4eb5 100644
--- a/tests/test_cpu.cpp
+++ b/tests/test_cpu.cpp
@@ -239,7 +239,7 @@ bool inf_test() {
     return status;
 }
 
-float numeric_grad(std::vector<float>& acts, std::vector<int>& flat_labels, std::vector<int>& label_lengths,
+void numeric_grad(std::vector<float>& acts, std::vector<int>& flat_labels, std::vector<int>& label_lengths,
                 std::vector<int> sizes, int alphabet_size, int minibatch, 
                 void* rnnt_cpu_workspace, rnntOptions& options, std::vector<float>& num_grad) {
 
@@ -403,4 +403,4 @@ int main(void) {
         std::cout << "Some or all tests fail" << std::endl;
         return 1;
     }
-}
\ No newline at end of file
+}
diff --git a/tests/test_time.cpp b/tests/test_time.cpp
index 3083b0e..ce77ba6 100644
--- a/tests/test_time.cpp
+++ b/tests/test_time.cpp
@@ -12,7 +12,7 @@
 
 #include "test.h"
 
-bool run_test(int B, int T, int L, int A, int num_threads) {
+void run_test(int B, int T, int L, int A, int num_threads) {
     std::mt19937 gen(2);
 
     auto start = std::chrono::high_resolution_clock::now();
@@ -119,4 +119,4 @@ int main(int argc, char** argv) {
     }
 
     run_test(B, T, L, A, num_threads);
-}
\ No newline at end of file
+}

From a6f8c2210f8d41dd323404ff3fac35bab83e380d Mon Sep 17 00:00:00 2001
From: ljn7 <john.nirmal@tihiitb.org>
Date: Sat, 26 Jul 2025 14:15:05 +0000
Subject: [PATCH 12/14] Improve GPU build configuration and clarify CUDA usage

- Added robust detection of CUDA toolkit using CUDAToolkit and CUDA packages
- Introduced WITH_GPU option to allow user-controlled GPU builds
- Automatically fallback to CPU-only build if CUDA is unavailable or WITH_GPU=OFF
- Improved status messages to reflect user intent and system CUDA availability
- Added support for CUDA architecture detection and override via CMAKE_CUDA_ARCHITECTURES
- Improved build summary output for clarity
---
 CMakeLists.txt | 231 +++++++++++++++++++++++++++++--------------------
 1 file changed, 137 insertions(+), 94 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a31e5a4..9c8839c 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,17 +1,17 @@
 cmake_minimum_required(VERSION 3.10)
+project(rnnt_release LANGUAGES CXX)
 
-option(WITH_GPU "Build with GPU (CUDA) support" ON)
-
+# ==== CUDA DETECTION ====
 set(CUDA_FOUND FALSE)
 set(CUDA_TOOLKIT_VERSION "0.0")
 
 if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
     find_package(CUDAToolkit QUIET)
-
+    
     if(CMAKE_VERSION VERSION_LESS "3.27")
         find_package(CUDA QUIET)
     endif()
-
+    
     if(CUDAToolkit_FOUND OR CUDA_FOUND)
         set(CUDA_FOUND TRUE)
         if(CUDAToolkit_FOUND)
@@ -22,75 +22,105 @@ if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
     endif()
 else()
     find_package(CUDA QUIET)
-    if(CUDA_FOUND)
-        if(DEFINED CUDA_VERSION AND NOT "${CUDA_VERSION}" STREQUAL "")
-            set(CUDA_TOOLKIT_VERSION "${CUDA_VERSION}")
-        endif()
+    if(CUDA_FOUND AND DEFINED CUDA_VERSION AND NOT "${CUDA_VERSION}" STREQUAL "")
+        set(CUDA_TOOLKIT_VERSION "${CUDA_VERSION}")
     endif()
 endif()
 
-if(NOT WITH_GPU)
-    message(STATUS "WITH_GPU=OFF: forcing CPU-only build, even if CUDA is available")
-    set(CUDA_FOUND FALSE)
-endif()
-
-if(WITH_GPU AND NOT CUDA_FOUND)
-    message(FATAL_ERROR "WITH_GPU=ON but no compatible CUDA toolkit was found")
-endif()
-
-if(WITH_GPU AND CUDA_FOUND)
-    project(rnnt_release LANGUAGES CXX CUDA)
-    set(USE_CUDA TRUE)
-    message(STATUS "CUDA ${CUDA_TOOLKIT_VERSION} found, GPU build enabled")
-else()
-    project(rnnt_release LANGUAGES CXX)
-    set(USE_CUDA FALSE)
-    message(STATUS "CUDA not found or disabled, building CPU-only version")
-endif()
-
 # ==== USER OPTIONS ====
+option(WITH_GPU "Build with GPU (CUDA) support" ${CUDA_FOUND})
 option(WITH_OMP "Build with OpenMP support" ON)
 option(USE_NAIVE_KERNEL "Use naive alpha-beta kernel" OFF)
 option(DEBUG_TIME "Output kernel time" OFF)
 option(DEBUG_KERNEL "Output alpha beta debug" OFF)
 
+# ==== GPU BUILD CONFIGURATION ====
+if(WITH_GPU)
+    if(NOT CUDA_FOUND)
+        message(WARNING "WITH_GPU=ON was requested, but no compatible CUDA toolkit was found. Falling back to CPU-only build.")
+        set(WITH_GPU OFF CACHE BOOL "Build with GPU (CUDA) support" FORCE)
+        set(USE_CUDA FALSE)
+    else()
+        enable_language(CUDA)
+        set(USE_CUDA TRUE)
+        message(STATUS "Building with CUDA ${CUDA_TOOLKIT_VERSION} support")
+    endif()
+else()
+    set(USE_CUDA FALSE)
+    if(CUDA_FOUND)
+        message(STATUS "CUDA ${CUDA_TOOLKIT_VERSION} detected but GPU support disabled (WITH_GPU=OFF)")
+    else()
+        message(STATUS "Building CPU-only version (no CUDA toolkit found)")
+    endif()
+endif()
+
+# ==== OPENMP CONFIGURATION ====
 if(WITH_OMP)
     find_package(OpenMP)
     if(OpenMP_CXX_FOUND)
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+        message(STATUS "OpenMP support enabled")
+    else()
+        message(WARNING "OpenMP requested but not found")
+        set(WITH_OMP OFF)
     endif()
 endif()
 
+# ==== COMPILER SETTINGS ====
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
-if(NOT APPLE)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Build type" FORCE)
+endif()
+
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O0 -g")
 else()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 -O2")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
+endif()
+
+if(APPLE)
     add_definitions(-DAPPLE)
+    execute_process(COMMAND uname -v OUTPUT_VARIABLE DARWIN_VERSION)
+    string(REGEX MATCH "[0-9]+" DARWIN_VERSION "${DARWIN_VERSION}")
+    message(STATUS "Darwin version: ${DARWIN_VERSION}")
+    if(${DARWIN_VERSION} LESS 15)
+        set(CMAKE_SKIP_RPATH TRUE)
+    endif()
+else()
+    set(CMAKE_SKIP_RPATH TRUE)
 endif()
 
 include_directories(include)
 
-# ==== PREPROCESSOR FLAGS ====
+# ==== PREPROCESSOR DEFINITIONS ====
 set(COMMON_DEFINITIONS "")
+
+if(USE_CUDA)
+    list(APPEND COMMON_DEFINITIONS -DWITH_CUDA=1)
+else()
+    list(APPEND COMMON_DEFINITIONS -DWITH_CUDA=0)
+endif()
+
 if(USE_NAIVE_KERNEL)
     list(APPEND COMMON_DEFINITIONS -DUSE_NAIVE_KERNEL)
 endif()
+
 if(DEBUG_TIME)
     list(APPEND COMMON_DEFINITIONS -DDEBUG_TIME)
 endif()
+
 if(DEBUG_KERNEL)
     list(APPEND COMMON_DEFINITIONS -DDEBUG_KERNEL)
 endif()
+
 if(NOT WITH_OMP)
     list(APPEND COMMON_DEFINITIONS -DRNNT_DISABLE_OMP)
 endif()
 
-# ==== CUDA SETUP (only if WITH_GPU is ON) ====
+# ==== CUDA ARCHITECTURE SETUP ====
 if(USE_CUDA)
     function(cuda_version_major VERSION OUTVAR)
         if(NOT "${VERSION}" STREQUAL "")
@@ -123,7 +153,7 @@ if(USE_CUDA)
     set(CUDA_ARCH_LIST "")
     if(CUDA_TOOLKIT_VERSION STREQUAL "" OR CUDA_VERSION_MAJOR EQUAL 0)
         set(CUDA_ARCH_LIST "52;60;70;75")
-        message(WARNING "Could not detect CUDA version. Defaulting to minimal arch set (52;60;70;75).")
+        message(WARNING "Could not detect CUDA version. Using minimal architecture set: ${CUDA_ARCH_LIST}")
     elseif(CUDA_VERSION_MAJOR LESS 11)
         set(CUDA_ARCH_LIST "52;60;70;75")
     elseif(CUDA_VERSION_MAJOR EQUAL 11)
@@ -134,42 +164,33 @@ if(USE_CUDA)
 
     if(DEFINED CMAKE_CUDA_ARCHITECTURES AND NOT "${CMAKE_CUDA_ARCHITECTURES}" STREQUAL "")
         set(CUDA_ARCH_LIST "${CMAKE_CUDA_ARCHITECTURES}")
+        message(STATUS "Using user-specified CUDA architectures: ${CUDA_ARCH_LIST}")
+    else()
+        message(STATUS "Auto-selected CUDA architectures for CUDA ${CUDA_TOOLKIT_VERSION}: ${CUDA_ARCH_LIST}")
+        message(STATUS "To override, set -DCMAKE_CUDA_ARCHITECTURES=xx;yy;zz")
     endif()
 
     if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
         set(CMAKE_CUDA_ARCHITECTURES "${CUDA_ARCH_LIST}" CACHE STRING "Target GPU architectures")
-        message(STATUS "Auto-selected CUDA archs for CUDA ${CUDA_TOOLKIT_VERSION}: ${CMAKE_CUDA_ARCHITECTURES}")
     else()
-        string(REPLACE ";" " " CUDA_ARCH_FLAGS "${CUDA_ARCH_LIST}")
-        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ")
+        set(CUDA_NVCC_FLAGS "")
         foreach(ARCH ${CUDA_ARCH_LIST})
             set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_${ARCH},code=sm_${ARCH}")
         endforeach()
-        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++11")
-        message(STATUS "Auto-selected CUDA NVCC_ARCH flags for CUDA ${CUDA_TOOLKIT_VERSION}: ${CUDA_NVCC_FLAGS}")
+        message(STATUS "CUDA NVCC flags: ${CUDA_NVCC_FLAGS}")
     endif()
 
-    if(CUDA_TOOLKIT_VERSION AND CUDA_VERSION_MAJOR LESS 11)
-        if(CUDA_ARCH_LIST MATCHES "80|86|89")
-            message(WARNING "\nDetected CUDA ${CUDA_TOOLKIT_VERSION}.
-            Some requested architectures (${CUDA_ARCH_LIST}) require CUDA >= 11 (Ampere/Ada).
-            Remove sm_80/86/89 or upgrade CUDA Toolkit if you see NVCC errors.\n")
+    if(CUDA_VERSION_MAJOR GREATER 0 AND CUDA_VERSION_MAJOR LESS 11)
+        string(REGEX MATCH "8[0689]" AMPERE_ARCH "${CUDA_ARCH_LIST}")
+        if(AMPERE_ARCH)
+            message(WARNING 
+                "Detected CUDA ${CUDA_TOOLKIT_VERSION} with Ampere/Ada architectures (${AMPERE_ARCH}) "
+                "which require CUDA >= 11. Consider upgrading CUDA or removing sm_80/86/89 architectures.")
         endif()
     endif()
 endif()
 
-# ==== Apple/Mac rpath ====
-if(APPLE)
-    execute_process(COMMAND uname -v OUTPUT_VARIABLE DARWIN_VERSION)
-    string(REGEX MATCH "[0-9]+" DARWIN_VERSION "${DARWIN_VERSION}")
-    message(STATUS "DARWIN_VERSION=${DARWIN_VERSION}")
-    if(${DARWIN_VERSION} LESS 15)
-        set(CMAKE_SKIP_RPATH TRUE)
-    endif()
-else()
-    set(CMAKE_SKIP_RPATH TRUE)
-endif()
-
+# ==== SOURCE FILE PREPARATION ====
 set(RNNT_ENTRYPOINT_SRC src/rnnt_entrypoint.cu)
 if(NOT USE_CUDA)
     configure_file(
@@ -180,51 +201,53 @@ if(NOT USE_CUDA)
     set(RNNT_ENTRYPOINT_SRC ${CMAKE_CURRENT_BINARY_DIR}/rnnt_entrypoint_cpu.cpp)
 endif()
 
-# ==== Main library target ====
-if(USE_CUDA)
-    message(STATUS "Building shared library with GPU support")
-    if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
-        add_library(warprnnt SHARED ${RNNT_ENTRYPOINT_SRC})
-        target_link_libraries(warprnnt PRIVATE CUDA::cudart CUDA::curand)
-    else()
-        CUDA_ADD_LIBRARY(warprnnt SHARED ${RNNT_ENTRYPOINT_SRC})
-        if(NOT Torch_FOUND)
-            TARGET_LINK_LIBRARIES(warprnnt ${CUDA_curand_LIBRARY})
-        endif()
-    endif()
-else()
-    message(STATUS "Building shared library with no GPU support")
-    add_library(warprnnt SHARED ${RNNT_ENTRYPOINT_SRC})
-endif()
+# ==== MAIN LIBRARY TARGET ====
+add_library(warprnnt SHARED ${RNNT_ENTRYPOINT_SRC})
 
+# Configure library properties
 set_target_properties(warprnnt PROPERTIES
     RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}
     LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}
     ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}
+    CXX_STANDARD 17
+    CXX_STANDARD_REQUIRED ON
 )
 
 if(WIN32)
-    set_target_properties(warprnnt PROPERTIES
-        WINDOWS_EXPORT_ALL_SYMBOLS ON
-    )
+    set_target_properties(warprnnt PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
 endif()
 
-target_compile_features(warprnnt PRIVATE cxx_std_17)
 target_compile_definitions(warprnnt PRIVATE ${COMMON_DEFINITIONS})
 target_include_directories(warprnnt PRIVATE include)
 
-if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18" AND TARGET warprnnt AND USE_CUDA)
-    set_target_properties(warprnnt PROPERTIES
-        CUDA_SEPARABLE_COMPILATION ON
-        CUDA_STANDARD 17
-        CUDA_STANDARD_REQUIRED ON
-    )
-    target_compile_options(warprnnt PRIVATE
-        $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
-    )
+if(USE_CUDA)
+    message(STATUS "Configuring library with GPU support")
+    
+    if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
+        target_link_libraries(warprnnt PRIVATE CUDA::cudart CUDA::curand)
+        set_target_properties(warprnnt PROPERTIES
+            CUDA_SEPARABLE_COMPILATION ON
+            CUDA_STANDARD 17
+            CUDA_STANDARD_REQUIRED ON
+        )
+        target_compile_options(warprnnt PRIVATE
+            $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
+        )
+    else()
+        cuda_add_library(warprnnt SHARED ${RNNT_ENTRYPOINT_SRC})
+        if(NOT Torch_FOUND)
+            target_link_libraries(warprnnt ${CUDA_curand_LIBRARY})
+        endif()
+    endif()
+else()
+    message(STATUS "Configuring library with CPU-only support")
+endif()
+
+if(WITH_OMP AND OpenMP_CXX_FOUND)
+    target_link_libraries(warprnnt PRIVATE OpenMP::OpenMP_CXX)
 endif()
 
-# ==== Test executables ====
+# ==== TEST EXECUTABLES ====
 add_executable(test_cpu tests/test_cpu.cpp tests/random.cpp)
 target_link_libraries(test_cpu PRIVATE warprnnt)
 target_compile_features(test_cpu PRIVATE cxx_std_17)
@@ -237,7 +260,6 @@ if(USE_CUDA)
     if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
         add_executable(test_time_gpu tests/test_time.cu tests/random.cpp)
         target_link_libraries(test_time_gpu PRIVATE warprnnt CUDA::curand)
-        target_compile_features(test_time_gpu PRIVATE cxx_std_17)
         set_target_properties(test_time_gpu PROPERTIES
             CUDA_SEPARABLE_COMPILATION ON
             CUDA_STANDARD 17
@@ -249,7 +271,6 @@ if(USE_CUDA)
 
         add_executable(test_gpu tests/test_gpu.cu tests/random.cpp)
         target_link_libraries(test_gpu PRIVATE warprnnt CUDA::curand)
-        target_compile_features(test_gpu PRIVATE cxx_std_17)
         set_target_properties(test_gpu PROPERTIES
             CUDA_SEPARABLE_COMPILATION ON
             CUDA_STANDARD 17
@@ -260,16 +281,20 @@ if(USE_CUDA)
         )
     else()
         cuda_add_executable(test_time_gpu tests/test_time.cu tests/random.cpp)
-        TARGET_LINK_LIBRARIES(test_time_gpu warprnnt ${CUDA_curand_LIBRARY})
-        set_target_properties(test_time_gpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
+        target_link_libraries(test_time_gpu warprnnt ${CUDA_curand_LIBRARY})
+        set_target_properties(test_time_gpu PROPERTIES 
+            COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11"
+        )
 
         cuda_add_executable(test_gpu tests/test_gpu.cu tests/random.cpp)
-        TARGET_LINK_LIBRARIES(test_gpu warprnnt ${CUDA_curand_LIBRARY})
-        set_target_properties(test_gpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
+        target_link_libraries(test_gpu warprnnt ${CUDA_curand_LIBRARY})
+        set_target_properties(test_gpu PROPERTIES 
+            COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11"
+        )
     endif()
 endif()
 
-# ==== Install targets ====
+# ==== INSTALL TARGETS ====
 install(TARGETS warprnnt
     RUNTIME DESTINATION bin
     LIBRARY DESTINATION lib
@@ -278,3 +303,21 @@ install(TARGETS warprnnt
 
 install(FILES include/rnnt.h DESTINATION include)
 
+# ==== BUILD SUMMARY ====
+message(STATUS "")
+message(STATUS "=== Build Configuration Summary ===")
+message(STATUS "Build Type: ${CMAKE_BUILD_TYPE}")
+message(STATUS "CUDA Available: ${CUDA_FOUND}")
+if(CUDA_FOUND)
+    message(STATUS "CUDA Version: ${CUDA_TOOLKIT_VERSION}")
+endif()
+message(STATUS "GPU Support: ${USE_CUDA}")
+if(USE_CUDA)
+    message(STATUS "CUDA Architectures: ${CMAKE_CUDA_ARCHITECTURES}")
+endif()
+message(STATUS "OpenMP Support: ${WITH_OMP}")
+message(STATUS "Naive Kernel: ${USE_NAIVE_KERNEL}")
+message(STATUS "Debug Time: ${DEBUG_TIME}")
+message(STATUS "Debug Kernel: ${DEBUG_KERNEL}")
+message(STATUS "=====================================")
+message(STATUS "")
\ No newline at end of file

From cd86c036333a31813257cb697a50e5c1159012bc Mon Sep 17 00:00:00 2001
From: John Nirmal <johnnirmal.works@gmail.com>
Date: Sat, 26 Jul 2025 21:09:09 +0530
Subject: [PATCH 13/14] ci: add CPU-only GitHub Actions workflow for build and
 test

- Uses cmake and make to build and run test_cpu
- Targets only CPU environments for faster and simpler CI
---
 .github/workflows/ci.yml | 42 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 .github/workflows/ci.yml

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..65f01ae
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,42 @@
+name: CI (CPU-only)
+
+on:
+  push:
+    branches: [main, master]
+  pull_request:
+    branches: [main, master]
+
+jobs:
+  build-and-test:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y cmake build-essential
+
+      - name: Configure CMake (CPU-only)
+        run: |
+          mkdir -p build
+          cd build
+          cmake ..
+
+      - name: Build project
+        run: |
+          cd build
+          make -j$(nproc)
+
+      - name: Install and update linker cache
+        run: |
+          cd build
+          sudo make install
+          sudo ldconfig
+
+      - name: Run CPU tests
+        run: |
+          cd build
+          ./test_cpu

From 48434133ea3e0c2d50fb01b27408d3df5f93715d Mon Sep 17 00:00:00 2001
From: ljn7 <john.nirmal@tihiitb.org>
Date: Thu, 7 Aug 2025 17:18:05 +0530
Subject: [PATCH 14/14] Updated CMakeLists

- Updated for compatability and performance similar to b-flo's implementation
- Removed tensorflow and mxnet bindings (might add as git-modules)
---
 CMakeLists.txt                                | 232 ++++++------------
 mxnet_binding                                 |   1 -
 tensorflow_binding/README.md                  |  97 --------
 tensorflow_binding/setup.py                   | 149 -----------
 tensorflow_binding/src/warprnnt_op.cc         | 191 --------------
 tensorflow_binding/tests/test_basic.py        |  56 -----
 tensorflow_binding/tests/test_warprnnt_op.py  |  97 --------
 .../warprnnt_tensorflow/__init__.py           |  48 ----
 8 files changed, 79 insertions(+), 792 deletions(-)
 mode change 100755 => 100644 CMakeLists.txt
 delete mode 160000 mxnet_binding
 delete mode 100644 tensorflow_binding/README.md
 delete mode 100644 tensorflow_binding/setup.py
 delete mode 100644 tensorflow_binding/src/warprnnt_op.cc
 delete mode 100644 tensorflow_binding/tests/test_basic.py
 delete mode 100644 tensorflow_binding/tests/test_warprnnt_op.py
 delete mode 100644 tensorflow_binding/warprnnt_tensorflow/__init__.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
old mode 100755
new mode 100644
index 9c8839c..f422901
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,13 +5,11 @@ project(rnnt_release LANGUAGES CXX)
 set(CUDA_FOUND FALSE)
 set(CUDA_TOOLKIT_VERSION "0.0")
 
+find_package(CUDA QUIET)
+
 if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
     find_package(CUDAToolkit QUIET)
     
-    if(CMAKE_VERSION VERSION_LESS "3.27")
-        find_package(CUDA QUIET)
-    endif()
-    
     if(CUDAToolkit_FOUND OR CUDA_FOUND)
         set(CUDA_FOUND TRUE)
         if(CUDAToolkit_FOUND)
@@ -21,7 +19,6 @@ if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
         endif()
     endif()
 else()
-    find_package(CUDA QUIET)
     if(CUDA_FOUND AND DEFINED CUDA_VERSION AND NOT "${CUDA_VERSION}" STREQUAL "")
         set(CUDA_TOOLKIT_VERSION "${CUDA_VERSION}")
     endif()
@@ -41,7 +38,6 @@ if(WITH_GPU)
         set(WITH_GPU OFF CACHE BOOL "Build with GPU (CUDA) support" FORCE)
         set(USE_CUDA FALSE)
     else()
-        enable_language(CUDA)
         set(USE_CUDA TRUE)
         message(STATUS "Building with CUDA ${CUDA_TOOLKIT_VERSION} support")
     endif()
@@ -69,7 +65,6 @@ endif()
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
-set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
 if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Build type" FORCE)
@@ -81,6 +76,10 @@ else()
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
 endif()
 
+if(WITH_OMP)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
+endif()
+
 if(APPLE)
     add_definitions(-DAPPLE)
     execute_process(COMMAND uname -v OUTPUT_VARIABLE DARWIN_VERSION)
@@ -96,98 +95,64 @@ endif()
 include_directories(include)
 
 # ==== PREPROCESSOR DEFINITIONS ====
-set(COMMON_DEFINITIONS "")
-
-if(USE_CUDA)
-    list(APPEND COMMON_DEFINITIONS -DWITH_CUDA=1)
-else()
-    list(APPEND COMMON_DEFINITIONS -DWITH_CUDA=0)
-endif()
-
 if(USE_NAIVE_KERNEL)
-    list(APPEND COMMON_DEFINITIONS -DUSE_NAIVE_KERNEL)
+    add_definitions(-DUSE_NAIVE_KERNEL)
 endif()
 
 if(DEBUG_TIME)
-    list(APPEND COMMON_DEFINITIONS -DDEBUG_TIME)
+    add_definitions(-DDEBUG_TIME)
 endif()
 
 if(DEBUG_KERNEL)
-    list(APPEND COMMON_DEFINITIONS -DDEBUG_KERNEL)
+    add_definitions(-DDEBUG_KERNEL)
 endif()
 
 if(NOT WITH_OMP)
-    list(APPEND COMMON_DEFINITIONS -DRNNT_DISABLE_OMP)
+    add_definitions(-DRNNT_DISABLE_OMP)
 endif()
 
-# ==== CUDA ARCHITECTURE SETUP ====
+# ==== CUDA CONFIGURATION ====
 if(USE_CUDA)
-    function(cuda_version_major VERSION OUTVAR)
-        if(NOT "${VERSION}" STREQUAL "")
-            string(REPLACE "." ";" TMP_LIST ${VERSION})
-            list(GET TMP_LIST 0 MAJOR)
-            set(${OUTVAR} "${MAJOR}" PARENT_SCOPE)
-        else()
-            set(${OUTVAR} "0" PARENT_SCOPE)
-        endif()
-    endfunction()
-
-    function(cuda_version_minor VERSION OUTVAR)
-        if(NOT "${VERSION}" STREQUAL "")
-            string(REPLACE "." ";" TMP_LIST ${VERSION})
-            list(LENGTH TMP_LIST TMP_LEN)
-            if(TMP_LEN GREATER 1)
-                list(GET TMP_LIST 1 MINOR)
-            else()
-                set(MINOR "0")
-            endif()
-            set(${OUTVAR} "${MINOR}" PARENT_SCOPE)
-        else()
-            set(${OUTVAR} "0" PARENT_SCOPE)
-        endif()
-    endfunction()
-
-    cuda_version_major("${CUDA_TOOLKIT_VERSION}" CUDA_VERSION_MAJOR)
-    cuda_version_minor("${CUDA_TOOLKIT_VERSION}" CUDA_VERSION_MINOR)
-
-    set(CUDA_ARCH_LIST "")
-    if(CUDA_TOOLKIT_VERSION STREQUAL "" OR CUDA_VERSION_MAJOR EQUAL 0)
-        set(CUDA_ARCH_LIST "52;60;70;75")
-        message(WARNING "Could not detect CUDA version. Using minimal architecture set: ${CUDA_ARCH_LIST}")
-    elseif(CUDA_VERSION_MAJOR LESS 11)
-        set(CUDA_ARCH_LIST "52;60;70;75")
-    elseif(CUDA_VERSION_MAJOR EQUAL 11)
-        set(CUDA_ARCH_LIST "52;60;70;75;80;86")
-    elseif(CUDA_VERSION_MAJOR GREATER_EQUAL 12)
-        set(CUDA_ARCH_LIST "52;60;70;75;80;86;89")
+    if(WITH_OMP)
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fopenmp")
     endif()
 
-    if(DEFINED CMAKE_CUDA_ARCHITECTURES AND NOT "${CMAKE_CUDA_ARCHITECTURES}" STREQUAL "")
-        set(CUDA_ARCH_LIST "${CMAKE_CUDA_ARCHITECTURES}")
-        message(STATUS "Using user-specified CUDA architectures: ${CUDA_ARCH_LIST}")
-    else()
-        message(STATUS "Auto-selected CUDA architectures for CUDA ${CUDA_TOOLKIT_VERSION}: ${CUDA_ARCH_LIST}")
-        message(STATUS "To override, set -DCMAKE_CUDA_ARCHITECTURES=xx;yy;zz")
+    if(NOT (CUDA_VERSION GREATER 10.2))
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30 -O2")
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_35,code=sm_35")
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_50,code=sm_50")
     endif()
 
-    if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
-        set(CMAKE_CUDA_ARCHITECTURES "${CUDA_ARCH_LIST}" CACHE STRING "Target GPU architectures")
-    else()
-        set(CUDA_NVCC_FLAGS "")
-        foreach(ARCH ${CUDA_ARCH_LIST})
-            set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_${ARCH},code=sm_${ARCH}")
-        endforeach()
-        message(STATUS "CUDA NVCC flags: ${CUDA_NVCC_FLAGS}")
+    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52")
+    
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5)
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -D_MWAITXINTRIN_H_INCLUDED -D_FORCE_INLINES")
     endif()
 
-    if(CUDA_VERSION_MAJOR GREATER 0 AND CUDA_VERSION_MAJOR LESS 11)
-        string(REGEX MATCH "8[0689]" AMPERE_ARCH "${CUDA_ARCH_LIST}")
-        if(AMPERE_ARCH)
-            message(WARNING 
-                "Detected CUDA ${CUDA_TOOLKIT_VERSION} with Ampere/Ada architectures (${AMPERE_ARCH}) "
-                "which require CUDA >= 11. Consider upgrading CUDA or removing sm_80/86/89 architectures.")
-        endif()
+    if(CUDA_VERSION GREATER 7.6)
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60")
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61")
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62")
+    endif()
+
+    if(CUDA_VERSION GREATER 8.9)
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_70,code=sm_70")
+    endif()
+
+    if(CUDA_VERSION GREATER 9.9)
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_75,code=sm_75")
+    endif()
+
+    if(CUDA_VERSION GREATER 11.0)
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_80,code=sm_80")
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_86,code=sm_86")
+    endif()
+
+    if(NOT APPLE)
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++17")
     endif()
+
+    message(STATUS "CUDA NVCC flags: ${CUDA_NVCC_FLAGS}")
 endif()
 
 # ==== SOURCE FILE PREPARATION ====
@@ -202,96 +167,57 @@ if(NOT USE_CUDA)
 endif()
 
 # ==== MAIN LIBRARY TARGET ====
-add_library(warprnnt SHARED ${RNNT_ENTRYPOINT_SRC})
+if(USE_CUDA)
+    message(STATUS "Building shared library with GPU support")
+    
+    cuda_add_library(warprnnt SHARED ${RNNT_ENTRYPOINT_SRC})
+    
+    if(NOT Torch_FOUND)
+        target_link_libraries(warprnnt ${CUDA_curand_LIBRARY})
+    endif()
+    
+    if(WITH_OMP AND OpenMP_CXX_FOUND)
+        target_link_libraries(warprnnt ${OpenMP_CXX_LIBRARIES})
+    endif()
+else()
+    message(STATUS "Building shared library with CPU-only support")
+    add_library(warprnnt SHARED ${RNNT_ENTRYPOINT_SRC})
+    
+    if(WITH_OMP AND OpenMP_CXX_FOUND)
+        target_link_libraries(warprnnt PRIVATE OpenMP::OpenMP_CXX)
+    endif()
+endif()
 
-# Configure library properties
 set_target_properties(warprnnt PROPERTIES
     RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}
     LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}
     ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}
-    CXX_STANDARD 17
-    CXX_STANDARD_REQUIRED ON
 )
 
 if(WIN32)
     set_target_properties(warprnnt PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
 endif()
 
-target_compile_definitions(warprnnt PRIVATE ${COMMON_DEFINITIONS})
-target_include_directories(warprnnt PRIVATE include)
-
-if(USE_CUDA)
-    message(STATUS "Configuring library with GPU support")
-    
-    if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
-        target_link_libraries(warprnnt PRIVATE CUDA::cudart CUDA::curand)
-        set_target_properties(warprnnt PROPERTIES
-            CUDA_SEPARABLE_COMPILATION ON
-            CUDA_STANDARD 17
-            CUDA_STANDARD_REQUIRED ON
-        )
-        target_compile_options(warprnnt PRIVATE
-            $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
-        )
-    else()
-        cuda_add_library(warprnnt SHARED ${RNNT_ENTRYPOINT_SRC})
-        if(NOT Torch_FOUND)
-            target_link_libraries(warprnnt ${CUDA_curand_LIBRARY})
-        endif()
-    endif()
-else()
-    message(STATUS "Configuring library with CPU-only support")
-endif()
-
-if(WITH_OMP AND OpenMP_CXX_FOUND)
-    target_link_libraries(warprnnt PRIVATE OpenMP::OpenMP_CXX)
-endif()
+# Use include_directories (global) since we're mixing legacy/modern approaches
+# target_include_directories(warprnnt PRIVATE include) # Don't use this with legacy CUDA
 
 # ==== TEST EXECUTABLES ====
 add_executable(test_cpu tests/test_cpu.cpp tests/random.cpp)
-target_link_libraries(test_cpu PRIVATE warprnnt)
-target_compile_features(test_cpu PRIVATE cxx_std_17)
+target_link_libraries(test_cpu warprnnt)
+set_target_properties(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
 
 add_executable(test_time tests/test_time.cpp tests/random.cpp)
-target_link_libraries(test_time PRIVATE warprnnt)
-target_compile_features(test_time PRIVATE cxx_std_17)
+target_link_libraries(test_time warprnnt)
+set_target_properties(test_time PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
 
 if(USE_CUDA)
-    if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
-        add_executable(test_time_gpu tests/test_time.cu tests/random.cpp)
-        target_link_libraries(test_time_gpu PRIVATE warprnnt CUDA::curand)
-        set_target_properties(test_time_gpu PROPERTIES
-            CUDA_SEPARABLE_COMPILATION ON
-            CUDA_STANDARD 17
-            CUDA_STANDARD_REQUIRED ON
-        )
-        target_compile_options(test_time_gpu PRIVATE
-            $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
-        )
-
-        add_executable(test_gpu tests/test_gpu.cu tests/random.cpp)
-        target_link_libraries(test_gpu PRIVATE warprnnt CUDA::curand)
-        set_target_properties(test_gpu PROPERTIES
-            CUDA_SEPARABLE_COMPILATION ON
-            CUDA_STANDARD 17
-            CUDA_STANDARD_REQUIRED ON
-        )
-        target_compile_options(test_gpu PRIVATE
-            $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
-        )
-    else()
-        cuda_add_executable(test_time_gpu tests/test_time.cu tests/random.cpp)
-        target_link_libraries(test_time_gpu warprnnt ${CUDA_curand_LIBRARY})
-        set_target_properties(test_time_gpu PROPERTIES 
-            COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11"
-        )
-
-        cuda_add_executable(test_gpu tests/test_gpu.cu tests/random.cpp)
-        target_link_libraries(test_gpu warprnnt ${CUDA_curand_LIBRARY})
-        set_target_properties(test_gpu PROPERTIES 
-            COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11"
-        )
-    endif()
+    cuda_add_executable(test_time_gpu tests/test_time.cu tests/random.cpp)
+    target_link_libraries(test_time_gpu warprnnt ${CUDA_curand_LIBRARY})
+    set_target_properties(test_time_gpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
+
+    cuda_add_executable(test_gpu tests/test_gpu.cu tests/random.cpp)
+    target_link_libraries(test_gpu warprnnt ${CUDA_curand_LIBRARY})
+    set_target_properties(test_gpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
 endif()
 
 # ==== INSTALL TARGETS ====
@@ -313,11 +239,11 @@ if(CUDA_FOUND)
 endif()
 message(STATUS "GPU Support: ${USE_CUDA}")
 if(USE_CUDA)
-    message(STATUS "CUDA Architectures: ${CMAKE_CUDA_ARCHITECTURES}")
+    message(STATUS "CUDA NVCC Flags: ${CUDA_NVCC_FLAGS}")
 endif()
 message(STATUS "OpenMP Support: ${WITH_OMP}")
 message(STATUS "Naive Kernel: ${USE_NAIVE_KERNEL}")
 message(STATUS "Debug Time: ${DEBUG_TIME}")
 message(STATUS "Debug Kernel: ${DEBUG_KERNEL}")
 message(STATUS "=====================================")
-message(STATUS "")
\ No newline at end of file
+message(STATUS "")
diff --git a/mxnet_binding b/mxnet_binding
deleted file mode 160000
index 75874b2..0000000
--- a/mxnet_binding
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 75874b21a476dd9f9e5480841f0872f9d7c2723e
diff --git a/tensorflow_binding/README.md b/tensorflow_binding/README.md
deleted file mode 100644
index b35ae4c..0000000
--- a/tensorflow_binding/README.md
+++ /dev/null
@@ -1,97 +0,0 @@
-# TensorFlow binding for WarpRNNT
-
-This package provides TensorFlow kernels that wrap the WarpRNNT
-library.
-
-## Installation
-
-To build the kernels it is necessary to have the TensorFlow source
-code available, since TensorFlow doesn't currently install the
-necessary headers to handle the SparseTensor that the CTCLoss op uses
-to input the labels.  You can retrieve the TensorFlow source from
-github.com:
-
-```bash
-git clone https://github.com/tensorflow/tensorflow.git
-```
-<!--
-Tell the build scripts where you have the TensorFlow source tree by
-setting the `TENSORFLOW_SRC_PATH` environment variable:
-
-```bash
-export TENSORFLOW_SRC_PATH=/path/to/tensorflow
-```
--->
-`WARP_RNNT_PATH` should be set to the location of a built WarpRNNT
-(i.e. `libwarprnnt.so`).  This defaults to `../build`, so from within a
-new warp-rnnt clone you could build WarpRNNT like this:
-
-```bash
-mkdir build; cd build
-cmake -DCUDA_TOOLKIT_ROOT_DIR=/path/to/cuda ..
-make
-```
-
-Otherwise, set `WARP_RNNT_PATH` to wherever you have `libwarprnnt.so`
-installed. If you have a GPU, you should also make sure that
-`CUDA_HOME` is set to the home cuda directory (i.e. where
-`include/cuda.h` and `lib/libcudart.so` live).
-
-You should now be able to use `setup.py` to install the package into
-your current Python environment:
-
-```bash
-CUDA=/path/to/cuda python setup.py install
-```
-
-You can run a few unit tests with `setup.py` as well if you want:
-
-```bash
-python setup.py test
-```
-
-## Using the kernels
-
-First import the module:
-
-```python
-import warprnnt_tensorflow
-```
-
-The WarpRNNT op is available via the `warprnnt_tensorflow.rnnt_loss` function:
-
-```python
-costs = warprnnt_tensorflow.rnnt_loss(acts, labels, input_lengths, label_lengths)
-```
-
-The `acts` is a 4 dimensional Tensor, `labels`
-is 2 dimensinal Tensor, and all the others are single dimension Tensors.
-See the main WarpRNNT documentation for more information.
-
-## Python interface
-```python
-def rnnt_loss(acts, labels, input_lengths, label_lengths, blank_label=0):
-    '''Computes the RNNT loss between a sequence of activations and a
-    ground truth labeling.
-    Args:
-        acts: A 4-D Tensor of floats.  The dimensions
-                     should be (B, T, U+1, V), where B is the minibatch index,
-                     T is the time index, U is the label sequence
-                     length (+1 means blank label prepanded), 
-                     and V indexes over activations for each 
-                     symbol in the alphabet.
-        labels: A 2-D Tensor of ints, a padded label sequences to make sure 
-                     labels for the minibatch are same length.
-        input_lengths: A 1-D Tensor of ints, the number of time steps
-                       for each sequence in the minibatch.
-        label_lengths: A 1-D Tensor of ints, the length of each label
-                       for each example in the minibatch.
-        blank_label: int, the label value/index that the RNNT
-                     calculation should use as the blank label
-    Returns:
-        1-D float Tensor, the cost of each example in the minibatch
-        (as negative log probabilities).
-    * This class performs the softmax operation internally.
-    * The label reserved for the blank symbol should be label 0.
-    '''
-```
diff --git a/tensorflow_binding/setup.py b/tensorflow_binding/setup.py
deleted file mode 100644
index 170f4d6..0000000
--- a/tensorflow_binding/setup.py
+++ /dev/null
@@ -1,149 +0,0 @@
-"""setup.py script for warp-rnnt TensorFlow wrapper"""
-
-from __future__ import print_function
-
-import os
-import platform
-import re
-import warnings
-import setuptools
-import sys
-import unittest
-from setuptools.command.build_ext import build_ext as orig_build_ext
-from distutils.version import LooseVersion
-
-# We need to import tensorflow to find where its include directory is.
-try:
-    import tensorflow as tf
-except ImportError:
-    raise RuntimeError("Tensorflow must be installed to build the tensorflow wrapper.")
-
-if "CUDA_HOME" not in os.environ:
-    print("CUDA_HOME not found in the environment so building "
-          "without GPU support. To build with GPU support "
-          "please define the CUDA_HOME environment variable. "
-          "This should be a path which contains include/cuda.h",
-          file=sys.stderr)
-    enable_gpu = False
-else:
-    enable_gpu = True
-
-if platform.system() == 'Darwin':
-    lib_ext = ".dylib"
-else:
-    lib_ext = ".so"
-
-warp_rnnt_path = "../build"
-if "WARP_RNNT_PATH" in os.environ:
-    warp_rnnt_path = os.environ["WARP_RNNT_PATH"]
-if not os.path.exists(os.path.join(warp_rnnt_path, "libwarprnnt"+lib_ext)):
-    print(("Could not find libwarprnnt.so in {}.\n"
-           "Build warp-rnnt and set WARP_RNNT_PATH to the location of"
-           " libwarprnnt.so (default is '../build')").format(warp_rnnt_path),
-          file=sys.stderr)
-    sys.exit(1)
-
-root_path = os.path.realpath(os.path.dirname(__file__))
-
-tf_include = tf.sysconfig.get_include()
-tf_src_dir = tf.sysconfig.get_lib() # os.environ["TENSORFLOW_SRC_PATH"]
-tf_includes = [tf_include, tf_src_dir]
-warp_rnnt_includes = [os.path.join(root_path, '../include')]
-include_dirs = tf_includes + warp_rnnt_includes
-
-if LooseVersion(tf.__version__) >= LooseVersion('1.4'):
-    nsync_dir = '../../external/nsync/public'
-    if LooseVersion(tf.__version__) >= LooseVersion('1.10'):
-        nsync_dir = 'external/nsync/public'
-    include_dirs += [os.path.join(tf_include, nsync_dir)]
-
-if os.getenv("TF_CXX11_ABI") is not None:
-    TF_CXX11_ABI = os.getenv("TF_CXX11_ABI")
-else:
-    warnings.warn("Assuming tensorflow was compiled without C++11 ABI. "
-                  "It is generally true if you are using binary pip package. "
-                  "If you compiled tensorflow from source with gcc >= 5 and didn't set "
-                  "-D_GLIBCXX_USE_CXX11_ABI=0 during compilation, you need to set "
-                  "environment variable TF_CXX11_ABI=1 when compiling this bindings. "
-                  "Also be sure to touch some files in src to trigger recompilation. "
-                  "Also, you need to set (or unsed) this environment variable if getting "
-                  "undefined symbol: _ZN10tensorflow... errors")
-    TF_CXX11_ABI = "0"
-
-extra_compile_args = ['-std=c++11', '-fPIC', '-D_GLIBCXX_USE_CXX11_ABI=' + TF_CXX11_ABI]
-# current tensorflow code triggers return type errors, silence those for now
-extra_compile_args += ['-Wno-return-type']
-if LooseVersion(tf.__version__) >= LooseVersion('1.4'):
-    extra_compile_args += tf.sysconfig.get_compile_flags()
-
-extra_link_args = []
-if LooseVersion(tf.__version__) >= LooseVersion('1.4'):
-    extra_link_args += tf.sysconfig.get_link_flags()
-
-if (enable_gpu):
-    extra_compile_args += ['-DWARPRNNT_ENABLE_GPU']
-    include_dirs += [os.path.join(os.environ["CUDA_HOME"], 'include')]
-
-    # mimic tensorflow cuda include setup so that their include command work
-    if not os.path.exists(os.path.join(root_path, "include")):
-        os.mkdir(os.path.join(root_path, "include"))
-
-    cuda_inc_path = os.path.join(root_path, "include/cuda")
-    if not os.path.exists(cuda_inc_path) or os.readlink(cuda_inc_path) != os.environ["CUDA_HOME"]:
-        if os.path.exists(cuda_inc_path):
-            os.remove(cuda_inc_path)
-        os.symlink(os.environ["CUDA_HOME"], cuda_inc_path)
-    include_dirs += [os.path.join(root_path, 'include')]
-
-# Ensure that all expected files and directories exist.
-for loc in include_dirs:
-    if not os.path.exists(loc):
-        print(("Could not find file or directory {}.\n"
-               "Check your environment variables and paths?").format(loc),
-              file=sys.stderr)
-        sys.exit(1)
-
-lib_srcs = ['src/warprnnt_op.cc']
-
-ext = setuptools.Extension('warprnnt_tensorflow.kernels',
-                           sources = lib_srcs,
-                           language = 'c++',
-                           include_dirs = include_dirs,
-                           library_dirs = [warp_rnnt_path],
-                           runtime_library_dirs = [os.path.realpath(warp_rnnt_path)],
-                           libraries = ['warprnnt'],
-                           extra_compile_args = extra_compile_args,
-                           extra_link_args = extra_link_args)
-
-class build_tf_ext(orig_build_ext):
-    def build_extensions(self):
-        if LooseVersion(tf.__version__) < LooseVersion('1.4'):
-            self.compiler.compiler_so.remove('-Wstrict-prototypes')
-        orig_build_ext.build_extensions(self)
-
-def discover_test_suite():
-    test_loader = unittest.TestLoader()
-    test_suite = test_loader.discover('tests', pattern='test_*.py')
-    return test_suite
-
-# Read the README.md file for the long description. This lets us avoid
-# duplicating the package description in multiple places in the source.
-README_PATH = os.path.join(os.path.dirname(__file__), "README.md")
-with open(README_PATH, "r") as handle:
-    # Extract everything between the first set of ## headlines
-    LONG_DESCRIPTION = re.search("#.*([^#]*)##", handle.read()).group(1).strip()
-
-setuptools.setup(
-    name = "warprnnt_tensorflow",
-    version = "0.1",
-    description = "TensorFlow wrapper for warp-transducer",
-    url="https://github.com/HawkAaron/warp-transducer",
-    long_description = LONG_DESCRIPTION,
-    author = "Mingkun Huang",
-    author_email = "mingkunhuang95@gmail.com",
-    license = "Apache",
-    packages = ["warprnnt_tensorflow"],
-    ext_modules = [ext],
-    cmdclass = {'build_ext': build_tf_ext},
-    test_suite = 'setup.discover_test_suite',
-)
diff --git a/tensorflow_binding/src/warprnnt_op.cc b/tensorflow_binding/src/warprnnt_op.cc
deleted file mode 100644
index e3d2210..0000000
--- a/tensorflow_binding/src/warprnnt_op.cc
+++ /dev/null
@@ -1,191 +0,0 @@
-#ifdef WARPRNNT_ENABLE_GPU
-#define EIGEN_USE_GPU
-#include <cuda.h>
-#endif
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/bounds_check.h"
-#include "tensorflow/core/framework/allocator.h"
-#include "rnnt.h"
-
-
-REGISTER_OP("WarpRNNT")
-    .Input("acts: float32")
-    .Input("labels: int32")
-    .Input("input_lengths: int32")
-    .Input("label_lengths: int32")
-    .Attr("blank_label: int = 0")
-    .Output("costs: float32")
-    .Output("grads: float32");
-
-namespace tf = tensorflow;
-
-namespace warp_rnnt {
-
-class WarpRNNTOpBase : public tf::OpKernel {
-  public:
-    explicit WarpRNNTOpBase(tf::OpKernelConstruction* ctx) : tf::OpKernel(ctx) {
-        OP_REQUIRES_OK(ctx, ctx->GetAttr("blank_label", &blank_label_));
-    }
-
-    void Compute(tf::OpKernelContext* ctx) override {
-        // Grab the input tensors
-        const tf::Tensor* acts;
-        const tf::Tensor* labels;
-        const tf::Tensor* label_lengths;
-        const tf::Tensor* input_lengths;
-        OP_REQUIRES_OK(ctx, ctx->input("acts", &acts));
-        OP_REQUIRES_OK(ctx, ctx->input("labels", &labels));
-        OP_REQUIRES_OK(ctx, ctx->input("label_lengths", &label_lengths));
-        OP_REQUIRES_OK(ctx, ctx->input("input_lengths", &input_lengths));
-
-        OP_REQUIRES(ctx, acts->shape().dims() == 4,
-                    tf::errors::InvalidArgument("acts is not a 4-Tensor"));
-        OP_REQUIRES(ctx, labels->shape().dims() == 2,
-                    tf::errors::InvalidArgument("labels is not a 2-Tensor"));
-        OP_REQUIRES(ctx, tf::TensorShapeUtils::IsVector(label_lengths->shape()),
-                     tf::errors::InvalidArgument("label_lengths is not a vector"));
-        OP_REQUIRES(ctx, tf::TensorShapeUtils::IsVector(input_lengths->shape()),
-                     tf::errors::InvalidArgument("input_lengths is not a vector"));
-
-        const auto& acts_shape = acts->shape();
-        const auto batch_size = acts_shape.dim_size(0);
-        const auto max_time = acts_shape.dim_size(1);
-        const auto max_u = acts_shape.dim_size(2);
-        const auto num_classes_raw = acts_shape.dim_size(3);
-
-        auto acts_t = acts->tensor<float, 4>();
-        auto labels_t = labels->tensor<int32_t, 2>();
-
-        OP_REQUIRES(
-                ctx, tf::FastBoundsCheck(num_classes_raw, std::numeric_limits<int>::max()),
-                tf::errors::InvalidArgument("num_classes cannot exceed max int"));
-        const auto alphabet_size = static_cast<const int>(num_classes_raw);
-
-        OP_REQUIRES(
-                ctx, batch_size == input_lengths->dim_size(0),
-                tf::errors::InvalidArgument("len(input_lengths) != batch_size.  ",
-                                            "len(input_length):  ", input_lengths->dim_size(0),
-                                            " batch_size: ", batch_size));
-        auto input_lengths_t = input_lengths->vec<int32_t>();
-
-        OP_REQUIRES(
-                ctx, batch_size == label_lengths->dim_size(0),
-                tf::errors::InvalidArgument("len(label_lengths) != batch_size.  ",
-                                            "len(label_length):  ", label_lengths->dim_size(0),
-                                            " batch_size: ", batch_size));
-        auto label_lengths_t = label_lengths->vec<int32_t>();
-
-        // TODO check that labels are in the alphabet?
-        // Refer to line 185, we know that
-        // Tensor input_lengths is in GPU, so cannot compare with CPU variable
-        //for (int b = 0; b < batch_size; b++) {
-        //    OP_REQUIRES(ctx, input_lengths_t(b) <= max_time,
-        //                tf::errors::InvalidArgument("input_lengths(", b, ") <= ", max_time));
-        //}
-
-        tf::Tensor* costs = nullptr;
-        OP_REQUIRES_OK(ctx, ctx->allocate_output("costs", input_lengths->shape(), &costs));
-        auto costs_t = costs->vec<float>();
-
-        tf::Tensor* grads = nullptr;
-        OP_REQUIRES_OK(ctx, ctx->allocate_output("grads", acts->shape(), &grads));
-        set_zero(grads);
-        auto grads_t = grads->tensor<float, 4>();
-
-        auto options = create_options(ctx);
-        options.blank_label = blank_label_;
-        options.maxT = max_time;
-        options.maxU = max_u;
-
-        size_t workspace_size_bytes;
-        bool use_gpu = false;
-        if(options.loc == RNNT_GPU) {
-            use_gpu = true;
-        }
-        auto warp_status = get_workspace_size(max_time,
-                                              max_u,
-                                              batch_size,
-                                              use_gpu,
-                                              &workspace_size_bytes);
-
-        OP_REQUIRES(ctx, warp_status == RNNT_STATUS_SUCCESS,
-                    tf::errors::Internal("warp_rnnt error in get_workspace_size: ",
-                                         rnntGetStatusString(warp_status)));
-
-        auto workspace_shape = tf::TensorShape{static_cast<int64_t>(workspace_size_bytes)};
-        tf::Tensor workspace;
-        OP_REQUIRES_OK(ctx, ctx->allocate_temp(tf::DT_UINT8, workspace_shape, &workspace));
-        auto workspace_t = workspace.flat<uint8_t>();
-
-        // compute RNNT
-        warp_status = compute_rnnt_loss(acts_t.data(),
-                                        grads_t.data(),
-                                        labels_t.data(),
-                                        label_lengths_t.data(),
-                                        input_lengths_t.data(),
-                                        alphabet_size, batch_size,
-                                        costs_t.data(), workspace_t.data(), options);
-
-        OP_REQUIRES(ctx, warp_status == RNNT_STATUS_SUCCESS,
-                    tf::errors::Internal("warp_rnnt error in compute_rnnt_loss: ",
-                                         rnntGetStatusString(warp_status)));
-
-    }
-  private:
-    int blank_label_;
-    virtual void set_zero(tf::Tensor* t) = 0;
-    virtual rnntOptions create_options(tf::OpKernelContext* ctx) = 0;
-};
-
-class WarpRNNTOpCPU : public WarpRNNTOpBase {
-  public:
-    explicit WarpRNNTOpCPU(tf::OpKernelConstruction* ctx) : WarpRNNTOpBase(ctx) {
-    }
-
-  private:
-    void set_zero(tf::Tensor* t) override {
-        t->flat<float>().setZero();
-    }
-
-    rnntOptions create_options(tf::OpKernelContext* ctx) override {
-        auto options = rnntOptions{};
-        options.loc = RNNT_CPU;
-        options.batch_first = true;
-        options.num_threads = ctx->device()->tensorflow_cpu_worker_threads()->num_threads;
-        return options;
-    }
-};
-
-REGISTER_KERNEL_BUILDER(Name("WarpRNNT").Device(::tensorflow::DEVICE_CPU), WarpRNNTOpCPU);
-
-#ifdef WARPRNNT_ENABLE_GPU
-
-class WarpRNNTOpGPU : public WarpRNNTOpBase {
-  public:
-    explicit WarpRNNTOpGPU(tf::OpKernelConstruction* ctx) : WarpRNNTOpBase(ctx) {
-    }
-
-  private:
-    void set_zero(tf::Tensor* t) override {
-        // here is not need
-        // cudaMemset(t->flat<float>().data(), 0, t->NumElements()*sizeof(float));
-    }
-
-    rnntOptions create_options(tf::OpKernelContext* ctx) override {
-        auto cuda_stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
-        auto options = rnntOptions{};
-        options.loc = RNNT_GPU;
-        options.stream = cuda_stream;
-        return options;
-    }
-};
-
-REGISTER_KERNEL_BUILDER(Name("WarpRNNT").Device(::tensorflow::DEVICE_GPU)
-                        .HostMemory("costs"),
-                        WarpRNNTOpGPU);
-#undef EIGEN_USE_GPU
-#endif
-
-}
diff --git a/tensorflow_binding/tests/test_basic.py b/tensorflow_binding/tests/test_basic.py
deleted file mode 100644
index d6d13e2..0000000
--- a/tensorflow_binding/tests/test_basic.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import numpy as np
-import tensorflow as tf
-from warprnnt_tensorflow import rnnt_loss
-
-acts = tf.placeholder(tf.float32, [None, None, None, None])
-labels = tf.placeholder(tf.int32, [None, None])
-input_length = tf.placeholder(tf.int32, [None])
-label_length = tf.placeholder(tf.int32, [None])
-
-B = 2; T = 4; U = 3; V = 3; blank = 0
-
-logits = tf.nn.log_softmax(acts)
-costs = rnnt_loss(logits, labels, input_length, label_length, blank)
-grad = tf.gradients(costs, [acts])
-
-a = np.array([[[[0.06535690384862791, 0.7875301411923206, 0.08159176605666074],
-            [0.5297155426466327, 0.7506749639230854, 0.7541348379087998],
-            [0.6097641124736383, 0.8681404965673826, 0.6225318186056529]],
-
-            [[0.6685222872103057, 0.8580392805336061, 0.16453892311765583],
-            [0.989779515236694, 0.944298460961015, 0.6031678586829663],
-            [0.9467833543605416, 0.666202507295747, 0.28688179752461884]],
-
-            [[0.09418426230195986, 0.3666735970751962, 0.736168049462793],
-            [0.1666804425271342, 0.7141542198635192, 0.3993997272216727],
-            [0.5359823524146038, 0.29182076440286386, 0.6126422611507932]],
-
-            [[0.3242405528768486, 0.8007644367291621, 0.5241057606558068],
-            [0.779194617063042, 0.18331417220174862, 0.113745182072432],
-            [0.24022162381327106, 0.3394695622533106, 0.1341595066017014]]],
-
-        [[[0.5055615569388828, 0.051597282072282646, 0.6402903936686337],
-            [0.43073311517251, 0.8294731834714112, 0.1774668847323424],
-            [0.3207001991262245, 0.04288308912457006, 0.30280282975568984]],
-
-            [[0.6751777088333762, 0.569537369330242, 0.5584738347504452],
-            [0.08313242153985256, 0.06016544344162322, 0.10795752845152584],
-            [0.7486153608562472, 0.943918041459349, 0.4863558118797222]],
-
-            [[0.4181986264486809, 0.6524078485043804, 0.024242983423721887],
-            [0.13458171554507403, 0.3663418070512402, 0.2958297395361563],
-            [0.9236695822497084, 0.6899291482654177, 0.7418981733448822]],
-
-            [[0.25000547599982104, 0.6034295486281007, 0.9872887878887768],
-            [0.5926057265215715, 0.8846724004467684, 0.5434495396894328],
-            [0.6607698886038497, 0.3771277082495921, 0.3580209022231813]]]], dtype=np.float32)
-
-b = np.array([[1, 2], [1, 1]], dtype=np.int32)
-c = np.array([4, 4], dtype=np.int32)
-d = np.array([2, 2], dtype=np.int32)
-
-feed = {acts: a, labels: b, input_length: c, label_length: d}
-with tf.Session() as sess:
-    cost, grads = sess.run([costs, grad], feed_dict=feed)
-    print(cost)
-    print(grads)
\ No newline at end of file
diff --git a/tensorflow_binding/tests/test_warprnnt_op.py b/tensorflow_binding/tests/test_warprnnt_op.py
deleted file mode 100644
index f50d58e..0000000
--- a/tensorflow_binding/tests/test_warprnnt_op.py
+++ /dev/null
@@ -1,97 +0,0 @@
-import tensorflow as tf
-import numpy as np
-from warprnnt_tensorflow import rnnt_loss
-from tensorflow.python.client import device_lib
-
-def is_gpu_available():
-    """Returns whether Tensorflow can access a GPU."""
-    return any(x.device_type == 'GPU' for x in device_lib.list_local_devices())
-
-class WarpRNNTTest(tf.test.TestCase):
-
-    def _run_rnnt(self, acts, labels, input_lengths, label_lengths,
-                    expected_costs, expected_grads, blank, use_gpu=False):
-        self.assertEquals(acts.shape, expected_grads.shape)
-        acts_t = tf.constant(acts)
-        labels_t = tf.constant(labels)
-        input_lengths_t = tf.constant(input_lengths)
-        label_lengths_t = tf.constant(label_lengths)
-
-        logits = acts_t if use_gpu else tf.nn.log_softmax(acts_t)
-        costs = rnnt_loss(logits, labels_t, input_lengths_t, label_lengths_t, blank)
-
-        grads = tf.gradients(costs, [acts_t])[0]
-
-        with self.test_session(use_gpu=use_gpu) as sess:
-            (tf_costs, tf_grad) = sess.run([costs, grads])
-            self.assertAllClose(tf_costs, expected_costs, atol=1e-6)
-            self.assertAllClose(tf_grad, expected_grads, atol=1e-6)
-    
-    def test_forward(self):
-        # Softmax activations for the following inputs:
-        acts = np.array([0.1, 0.6, 0.1, 0.1, 0.1, 0.1, 
-                        0.1, 0.6, 0.1, 0.1, 0.1, 0.1, 
-                        0.2, 0.8, 0.1, 0.1, 0.6, 0.1, 
-                        0.1, 0.1, 0.1, 0.1, 0.2, 0.1, 
-                        0.1, 0.7, 0.1, 0.2, 0.1, 0.1], dtype=np.float32).reshape(1, 2, 3, 5)
-
-        labels = np.array([[1, 2]], dtype=np.int32)
-        input_lengths = np.array([2], dtype=np.int32)
-        label_lengths = np.array([2], dtype=np.int32)
-
-        acts_t = tf.constant(acts)
-        labels_t = tf.constant(labels)
-        input_lengths_t = tf.constant(input_lengths)
-        label_lengths_t = tf.constant(label_lengths)
-        acts_t = tf.nn.log_softmax(acts_t) # NOTE cpu
-        costs = rnnt_loss(acts_t, labels_t, input_lengths_t, label_lengths_t)
-        with self.test_session():
-            print(costs.eval())
-
-    def _test_multiple_batches(self, use_gpu):
-        B = 2; T = 4; U = 3; V = 3
-
-        acts = np.array([0.065357, 0.787530, 0.081592, 0.529716, 0.750675, 0.754135, 
-                        0.609764, 0.868140, 0.622532, 0.668522, 0.858039, 0.164539, 
-                        0.989780, 0.944298, 0.603168, 0.946783, 0.666203, 0.286882, 
-                        0.094184, 0.366674, 0.736168, 0.166680, 0.714154, 0.399400, 
-                        0.535982, 0.291821, 0.612642, 0.324241, 0.800764, 0.524106, 
-                        0.779195, 0.183314, 0.113745, 0.240222, 0.339470, 0.134160, 
-                        0.505562, 0.051597, 0.640290, 0.430733, 0.829473, 0.177467, 
-                        0.320700, 0.042883, 0.302803, 0.675178, 0.569537, 0.558474, 
-                        0.083132, 0.060165, 0.107958, 0.748615, 0.943918, 0.486356, 
-                        0.418199, 0.652408, 0.024243, 0.134582, 0.366342, 0.295830, 
-                        0.923670, 0.689929, 0.741898, 0.250005, 0.603430, 0.987289, 
-                        0.592606, 0.884672, 0.543450, 0.660770, 0.377128, 0.358021], dtype=np.float32).reshape(B, T, U, V);
-
-        expected_costs = np.array([4.28065, 3.93844], dtype=np.float32)
-        expected_grads = np.array([-0.186844, -0.062555, 0.249399, -0.203377, 0.202399, 0.000977,
-                                    -0.141016, 0.079123, 0.061893, -0.011552, -0.081280, 0.092832,
-                                    -0.154257, 0.229433, -0.075176, -0.246593, 0.146405, 0.100188,
-                                    -0.012918, -0.061593, 0.074512, -0.055986, 0.219831, -0.163845,
-                                    -0.497627, 0.209240, 0.288387, 0.013605, -0.030220, 0.016615,
-                                    0.113925, 0.062781, -0.176706, -0.667078, 0.367659, 0.299419,
-                                    -0.356344, -0.055347, 0.411691, -0.096922, 0.029459, 0.067463,
-                                    -0.063518, 0.027654, 0.035863, -0.154499, -0.073942, 0.228441,
-                                    -0.166790, -0.000088, 0.166878, -0.172370, 0.105565, 0.066804,
-                                    0.023875, -0.118256, 0.094381, -0.104707, -0.108934, 0.213642,
-                                    -0.369844, 0.180118, 0.189726, 0.025714, -0.079462, 0.053748,
-                                    0.122328, -0.238789, 0.116460, -0.598687, 0.302203, 0.296484], dtype=np.float32).reshape(B, T, U, V);
-
-        labels = np.array([[1, 2], [1, 1]], dtype=np.int32)
-        input_lengths = np.array([4, 4], dtype=np.int32)
-        label_lengths = np.array([2, 2], dtype=np.int32)
-
-        self._run_rnnt(acts, labels, input_lengths, label_lengths, expected_costs, expected_grads, 0, use_gpu)
-
-    def test_multiple_batches_cpu(self):
-        self._test_multiple_batches(use_gpu=False)
-    
-    def test_multiple_batches_gpu(self):
-        if (is_gpu_available()):
-            self._test_multiple_batches(use_gpu=True)
-        else:
-            print('Skipping GPU test, no gpus available')
-
-if __name__ == '__main__':
-    tf.test.main()
diff --git a/tensorflow_binding/warprnnt_tensorflow/__init__.py b/tensorflow_binding/warprnnt_tensorflow/__init__.py
deleted file mode 100644
index de45611..0000000
--- a/tensorflow_binding/warprnnt_tensorflow/__init__.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import imp
-import tensorflow as tf
-from tensorflow.python.framework import ops
-
-lib_file = imp.find_module('kernels', __path__)[1]
-_warprnnt = tf.load_op_library(lib_file)
-
-
-def rnnt_loss(acts, labels, input_lengths, label_lengths, blank_label=0):
-    '''Computes the RNNT loss between a sequence of activations and a
-    ground truth labeling.
-    Args:
-        acts: A 4-D Tensor of floats.  The dimensions
-                     should be (B, T, U, V), where B is the minibatch index,
-                     T is the time index, U is the prediction network sequence
-                     length, and V indexes over activations for each 
-                     symbol in the alphabet.
-        labels: A 2-D Tensor of ints, a padded label sequences to make sure 
-                     labels for the minibatch are same length.
-        input_lengths: A 1-D Tensor of ints, the number of time steps
-                       for each sequence in the minibatch.
-        label_lengths: A 1-D Tensor of ints, the length of each label
-                       for each example in the minibatch.
-        blank_label: int, the label value/index that the RNNT
-                     calculation should use as the blank label
-    Returns:
-        1-D float Tensor, the cost of each example in the minibatch
-        (as negative log probabilities).
-    * This class performs the softmax operation internally.
-    * The label reserved for the blank symbol should be label 0.
-    '''
-    loss, _ = _warprnnt.warp_rnnt(acts, labels, input_lengths,
-                                  label_lengths, blank_label)
-    return loss
-
-
-@ops.RegisterGradient("WarpRNNT")
-def _RNNTLossGrad(op, grad_loss, _):
-    grad = op.outputs[1]
-    # NOTE since here we are batch first, cannot use _BroadcastMul
-    grad_loss = tf.reshape(grad_loss, (-1, 1, 1, 1))
-    return [grad_loss * grad, None, None, None]
-
-@ops.RegisterShape("WarpRNNT")
-def _RNNTLossShape(op):
-    inputs_shape = op.inputs[0].get_shape().with_rank(4)
-    batch_size = inputs_shape[0]
-    return [batch_size, inputs_shape]
\ No newline at end of file