diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..65f01ae
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,42 @@
+name: CI (CPU-only)
+
+on:
+  push:
+    branches: [main, master]
+  pull_request:
+    branches: [main, master]
+
+jobs:
+  build-and-test:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y cmake build-essential
+
+      - name: Configure CMake (CPU-only)
+        run: |
+          mkdir -p build
+          cd build
+          cmake ..
+
+      - name: Build project
+        run: |
+          cd build
+          make -j$(nproc)
+
+      - name: Install and update linker cache
+        run: |
+          cd build
+          sudo make install
+          sudo ldconfig
+
+      - name: Run CPU tests
+        run: |
+          cd build
+          ./test_cpu
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..9d54d6b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,132 @@
+##################################
+# Python
+##################################
+
+# Byte-compiled
+__pycache__/
+*.py[cod]
+*$py.class
+
+# Virtual envs
+.env/
+.venv/
+env/
+venv/
+
+# Packaging
+build/
+dist/
+*.egg-info/
+.eggs/
+*.whl
+
+# Caches / testing
+.pytest_cache/
+htmlcov/
+coverage.*
+.tox/
+.nox/
+
+# Type checkers
+.mypy_cache/
+.pytype/
+.pyre/
+
+# Jupyter
+.ipynb_checkpoints/
+
+##################################
+# C++ / CUDA / CMake
+##################################
+
+# Objects and binaries
+*.o
+*.obj
+*.a
+*.so
+*.dll
+*.lib
+*.dylib
+*.out
+*.exe
+*.app
+
+# CUDA
+*.cu.o
+*.ptx
+*.cubin
+*.fatbin
+*.nvvp
+*.nvcpl
+*.nvd*
+
+# Dependency / compiler files
+*.d
+*.mod
+
+# CMake and build
+/build/
+CMakeFiles/
+CMakeCache.txt
+cmake_install.cmake
+Makefile
+CTestTestfile.cmake
+install_manifest.txt
+compile_commands.json
+
+# Ninja
+.ninja_log
+.ninja_deps
+build.ninja
+rules.ninja
+
+##################################
+# Python bindings (PyTorch / TensorFlow)
+##################################
+
+# Compiled extensions
+*.so
+*.pyd
+*.dll
+
+# Generated by setup.py
+warprnnt_pytorch.egg-info/
+warprnnt_tensorflow.egg-info/
+
+# Temporary output
+*.log
+
+##################################
+# Docs
+##################################
+
+# LaTeX
+*.aux
+*.log
+*.out
+*.pdf
+*.toc
+*.lof
+*.lot
+
+##################################
+# IDE / Editor / OS
+##################################
+
+# VSCode
+.vscode/
+
+# JetBrains / CLion / PyCharm
+.idea/
+cmake-build-*/
+
+# Vim / Emacs
+*.swp
+*.swo
+*~
+
+# macOS / Windows
+.DS_Store
+Thumbs.db
+desktop.ini
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
old mode 100755
new mode 100644
index 2c570e0..f422901
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,136 +1,249 @@
-IF (APPLE)
-    cmake_minimum_required(VERSION 3.4)
-ELSE()
-    cmake_minimum_required(VERSION 2.8)
-ENDIF()
+cmake_minimum_required(VERSION 3.10)
+project(rnnt_release LANGUAGES CXX)
+
+# ==== CUDA DETECTION ====
+set(CUDA_FOUND FALSE)
+set(CUDA_TOOLKIT_VERSION "0.0")
+
+find_package(CUDA QUIET)
+
+if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
+    find_package(CUDAToolkit QUIET)
+    
+    if(CUDAToolkit_FOUND OR CUDA_FOUND)
+        set(CUDA_FOUND TRUE)
+        if(CUDAToolkit_FOUND)
+            set(CUDA_TOOLKIT_VERSION "${CUDAToolkit_VERSION}")
+        elseif(DEFINED CUDA_VERSION AND NOT "${CUDA_VERSION}" STREQUAL "")
+            set(CUDA_TOOLKIT_VERSION "${CUDA_VERSION}")
+        endif()
+    endif()
+else()
+    if(CUDA_FOUND AND DEFINED CUDA_VERSION AND NOT "${CUDA_VERSION}" STREQUAL "")
+        set(CUDA_TOOLKIT_VERSION "${CUDA_VERSION}")
+    endif()
+endif()
+
+# ==== USER OPTIONS ====
+option(WITH_GPU "Build with GPU (CUDA) support" ${CUDA_FOUND})
+option(WITH_OMP "Build with OpenMP support" ON)
+option(USE_NAIVE_KERNEL "Use naive alpha-beta kernel" OFF)
+option(DEBUG_TIME "Output kernel time" OFF)
+option(DEBUG_KERNEL "Output alpha beta debug" OFF)
+
+# ==== GPU BUILD CONFIGURATION ====
+if(WITH_GPU)
+    if(NOT CUDA_FOUND)
+        message(WARNING "WITH_GPU=ON was requested, but no compatible CUDA toolkit was found. Falling back to CPU-only build.")
+        set(WITH_GPU OFF CACHE BOOL "Build with GPU (CUDA) support" FORCE)
+        set(USE_CUDA FALSE)
+    else()
+        set(USE_CUDA TRUE)
+        message(STATUS "Building with CUDA ${CUDA_TOOLKIT_VERSION} support")
+    endif()
+else()
+    set(USE_CUDA FALSE)
+    if(CUDA_FOUND)
+        message(STATUS "CUDA ${CUDA_TOOLKIT_VERSION} detected but GPU support disabled (WITH_GPU=OFF)")
+    else()
+        message(STATUS "Building CPU-only version (no CUDA toolkit found)")
+    endif()
+endif()
 
-project(rnnt_release)
+# ==== OPENMP CONFIGURATION ====
+if(WITH_OMP)
+    find_package(OpenMP)
+    if(OpenMP_CXX_FOUND)
+        message(STATUS "OpenMP support enabled")
+    else()
+        message(WARNING "OpenMP requested but not found")
+        set(WITH_OMP OFF)
+    endif()
+endif()
+
+# ==== COMPILER SETTINGS ====
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
 
-IF (NOT APPLE)
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Build type" FORCE)
+endif()
+
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O0 -g")
+else()
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
-ENDIF()
+endif()
+
+if(WITH_OMP)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
+endif()
 
-IF (APPLE)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O2")
+if(APPLE)
     add_definitions(-DAPPLE)
-ENDIF()
+    execute_process(COMMAND uname -v OUTPUT_VARIABLE DARWIN_VERSION)
+    string(REGEX MATCH "[0-9]+" DARWIN_VERSION "${DARWIN_VERSION}")
+    message(STATUS "Darwin version: ${DARWIN_VERSION}")
+    if(${DARWIN_VERSION} LESS 15)
+        set(CMAKE_SKIP_RPATH TRUE)
+    endif()
+else()
+    set(CMAKE_SKIP_RPATH TRUE)
+endif()
 
 include_directories(include)
 
-FIND_PACKAGE(CUDA)
-MESSAGE(STATUS "cuda found ${CUDA_FOUND}")
-
-option(USE_NAIVE_KERNEL "use naive alpha-beta kernel" OFF)
-option(DEBUG_TIME "output kernel time" OFF)
-option(DEBUG_KERNEL "output alpha beta" OFF)
-if (USE_NAIVE_KERNEL)
+# ==== PREPROCESSOR DEFINITIONS ====
+if(USE_NAIVE_KERNEL)
     add_definitions(-DUSE_NAIVE_KERNEL)
 endif()
-if (DEBUG_TIME)
+
+if(DEBUG_TIME)
     add_definitions(-DDEBUG_TIME)
 endif()
-if (DEBUG_KERNEL)
+
+if(DEBUG_KERNEL)
     add_definitions(-DDEBUG_KERNEL)
 endif()
 
-option(WITH_GPU "compile warp-rnnt with cuda." ${CUDA_FOUND})
-option(WITH_OMP "compile warp-rnnt with openmp." ON)
-
 if(NOT WITH_OMP)
     add_definitions(-DRNNT_DISABLE_OMP)
 endif()
-if (WITH_OMP)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fopenmp")
-endif()
-
-# need to be at least 30 or __shfl_down in reduce wont compile
-set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30 -O2")
-set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_35,code=sm_35")
-
-set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_50,code=sm_50")
-set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52")
-IF(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5)
-  SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -D_MWAITXINTRIN_H_INCLUDED -D_FORCE_INLINES")
-ENDIF()
 
-IF (CUDA_VERSION GREATER 7.6)
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60")
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61")
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62")
-ENDIF()
-
-IF (CUDA_VERSION GREATER 8.9)
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_70,code=sm_70")
-ENDIF()
-
-IF (CUDA_VERSION GREATER 9.9)
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_75,code=sm_75")
-ENDIF()
-
-if (NOT APPLE)
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++11")
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-ENDIF()
-
-
-IF (APPLE)
-    EXEC_PROGRAM(uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION)
-    STRING(REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
-    MESSAGE(STATUS "DARWIN_VERSION=${DARWIN_VERSION}")
-
-    #for el capitain have to use rpath
-
-    IF (DARWIN_VERSION LESS 15)
-        set(CMAKE_SKIP_RPATH TRUE)
-    ENDIF ()
-
-ELSE()
-    #always skip for linux
-    set(CMAKE_SKIP_RPATH TRUE)
-ENDIF()
-
-
-IF (WITH_GPU)
-
-    MESSAGE(STATUS "Building shared library with GPU support")
-
-    CUDA_ADD_LIBRARY(warprnnt SHARED src/rnnt_entrypoint.cu)
-    IF (!Torch_FOUND)
-        TARGET_LINK_LIBRARIES(warprnnt ${CUDA_curand_LIBRARY})
-    ENDIF()
-
-    cuda_add_executable(test_time_gpu tests/test_time.cu tests/random.cpp )
-    TARGET_LINK_LIBRARIES(test_time_gpu warprnnt ${CUDA_curand_LIBRARY})
-    SET_TARGET_PROPERTIES(test_time_gpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
+# ==== CUDA CONFIGURATION ====
+if(USE_CUDA)
+    if(WITH_OMP)
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fopenmp")
+    endif()
+
+    if(NOT (CUDA_VERSION GREATER 10.2))
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30 -O2")
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_35,code=sm_35")
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_50,code=sm_50")
+    endif()
+
+    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52")
+    
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5)
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -D_MWAITXINTRIN_H_INCLUDED -D_FORCE_INLINES")
+    endif()
+
+    if(CUDA_VERSION GREATER 7.6)
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60")
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61")
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62")
+    endif()
+
+    if(CUDA_VERSION GREATER 8.9)
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_70,code=sm_70")
+    endif()
+
+    if(CUDA_VERSION GREATER 9.9)
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_75,code=sm_75")
+    endif()
+
+    if(CUDA_VERSION GREATER 11.0)
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_80,code=sm_80")
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_86,code=sm_86")
+    endif()
+
+    if(NOT APPLE)
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++17")
+    endif()
+
+    message(STATUS "CUDA NVCC flags: ${CUDA_NVCC_FLAGS}")
+endif()
 
-    cuda_add_executable(test_gpu tests/test_gpu.cu tests/random.cpp )
-    TARGET_LINK_LIBRARIES(test_gpu warprnnt ${CUDA_curand_LIBRARY})
-    SET_TARGET_PROPERTIES(test_gpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
+# ==== SOURCE FILE PREPARATION ====
+set(RNNT_ENTRYPOINT_SRC src/rnnt_entrypoint.cu)
+if(NOT USE_CUDA)
+    configure_file(
+        ${RNNT_ENTRYPOINT_SRC}
+        ${CMAKE_CURRENT_BINARY_DIR}/rnnt_entrypoint_cpu.cpp
+        COPYONLY
+    )
+    set(RNNT_ENTRYPOINT_SRC ${CMAKE_CURRENT_BINARY_DIR}/rnnt_entrypoint_cpu.cpp)
+endif()
 
-ELSE()
-    MESSAGE(STATUS "Building shared library with no GPU support")
+# ==== MAIN LIBRARY TARGET ====
+if(USE_CUDA)
+    message(STATUS "Building shared library with GPU support")
+    
+    cuda_add_library(warprnnt SHARED ${RNNT_ENTRYPOINT_SRC})
+    
+    if(NOT Torch_FOUND)
+        target_link_libraries(warprnnt ${CUDA_curand_LIBRARY})
+    endif()
+    
+    if(WITH_OMP AND OpenMP_CXX_FOUND)
+        target_link_libraries(warprnnt ${OpenMP_CXX_LIBRARIES})
+    endif()
+else()
+    message(STATUS "Building shared library with CPU-only support")
+    add_library(warprnnt SHARED ${RNNT_ENTRYPOINT_SRC})
+    
+    if(WITH_OMP AND OpenMP_CXX_FOUND)
+        target_link_libraries(warprnnt PRIVATE OpenMP::OpenMP_CXX)
+    endif()
+endif()
 
-    if (NOT APPLE)
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O2")
-    ENDIF()
+set_target_properties(warprnnt PROPERTIES
+    RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}
+    LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}
+    ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}
+)
 
-    ADD_LIBRARY(warprnnt SHARED src/rnnt_entrypoint.cpp)
+if(WIN32)
+    set_target_properties(warprnnt PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
+endif()
 
-ENDIF()
+# Use include_directories (global) since we're mixing legacy/modern approaches
+# target_include_directories(warprnnt PRIVATE include) # Don't use this with legacy CUDA
 
+# ==== TEST EXECUTABLES ====
+add_executable(test_cpu tests/test_cpu.cpp tests/random.cpp)
+target_link_libraries(test_cpu warprnnt)
+set_target_properties(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
 
-add_executable(test_cpu tests/test_cpu.cpp tests/random.cpp )
-TARGET_LINK_LIBRARIES(test_cpu warprnnt)
-SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
+add_executable(test_time tests/test_time.cpp tests/random.cpp)
+target_link_libraries(test_time warprnnt)
+set_target_properties(test_time PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
 
-add_executable(test_time tests/test_time.cpp tests/random.cpp )
-TARGET_LINK_LIBRARIES(test_time warprnnt)
-SET_TARGET_PROPERTIES(test_time PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
+if(USE_CUDA)
+    cuda_add_executable(test_time_gpu tests/test_time.cu tests/random.cpp)
+    target_link_libraries(test_time_gpu warprnnt ${CUDA_curand_LIBRARY})
+    set_target_properties(test_time_gpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
 
-INSTALL(TARGETS warprnnt
-        RUNTIME DESTINATION "bin"
-        LIBRARY DESTINATION "lib"
-        ARCHIVE DESTINATION "lib")
+    cuda_add_executable(test_gpu tests/test_gpu.cu tests/random.cpp)
+    target_link_libraries(test_gpu warprnnt ${CUDA_curand_LIBRARY})
+    set_target_properties(test_gpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
+endif()
 
-INSTALL(FILES include/rnnt.h DESTINATION "include")
+# ==== INSTALL TARGETS ====
+install(TARGETS warprnnt
+    RUNTIME DESTINATION bin
+    LIBRARY DESTINATION lib
+    ARCHIVE DESTINATION lib
+)
+
+install(FILES include/rnnt.h DESTINATION include)
+
+# ==== BUILD SUMMARY ====
+message(STATUS "")
+message(STATUS "=== Build Configuration Summary ===")
+message(STATUS "Build Type: ${CMAKE_BUILD_TYPE}")
+message(STATUS "CUDA Available: ${CUDA_FOUND}")
+if(CUDA_FOUND)
+    message(STATUS "CUDA Version: ${CUDA_TOOLKIT_VERSION}")
+endif()
+message(STATUS "GPU Support: ${USE_CUDA}")
+if(USE_CUDA)
+    message(STATUS "CUDA NVCC Flags: ${CUDA_NVCC_FLAGS}")
+endif()
+message(STATUS "OpenMP Support: ${WITH_OMP}")
+message(STATUS "Naive Kernel: ${USE_NAIVE_KERNEL}")
+message(STATUS "Debug Time: ${DEBUG_TIME}")
+message(STATUS "Debug Kernel: ${DEBUG_KERNEL}")
+message(STATUS "=====================================")
+message(STATUS "")
diff --git a/README.md b/README.md
index 11263ef..1ec3b88 100644
--- a/README.md
+++ b/README.md
@@ -57,6 +57,7 @@ Run cmake and build:
 ```bash
 cmake -DCUDA_TOOLKIT_ROOT_DIR=$CUDA_HOME ..
 make
+sudo make install #optional
 ```
 if it logs
 ```
@@ -68,6 +69,30 @@ please run `rm CMakeCache.txt` and cmake again.
 The C library should now be built along with test executables. If CUDA was detected, then `test_gpu` will be built;
 `test_cpu` will always be built.
 
+## Troubleshooting: `cuda_runtime_api.h: No such file or directory`
+
+If you encounter an error like this during installation:
+
+`
+fatal error: cuda_runtime_api.h: No such file or directory
+`
+
+This usually means the CUDA headers can't be found by the compiler. If your system has CUDA installed (e.g., version 12.9) and the file `cuda_runtime_api.h` exists somewhere like:
+
+`
+/usr/local/cuda-12.9/targets/x86_64-linux/include/cuda_runtime_api.h
+`
+
+Then you can fix this by manually setting the environment variables to point to the correct CUDA paths:
+
+```bash
+export CUDA_HOME=/usr/local/cuda-12.9
+export CFLAGS="-I$CUDA_HOME/targets/x86_64-linux/include"
+export LDFLAGS="-L$CUDA_HOME/targets/x86_64-linux/lib"
+export PATH="$CUDA_HOME/bin:$PATH"
+export LD_LIBRARY_PATH="$CUDA_HOME/lib64:$LD_LIBRARY_PATH"
+```
+
 ## Test
 To run the tests, make sure the CUDA libraries are in `LD_LIBRARY_PATH` (DYLD_LIBRARY_PATH for OSX).
 
diff --git a/include/detail/cpu_rnnt.h b/include/detail/cpu_rnnt.h
index 0042b34..f4e586f 100644
--- a/include/detail/cpu_rnnt.h
+++ b/include/detail/cpu_rnnt.h
@@ -20,9 +20,9 @@ class CpuRNNT {
 public:
     // Noncopyable
     CpuRNNT(int minibatch, int maxT, int maxU, int alphabet_size, void* workspace, 
-            int blank, int num_threads, bool batch_first) :
+            int blank, float fastemit_lambda, int num_threads, bool batch_first) :
         minibatch_(minibatch), maxT_(maxT), maxU_(maxU), alphabet_size_(alphabet_size), 
-        workspace_(workspace), blank_(blank), num_threads_(num_threads), batch_first(batch_first) {
+          workspace_(workspace), blank_(blank), fastemit_lambda_(fastemit_lambda), num_threads_(num_threads), batch_first(batch_first) {
 #if defined(RNNT_DISABLE_OMP) || defined(APPLE)
 #else
         if (num_threads > 0) {
@@ -82,6 +82,7 @@ class CpuRNNT {
     int alphabet_size_; // Number of characters plus blank
     void* workspace_;
     int blank_;
+    float fastemit_lambda_;
     int num_threads_;
     bool batch_first;
     
@@ -259,7 +260,10 @@ CpuRNNT<ProbT>::compute_betas_and_grad(ProbT* grad, const ProbT* const log_probs
             }
             if (u < U-1) {
                 ProbT g = alphas[idx(t, u)] + betas[idx(t, u+1)];
-                grad[idx(t, u, labels[u])] = -std::exp(log_probs[idx(t, u) * 2 + 1] + g - loglike);
+                grad[idx(t, u, labels[u])] = -(
+                                               (1. + fastemit_lambda_)
+                                               * std::exp(log_probs[idx(t, u) * 2 + 1] + g - loglike)
+                                               );
             }
         }
     }
diff --git a/include/detail/gpu_rnnt.h b/include/detail/gpu_rnnt.h
index dd7f874..c8e67df 100644
--- a/include/detail/gpu_rnnt.h
+++ b/include/detail/gpu_rnnt.h
@@ -20,9 +20,9 @@ class GpuRNNT {
 public:
     // Noncopyable
     GpuRNNT(int minibatch, int maxT, int maxU, int alphabet_size, void* workspace, 
-            int blank, int num_threads, CUstream stream) :
+            int blank, float fastemit_lambda, int num_threads, CUstream stream) :
         minibatch_(minibatch), maxT_(maxT), maxU_(maxU), alphabet_size_(alphabet_size), 
-        gpu_workspace(workspace), blank_(blank), num_threads_(num_threads), stream_(stream) {
+          gpu_workspace(workspace), blank_(blank), fastemit_lambda_(fastemit_lambda), num_threads_(num_threads), stream_(stream) {
 #if defined(RNNT_DISABLE_OMP) || defined(APPLE)
 #else
         if (num_threads > 0) {
@@ -65,6 +65,7 @@ class GpuRNNT {
     int alphabet_size_; // Number of characters plus blank
     void* gpu_workspace;
     int blank_;
+    float fastemit_lambda_;
     int num_threads_;
     CUstream stream_;
     
@@ -195,9 +196,16 @@ GpuRNNT<ProbT>::compute_cost_and_score(const ProbT* const acts,
         start = std::chrono::high_resolution_clock::now();
 #endif
         // TODO optimize gradient kernel
-        compute_grad_kernel<128, ProbT><<<minibatch_ * maxT_ * maxU_, 128, 0, stream_>>>(grads, 
-            acts, denom, alphas, betas, llForward, input_lengths, label_lengths, labels, 
-            minibatch_, maxT_, maxU_, alphabet_size_, blank_);
+
+        if (fastemit_lambda_ > 0.0f) {
+            compute_fastemit_grad_kernel<128, ProbT><<<minibatch_ * maxT_ * maxU_, 128, 0, stream_>>>(grads, 
+                acts, denom, alphas, betas, llForward, input_lengths, label_lengths, labels, 
+                minibatch_, maxT_, maxU_, alphabet_size_, blank_, fastemit_lambda_);
+        } else {
+            compute_grad_kernel<128, ProbT><<<minibatch_ * maxT_ * maxU_, 128, 0, stream_>>>(grads, 
+                acts, denom, alphas, betas, llForward, input_lengths, label_lengths, labels, 
+                minibatch_, maxT_, maxU_, alphabet_size_, blank_);
+        }
 #if defined(DEBUG_TIME)
         cudaStreamSynchronize(stream_);
         end = std::chrono::high_resolution_clock::now();
diff --git a/include/detail/gpu_rnnt_kernel.h b/include/detail/gpu_rnnt_kernel.h
index 738d7ef..ed0b0e6 100644
--- a/include/detail/gpu_rnnt_kernel.h
+++ b/include/detail/gpu_rnnt_kernel.h
@@ -177,3 +177,50 @@ __global__ void compute_grad_kernel(Tp* grads, const Tp* const acts, const Tp* c
         }
     }
 }
+
+template<int NT, typename Tp>
+__global__ void compute_fastemit_grad_kernel(Tp* grads, const Tp* const acts, const Tp* const denom, const Tp* alphas, const Tp* betas, const Tp* const logll, const int* const xlen, const int* const ylen, 
+    const int* const mlabels, const int minibatch, const int maxT, const int maxU, const int alphabet_size, const int blank_, const Tp fastemit_lambda) {
+    int tid = threadIdx.x; // alphabet dim
+    int idx = tid;
+    int col = blockIdx.x; // mb, t, u
+
+    int u = col % maxU;
+    int bt = (col - u) / maxU;
+    int t = bt % maxT;
+    int mb = (bt - t) / maxT;
+
+    const int T = xlen[mb];
+    const int U = ylen[mb] + 1;
+    const int* labels = mlabels + mb * (maxU - 1);
+
+    if (t < T && u < U) {
+        while (idx < alphabet_size) {
+            Tp logpk = denom[col] + acts[col * alphabet_size + idx];
+            // Tp logpk = logp(denom, acts, maxT, maxU, alphabet_size, mb, t, u, idx);
+            Tp grad = exp(alphas[col] + betas[col] + logpk - logll[mb]);
+
+            Tp logy_btu1 = rnnt_helper::neg_inf<Tp>(); // log(y(t,u)) + log(beta(t, u+1))
+            if (u < U-1) {
+                logy_btu1 = denom[col] + acts[col * alphabet_size + labels[u]] + betas[col+1];
+            }
+            grad += fastemit_lambda * exp(alphas[col] + logy_btu1 + logpk - logll[mb]);
+
+            // grad to last blank transition
+            if (idx == blank_ && t == T-1 && u == U-1) {
+                grad -= exp(alphas[col] + logpk - logll[mb]);
+                grad -= fastemit_lambda * exp(alphas[col] + logy_btu1 + logpk - logll[mb]);
+            }
+            if (idx == blank_ && t < T-1) {
+                grad -= exp(alphas[col] + logpk - logll[mb] + betas[col + maxU]);
+            }
+            if (u < U-1 && idx == labels[u]) {
+                grad -= exp(alphas[col] + logpk - logll[mb] + betas[col+1]);
+                grad -= fastemit_lambda * exp(alphas[col] + logy_btu1 - logll[mb]);
+            }
+            grads[col * alphabet_size + idx] = grad;
+
+            idx += NT;
+        }
+    }
+}
diff --git a/include/rnnt.h b/include/rnnt.h
index 4759e0a..207065f 100644
--- a/include/rnnt.h
+++ b/include/rnnt.h
@@ -61,6 +61,8 @@ struct rnntOptions {
 
     /// memory structure
     bool batch_first;
+
+    float fastemit_lambda;
 };
 
 /** Compute the RNN Transducer loss between a sequence
diff --git a/mxnet_binding b/mxnet_binding
deleted file mode 160000
index 75874b2..0000000
--- a/mxnet_binding
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 75874b21a476dd9f9e5480841f0872f9d7c2723e
diff --git a/pytorch_binding/README.md b/pytorch_binding/README.md
index cce44e7..4be1252 100644
--- a/pytorch_binding/README.md
+++ b/pytorch_binding/README.md
@@ -10,12 +10,14 @@ Install [PyTorch](https://github.com/pytorch/pytorch#installation).
 (i.e. `libwarprnnt.so`).  This defaults to `../build`, so from within a
 new warp-transducer clone you could build WarpRNNT like this:
 
+
 ```bash
 git clone https://github.com/HawkAaron/warp-transducer
 cd warp-transducer
 mkdir build; cd build
 cmake ..
 make
+sudo make install # optional
 ```
 
 Otherwise, set `WARP_RNNT_PATH` to wherever you have `libwarprnnt.so`
@@ -30,7 +32,7 @@ export CUDA_HOME="/usr/local/cuda"
 Now install the bindings: (Please make sure the GCC version >= 4.9)
 ```
 cd pytorch_binding
-python setup.py install
+pip install . # pip install . --break-package-system
 ```
 
 If you try the above and get a dlopen error on OSX with anaconda3 (as recommended by pytorch):
@@ -89,3 +91,33 @@ forward(acts, labels, act_lens, label_lens):
     label_lens: Tensor of (batch) containing label length of each example
     """
 ```
+
+## Troubleshooting: `cuda_runtime_api.h: No such file or directory`
+
+If you encounter an error like this during installation:
+
+`
+fatal error: cuda_runtime_api.h: No such file or directory
+`
+
+This usually means the CUDA headers can't be found by the compiler. If your system has CUDA installed (e.g., version 12.9) and the file `cuda_runtime_api.h` exists somewhere like:
+
+`
+/usr/local/cuda-12.9/targets/x86_64-linux/include/cuda_runtime_api.h
+`
+
+Then you can fix this by manually setting the environment variables to point to the correct CUDA paths:
+
+```bash
+export CUDA_HOME=/usr/local/cuda-12.9
+export CFLAGS="-I$CUDA_HOME/targets/x86_64-linux/include"
+export LDFLAGS="-L$CUDA_HOME/targets/x86_64-linux/lib"
+export PATH="$CUDA_HOME/bin:$PATH"
+export LD_LIBRARY_PATH="$CUDA_HOME/lib64:$LD_LIBRARY_PATH"
+```
+
+Then run the installation again:
+
+```bash
+pip install .
+```
diff --git a/pytorch_binding/setup.py b/pytorch_binding/setup.py
index ad960ea..af8c094 100644
--- a/pytorch_binding/setup.py
+++ b/pytorch_binding/setup.py
@@ -1,4 +1,4 @@
-from distutils.version import LooseVersion
+from packaging.version import Version
 import os
 import platform
 import sys
@@ -8,7 +8,9 @@
 
 
 extra_compile_args = ['-fPIC']
-if LooseVersion(torch.__version__) >= LooseVersion("1.5.0"):
+if Version(torch.__version__) >= Version("2.1.0"):
+    extra_compile_args += ['-std=c++17']
+elif Version(torch.__version__) >= Version("1.5.0"):
     extra_compile_args += ['-std=c++14']
 else:
     extra_compile_args += ['-std=c++11']
@@ -39,12 +41,15 @@
 
 setup(
     name='warprnnt_pytorch',
-    version="0.1",
+    version="0.2.0",
     description="PyTorch wrapper for RNN-Transducer",
     url="https://github.com/HawkAaron/warp-transducer",
     author="Mingkun Huang",
     author_email="mingkunhuang95@gmail.com",
     packages=find_packages(),
+    install_requires=[
+        'packaging'
+    ],
     ext_modules=[
         CppExtension(
             name='warprnnt_pytorch.warp_rnnt',
diff --git a/pytorch_binding/src/binding.cpp b/pytorch_binding/src/binding.cpp
index 7aaf000..95f5ff8 100644
--- a/pytorch_binding/src/binding.cpp
+++ b/pytorch_binding/src/binding.cpp
@@ -5,8 +5,7 @@
 #include "rnnt.h"
 
 #ifdef WARPRNNT_ENABLE_GPU
-    #include "THC.h"
-    extern THCState* state;
+    #include "c10/cuda/CUDACachingAllocator.h"
 #endif
 
 int cpu_rnnt(torch::Tensor acts,
@@ -16,6 +15,7 @@ int cpu_rnnt(torch::Tensor acts,
             torch::Tensor costs,
             torch::Tensor grads,
             int blank_label,
+            float fastemit_lambda,
             int num_threads) {
 
     int maxT = acts.size(0);
@@ -23,16 +23,17 @@ int cpu_rnnt(torch::Tensor acts,
     int minibatch_size = acts.size(2);
     int alphabet_size = acts.size(3);
 
-	if (true) {
-		minibatch_size = acts.size(0);
-		maxT = acts.size(1);
-		maxU = acts.size(2);
-	}
+    if (true) {
+      minibatch_size = acts.size(0);
+      maxT = acts.size(1);
+      maxU = acts.size(2);
+    }
 
     rnntOptions options;
     memset(&options, 0, sizeof(options));
     options.maxT = maxT;
     options.maxU = maxU;
+    options.fastemit_lambda = fastemit_lambda;
     options.blank_label = blank_label;
     options.batch_first = true;
     options.loc = RNNT_CPU;
@@ -43,33 +44,33 @@ int cpu_rnnt(torch::Tensor acts,
 #endif
 
     size_t cpu_size_bytes = 0;
-    switch (acts.type().scalarType()) {
-      case torch::ScalarType::Float:
+    switch (acts.scalar_type()) {
+      case torch::kFloat:
         {
         get_workspace_size(maxT, maxU, minibatch_size,
                            false, &cpu_size_bytes);
 
         float* cpu_workspace = (float*) new unsigned char[cpu_size_bytes];
-        compute_rnnt_loss(acts.data<float>(), grads.data<float>(),
-                         labels.data<int>(), label_lengths.data<int>(),
-                         input_lengths.data<int>(), alphabet_size,
-                         minibatch_size, costs.data<float>(),
+        compute_rnnt_loss(acts.data_ptr<float>(), grads.data_ptr<float>(),
+                         labels.data_ptr<int>(), label_lengths.data_ptr<int>(),
+                         input_lengths.data_ptr<int>(), alphabet_size,
+                         minibatch_size, costs.data_ptr<float>(),
                          cpu_workspace, options);
 
         delete cpu_workspace;
         return 0;
         }
-      case torch::ScalarType::Double:
+      case torch::kDouble:
         {
         get_workspace_size(maxT, maxU, minibatch_size,
                            false, &cpu_size_bytes,
                            sizeof(double));
 
         double* cpu_workspace = (double*) new unsigned char[cpu_size_bytes];
-        compute_rnnt_loss_fp64(acts.data<double>(), grads.data<double>(),
-                         labels.data<int>(), label_lengths.data<int>(),
-                         input_lengths.data<int>(), alphabet_size,
-                         minibatch_size, costs.data<double>(),
+        compute_rnnt_loss_fp64(acts.data_ptr<double>(), grads.data_ptr<double>(),
+                         labels.data_ptr<int>(), label_lengths.data_ptr<int>(),
+                         input_lengths.data_ptr<int>(), alphabet_size,
+                         minibatch_size, costs.data_ptr<double>(),
                          cpu_workspace, options);
 
         delete cpu_workspace;
@@ -88,6 +89,7 @@ int gpu_rnnt(torch::Tensor acts,
             torch::Tensor costs,
             torch::Tensor grads,
             int blank_label,
+            float fastemit_lambda,
             int num_threads) {
 
     int minibatch_size = acts.size(0);
@@ -102,14 +104,15 @@ int gpu_rnnt(torch::Tensor acts,
     options.blank_label = blank_label;
     options.loc = RNNT_GPU;
     options.stream = at::cuda::getCurrentCUDAStream();
+    options.fastemit_lambda = fastemit_lambda;
     options.num_threads = num_threads;
 #if defined(RNNT_DISABLE_OMP) || defined(APPLE)
     // have to use at least one
     options.num_threads = std::max(options.num_threads, (unsigned int) 1);
 #endif
 
-    switch (acts.type().scalarType()) {
-      case torch::ScalarType::Float:
+    switch (acts.scalar_type()) {
+      case torch::kFloat:
         {
         size_t gpu_size_bytes;
         get_workspace_size(maxT, maxU, minibatch_size,
@@ -117,18 +120,22 @@ int gpu_rnnt(torch::Tensor acts,
 
         cudaSetDevice(acts.get_device());
 
-        void* gpu_workspace = THCudaMalloc(state, gpu_size_bytes);
+        void* gpu_workspace = c10::cuda::CUDACachingAllocator::raw_alloc(gpu_size_bytes);
+        if (gpu_workspace == nullptr) {
+            std::cerr << __FILE__ << ':' << __LINE__ << ": " << "failed to allocate GPU workspace" << std::endl;
+            return -1;
+        }
 
-        compute_rnnt_loss(acts.data<float>(), grads.data<float>(),
-                         labels.data<int>(), label_lengths.data<int>(),
-                         input_lengths.data<int>(), alphabet_size,
-                         minibatch_size, costs.data<float>(),
+        compute_rnnt_loss(acts.data_ptr<float>(), grads.data_ptr<float>(),
+                         labels.data_ptr<int>(), label_lengths.data_ptr<int>(),
+                         input_lengths.data_ptr<int>(), alphabet_size,
+                         minibatch_size, costs.data_ptr<float>(),
                          gpu_workspace, options);
 
-        THCudaFree(state, gpu_workspace);
+        c10::cuda::CUDACachingAllocator::raw_delete(gpu_workspace);
         return 0;
         }
-      case torch::ScalarType::Double:
+      case torch::kDouble:
         {
         size_t gpu_size_bytes;
         get_workspace_size(maxT, maxU, minibatch_size,
@@ -136,15 +143,19 @@ int gpu_rnnt(torch::Tensor acts,
 
         cudaSetDevice(acts.get_device());
 
-        void* gpu_workspace = THCudaMalloc(state, gpu_size_bytes);
+        void* gpu_workspace = c10::cuda::CUDACachingAllocator::raw_alloc(gpu_size_bytes);
+        if (gpu_workspace == nullptr) {
+            std::cerr << __FILE__ << ':' << __LINE__ << ": " << "failed to allocate GPU workspace" << std::endl;
+            return -1;
+        }
 
-        compute_rnnt_loss_fp64(acts.data<double>(), grads.data<double>(),
-                         labels.data<int>(), label_lengths.data<int>(),
-                         input_lengths.data<int>(), alphabet_size,
-                         minibatch_size, costs.data<double>(),
+        compute_rnnt_loss_fp64(acts.data_ptr<double>(), grads.data_ptr<double>(),
+                         labels.data_ptr<int>(), label_lengths.data_ptr<int>(),
+                         input_lengths.data_ptr<int>(), alphabet_size,
+                         minibatch_size, costs.data_ptr<double>(),
                          gpu_workspace, options);
 
-        THCudaFree(state, gpu_workspace);
+        c10::cuda::CUDACachingAllocator::raw_delete(gpu_workspace);
         return 0;
         }
       default:
diff --git a/pytorch_binding/test/test.py b/pytorch_binding/test/test.py
index 14feb09..c378ed0 100644
--- a/pytorch_binding/test/test.py
+++ b/pytorch_binding/test/test.py
@@ -23,28 +23,24 @@
 
 fn = rnntloss() if args.np else RNNTLoss(reduction='sum')
 
-gpu = 1
+#acts = autograd.Variable(acts, requires_grad=True)
 def wrap_and_call(acts, labels):
     acts = torch.FloatTensor(acts)
-    if use_cuda:
-        acts = acts.cuda(gpu)
-    #acts = autograd.Variable(acts, requires_grad=True)
+    device = torch.device("cuda" if use_cuda else "cpu")
+    acts = acts.to(device)
     acts.requires_grad = True
 
     lengths = [acts.shape[1]] * acts.shape[0]
     label_lengths = [len(l) for l in labels]
-    labels = torch.IntTensor(labels)
-    lengths = torch.IntTensor(lengths)
-    label_lengths = torch.IntTensor(label_lengths)
-    if use_cuda:
-        labels = labels.cuda(gpu)
-        lengths = lengths.cuda(gpu)
-        label_lengths = label_lengths.cuda(gpu)
+
+    labels = torch.IntTensor(labels).to(device)
+    lengths = torch.IntTensor(lengths).to(device)
+    label_lengths = torch.IntTensor(label_lengths).to(device)
 
     costs = fn(acts, labels, lengths, label_lengths)
     cost = torch.sum(costs)
     cost.backward()
-    # print(repr(acts.grad.data.cpu().numpy()))
+
     return costs.data.cpu().numpy(), acts.grad.data.cpu().numpy()
 
 
diff --git a/pytorch_binding/warprnnt_pytorch/__init__.py b/pytorch_binding/warprnnt_pytorch/__init__.py
index 76c6c30..8f1d261 100644
--- a/pytorch_binding/warprnnt_pytorch/__init__.py
+++ b/pytorch_binding/warprnnt_pytorch/__init__.py
@@ -9,12 +9,13 @@
 
 class _RNNT(Function):
     @staticmethod
-    def forward(ctx, acts, labels, act_lens, label_lens, blank, reduction):
+    def forward(ctx, acts, labels, act_lens, label_lens, blank, reduction, fastemit_lambda):
         """
         acts: Tensor of (batch x seqLength x labelLength x outputDim) containing output from network
         labels: 2 dimensional Tensor containing all the targets of the batch with zero padded
         act_lens: Tensor of size (batch) containing size of each output sequence from the network
         label_lens: Tensor of (batch) containing label length of each example
+        fastemit_lambda: Regularization parameter for FastEmit (https://arxiv.org/pdf/2010.11148.pdf)
         """
         is_cuda = acts.is_cuda
 
@@ -31,6 +32,7 @@ def forward(ctx, acts, labels, act_lens, label_lens, blank, reduction):
                   costs,
                   grads,
                   blank,
+                  fastemit_lambda,
                   0)
 
         if reduction in ['sum', 'mean']:
@@ -47,10 +49,10 @@ def forward(ctx, acts, labels, act_lens, label_lens, blank, reduction):
     @staticmethod
     def backward(ctx, grad_output):
         grad_output = grad_output.view(-1, 1, 1, 1).to(ctx.grads)
-        return ctx.grads.mul_(grad_output), None, None, None, None, None
+        return ctx.grads.mul_(grad_output), None, None, None, None, None, None
 
 
-def rnnt_loss(acts, labels, act_lens, label_lens, blank=0, reduction='mean'):
+def rnnt_loss(acts, labels, act_lens, label_lens, blank=0, reduction='mean', fastemit_lambda=0.001):
     """ RNN Transducer Loss
 
     Args:
@@ -63,11 +65,12 @@ def rnnt_loss(acts, labels, act_lens, label_lens, blank=0, reduction='mean'):
             'none' | 'mean' | 'sum'. 'none': no reduction will be applied, 
             'mean': the output losses will be divided by the target lengths and
             then the mean over the batch is taken. Default: 'mean'
+       fastemit_lambda: Regularization parameter for FastEmit (https://arxiv.org/pdf/2010.11148.pdf)
     """
     if not acts.is_cuda:
         acts = torch.nn.functional.log_softmax(acts, -1)
 
-    return _RNNT.apply(acts, labels, act_lens, label_lens, blank, reduction)
+    return _RNNT.apply(acts, labels, act_lens, label_lens, blank, reduction, fastemit_lambda)
 
 
 class RNNTLoss(Module):
@@ -79,10 +82,11 @@ class RNNTLoss(Module):
             'mean': the output losses will be divided by the target lengths and
             then the mean over the batch is taken. Default: 'mean'
     """
-    def __init__(self, blank=0, reduction='mean'):
+    def __init__(self, blank=0, fastemit_lambda=0.001, reduction='mean'):
         super(RNNTLoss, self).__init__()
         self.blank = blank
         self.reduction = reduction
+        self.fastemit_lambda = fastemit_lambda
         self.loss = _RNNT.apply
 
     def forward(self, acts, labels, act_lens, label_lens):
@@ -97,7 +101,7 @@ def forward(self, acts, labels, act_lens, label_lens):
             # log_softmax is computed within GPU version.
             acts = torch.nn.functional.log_softmax(acts, -1)
 
-        return self.loss(acts, labels, act_lens, label_lens, self.blank, self.reduction)
+        return self.loss(acts, labels, act_lens, label_lens, self.blank, self.reduction, self.fastemit_lambda)
 
 
 def check_type(var, t, name):
diff --git a/src/rnnt_entrypoint.cpp b/src/rnnt_entrypoint.cpp
deleted file mode 100644
index 0d742d8..0000000
--- a/src/rnnt_entrypoint.cpp
+++ /dev/null
@@ -1,187 +0,0 @@
-#include <cstddef>
-#include <iostream>
-#include <algorithm>
-
-#include <rnnt.h>
-
-#include "detail/cpu_rnnt.h"
-#ifdef __CUDACC__
-    #include "detail/gpu_rnnt.h"
-#endif
-
-extern "C" {
-
-int get_warprnnt_version() {
-    return 1;
-}
-
-const char* rnntGetStatusString(rnntStatus_t status) {
-    switch (status) {
-    case RNNT_STATUS_SUCCESS:
-        return "no error";
-    case RNNT_STATUS_MEMOPS_FAILED:
-        return "cuda memcpy or memset failed";
-    case RNNT_STATUS_INVALID_VALUE:
-        return "invalid value";
-    case RNNT_STATUS_EXECUTION_FAILED:
-        return "execution failed";
-
-    case RNNT_STATUS_UNKNOWN_ERROR:
-    default:
-        return "unknown error";
-
-    }
-
-}
-
-
-rnntStatus_t compute_rnnt_loss(const float* const activations, //BTUV
-                             float* gradients,
-                             const int* const flat_labels,
-                             const int* const label_lengths,
-                             const int* const input_lengths,
-                             int alphabet_size,
-                             int minibatch,
-                             float *costs,
-                             void *workspace,
-                             rnntOptions options) {
-
-    if (activations == nullptr ||
-        flat_labels == nullptr ||
-        label_lengths == nullptr ||
-        input_lengths == nullptr ||
-        costs == nullptr ||
-        workspace == nullptr ||
-        alphabet_size <= 0 ||
-        minibatch <= 0 ||
-        options.maxT <= 0 ||
-        options.maxU <= 0)
-        return RNNT_STATUS_INVALID_VALUE;
-
-    if (options.loc == RNNT_CPU) {
-        CpuRNNT<float> rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace, 
-                                options.blank_label, options.num_threads, options.batch_first);
-
-        if (gradients != NULL)
-            return rnnt.cost_and_grad(activations, gradients,
-                                        costs,
-                                        flat_labels, label_lengths,
-                                        input_lengths);
-        else
-            return rnnt.score_forward(activations, costs, flat_labels,
-                                        label_lengths, input_lengths);
-    } else if (options.loc == RNNT_GPU) {
-#ifdef __CUDACC__
-        GpuRNNT<float> rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace,
-                                options.blank_label, options.num_threads, options.stream);
-
-        if (gradients != NULL)
-            return rnnt.cost_and_grad(activations, gradients,
-                                        costs,
-                                        flat_labels, label_lengths,
-                                        input_lengths);
-        else
-            return rnnt.score_forward(activations, costs, flat_labels,
-                                        label_lengths, input_lengths);
-#else
-        std::cerr << "GPU execution requested, but not compiled with GPU support" << std::endl;
-        return RNNT_STATUS_EXECUTION_FAILED;
-#endif
-    } else {
-        return RNNT_STATUS_INVALID_VALUE;
-    }
-}
-
-
-rnntStatus_t get_workspace_size(int maxT, int maxU,
-                               int minibatch,
-                               bool gpu,
-                               size_t* size_bytes,
-                               size_t dtype_size)
-{
-    if (minibatch <= 0 ||
-        maxT <= 0 ||
-        maxU <= 0)
-        return RNNT_STATUS_INVALID_VALUE;
-
-    *size_bytes = 0;
-
-    // per minibatch memory
-    size_t per_minibatch_bytes = 0;
-
-    // alphas & betas
-    per_minibatch_bytes += dtype_size * maxT * maxU * 2;
-
-    if (!gpu) {
-        // blank & label log probability cache
-        per_minibatch_bytes += dtype_size * maxT * maxU * 2;
-    } else {
-        // softmax denominator
-        per_minibatch_bytes += dtype_size * maxT * maxU;
-        // forward-backward loglikelihood
-        per_minibatch_bytes += dtype_size * 2;
-    }
-
-    *size_bytes = per_minibatch_bytes * minibatch;
-
-    return RNNT_STATUS_SUCCESS;
-}
-
-rnntStatus_t compute_rnnt_loss_fp64(const double* const activations, //BTUV
-                             double* gradients,
-                             const int* const flat_labels,
-                             const int* const label_lengths,
-                             const int* const input_lengths,
-                             int alphabet_size,
-                             int minibatch,
-                             double *costs,
-                             void *workspace,
-                             rnntOptions options) {
-
-    if (activations == nullptr ||
-        flat_labels == nullptr ||
-        label_lengths == nullptr ||
-        input_lengths == nullptr ||
-        costs == nullptr ||
-        workspace == nullptr ||
-        alphabet_size <= 0 ||
-        minibatch <= 0 ||
-        options.maxT <= 0 ||
-        options.maxU <= 0)
-        return RNNT_STATUS_INVALID_VALUE;
-
-    if (options.loc == RNNT_CPU) {
-        CpuRNNT<double> rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace, 
-                                options.blank_label, options.num_threads, options.batch_first);
-
-        if (gradients != NULL)
-            return rnnt.cost_and_grad(activations, gradients,
-                                        costs,
-                                        flat_labels, label_lengths,
-                                        input_lengths);
-        else
-            return rnnt.score_forward(activations, costs, flat_labels,
-                                        label_lengths, input_lengths);
-    } else if (options.loc == RNNT_GPU) {
-#ifdef __CUDACC__
-        GpuRNNT<double> rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace,
-                                options.blank_label, options.num_threads, options.stream);
-
-        if (gradients != NULL)
-            return rnnt.cost_and_grad(activations, gradients,
-                                        costs,
-                                        flat_labels, label_lengths,
-                                        input_lengths);
-        else
-            return rnnt.score_forward(activations, costs, flat_labels,
-                                        label_lengths, input_lengths);
-#else
-        std::cerr << "GPU execution requested, but not compiled with GPU support" << std::endl;
-        return RNNT_STATUS_EXECUTION_FAILED;
-#endif
-    } else {
-        return RNNT_STATUS_INVALID_VALUE;
-    }
-}
-
-}
diff --git a/src/rnnt_entrypoint.cu b/src/rnnt_entrypoint.cu
deleted file mode 120000
index c483c3c..0000000
--- a/src/rnnt_entrypoint.cu
+++ /dev/null
@@ -1 +0,0 @@
-rnnt_entrypoint.cpp
\ No newline at end of file
diff --git a/src/rnnt_entrypoint.cu b/src/rnnt_entrypoint.cu
new file mode 100644
index 0000000..c213f4c
--- /dev/null
+++ b/src/rnnt_entrypoint.cu
@@ -0,0 +1,197 @@
+#include <cstddef>
+#include <iostream>
+#include <algorithm>
+
+#include <rnnt.h>
+
+#include "detail/cpu_rnnt.h"
+#ifdef __CUDACC__
+    #include "detail/gpu_rnnt.h"
+#endif
+
+extern "C" {
+
+int get_warprnnt_version() {
+    return 1;
+}
+
+const char* rnntGetStatusString(rnntStatus_t status) {
+    switch (status) {
+    case RNNT_STATUS_SUCCESS:
+        return "no error";
+    case RNNT_STATUS_MEMOPS_FAILED:
+        return "cuda memcpy or memset failed";
+    case RNNT_STATUS_INVALID_VALUE:
+        return "invalid value";
+    case RNNT_STATUS_EXECUTION_FAILED:
+        return "execution failed";
+
+    case RNNT_STATUS_UNKNOWN_ERROR:
+    default:
+        return "unknown error";
+
+    }
+
+}
+
+
+rnntStatus_t compute_rnnt_loss(const float* const activations, //BTUV
+                             float* gradients,
+                             const int* const flat_labels,
+                             const int* const label_lengths,
+                             const int* const input_lengths,
+                             int alphabet_size,
+                             int minibatch,
+                             float *costs,
+                             void *workspace,
+                             rnntOptions options) {
+
+    if (activations == nullptr ||
+        flat_labels == nullptr ||
+        label_lengths == nullptr ||
+        input_lengths == nullptr ||
+        costs == nullptr ||
+        workspace == nullptr ||
+        alphabet_size <= 0 ||
+        minibatch <= 0 ||
+        options.maxT <= 0 ||
+        options.maxU <= 0 ||
+        options.fastemit_lambda < 0)
+        return RNNT_STATUS_INVALID_VALUE;
+
+    if (options.loc == RNNT_CPU) {
+        CpuRNNT<float> rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace, 
+                            options.blank_label, options.fastemit_lambda, options.num_threads, options.batch_first);
+
+        if (gradients != NULL)
+            return rnnt.cost_and_grad(activations, gradients,
+                                        costs,
+                                        flat_labels, label_lengths,
+                                        input_lengths);
+        else
+            return rnnt.score_forward(activations, costs, flat_labels,
+                                        label_lengths, input_lengths);
+    } else if (options.loc == RNNT_GPU) {
+#ifdef __CUDACC__
+        GpuRNNT<float> rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace,
+                            options.blank_label, options.fastemit_lambda, options.num_threads, options.stream);
+
+        if (gradients != NULL)
+            return rnnt.cost_and_grad(activations, gradients,
+                                        costs,
+                                        flat_labels, label_lengths,
+                                        input_lengths);
+        else
+            return rnnt.score_forward(activations, costs, flat_labels,
+                                        label_lengths, input_lengths);
+#else
+        std::cerr << "GPU execution requested, but not compiled with GPU support" << std::endl;
+        return RNNT_STATUS_EXECUTION_FAILED;
+#endif
+    } else {
+        return RNNT_STATUS_INVALID_VALUE;
+    }
+}
+
+
+rnntStatus_t get_workspace_size(int maxT, int maxU,
+                               int minibatch,
+                               bool gpu,
+                               size_t* size_bytes,
+                               size_t dtype_size)
+{
+    if (minibatch <= 0 ||
+        maxT <= 0 ||
+        maxU <= 0)
+        return RNNT_STATUS_INVALID_VALUE;
+
+    *size_bytes = 0;
+
+    // per minibatch memory
+    size_t per_minibatch_bytes = 0;
+
+    // alphas & betas
+    per_minibatch_bytes += dtype_size * maxT * maxU * 2;
+
+    if (!gpu) {
+        // blank & label log probability cache
+        per_minibatch_bytes += dtype_size * maxT * maxU * 2;
+    } else {
+        // softmax denominator
+        per_minibatch_bytes += dtype_size * maxT * maxU;
+        // forward-backward loglikelihood
+        per_minibatch_bytes += dtype_size * 2;
+    }
+
+    *size_bytes = per_minibatch_bytes * minibatch;
+
+    return RNNT_STATUS_SUCCESS;
+}
+
+rnntStatus_t compute_rnnt_loss_fp64(const double* const activations, //BTUV
+                             double* gradients,
+                             const int* const flat_labels,
+                             const int* const label_lengths,
+                             const int* const input_lengths,
+                             int alphabet_size,
+                             int minibatch,
+                             double *costs,
+                             void *workspace,
+                             rnntOptions options) {
+
+    if (activations == nullptr ||
+        flat_labels == nullptr ||
+        label_lengths == nullptr ||
+        input_lengths == nullptr ||
+        costs == nullptr ||
+        workspace == nullptr ||
+        alphabet_size <= 0 ||
+        minibatch <= 0 ||
+        options.maxT <= 0 ||
+        options.maxU <= 0 ||
+        options.fastemit_lambda < 0)
+        return RNNT_STATUS_INVALID_VALUE;
+
+    if (options.loc == RNNT_CPU) {
+        CpuRNNT<double> rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace, 
+                                options.blank_label, options.fastemit_lambda, options.num_threads, options.batch_first);
+
+        if (gradients != NULL)
+            return rnnt.cost_and_grad(activations, gradients,
+                                        costs,
+                                        flat_labels, label_lengths,
+                                        input_lengths);
+        else
+            return rnnt.score_forward(activations, costs, flat_labels,
+                                        label_lengths, input_lengths);
+    } else if (options.loc == RNNT_GPU) {
+#ifdef __CUDACC__
+        GpuRNNT<double> rnnt(minibatch, options.maxT, options.maxU, alphabet_size, workspace,
+                                options.blank_label, options.fastemit_lambda, options.num_threads, options.stream);
+
+        if (gradients != NULL)
+            return rnnt.cost_and_grad(activations, gradients,
+                                        costs,
+                                        flat_labels, label_lengths,
+                                        input_lengths);
+        else
+            return rnnt.score_forward(activations, costs, flat_labels,
+                                        label_lengths, input_lengths);
+#else
+        std::cerr << "GPU execution requested, but not compiled with GPU support" << std::endl;
+        return RNNT_STATUS_EXECUTION_FAILED;
+#endif
+    } else {
+        return RNNT_STATUS_INVALID_VALUE;
+    }
+}
+
+const char* rnnt_backend_mode() {
+#ifdef __CUDACC__
+    return "GPU";
+#else
+    return "CPU";
+#endif
+}
+
+}
diff --git a/tensorflow_binding/README.md b/tensorflow_binding/README.md
deleted file mode 100644
index b35ae4c..0000000
--- a/tensorflow_binding/README.md
+++ /dev/null
@@ -1,97 +0,0 @@
-# TensorFlow binding for WarpRNNT
-
-This package provides TensorFlow kernels that wrap the WarpRNNT
-library.
-
-## Installation
-
-To build the kernels it is necessary to have the TensorFlow source
-code available, since TensorFlow doesn't currently install the
-necessary headers to handle the SparseTensor that the CTCLoss op uses
-to input the labels.  You can retrieve the TensorFlow source from
-github.com:
-
-```bash
-git clone https://github.com/tensorflow/tensorflow.git
-```
-<!--
-Tell the build scripts where you have the TensorFlow source tree by
-setting the `TENSORFLOW_SRC_PATH` environment variable:
-
-```bash
-export TENSORFLOW_SRC_PATH=/path/to/tensorflow
-```
--->
-`WARP_RNNT_PATH` should be set to the location of a built WarpRNNT
-(i.e. `libwarprnnt.so`).  This defaults to `../build`, so from within a
-new warp-rnnt clone you could build WarpRNNT like this:
-
-```bash
-mkdir build; cd build
-cmake -DCUDA_TOOLKIT_ROOT_DIR=/path/to/cuda ..
-make
-```
-
-Otherwise, set `WARP_RNNT_PATH` to wherever you have `libwarprnnt.so`
-installed. If you have a GPU, you should also make sure that
-`CUDA_HOME` is set to the home cuda directory (i.e. where
-`include/cuda.h` and `lib/libcudart.so` live).
-
-You should now be able to use `setup.py` to install the package into
-your current Python environment:
-
-```bash
-CUDA=/path/to/cuda python setup.py install
-```
-
-You can run a few unit tests with `setup.py` as well if you want:
-
-```bash
-python setup.py test
-```
-
-## Using the kernels
-
-First import the module:
-
-```python
-import warprnnt_tensorflow
-```
-
-The WarpRNNT op is available via the `warprnnt_tensorflow.rnnt_loss` function:
-
-```python
-costs = warprnnt_tensorflow.rnnt_loss(acts, labels, input_lengths, label_lengths)
-```
-
-The `acts` is a 4 dimensional Tensor, `labels`
-is 2 dimensinal Tensor, and all the others are single dimension Tensors.
-See the main WarpRNNT documentation for more information.
-
-## Python interface
-```python
-def rnnt_loss(acts, labels, input_lengths, label_lengths, blank_label=0):
-    '''Computes the RNNT loss between a sequence of activations and a
-    ground truth labeling.
-    Args:
-        acts: A 4-D Tensor of floats.  The dimensions
-                     should be (B, T, U+1, V), where B is the minibatch index,
-                     T is the time index, U is the label sequence
-                     length (+1 means blank label prepanded), 
-                     and V indexes over activations for each 
-                     symbol in the alphabet.
-        labels: A 2-D Tensor of ints, a padded label sequences to make sure 
-                     labels for the minibatch are same length.
-        input_lengths: A 1-D Tensor of ints, the number of time steps
-                       for each sequence in the minibatch.
-        label_lengths: A 1-D Tensor of ints, the length of each label
-                       for each example in the minibatch.
-        blank_label: int, the label value/index that the RNNT
-                     calculation should use as the blank label
-    Returns:
-        1-D float Tensor, the cost of each example in the minibatch
-        (as negative log probabilities).
-    * This class performs the softmax operation internally.
-    * The label reserved for the blank symbol should be label 0.
-    '''
-```
diff --git a/tensorflow_binding/setup.py b/tensorflow_binding/setup.py
deleted file mode 100644
index 170f4d6..0000000
--- a/tensorflow_binding/setup.py
+++ /dev/null
@@ -1,149 +0,0 @@
-"""setup.py script for warp-rnnt TensorFlow wrapper"""
-
-from __future__ import print_function
-
-import os
-import platform
-import re
-import warnings
-import setuptools
-import sys
-import unittest
-from setuptools.command.build_ext import build_ext as orig_build_ext
-from distutils.version import LooseVersion
-
-# We need to import tensorflow to find where its include directory is.
-try:
-    import tensorflow as tf
-except ImportError:
-    raise RuntimeError("Tensorflow must be installed to build the tensorflow wrapper.")
-
-if "CUDA_HOME" not in os.environ:
-    print("CUDA_HOME not found in the environment so building "
-          "without GPU support. To build with GPU support "
-          "please define the CUDA_HOME environment variable. "
-          "This should be a path which contains include/cuda.h",
-          file=sys.stderr)
-    enable_gpu = False
-else:
-    enable_gpu = True
-
-if platform.system() == 'Darwin':
-    lib_ext = ".dylib"
-else:
-    lib_ext = ".so"
-
-warp_rnnt_path = "../build"
-if "WARP_RNNT_PATH" in os.environ:
-    warp_rnnt_path = os.environ["WARP_RNNT_PATH"]
-if not os.path.exists(os.path.join(warp_rnnt_path, "libwarprnnt"+lib_ext)):
-    print(("Could not find libwarprnnt.so in {}.\n"
-           "Build warp-rnnt and set WARP_RNNT_PATH to the location of"
-           " libwarprnnt.so (default is '../build')").format(warp_rnnt_path),
-          file=sys.stderr)
-    sys.exit(1)
-
-root_path = os.path.realpath(os.path.dirname(__file__))
-
-tf_include = tf.sysconfig.get_include()
-tf_src_dir = tf.sysconfig.get_lib() # os.environ["TENSORFLOW_SRC_PATH"]
-tf_includes = [tf_include, tf_src_dir]
-warp_rnnt_includes = [os.path.join(root_path, '../include')]
-include_dirs = tf_includes + warp_rnnt_includes
-
-if LooseVersion(tf.__version__) >= LooseVersion('1.4'):
-    nsync_dir = '../../external/nsync/public'
-    if LooseVersion(tf.__version__) >= LooseVersion('1.10'):
-        nsync_dir = 'external/nsync/public'
-    include_dirs += [os.path.join(tf_include, nsync_dir)]
-
-if os.getenv("TF_CXX11_ABI") is not None:
-    TF_CXX11_ABI = os.getenv("TF_CXX11_ABI")
-else:
-    warnings.warn("Assuming tensorflow was compiled without C++11 ABI. "
-                  "It is generally true if you are using binary pip package. "
-                  "If you compiled tensorflow from source with gcc >= 5 and didn't set "
-                  "-D_GLIBCXX_USE_CXX11_ABI=0 during compilation, you need to set "
-                  "environment variable TF_CXX11_ABI=1 when compiling this bindings. "
-                  "Also be sure to touch some files in src to trigger recompilation. "
-                  "Also, you need to set (or unsed) this environment variable if getting "
-                  "undefined symbol: _ZN10tensorflow... errors")
-    TF_CXX11_ABI = "0"
-
-extra_compile_args = ['-std=c++11', '-fPIC', '-D_GLIBCXX_USE_CXX11_ABI=' + TF_CXX11_ABI]
-# current tensorflow code triggers return type errors, silence those for now
-extra_compile_args += ['-Wno-return-type']
-if LooseVersion(tf.__version__) >= LooseVersion('1.4'):
-    extra_compile_args += tf.sysconfig.get_compile_flags()
-
-extra_link_args = []
-if LooseVersion(tf.__version__) >= LooseVersion('1.4'):
-    extra_link_args += tf.sysconfig.get_link_flags()
-
-if (enable_gpu):
-    extra_compile_args += ['-DWARPRNNT_ENABLE_GPU']
-    include_dirs += [os.path.join(os.environ["CUDA_HOME"], 'include')]
-
-    # mimic tensorflow cuda include setup so that their include command work
-    if not os.path.exists(os.path.join(root_path, "include")):
-        os.mkdir(os.path.join(root_path, "include"))
-
-    cuda_inc_path = os.path.join(root_path, "include/cuda")
-    if not os.path.exists(cuda_inc_path) or os.readlink(cuda_inc_path) != os.environ["CUDA_HOME"]:
-        if os.path.exists(cuda_inc_path):
-            os.remove(cuda_inc_path)
-        os.symlink(os.environ["CUDA_HOME"], cuda_inc_path)
-    include_dirs += [os.path.join(root_path, 'include')]
-
-# Ensure that all expected files and directories exist.
-for loc in include_dirs:
-    if not os.path.exists(loc):
-        print(("Could not find file or directory {}.\n"
-               "Check your environment variables and paths?").format(loc),
-              file=sys.stderr)
-        sys.exit(1)
-
-lib_srcs = ['src/warprnnt_op.cc']
-
-ext = setuptools.Extension('warprnnt_tensorflow.kernels',
-                           sources = lib_srcs,
-                           language = 'c++',
-                           include_dirs = include_dirs,
-                           library_dirs = [warp_rnnt_path],
-                           runtime_library_dirs = [os.path.realpath(warp_rnnt_path)],
-                           libraries = ['warprnnt'],
-                           extra_compile_args = extra_compile_args,
-                           extra_link_args = extra_link_args)
-
-class build_tf_ext(orig_build_ext):
-    def build_extensions(self):
-        if LooseVersion(tf.__version__) < LooseVersion('1.4'):
-            self.compiler.compiler_so.remove('-Wstrict-prototypes')
-        orig_build_ext.build_extensions(self)
-
-def discover_test_suite():
-    test_loader = unittest.TestLoader()
-    test_suite = test_loader.discover('tests', pattern='test_*.py')
-    return test_suite
-
-# Read the README.md file for the long description. This lets us avoid
-# duplicating the package description in multiple places in the source.
-README_PATH = os.path.join(os.path.dirname(__file__), "README.md")
-with open(README_PATH, "r") as handle:
-    # Extract everything between the first set of ## headlines
-    LONG_DESCRIPTION = re.search("#.*([^#]*)##", handle.read()).group(1).strip()
-
-setuptools.setup(
-    name = "warprnnt_tensorflow",
-    version = "0.1",
-    description = "TensorFlow wrapper for warp-transducer",
-    url="https://github.com/HawkAaron/warp-transducer",
-    long_description = LONG_DESCRIPTION,
-    author = "Mingkun Huang",
-    author_email = "mingkunhuang95@gmail.com",
-    license = "Apache",
-    packages = ["warprnnt_tensorflow"],
-    ext_modules = [ext],
-    cmdclass = {'build_ext': build_tf_ext},
-    test_suite = 'setup.discover_test_suite',
-)
diff --git a/tensorflow_binding/src/warprnnt_op.cc b/tensorflow_binding/src/warprnnt_op.cc
deleted file mode 100644
index e3d2210..0000000
--- a/tensorflow_binding/src/warprnnt_op.cc
+++ /dev/null
@@ -1,191 +0,0 @@
-#ifdef WARPRNNT_ENABLE_GPU
-#define EIGEN_USE_GPU
-#include <cuda.h>
-#endif
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/bounds_check.h"
-#include "tensorflow/core/framework/allocator.h"
-#include "rnnt.h"
-
-
-REGISTER_OP("WarpRNNT")
-    .Input("acts: float32")
-    .Input("labels: int32")
-    .Input("input_lengths: int32")
-    .Input("label_lengths: int32")
-    .Attr("blank_label: int = 0")
-    .Output("costs: float32")
-    .Output("grads: float32");
-
-namespace tf = tensorflow;
-
-namespace warp_rnnt {
-
-class WarpRNNTOpBase : public tf::OpKernel {
-  public:
-    explicit WarpRNNTOpBase(tf::OpKernelConstruction* ctx) : tf::OpKernel(ctx) {
-        OP_REQUIRES_OK(ctx, ctx->GetAttr("blank_label", &blank_label_));
-    }
-
-    void Compute(tf::OpKernelContext* ctx) override {
-        // Grab the input tensors
-        const tf::Tensor* acts;
-        const tf::Tensor* labels;
-        const tf::Tensor* label_lengths;
-        const tf::Tensor* input_lengths;
-        OP_REQUIRES_OK(ctx, ctx->input("acts", &acts));
-        OP_REQUIRES_OK(ctx, ctx->input("labels", &labels));
-        OP_REQUIRES_OK(ctx, ctx->input("label_lengths", &label_lengths));
-        OP_REQUIRES_OK(ctx, ctx->input("input_lengths", &input_lengths));
-
-        OP_REQUIRES(ctx, acts->shape().dims() == 4,
-                    tf::errors::InvalidArgument("acts is not a 4-Tensor"));
-        OP_REQUIRES(ctx, labels->shape().dims() == 2,
-                    tf::errors::InvalidArgument("labels is not a 2-Tensor"));
-        OP_REQUIRES(ctx, tf::TensorShapeUtils::IsVector(label_lengths->shape()),
-                     tf::errors::InvalidArgument("label_lengths is not a vector"));
-        OP_REQUIRES(ctx, tf::TensorShapeUtils::IsVector(input_lengths->shape()),
-                     tf::errors::InvalidArgument("input_lengths is not a vector"));
-
-        const auto& acts_shape = acts->shape();
-        const auto batch_size = acts_shape.dim_size(0);
-        const auto max_time = acts_shape.dim_size(1);
-        const auto max_u = acts_shape.dim_size(2);
-        const auto num_classes_raw = acts_shape.dim_size(3);
-
-        auto acts_t = acts->tensor<float, 4>();
-        auto labels_t = labels->tensor<int32_t, 2>();
-
-        OP_REQUIRES(
-                ctx, tf::FastBoundsCheck(num_classes_raw, std::numeric_limits<int>::max()),
-                tf::errors::InvalidArgument("num_classes cannot exceed max int"));
-        const auto alphabet_size = static_cast<const int>(num_classes_raw);
-
-        OP_REQUIRES(
-                ctx, batch_size == input_lengths->dim_size(0),
-                tf::errors::InvalidArgument("len(input_lengths) != batch_size.  ",
-                                            "len(input_length):  ", input_lengths->dim_size(0),
-                                            " batch_size: ", batch_size));
-        auto input_lengths_t = input_lengths->vec<int32_t>();
-
-        OP_REQUIRES(
-                ctx, batch_size == label_lengths->dim_size(0),
-                tf::errors::InvalidArgument("len(label_lengths) != batch_size.  ",
-                                            "len(label_length):  ", label_lengths->dim_size(0),
-                                            " batch_size: ", batch_size));
-        auto label_lengths_t = label_lengths->vec<int32_t>();
-
-        // TODO check that labels are in the alphabet?
-        // Refer to line 185, we know that
-        // Tensor input_lengths is in GPU, so cannot compare with CPU variable
-        //for (int b = 0; b < batch_size; b++) {
-        //    OP_REQUIRES(ctx, input_lengths_t(b) <= max_time,
-        //                tf::errors::InvalidArgument("input_lengths(", b, ") <= ", max_time));
-        //}
-
-        tf::Tensor* costs = nullptr;
-        OP_REQUIRES_OK(ctx, ctx->allocate_output("costs", input_lengths->shape(), &costs));
-        auto costs_t = costs->vec<float>();
-
-        tf::Tensor* grads = nullptr;
-        OP_REQUIRES_OK(ctx, ctx->allocate_output("grads", acts->shape(), &grads));
-        set_zero(grads);
-        auto grads_t = grads->tensor<float, 4>();
-
-        auto options = create_options(ctx);
-        options.blank_label = blank_label_;
-        options.maxT = max_time;
-        options.maxU = max_u;
-
-        size_t workspace_size_bytes;
-        bool use_gpu = false;
-        if(options.loc == RNNT_GPU) {
-            use_gpu = true;
-        }
-        auto warp_status = get_workspace_size(max_time,
-                                              max_u,
-                                              batch_size,
-                                              use_gpu,
-                                              &workspace_size_bytes);
-
-        OP_REQUIRES(ctx, warp_status == RNNT_STATUS_SUCCESS,
-                    tf::errors::Internal("warp_rnnt error in get_workspace_size: ",
-                                         rnntGetStatusString(warp_status)));
-
-        auto workspace_shape = tf::TensorShape{static_cast<int64_t>(workspace_size_bytes)};
-        tf::Tensor workspace;
-        OP_REQUIRES_OK(ctx, ctx->allocate_temp(tf::DT_UINT8, workspace_shape, &workspace));
-        auto workspace_t = workspace.flat<uint8_t>();
-
-        // compute RNNT
-        warp_status = compute_rnnt_loss(acts_t.data(),
-                                        grads_t.data(),
-                                        labels_t.data(),
-                                        label_lengths_t.data(),
-                                        input_lengths_t.data(),
-                                        alphabet_size, batch_size,
-                                        costs_t.data(), workspace_t.data(), options);
-
-        OP_REQUIRES(ctx, warp_status == RNNT_STATUS_SUCCESS,
-                    tf::errors::Internal("warp_rnnt error in compute_rnnt_loss: ",
-                                         rnntGetStatusString(warp_status)));
-
-    }
-  private:
-    int blank_label_;
-    virtual void set_zero(tf::Tensor* t) = 0;
-    virtual rnntOptions create_options(tf::OpKernelContext* ctx) = 0;
-};
-
-class WarpRNNTOpCPU : public WarpRNNTOpBase {
-  public:
-    explicit WarpRNNTOpCPU(tf::OpKernelConstruction* ctx) : WarpRNNTOpBase(ctx) {
-    }
-
-  private:
-    void set_zero(tf::Tensor* t) override {
-        t->flat<float>().setZero();
-    }
-
-    rnntOptions create_options(tf::OpKernelContext* ctx) override {
-        auto options = rnntOptions{};
-        options.loc = RNNT_CPU;
-        options.batch_first = true;
-        options.num_threads = ctx->device()->tensorflow_cpu_worker_threads()->num_threads;
-        return options;
-    }
-};
-
-REGISTER_KERNEL_BUILDER(Name("WarpRNNT").Device(::tensorflow::DEVICE_CPU), WarpRNNTOpCPU);
-
-#ifdef WARPRNNT_ENABLE_GPU
-
-class WarpRNNTOpGPU : public WarpRNNTOpBase {
-  public:
-    explicit WarpRNNTOpGPU(tf::OpKernelConstruction* ctx) : WarpRNNTOpBase(ctx) {
-    }
-
-  private:
-    void set_zero(tf::Tensor* t) override {
-        // here is not need
-        // cudaMemset(t->flat<float>().data(), 0, t->NumElements()*sizeof(float));
-    }
-
-    rnntOptions create_options(tf::OpKernelContext* ctx) override {
-        auto cuda_stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
-        auto options = rnntOptions{};
-        options.loc = RNNT_GPU;
-        options.stream = cuda_stream;
-        return options;
-    }
-};
-
-REGISTER_KERNEL_BUILDER(Name("WarpRNNT").Device(::tensorflow::DEVICE_GPU)
-                        .HostMemory("costs"),
-                        WarpRNNTOpGPU);
-#undef EIGEN_USE_GPU
-#endif
-
-}
diff --git a/tensorflow_binding/tests/test_basic.py b/tensorflow_binding/tests/test_basic.py
deleted file mode 100644
index d6d13e2..0000000
--- a/tensorflow_binding/tests/test_basic.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import numpy as np
-import tensorflow as tf
-from warprnnt_tensorflow import rnnt_loss
-
-acts = tf.placeholder(tf.float32, [None, None, None, None])
-labels = tf.placeholder(tf.int32, [None, None])
-input_length = tf.placeholder(tf.int32, [None])
-label_length = tf.placeholder(tf.int32, [None])
-
-B = 2; T = 4; U = 3; V = 3; blank = 0
-
-logits = tf.nn.log_softmax(acts)
-costs = rnnt_loss(logits, labels, input_length, label_length, blank)
-grad = tf.gradients(costs, [acts])
-
-a = np.array([[[[0.06535690384862791, 0.7875301411923206, 0.08159176605666074],
-            [0.5297155426466327, 0.7506749639230854, 0.7541348379087998],
-            [0.6097641124736383, 0.8681404965673826, 0.6225318186056529]],
-
-            [[0.6685222872103057, 0.8580392805336061, 0.16453892311765583],
-            [0.989779515236694, 0.944298460961015, 0.6031678586829663],
-            [0.9467833543605416, 0.666202507295747, 0.28688179752461884]],
-
-            [[0.09418426230195986, 0.3666735970751962, 0.736168049462793],
-            [0.1666804425271342, 0.7141542198635192, 0.3993997272216727],
-            [0.5359823524146038, 0.29182076440286386, 0.6126422611507932]],
-
-            [[0.3242405528768486, 0.8007644367291621, 0.5241057606558068],
-            [0.779194617063042, 0.18331417220174862, 0.113745182072432],
-            [0.24022162381327106, 0.3394695622533106, 0.1341595066017014]]],
-
-        [[[0.5055615569388828, 0.051597282072282646, 0.6402903936686337],
-            [0.43073311517251, 0.8294731834714112, 0.1774668847323424],
-            [0.3207001991262245, 0.04288308912457006, 0.30280282975568984]],
-
-            [[0.6751777088333762, 0.569537369330242, 0.5584738347504452],
-            [0.08313242153985256, 0.06016544344162322, 0.10795752845152584],
-            [0.7486153608562472, 0.943918041459349, 0.4863558118797222]],
-
-            [[0.4181986264486809, 0.6524078485043804, 0.024242983423721887],
-            [0.13458171554507403, 0.3663418070512402, 0.2958297395361563],
-            [0.9236695822497084, 0.6899291482654177, 0.7418981733448822]],
-
-            [[0.25000547599982104, 0.6034295486281007, 0.9872887878887768],
-            [0.5926057265215715, 0.8846724004467684, 0.5434495396894328],
-            [0.6607698886038497, 0.3771277082495921, 0.3580209022231813]]]], dtype=np.float32)
-
-b = np.array([[1, 2], [1, 1]], dtype=np.int32)
-c = np.array([4, 4], dtype=np.int32)
-d = np.array([2, 2], dtype=np.int32)
-
-feed = {acts: a, labels: b, input_length: c, label_length: d}
-with tf.Session() as sess:
-    cost, grads = sess.run([costs, grad], feed_dict=feed)
-    print(cost)
-    print(grads)
\ No newline at end of file
diff --git a/tensorflow_binding/tests/test_warprnnt_op.py b/tensorflow_binding/tests/test_warprnnt_op.py
deleted file mode 100644
index f50d58e..0000000
--- a/tensorflow_binding/tests/test_warprnnt_op.py
+++ /dev/null
@@ -1,97 +0,0 @@
-import tensorflow as tf
-import numpy as np
-from warprnnt_tensorflow import rnnt_loss
-from tensorflow.python.client import device_lib
-
-def is_gpu_available():
-    """Returns whether Tensorflow can access a GPU."""
-    return any(x.device_type == 'GPU' for x in device_lib.list_local_devices())
-
-class WarpRNNTTest(tf.test.TestCase):
-
-    def _run_rnnt(self, acts, labels, input_lengths, label_lengths,
-                    expected_costs, expected_grads, blank, use_gpu=False):
-        self.assertEquals(acts.shape, expected_grads.shape)
-        acts_t = tf.constant(acts)
-        labels_t = tf.constant(labels)
-        input_lengths_t = tf.constant(input_lengths)
-        label_lengths_t = tf.constant(label_lengths)
-
-        logits = acts_t if use_gpu else tf.nn.log_softmax(acts_t)
-        costs = rnnt_loss(logits, labels_t, input_lengths_t, label_lengths_t, blank)
-
-        grads = tf.gradients(costs, [acts_t])[0]
-
-        with self.test_session(use_gpu=use_gpu) as sess:
-            (tf_costs, tf_grad) = sess.run([costs, grads])
-            self.assertAllClose(tf_costs, expected_costs, atol=1e-6)
-            self.assertAllClose(tf_grad, expected_grads, atol=1e-6)
-    
-    def test_forward(self):
-        # Softmax activations for the following inputs:
-        acts = np.array([0.1, 0.6, 0.1, 0.1, 0.1, 0.1, 
-                        0.1, 0.6, 0.1, 0.1, 0.1, 0.1, 
-                        0.2, 0.8, 0.1, 0.1, 0.6, 0.1, 
-                        0.1, 0.1, 0.1, 0.1, 0.2, 0.1, 
-                        0.1, 0.7, 0.1, 0.2, 0.1, 0.1], dtype=np.float32).reshape(1, 2, 3, 5)
-
-        labels = np.array([[1, 2]], dtype=np.int32)
-        input_lengths = np.array([2], dtype=np.int32)
-        label_lengths = np.array([2], dtype=np.int32)
-
-        acts_t = tf.constant(acts)
-        labels_t = tf.constant(labels)
-        input_lengths_t = tf.constant(input_lengths)
-        label_lengths_t = tf.constant(label_lengths)
-        acts_t = tf.nn.log_softmax(acts_t) # NOTE cpu
-        costs = rnnt_loss(acts_t, labels_t, input_lengths_t, label_lengths_t)
-        with self.test_session():
-            print(costs.eval())
-
-    def _test_multiple_batches(self, use_gpu):
-        B = 2; T = 4; U = 3; V = 3
-
-        acts = np.array([0.065357, 0.787530, 0.081592, 0.529716, 0.750675, 0.754135, 
-                        0.609764, 0.868140, 0.622532, 0.668522, 0.858039, 0.164539, 
-                        0.989780, 0.944298, 0.603168, 0.946783, 0.666203, 0.286882, 
-                        0.094184, 0.366674, 0.736168, 0.166680, 0.714154, 0.399400, 
-                        0.535982, 0.291821, 0.612642, 0.324241, 0.800764, 0.524106, 
-                        0.779195, 0.183314, 0.113745, 0.240222, 0.339470, 0.134160, 
-                        0.505562, 0.051597, 0.640290, 0.430733, 0.829473, 0.177467, 
-                        0.320700, 0.042883, 0.302803, 0.675178, 0.569537, 0.558474, 
-                        0.083132, 0.060165, 0.107958, 0.748615, 0.943918, 0.486356, 
-                        0.418199, 0.652408, 0.024243, 0.134582, 0.366342, 0.295830, 
-                        0.923670, 0.689929, 0.741898, 0.250005, 0.603430, 0.987289, 
-                        0.592606, 0.884672, 0.543450, 0.660770, 0.377128, 0.358021], dtype=np.float32).reshape(B, T, U, V);
-
-        expected_costs = np.array([4.28065, 3.93844], dtype=np.float32)
-        expected_grads = np.array([-0.186844, -0.062555, 0.249399, -0.203377, 0.202399, 0.000977,
-                                    -0.141016, 0.079123, 0.061893, -0.011552, -0.081280, 0.092832,
-                                    -0.154257, 0.229433, -0.075176, -0.246593, 0.146405, 0.100188,
-                                    -0.012918, -0.061593, 0.074512, -0.055986, 0.219831, -0.163845,
-                                    -0.497627, 0.209240, 0.288387, 0.013605, -0.030220, 0.016615,
-                                    0.113925, 0.062781, -0.176706, -0.667078, 0.367659, 0.299419,
-                                    -0.356344, -0.055347, 0.411691, -0.096922, 0.029459, 0.067463,
-                                    -0.063518, 0.027654, 0.035863, -0.154499, -0.073942, 0.228441,
-                                    -0.166790, -0.000088, 0.166878, -0.172370, 0.105565, 0.066804,
-                                    0.023875, -0.118256, 0.094381, -0.104707, -0.108934, 0.213642,
-                                    -0.369844, 0.180118, 0.189726, 0.025714, -0.079462, 0.053748,
-                                    0.122328, -0.238789, 0.116460, -0.598687, 0.302203, 0.296484], dtype=np.float32).reshape(B, T, U, V);
-
-        labels = np.array([[1, 2], [1, 1]], dtype=np.int32)
-        input_lengths = np.array([4, 4], dtype=np.int32)
-        label_lengths = np.array([2, 2], dtype=np.int32)
-
-        self._run_rnnt(acts, labels, input_lengths, label_lengths, expected_costs, expected_grads, 0, use_gpu)
-
-    def test_multiple_batches_cpu(self):
-        self._test_multiple_batches(use_gpu=False)
-    
-    def test_multiple_batches_gpu(self):
-        if (is_gpu_available()):
-            self._test_multiple_batches(use_gpu=True)
-        else:
-            print('Skipping GPU test, no gpus available')
-
-if __name__ == '__main__':
-    tf.test.main()
diff --git a/tensorflow_binding/warprnnt_tensorflow/__init__.py b/tensorflow_binding/warprnnt_tensorflow/__init__.py
deleted file mode 100644
index de45611..0000000
--- a/tensorflow_binding/warprnnt_tensorflow/__init__.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import imp
-import tensorflow as tf
-from tensorflow.python.framework import ops
-
-lib_file = imp.find_module('kernels', __path__)[1]
-_warprnnt = tf.load_op_library(lib_file)
-
-
-def rnnt_loss(acts, labels, input_lengths, label_lengths, blank_label=0):
-    '''Computes the RNNT loss between a sequence of activations and a
-    ground truth labeling.
-    Args:
-        acts: A 4-D Tensor of floats.  The dimensions
-                     should be (B, T, U, V), where B is the minibatch index,
-                     T is the time index, U is the prediction network sequence
-                     length, and V indexes over activations for each 
-                     symbol in the alphabet.
-        labels: A 2-D Tensor of ints, a padded label sequences to make sure 
-                     labels for the minibatch are same length.
-        input_lengths: A 1-D Tensor of ints, the number of time steps
-                       for each sequence in the minibatch.
-        label_lengths: A 1-D Tensor of ints, the length of each label
-                       for each example in the minibatch.
-        blank_label: int, the label value/index that the RNNT
-                     calculation should use as the blank label
-    Returns:
-        1-D float Tensor, the cost of each example in the minibatch
-        (as negative log probabilities).
-    * This class performs the softmax operation internally.
-    * The label reserved for the blank symbol should be label 0.
-    '''
-    loss, _ = _warprnnt.warp_rnnt(acts, labels, input_lengths,
-                                  label_lengths, blank_label)
-    return loss
-
-
-@ops.RegisterGradient("WarpRNNT")
-def _RNNTLossGrad(op, grad_loss, _):
-    grad = op.outputs[1]
-    # NOTE since here we are batch first, cannot use _BroadcastMul
-    grad_loss = tf.reshape(grad_loss, (-1, 1, 1, 1))
-    return [grad_loss * grad, None, None, None]
-
-@ops.RegisterShape("WarpRNNT")
-def _RNNTLossShape(op):
-    inputs_shape = op.inputs[0].get_shape().with_rank(4)
-    batch_size = inputs_shape[0]
-    return [batch_size, inputs_shape]
\ No newline at end of file
diff --git a/tests/test_cpu.cpp b/tests/test_cpu.cpp
index 554a18b..fee4eb5 100644
--- a/tests/test_cpu.cpp
+++ b/tests/test_cpu.cpp
@@ -239,7 +239,7 @@ bool inf_test() {
     return status;
 }
 
-float numeric_grad(std::vector<float>& acts, std::vector<int>& flat_labels, std::vector<int>& label_lengths,
+void numeric_grad(std::vector<float>& acts, std::vector<int>& flat_labels, std::vector<int>& label_lengths,
                 std::vector<int> sizes, int alphabet_size, int minibatch, 
                 void* rnnt_cpu_workspace, rnntOptions& options, std::vector<float>& num_grad) {
 
@@ -403,4 +403,4 @@ int main(void) {
         std::cout << "Some or all tests fail" << std::endl;
         return 1;
     }
-}
\ No newline at end of file
+}
diff --git a/tests/test_time.cpp b/tests/test_time.cpp
index 3083b0e..ce77ba6 100644
--- a/tests/test_time.cpp
+++ b/tests/test_time.cpp
@@ -12,7 +12,7 @@
 
 #include "test.h"
 
-bool run_test(int B, int T, int L, int A, int num_threads) {
+void run_test(int B, int T, int L, int A, int num_threads) {
     std::mt19937 gen(2);
 
     auto start = std::chrono::high_resolution_clock::now();
@@ -119,4 +119,4 @@ int main(int argc, char** argv) {
     }
 
     run_test(B, T, L, A, num_threads);
-}
\ No newline at end of file
+}