diff --git a/.gitignore b/.gitignore
index db78efc..0053c11 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@ __pycache__/
 
 # datasets
 *.zip
+papers/connectionist_temporal_classification/TIMIT
 
 # Distribution / packaging
 .Python
diff --git a/papers/connectionist_temporal_classification/Connectionist Temporal Classification.ipynb b/papers/connectionist_temporal_classification/Connectionist Temporal Classification.ipynb
new file mode 100644
index 0000000..350030a
--- /dev/null
+++ b/papers/connectionist_temporal_classification/Connectionist Temporal Classification.ipynb	
@@ -0,0 +1,486 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This is an implementation the Connectionist Temporal Classification loss function:\n",
+    "\n",
+    "> Graves, A., Fernández, S., Gomez, F., & Schmidhuber, J. (2006, June). Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In Proceedings of the 23rd international conference on Machine learning (pp. 369-376). ACM. ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf\n",
+    "\n",
+    "This notebook only show the learning procedure, no thorough testing is performed and the prefix search decoding is not implemented (contributions are welcome!).\n",
+    "\n",
+    "The original paper seems to use size 1 minibatches instead of 16 here. There shouldn't be any significant variations otherwise.\n",
+    "\n",
+    "Please download the [TIMIT dataset](http://academictorrents.com/details/34e2b78745138186976cbc27939b1b34d18bd5b3) and place the `TIMIT.zip` file next to this one.\n",
+    "\n",
+    "The following python packages are required:\n",
+    "- scipy\n",
+    "- lasagne\n",
+    "- matplotlib\n",
+    "- [sphfile](https://pypi.python.org/pypi/sphfile) (to read the sound files)\n",
+    "- [python_speech_features](https://github.com/jameslyons/python_speech_features) (to generate mfcc features)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "%matplotlib notebook\n",
+    "\n",
+    "import os\n",
+    "os.environ['THEANO_FLAGS'] = \"device=cuda\"\n",
+    "#os.environ['CUDA_LAUNCH_BLOCKING'] = \"1\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pickle as pkl\n",
+    "import numpy as np\n",
+    "from zipfile import ZipFile\n",
+    "from sphfile import SPHFile\n",
+    "from python_speech_features import mfcc\n",
+    "import theano\n",
+    "import theano.tensor as T\n",
+    "import lasagne\n",
+    "from lasagne.layers import InputLayer, LSTMLayer, DenseLayer, ConcatLayer, GaussianNoiseLayer\n",
+    "from lasagne.init import Uniform\n",
+    "from lasagne.nonlinearities import tanh, sigmoid\n",
+    "import matplotlib\n",
+    "import matplotlib.pyplot as plt\n",
+    "from ctc import ctc_loss, log_softmax, ctc_backward\n",
+    "import time"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## small useful functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def smooth(x, w):\n",
+    "    window = int(np.ceil(len(x) / 2 * (1000 ** w - 1) / 999))\n",
+    "    window += 1 - window % 2\n",
+    "    \n",
+    "    if window < 3 or len(x) < window:\n",
+    "        return x\n",
+    "    \n",
+    "    edge_weights = np.arange(1, window // 2 + 1)\n",
+    "    return np.concatenate([\n",
+    "        np.cumsum(x[:window // 2]) / edge_weights,\n",
+    "        np.convolve(x, np.full([window], 1 / window), 'valid'),\n",
+    "        np.cumsum(x[:-window // 2:-1])[::-1] / edge_weights[::-1]])\n",
+    "\n",
+    "def argmax_decode(preds, exclude=()):\n",
+    "    preds = np.argmax(preds, axis=1)\n",
+    "    decoded = [preds[0]]\n",
+    "    for v in preds:\n",
+    "        if v != decoded[-1]:\n",
+    "            decoded.append(v)\n",
+    "    \n",
+    "    return np.array([v for v in decoded if v not in exclude])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prepare dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if not os.path.isdir(\"data/lisa/data/timit/raw/TIMIT\"):\n",
+    "    assert os.path.exists(\"TIMIT.zip\"), \"Missing data archive\"\n",
+    "    with ZipFile(\"TIMIT.zip\", 'r') as f:\n",
+    "        f.extractall(path=\".\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "files = []\n",
+    "train_subset = []\n",
+    "\n",
+    "for dirpath, _, filenames in os.walk(\"data/lisa/data/timit/raw/TIMIT\"):\n",
+    "    for f in filenames:\n",
+    "        if f.endswith(\"WAV\"):\n",
+    "            recording = SPHFile(dirpath + \"/\" + f).content\n",
+    "            files.append(dirpath + \"/\" + f[:-4])\n",
+    "            train_subset.append(dirpath[31:36] == \"TRAIN\")\n",
+    "\n",
+    "files = np.array(files)\n",
+    "train_subset = np.array(train_subset, dtype=np.bool)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Preprocessing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if not os.path.exists(\"preprocessed_dataset.pkl\"):\n",
+    "    features = []\n",
+    "    labels = []\n",
+    "\n",
+    "    for f in files:\n",
+    "        recording = SPHFile(f + \".WAV\")\n",
+    "        signal = recording.content\n",
+    "        samplerate = recording.format['sample_rate']\n",
+    "\n",
+    "        mfccfeats = mfcc(signal, samplerate=samplerate, winlen=0.01, winstep=0.005, \n",
+    "                         numcep=13, nfilt=26, appendEnergy=True)\n",
+    "        derivatives = np.concatenate([\n",
+    "            mfccfeats[1, None] - mfccfeats[0, None],\n",
+    "            .5 * mfccfeats[2:] - .5 * mfccfeats[0:-2],\n",
+    "            mfccfeats[-1, None] - mfccfeats[-2, None]], axis=0)\n",
+    "\n",
+    "        features.append(np.concatenate([mfccfeats, derivatives], axis=1).astype(np.float32))\n",
+    "\n",
+    "        with open(f + \".PHN\") as phonem_file:\n",
+    "            labels.append([l.split()[2] for l in phonem_file.readlines()])\n",
+    "\n",
+    "    m = np.mean(np.concatenate(features, axis=0))\n",
+    "    s = np.std(np.concatenate(features, axis=0))\n",
+    "\n",
+    "    for i in range(len(features)):\n",
+    "        features[i] = (features[i] - m) / s\n",
+    "\n",
+    "    vocabulary = set()\n",
+    "    for lseq in labels:\n",
+    "        vocabulary |= set(lseq)\n",
+    "\n",
+    "    vocabulary = list(vocabulary)\n",
+    "    vocabulary[-1], vocabulary[vocabulary.index('h#')] = \\\n",
+    "        vocabulary[vocabulary.index('h#')], vocabulary[-1]\n",
+    "    blank = len(vocabulary) - 1\n",
+    "\n",
+    "    for i in range(len(labels)):\n",
+    "        labels[i] = np.array([vocabulary.index(l) for l in labels[i]], dtype=np.int32)\n",
+    "    \n",
+    "    with open(\"preprocessed_dataset.pkl\", 'wb') as f:\n",
+    "        pkl.dump((features, labels, vocabulary, blank), f, -1)\n",
+    "\n",
+    "\n",
+    "with open(\"preprocessed_dataset.pkl\", 'rb') as f:\n",
+    "    features, labels, vocabulary, blank = pkl.load(f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# let's go brutal and shove that in GPU memory\n",
+    "\n",
+    "n_sequences = len(features)\n",
+    "feat_size = features[0].shape[1]\n",
+    "max_duration = max(len(seq) for seq in features)\n",
+    "max_labels = max(len(seq) - 2 for seq in labels)  # -2 for init and final blank\n",
+    "\n",
+    "durations = np.array([len(seq) for seq in features], dtype=np.int32)\n",
+    "nlabels = np.array([len(seq) - 2 for seq in labels], dtype=np.int32)\n",
+    "all_features = np.zeros((n_sequences, max_duration, feat_size), dtype=np.float32)\n",
+    "for i in range(n_sequences):\n",
+    "    all_features[i, :durations[i]] = features[i]\n",
+    "all_labels = np.zeros((n_sequences, max_labels), dtype=np.int32)\n",
+    "for i in range(n_sequences):\n",
+    "    all_labels[i, :nlabels[i]] = labels[i][1:-1]\n",
+    "\n",
+    "durations_var = T.as_tensor_variable(durations, name=\"durations\")\n",
+    "all_features_var = T.as_tensor_variable(all_features, name=\"all_features\")\n",
+    "nlabels_var = T.as_tensor_variable(nlabels, name=\"nlabels\")\n",
+    "all_labels_var = T.as_tensor_variable(all_labels, name=\"all_labels\")\n",
+    "\n",
+    "minibatch_indexes = T.ivector()\n",
+    "batch_features = all_features_var[minibatch_indexes]\n",
+    "batch_durations = durations_var[minibatch_indexes]\n",
+    "batch_nlabels = nlabels_var[minibatch_indexes]\n",
+    "batch_labels = all_labels_var[minibatch_indexes]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batch_size = 16\n",
+    "\n",
+    "l_in = InputLayer(\n",
+    "    input_var=batch_features,\n",
+    "    shape=(batch_size, max_duration, feat_size))\n",
+    "\n",
+    "l_duration = InputLayer(input_var=batch_durations, shape=(1,))\n",
+    "\n",
+    "l_mask = lasagne.layers.ExpressionLayer(\n",
+    "    l_duration, \n",
+    "    lambda d: T.arange(max_duration)[None, :] < d[:, None])\n",
+    "\n",
+    "l_noise = GaussianNoiseLayer(l_in, sigma=0.6)\n",
+    "# l_noise = l_in\n",
+    "\n",
+    "l_fwlstm = LSTMLayer(\n",
+    "    l_noise, 100,\n",
+    "    ingate=lasagne.layers.Gate(W_cell=Uniform(0.1), nonlinearity=sigmoid),\n",
+    "    forgetgate=lasagne.layers.Gate(W_cell=Uniform(0.1), nonlinearity=sigmoid),\n",
+    "    cell=lasagne.layers.Gate(W_cell=Uniform(0.1), nonlinearity=tanh),\n",
+    "    outgate=lasagne.layers.Gate(W_cell=Uniform(0.1), nonlinearity=sigmoid),\n",
+    "    nonlinearity=tanh,\n",
+    "    mask_input=l_mask, peepholes=True)\n",
+    "l_bwlstm = LSTMLayer(\n",
+    "    l_noise, 100,\n",
+    "    ingate=lasagne.layers.Gate(W_cell=Uniform(0.1), nonlinearity=sigmoid),\n",
+    "    forgetgate=lasagne.layers.Gate(W_cell=Uniform(0.1), nonlinearity=sigmoid),\n",
+    "    cell=lasagne.layers.Gate(W_cell=Uniform(0.1), nonlinearity=tanh),\n",
+    "    outgate=lasagne.layers.Gate(W_cell=Uniform(0.1), nonlinearity=sigmoid),\n",
+    "    nonlinearity=tanh,\n",
+    "    mask_input=l_mask, peepholes=True, backwards=True)\n",
+    "\n",
+    "l_cat = ConcatLayer([l_fwlstm, l_bwlstm], axis=2)\n",
+    "\n",
+    "l_linout = DenseLayer(\n",
+    "    l_cat, len(vocabulary), \n",
+    "    nonlinearity=None,\n",
+    "    num_leading_axes=2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_output = lasagne.layers.get_output(\n",
+    "    l_linout, deterministic=False).dimshuffle(1, 0, 2)\n",
+    "\n",
+    "loss = ctc_loss(\n",
+    "    linout=train_output,\n",
+    "    durations=batch_durations,\n",
+    "    labels=batch_labels,\n",
+    "    label_sizes=batch_nlabels,\n",
+    "    blank=blank)\n",
+    "\n",
+    "params = lasagne.layers.get_all_params(l_linout, trainable=True)\n",
+    "grads = theano.grad(loss.sum(), params)\n",
+    "updates = lasagne.updates.adam(\n",
+    "    grads, params, \n",
+    "    learning_rate=1e-4)\n",
+    "update_fn = theano.function(\n",
+    "    [minibatch_indexes], \n",
+    "    loss,\n",
+    "    updates=updates)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "i = 0\n",
+    "nsteps = int(100 * n_sequences / batch_size)\n",
+    "params_history = []\n",
+    "loss_history = np.zeros((nsteps,))\n",
+    "\n",
+    "def update_plot(fig, ax1, ax2, loss_history):\n",
+    "    ax1.clear()\n",
+    "    ax1.set_xlim(0, len(loss_history))\n",
+    "    ax1.set_yscale('log')\n",
+    "    ax1.set_ylim(0.8 * np.percentile(loss_history, 1), \n",
+    "                1.2 * np.percentile(loss_history, 99))\n",
+    "    ax1.grid(color='gray', linestyle='-', linewidth=1)\n",
+    "    ax1.grid(color='gray', linestyle=':', which='minor', linewidth=1)\n",
+    "    ax1.set_axisbelow(True)\n",
+    "    xticks = np.arange(len(loss_history))\n",
+    "    ax1.scatter(xticks, loss_history, marker='.', \n",
+    "               color='firebrick', edgecolor=\"none\", alpha=0.1)\n",
+    "    smooth_history = smooth(loss_history, 0.6)\n",
+    "    ax1.plot(xticks, smooth_history, linewidth=2, color='firebrick')\n",
+    "\n",
+    "    ax2.clear()\n",
+    "    ax2.set_yscale('log')\n",
+    "    ax2.set_ylim(0.8 * np.percentile(loss_history, 1), \n",
+    "                 1.2 * np.percentile(loss_history, 99))\n",
+    "    ax2.grid(False)\n",
+    "    ax2.yaxis.set_label_position(\"right\")\n",
+    "    ax2.set_yticks([], minor=True)\n",
+    "    ax2.set_yticks([smooth_history[-1]])\n",
+    "    ax2.get_yaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())\n",
+    "\n",
+    "    fig.canvas.draw()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plt.figure()\n",
+    "ax1 = fig.add_subplot(111)\n",
+    "xticks = np.arange(i)\n",
+    "ax1.set_xlim(0, i + 1)\n",
+    "ax1.set_ylim(0, 1)\n",
+    "ax2 = ax1.twinx()\n",
+    "\n",
+    "# Note: you can interrupt and resume the execution of this cell\n",
+    "while i < nsteps:\n",
+    "    t1 = time.time()\n",
+    "    batch_loss = np.mean(update_fn(\n",
+    "        np.random.choice(n_sequences, batch_size).astype(np.int32)))\n",
+    "    t2 = time.time()\n",
+    "    \n",
+    "    print(\"\\r{:<6d} loss = {:>5.0f}, (d={:1.2f})\".format(i, batch_loss, t2 - t1), end='', flush=True)\n",
+    "    loss_history[i] = batch_loss\n",
+    "\n",
+    "    if (i + 1) % 10 == 0:        \n",
+    "        update_plot(fig, ax1, ax2, loss_history[:i])\n",
+    "\n",
+    "#     if (i + 1) % 1000 == 0:\n",
+    "#         params_history.append(lasagne.layers.get_all_param_values(l_linout))\n",
+    "\n",
+    "    i += 1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Evaluate model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_output = lasagne.layers.get_output(l_linout, deterministic=True)\n",
+    "\n",
+    "logits_fn = theano.function(\n",
+    "    [minibatch_indexes],\n",
+    "    [batch_features, batch_durations, \n",
+    "     batch_labels, batch_nlabels, \n",
+    "     test_output])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sequence = 3\n",
+    "\n",
+    "f, d, l, n, p = logits_fn(np.array([sequence], dtype=np.int32))\n",
+    "f = f[0, :d[0]]\n",
+    "l = l[0, :n[0]]\n",
+    "p = p[0, :d[0]]\n",
+    "s = np.exp(p - np.max(p, axis=-1, keepdims=True)) \\\n",
+    "    / np.sum(np.exp(p - np.max(p, axis=-1, keepdims=True)), axis=-1, keepdims=True)\n",
+    "\n",
+    "fig = plt.figure()\n",
+    "ax = plt.subplot(111)\n",
+    "lines = []\n",
+    "\n",
+    "for c in np.argsort(vocabulary[:-1]):\n",
+    "    if c in l:\n",
+    "        line, = ax.plot(np.arange(len(p)), s[:, c], label=vocabulary[c], picker=5)\n",
+    "        lines.append(line)\n",
+    "\n",
+    "ax.plot(np.arange(len(p)), s[:, -1], linestyle=\":\")\n",
+    "\n",
+    "ax.set_ylim(0.0, 1.2)\n",
+    "# ax.set_yscale('log')\n",
+    "ax.set_title('Select curve to see the label')\n",
+    "\n",
+    "ax.legend(\n",
+    "    framealpha=1,\n",
+    "    loc='upper center', bbox_to_anchor=(0.5, -0.2), ncol=8)\n",
+    "\n",
+    "fig.subplots_adjust(bottom=0.5)\n",
+    "fig.show()\n",
+    "\n",
+    "def onpick(event):\n",
+    "    for line in lines:\n",
+    "        line.set_alpha(0.3)\n",
+    "        line.set_linewidth(2)\n",
+    "    \n",
+    "    event.artist.set_alpha(1)\n",
+    "    event.artist.set_linewidth(2)\n",
+    "    ax.set_title(event.artist.get_label())\n",
+    "\n",
+    "cid = fig.canvas.mpl_connect('pick_event', onpick)\n",
+    "\n",
+    "print(\"target    : {}\".format(\", \".join(vocabulary[l_] for l_ in l)))\n",
+    "print(\"prediction: {}\".format(\", \".join(vocabulary[l_] for l_ in argmax_decode(s, [blank]))))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/papers/connectionist_temporal_classification/ctc.py b/papers/connectionist_temporal_classification/ctc.py
new file mode 100644
index 0000000..4535d1d
--- /dev/null
+++ b/papers/connectionist_temporal_classification/ctc.py
@@ -0,0 +1,343 @@
+# Author: Nicolas Granger <nicolas.granger.m@gmail.com>
+#
+# Implements the connectionist temporal classification loss from:
+# Graves, A., Fernández, S., Gomez, F., & Schmidhuber, J. (2006, June).
+# Connectionist temporal classification: labelling unsegmented sequence data
+# with recurrent neural networks. In Proceedings of the 23rd international
+# conference on Machine learning (pp. 369-376). ACM.
+# ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf
+
+import numpy as np
+import theano
+import theano.tensor as T
+from theano.tensor import discrete_dtypes, continuous_dtypes
+
+
+def isneginf(x, neginf=-1e9):
+    return x < neginf
+
+
+def logaddexp(x, y, magnitude=20):
+    x, y = T.minimum(x, y), T.maximum(x, y)
+    diff = T.minimum(y - x, magnitude)
+    res = x + T.log(1 + T.exp(diff))
+    return T.switch((y - x) > magnitude, y, res)
+
+
+def logsumexp(x, axis, keepdims=False):
+    k = T.max(x, axis=axis, keepdims=True)
+    res = T.log(T.sum(T.exp(x - k), axis=axis, keepdims=keepdims)) + k
+    return T.switch(isneginf(k), -2e9, res)
+
+
+def log_softmax(X, axis=-1, clip=None):
+    k = T.max(X, axis=axis, keepdims=True)
+    norm_X = X - k
+
+    if clip is not None:
+        norm_X = T.maximum(norm_X, clip)
+
+    log_sum_exp_X = logsumexp(norm_X, axis=axis, keepdims=True)
+    return norm_X - log_sum_exp_X
+
+
+# Bits of the CTC algorithm ---------------------------------------------------
+
+def insert_alternating_blanks(labels, blank_label):
+    batch_size, label_size = labels.shape
+    blanked_labels = T.zeros((batch_size, 2 * label_size + 1), dtype=np.int32)
+    blanked_labels = T.set_subtensor(blanked_labels[:, 0::2], blank_label)
+    blanked_labels = T.set_subtensor(blanked_labels[:, 1:-1:2], labels)
+    return blanked_labels
+
+
+def ctc_forward(log_odds, durations,
+                blanked_labels, label_sizes, not_repeated):
+    seqsize, batch_sz, _ = log_odds.shape
+    label_size = blanked_labels.shape[1]
+
+    def step(t, a_tm1, log_odds_,
+             durations_, blanked_labels_, label_sizes_, not_repeated_):
+        y_t = log_odds_[t]
+        a_t = a_tm1
+        a_t = T.set_subtensor(
+            a_t[:, 1:],
+            logaddexp(a_t[:, 1:], a_tm1[:, :-1]))
+        a_t = T.set_subtensor(
+            a_t[:, 2:],
+            logaddexp(a_t[:, 2:], T.switch(not_repeated_, a_tm1[:, :-2], -2e9)))
+
+        # stop after a_T(|l'|)
+        mask = T.ge(t, durations_)[:, None] \
+            + T.ge(T.arange(label_size)[None, :],
+                   2 * label_sizes_[:, None] + 1)
+
+        a_t = T.switch(
+            isneginf(a_t) + mask, -2e9,
+            a_t + y_t[T.arange(batch_sz)[:, None], blanked_labels_])
+        return a_t
+
+    alpha_init = -2e9 * T.ones((batch_sz, label_size))
+    alpha_init = T.set_subtensor(alpha_init[:, 0], 0)
+
+    alphas, _ = theano.scan(
+        fn=step,
+        n_steps=seqsize,
+        strict=True,
+        sequences=[T.arange(seqsize)],
+        outputs_info=alpha_init,
+        non_sequences=[log_odds, durations, blanked_labels, label_sizes,
+                       not_repeated],
+        name="ctc_forward")
+
+    return alphas
+
+
+def ctc_backward(log_odds, durations, blanked_labels, label_sizes, not_repeated):
+    seqsize, batch_sz, _ = log_odds.shape
+    label_size = blanked_labels.shape[1]
+
+    def step(t, b_tp1, log_odds_,
+             durations_, blanked_labels_, label_sizes_, not_repeated_):
+        y_t = log_odds_[t]
+
+        # increase b_{T+1}(|l'|) from 0 to 1 to initialize recursion
+        starter_t = T.eq(t, durations_ - 1)[:, None] \
+            * T.eq((2 * label_sizes_)[:, None],
+                   T.arange(label_size)[None, :])
+        b_tp1_2lp1 = b_tp1[T.arange(batch_sz), 2 * label_sizes_]
+        b_tp1 = T.set_subtensor(
+            b_tp1_2lp1,
+            T.switch(T.eq(t, durations_ - 1), 0, b_tp1_2lp1))
+        b_tp1 = T.switch(starter_t, 0, b_tp1)  # initialize recursion
+
+        b_t = b_tp1
+        b_t = T.set_subtensor(
+            b_t[:, :-1],
+            logaddexp(b_t[:, :-1], b_tp1[:, 1:]))
+        b_t = T.set_subtensor(
+            b_t[:, :-2],
+            logaddexp(b_t[:, :-2], T.switch(not_repeated_, b_tp1[:, 2:], -2e9)))
+        b_t += y_t[T.arange(batch_sz)[:, None], blanked_labels_]
+        b_t = T.switch(isneginf(b_t), -2e9, b_t)
+        return b_t
+
+    beta_init = -2e9 * T.ones((batch_sz, label_size))
+
+    betas, _ = theano.scan(
+        fn=step,
+        n_steps=seqsize,
+        strict=True,
+        sequences=[T.arange(seqsize)],
+        outputs_info=beta_init,
+        non_sequences=[log_odds, durations, blanked_labels, label_sizes,
+                       not_repeated],
+        go_backwards=True,
+        name="ctc_backward")
+    betas = betas[::-1, :, :]
+
+    return betas
+
+
+# Theano Op -------------------------------------------------------------------
+
+def ctc_propagate(linout, durations, blanked_labels, label_sizes, not_repeated):
+    _, batch_size, voca_size = linout.shape
+
+    logits = log_softmax(linout)
+    betas = ctc_backward(logits, durations,
+                         blanked_labels, label_sizes, not_repeated)
+    loss = - logaddexp(betas[0, :, 0], betas[0, :, 1])
+
+    # alphas = ctc_forward(logits, durations,
+    #                      blanked_labels, label_sizes, not_repeated)
+    # loss = - logaddexp(
+    #     alphas[durations - 1, T.arange(batch_size), 2 * label_sizes - 1],
+    #     alphas[durations - 1, T.arange(batch_size), 2 * label_sizes])
+
+    return loss, logits, betas
+
+
+def ctc_backprop(durations, blanked_labels, label_sizes, not_repeated,
+                 logits, betas, loss, output_gradient):
+    seq_size, batch_size, voca_size = logits.shape
+
+    alphas = ctc_forward(logits, durations,
+                         blanked_labels, label_sizes, not_repeated)
+
+    # log(sum_{s \in lab(l, k)} a_t(s) b_t(s))
+    def fwbw_sum_step(k, s, labels_, ab_):
+        s_view = s[:, T.arange(batch_size), labels_[:, k]]
+        ab_view = ab_[:, :, k]
+        next_sum = logaddexp(s_view, ab_view)
+        s = T.set_subtensor(s_view, next_sum)
+        return s
+
+    ab = alphas + betas
+    fwbw_sum = theano.scan(
+        fn=fwbw_sum_step,
+        sequences=[T.arange(blanked_labels.shape[1])],
+        outputs_info=-2e9 * T.ones((seq_size, batch_size, voca_size)),
+        non_sequences=[blanked_labels, ab],
+        strict=True,
+        name="fwbw_sum")[0][-1]  # should be unrolled if label_size is known
+
+    A = loss[None, :, None] + logits \
+        + logsumexp(fwbw_sum - logits, axis=2, keepdims=True)
+    B = loss[None, :, None] + fwbw_sum - logits
+    dloss_dy = T.exp(A) - T.exp(B)
+
+    dloss_dy = T.switch(T.all(isneginf(fwbw_sum), axis=2, keepdims=True),
+                        0, dloss_dy)
+
+    return dloss_dy * output_gradient[None, :, None]
+
+
+def make_ctc_op():
+    preds_var = T.tensor3()
+    durations_var = T.ivector()
+    blanked_labels_var = T.imatrix()
+    bool_matrix = T.TensorType("bool", (False, False))
+    not_repeated_var = bool_matrix()
+    label_sizes_var = T.ivector()
+
+    # linout, durations, labels, label_sizes, blank = inputs
+    # seq_size, batch_size, voca_size = linout.shape
+    #
+    # logits, blanked_labels, not_repeated, betas, loss = \
+    #     ctc_perform_graph(linout, durations, labels, label_sizes, blank)
+
+    loss, logits, betas = ctc_propagate(preds_var, durations_var, blanked_labels_var,
+                                        label_sizes_var, not_repeated_var)
+
+    def backprop_op1(inputs, output_gradients):
+        del inputs
+        return [
+            output_gradients[0],
+            theano.gradient.disconnected_type(),
+            theano.gradient.disconnected_type(),
+            theano.gradient.disconnected_type(),
+            theano.gradient.disconnected_type()]
+
+    op1 = theano.OpFromGraph(
+        inputs=[preds_var, durations_var,
+                blanked_labels_var, label_sizes_var,
+                not_repeated_var],
+        outputs=[preds_var, logits, betas, loss],
+        grad_overrides=backprop_op1,
+        inline=True, name="ctcLossOp1")
+
+    def backprop_op2(inputs, output_gradients):
+        preds_var_, logits_, betas_, loss_, \
+            durations_, blanked_labels_, label_sizes_, not_repeated_ = inputs
+        output_gradient, = output_gradients
+
+        g = ctc_backprop(durations_, blanked_labels_, label_sizes_, not_repeated_,
+                         logits_, betas_, loss_, output_gradient)
+
+        return [
+            g,
+            T.zeros_like(logits_),
+            # theano.gradient.disconnected_type(),
+            T.zeros_like(betas_),
+            # theano.gradient.disconnected_type(),
+            T.zeros_like(loss_),
+            theano.gradient.disconnected_type(),
+            theano.gradient.disconnected_type(),
+            theano.gradient.disconnected_type(),
+            theano.gradient.disconnected_type(),
+            theano.gradient.disconnected_type()]
+
+    preds, logits, betas, loss = op1(
+        preds_var, durations_var,
+        blanked_labels_var, label_sizes_var,
+        not_repeated_var)
+
+    op2 = theano.OpFromGraph(
+        inputs=[preds, logits, betas, loss,
+                durations_var, blanked_labels_var, label_sizes_var,
+                not_repeated_var],
+        outputs=[loss + preds.sum() * 0 + logits.sum() * 0 + betas.sum() * 0],
+        grad_overrides=backprop_op2,
+        inline=True, name="ctcLossOp2")
+
+    return op1, op2
+
+
+# -----------------------------------------------------------------------------
+
+def ctc_loss(preds, durations, labels, label_sizes, blank=-1):
+    """Compute the Connectionnist Temporal Classification loss [#graves2006]_.
+
+    .. math:: L = - ln\left( \sum_{\pi \in \mathcal{B}^{-1}(l)} P(\pi | y)
+                      \right)
+
+    where :math:`y` is the sequence of predictions, :math:`l` the target
+    label sequence without blanks or repetition, :math:`\pi` is taken from the
+    ensemble of possible label assignments over the observations and
+    :math:`\mathcal{B}` is a function that remove blanks and repetitions for a
+    sequence of labels.
+
+    Parameters
+    ----------
+    preds : Theano shared variable, expression or numpy array
+        The input values for the softmax function with shape
+        duration x batch_size x nclasses.
+    durations: Theano shared variable, expression or numpy array
+        An _integer_ vector of size batch_size contining the actual length of
+        each sequence in preds.
+    labels: Theano shared variable, expression or numpy array
+        An _integer_ matrix of size batch_size x label_size containing the
+        target labels.
+    label_sizes: Theano shared variable, expression or numpy array
+        An _integer_ vector of size batch_size containing the actual length of
+        each sequence in labels.
+    blank:
+        The blank label class, by default the last index.
+
+    Returns
+    -------
+    Theano tensor
+        A vector expression with the CTC loss of each sequence.
+
+    Reference
+    ---------
+    .. [#graves2006] Graves, A., Fernández, S., Gomez, F., & Schmidhuber, J.
+       (2006, June). Connectionist temporal classification: labelling
+       unsegmented sequence data with recurrent neural networks. In
+       Proceedings of the 23rd international conference on Machine learning
+       (pp. 369-376). ACM. ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf
+    """
+    preds = T.as_tensor_variable(preds)
+    durations = T.as_tensor_variable(durations)
+    labels = T.as_tensor_variable(labels)
+    label_sizes = T.as_tensor_variable(label_sizes)
+    blank = T.cast(T.as_tensor_variable(blank), 'int32')
+
+    if not(preds.dtype in continuous_dtypes and preds.ndim == 3):
+        raise ValueError("preds must continuous with dimension 3")
+    if not (durations.dtype in discrete_dtypes and durations.ndim == 1):
+        raise ValueError("durations must be a integer vector")
+    if not (labels.dtype in discrete_dtypes and labels.ndim == 2):
+        raise ValueError("labels must be an integer matrix")
+    if not (label_sizes.dtype in discrete_dtypes and label_sizes.ndim == 1):
+        raise ValueError("label_sizes must be an integer vector")
+    if not (blank.dtype in discrete_dtypes and blank.ndim == 0):
+        raise ValueError("blank must be an integer value")
+
+    voca_size = T.cast(preds.shape[2], 'int32')
+    labels = labels % voca_size
+    blank = blank % voca_size
+
+    op1, op2 = make_ctc_op()
+
+    blanked_labels = insert_alternating_blanks(labels, blank)
+    not_repeated = T.neq(blanked_labels[:, 2:], blanked_labels[:, :-2])
+
+    preds, logits, betas, loss = op1(preds, durations,
+                                     blanked_labels, label_sizes,
+                                     not_repeated)
+    loss = op2(preds, logits, betas, loss,
+               durations, blanked_labels, label_sizes, not_repeated)
+
+    return loss
diff --git a/papers/connectionist_temporal_classification/test_ctc.py b/papers/connectionist_temporal_classification/test_ctc.py
new file mode 100644
index 0000000..204d5a5
--- /dev/null
+++ b/papers/connectionist_temporal_classification/test_ctc.py
@@ -0,0 +1,181 @@
+import numpy as np
+import theano
+import theano.tensor as T
+from theano.tests import unittest_tools
+
+from papers.connectionist_temporal_classification.ctc import ctc_loss, isneginf
+
+
+# def test_forward_backward():
+#     batch_size = 6
+#     label_size = 7
+#     voca_size = 5
+#     seq_size = 10
+#
+#     label_lengths = np.random.randint(0, label_size,
+#                                       size=(batch_size,), dtype=np.int32)
+#     label_lengths[0] = label_size  # extremum case
+#     label_lengths[1] = 0  # extremum case
+#     labels = np.array(
+#         [np.random.randint(0, voca_size - 1, size=label_size, dtype=np.int32)
+#          for _ in range(batch_size)])
+#     for i in range(batch_size):
+#         labels[i, label_lengths[i]:] = -1
+#
+#     seq_durations = np.array([
+#         np.random.randint(max(1, label_lengths[i]), seq_size)
+#         for i in range(batch_size)], dtype=np.int32)
+#
+#     linear_out = np.random.randn(seq_size, batch_size, voca_size) \
+#         .astype(np.float32)
+#
+#     blank_class = -1
+#     blank_class = np.mod(blank_class, voca_size)
+#
+#     labels = np.mod(labels, voca_size)
+#
+#     log_odds = log_softmax(linear_out)
+#     blanked_labels = insert_alternating_blanks(T.mod(labels, voca_size),
+#                                                blank_class)
+#     not_repeated = T.neq(blanked_labels[:, 2:], blanked_labels[:, :-2])
+#
+#     alphas = ctc_forward(log_odds, seq_durations,
+#                          blanked_labels, label_lengths, not_repeated)
+#     betas = ctc_backward(log_odds, seq_durations,
+#                          blanked_labels, label_lengths, not_repeated)
+#
+#     preds = log_softmax(linear_out)
+#
+#     y_blanks = preds[:, T.arange(batch_size)[:, None], blanked_labels]
+#     p_l = T.sum(T.exp(alphas + betas - y_blanks), axis=2)
+#
+#     alphas = alphas.eval()
+#     betas = betas.eval()
+#     preds = preds.eval()
+#
+#     for i in range(batch_size):
+#         assert np.allclose(alphas[0, i, 0], preds[0, i, -1])
+#         if label_lengths[i] > 0:
+#             assert np.allclose(alphas[0, i, 1], preds[0, i, labels[i, 0]])
+#         else:
+#             assert isneginf(alphas[0, i, 1])
+#         assert np.all(isneginf(alphas[0, i, 2:]))
+#
+#     for i in range(batch_size):
+#         t = seq_durations[i] - 1
+#         l = label_lengths[i]
+#         assert np.allclose(betas[t, i, 2 * l], preds[t, i, -1])
+#         if l > 0:
+#             assert np.allclose(betas[t, i, 2 * l - 1],
+#                                preds[t, i, labels[i, l - 1]])
+#             assert np.all(isneginf(betas[t, i, :max(l - 2, 0)]))
+#         else:
+#             assert np.all(isneginf(betas[t, i, 1:]))
+#
+#     p_l = p_l.eval()
+#
+#     for i in range(batch_size):
+#         assert (np.allclose(p_l[:seq_durations[i], i], p_l[0, i]))
+#         a, b = max(0, 2 * label_lengths[i] - 1), 2 * label_lengths[i] + 1
+#         p_li = np.exp(alphas[seq_durations[i] - 1, i, a:b]).sum()
+#         assert np.allclose(p_li, p_l[0, i])
+#         p_li = np.exp(betas[0, i, :2]).sum()
+#         assert np.allclose(p_li, p_l[0, i])
+
+
+def test_simple_precomputed():
+    # Test obtained from Torch tutorial at:
+    # https://github.com/baidu-research/warp-ctc/blob/master/torch_binding/TUTORIAL.md
+
+    linear_out = np.asarray([
+        [[0, 0, 0, 0, 0], [1, 2, 3, 4, 5], [-5, -4, -3, -2, -1]],
+        [[0, 0, 0, 0, 0], [6, 7, 8, 9, 10], [-10, -9, -8, -7, -6]],
+        [[0, 0, 0, 0, 0], [11, 12, 13, 14, 15], [-15, -14, -13, -12, -11]]
+    ], dtype=np.float32)
+
+    seq_sizes = np.asarray([1, 3, 3], dtype=np.int32)
+
+    labels = np.asarray([[1, 0], [3, 3], [2, 3]], dtype=np.int32)
+
+    label_sizes = np.asarray([1, 2, 2], dtype=np.int32)
+
+    expected_losses = np.asarray([1.609437943, 7.355742931, 4.938849926],
+                                 dtype=np.float32)
+
+    blank = 0
+
+    expected_grad = np.asarray([
+        [[0.2,            -0.8,            0.2,            0.2,           0.2],
+         [ 0.01165623125,  0.03168492019,  0.08612854034, -0.7658783197,  0.636408627],
+         [-0.02115798369,  0.03168492019, -0.8810571432,   0.2341216654,  0.636408627]],
+        [[0,               0,              0,              0,             0],
+         [-0.9883437753,   0.03168492019,  0.08612854034,  0.2341216654,  0.636408627],
+         [-0.02115798369,  0.03168492019, -0.1891518533,  -0.4577836394,  0.636408627]],
+        [[0,               0,              0,              0,             0],
+         [0.01165623125,   0.03168492019,  0.08612854034, -0.7658783197,  0.636408627],
+         [-0.02115798369,  0.03168492019,  0.08612854034, -0.7330639958,  0.636408627]]
+    ], dtype=np.float32)
+
+    linear_out_var = T.as_tensor_variable(linear_out)
+    losses = ctc_loss(
+        linear_out_var, seq_sizes, labels, label_sizes, blank)
+
+    assert np.allclose(losses.eval(), expected_losses, atol=1)
+
+    grad = theano.grad(losses.sum(), wrt=linear_out_var)
+
+    assert np.allclose(grad.eval(), expected_grad, rtol=.001, atol=1)
+
+
+def test_random():
+    batch_size = 16
+    label_size = 5
+    voca_size = 4
+    seq_size = 20
+
+    label_sizes = np.random.randint(
+        0, label_size, size=(batch_size,), dtype=np.int32)
+    label_sizes[0] = label_size
+    label_sizes[1] = 0
+    label_sizes[2] = 5
+    label_sizes[3] = 5
+
+    labels = np.random.randint(
+        0, voca_size - 1,
+        size=(batch_size, label_size), dtype=np.int32)
+    labels[3] = 0
+
+    seq_sizes = np.array([
+        np.random.randint(max(1, label_sizes[i]), seq_size)
+        for i in range(batch_size)], dtype=np.int32)
+    seq_sizes[2] = 4
+
+    linear_out = np.random.randn(
+        seq_size, batch_size, voca_size).astype(np.float32)
+
+    # check edge cases
+    # TODO
+
+    # check the gradient can be computed at all
+    linear_out_var = T.tensor3()
+    preds = T.nnet.softmax(
+        linear_out_var.reshape((-1, voca_size))
+    ).reshape((seq_size, batch_size, voca_size))
+
+    g = theano.grad(ctc_loss(preds, seq_sizes,
+                             labels, label_sizes).sum(),
+                    wrt=linear_out_var).eval(
+        {linear_out_var: linear_out.astype(np.float32)})
+    assert not np.any(np.isnan(g))
+
+    # check correctness against finite difference approximation
+    def f(linear_out_):
+        preds_ = T.nnet.softmax(
+                linear_out_.reshape((-1, voca_size))
+            ).reshape((seq_size, batch_size, voca_size))
+        loss = ctc_loss(preds_, seq_sizes, labels, label_sizes)
+        # prevent finite differences from failing
+        loss = T.switch(isneginf(-loss), 0, loss)
+        return loss
+
+    unittest_tools.verify_grad(f, [linear_out], abs_tol=0.05, rel_tol=0.05)