diff --git a/.gitignore b/.gitignore index db78efc..0053c11 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ __pycache__/ # datasets *.zip +papers/connectionist_temporal_classification/TIMIT # Distribution / packaging .Python diff --git a/papers/connectionist_temporal_classification/Connectionist Temporal Classification.ipynb b/papers/connectionist_temporal_classification/Connectionist Temporal Classification.ipynb new file mode 100644 index 0000000..350030a --- /dev/null +++ b/papers/connectionist_temporal_classification/Connectionist Temporal Classification.ipynb @@ -0,0 +1,486 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is an implementation the Connectionist Temporal Classification loss function:\n", + "\n", + "> Graves, A., Fernández, S., Gomez, F., & Schmidhuber, J. (2006, June). Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In Proceedings of the 23rd international conference on Machine learning (pp. 369-376). ACM. ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf\n", + "\n", + "This notebook only show the learning procedure, no thorough testing is performed and the prefix search decoding is not implemented (contributions are welcome!).\n", + "\n", + "The original paper seems to use size 1 minibatches instead of 16 here. There shouldn't be any significant variations otherwise.\n", + "\n", + "Please download the [TIMIT dataset](http://academictorrents.com/details/34e2b78745138186976cbc27939b1b34d18bd5b3) and place the `TIMIT.zip` file next to this one.\n", + "\n", + "The following python packages are required:\n", + "- scipy\n", + "- lasagne\n", + "- matplotlib\n", + "- [sphfile](https://pypi.python.org/pypi/sphfile) (to read the sound files)\n", + "- [python_speech_features](https://github.com/jameslyons/python_speech_features) (to generate mfcc features)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "%matplotlib notebook\n", + "\n", + "import os\n", + "os.environ['THEANO_FLAGS'] = \"device=cuda\"\n", + "#os.environ['CUDA_LAUNCH_BLOCKING'] = \"1\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pickle as pkl\n", + "import numpy as np\n", + "from zipfile import ZipFile\n", + "from sphfile import SPHFile\n", + "from python_speech_features import mfcc\n", + "import theano\n", + "import theano.tensor as T\n", + "import lasagne\n", + "from lasagne.layers import InputLayer, LSTMLayer, DenseLayer, ConcatLayer, GaussianNoiseLayer\n", + "from lasagne.init import Uniform\n", + "from lasagne.nonlinearities import tanh, sigmoid\n", + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "from ctc import ctc_loss, log_softmax, ctc_backward\n", + "import time" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## small useful functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def smooth(x, w):\n", + " window = int(np.ceil(len(x) / 2 * (1000 ** w - 1) / 999))\n", + " window += 1 - window % 2\n", + " \n", + " if window < 3 or len(x) < window:\n", + " return x\n", + " \n", + " edge_weights = np.arange(1, window // 2 + 1)\n", + " return np.concatenate([\n", + " np.cumsum(x[:window // 2]) / edge_weights,\n", + " np.convolve(x, np.full([window], 1 / window), 'valid'),\n", + " np.cumsum(x[:-window // 2:-1])[::-1] / edge_weights[::-1]])\n", + "\n", + "def argmax_decode(preds, exclude=()):\n", + " preds = np.argmax(preds, axis=1)\n", + " decoded = [preds[0]]\n", + " for v in preds:\n", + " if v != decoded[-1]:\n", + " decoded.append(v)\n", + " \n", + " return np.array([v for v in decoded if v not in exclude])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prepare dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if not os.path.isdir(\"data/lisa/data/timit/raw/TIMIT\"):\n", + " assert os.path.exists(\"TIMIT.zip\"), \"Missing data archive\"\n", + " with ZipFile(\"TIMIT.zip\", 'r') as f:\n", + " f.extractall(path=\".\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "files = []\n", + "train_subset = []\n", + "\n", + "for dirpath, _, filenames in os.walk(\"data/lisa/data/timit/raw/TIMIT\"):\n", + " for f in filenames:\n", + " if f.endswith(\"WAV\"):\n", + " recording = SPHFile(dirpath + \"/\" + f).content\n", + " files.append(dirpath + \"/\" + f[:-4])\n", + " train_subset.append(dirpath[31:36] == \"TRAIN\")\n", + "\n", + "files = np.array(files)\n", + "train_subset = np.array(train_subset, dtype=np.bool)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if not os.path.exists(\"preprocessed_dataset.pkl\"):\n", + " features = []\n", + " labels = []\n", + "\n", + " for f in files:\n", + " recording = SPHFile(f + \".WAV\")\n", + " signal = recording.content\n", + " samplerate = recording.format['sample_rate']\n", + "\n", + " mfccfeats = mfcc(signal, samplerate=samplerate, winlen=0.01, winstep=0.005, \n", + " numcep=13, nfilt=26, appendEnergy=True)\n", + " derivatives = np.concatenate([\n", + " mfccfeats[1, None] - mfccfeats[0, None],\n", + " .5 * mfccfeats[2:] - .5 * mfccfeats[0:-2],\n", + " mfccfeats[-1, None] - mfccfeats[-2, None]], axis=0)\n", + "\n", + " features.append(np.concatenate([mfccfeats, derivatives], axis=1).astype(np.float32))\n", + "\n", + " with open(f + \".PHN\") as phonem_file:\n", + " labels.append([l.split()[2] for l in phonem_file.readlines()])\n", + "\n", + " m = np.mean(np.concatenate(features, axis=0))\n", + " s = np.std(np.concatenate(features, axis=0))\n", + "\n", + " for i in range(len(features)):\n", + " features[i] = (features[i] - m) / s\n", + "\n", + " vocabulary = set()\n", + " for lseq in labels:\n", + " vocabulary |= set(lseq)\n", + "\n", + " vocabulary = list(vocabulary)\n", + " vocabulary[-1], vocabulary[vocabulary.index('h#')] = \\\n", + " vocabulary[vocabulary.index('h#')], vocabulary[-1]\n", + " blank = len(vocabulary) - 1\n", + "\n", + " for i in range(len(labels)):\n", + " labels[i] = np.array([vocabulary.index(l) for l in labels[i]], dtype=np.int32)\n", + " \n", + " with open(\"preprocessed_dataset.pkl\", 'wb') as f:\n", + " pkl.dump((features, labels, vocabulary, blank), f, -1)\n", + "\n", + "\n", + "with open(\"preprocessed_dataset.pkl\", 'rb') as f:\n", + " features, labels, vocabulary, blank = pkl.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# let's go brutal and shove that in GPU memory\n", + "\n", + "n_sequences = len(features)\n", + "feat_size = features[0].shape[1]\n", + "max_duration = max(len(seq) for seq in features)\n", + "max_labels = max(len(seq) - 2 for seq in labels) # -2 for init and final blank\n", + "\n", + "durations = np.array([len(seq) for seq in features], dtype=np.int32)\n", + "nlabels = np.array([len(seq) - 2 for seq in labels], dtype=np.int32)\n", + "all_features = np.zeros((n_sequences, max_duration, feat_size), dtype=np.float32)\n", + "for i in range(n_sequences):\n", + " all_features[i, :durations[i]] = features[i]\n", + "all_labels = np.zeros((n_sequences, max_labels), dtype=np.int32)\n", + "for i in range(n_sequences):\n", + " all_labels[i, :nlabels[i]] = labels[i][1:-1]\n", + "\n", + "durations_var = T.as_tensor_variable(durations, name=\"durations\")\n", + "all_features_var = T.as_tensor_variable(all_features, name=\"all_features\")\n", + "nlabels_var = T.as_tensor_variable(nlabels, name=\"nlabels\")\n", + "all_labels_var = T.as_tensor_variable(all_labels, name=\"all_labels\")\n", + "\n", + "minibatch_indexes = T.ivector()\n", + "batch_features = all_features_var[minibatch_indexes]\n", + "batch_durations = durations_var[minibatch_indexes]\n", + "batch_nlabels = nlabels_var[minibatch_indexes]\n", + "batch_labels = all_labels_var[minibatch_indexes]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "batch_size = 16\n", + "\n", + "l_in = InputLayer(\n", + " input_var=batch_features,\n", + " shape=(batch_size, max_duration, feat_size))\n", + "\n", + "l_duration = InputLayer(input_var=batch_durations, shape=(1,))\n", + "\n", + "l_mask = lasagne.layers.ExpressionLayer(\n", + " l_duration, \n", + " lambda d: T.arange(max_duration)[None, :] < d[:, None])\n", + "\n", + "l_noise = GaussianNoiseLayer(l_in, sigma=0.6)\n", + "# l_noise = l_in\n", + "\n", + "l_fwlstm = LSTMLayer(\n", + " l_noise, 100,\n", + " ingate=lasagne.layers.Gate(W_cell=Uniform(0.1), nonlinearity=sigmoid),\n", + " forgetgate=lasagne.layers.Gate(W_cell=Uniform(0.1), nonlinearity=sigmoid),\n", + " cell=lasagne.layers.Gate(W_cell=Uniform(0.1), nonlinearity=tanh),\n", + " outgate=lasagne.layers.Gate(W_cell=Uniform(0.1), nonlinearity=sigmoid),\n", + " nonlinearity=tanh,\n", + " mask_input=l_mask, peepholes=True)\n", + "l_bwlstm = LSTMLayer(\n", + " l_noise, 100,\n", + " ingate=lasagne.layers.Gate(W_cell=Uniform(0.1), nonlinearity=sigmoid),\n", + " forgetgate=lasagne.layers.Gate(W_cell=Uniform(0.1), nonlinearity=sigmoid),\n", + " cell=lasagne.layers.Gate(W_cell=Uniform(0.1), nonlinearity=tanh),\n", + " outgate=lasagne.layers.Gate(W_cell=Uniform(0.1), nonlinearity=sigmoid),\n", + " nonlinearity=tanh,\n", + " mask_input=l_mask, peepholes=True, backwards=True)\n", + "\n", + "l_cat = ConcatLayer([l_fwlstm, l_bwlstm], axis=2)\n", + "\n", + "l_linout = DenseLayer(\n", + " l_cat, len(vocabulary), \n", + " nonlinearity=None,\n", + " num_leading_axes=2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_output = lasagne.layers.get_output(\n", + " l_linout, deterministic=False).dimshuffle(1, 0, 2)\n", + "\n", + "loss = ctc_loss(\n", + " linout=train_output,\n", + " durations=batch_durations,\n", + " labels=batch_labels,\n", + " label_sizes=batch_nlabels,\n", + " blank=blank)\n", + "\n", + "params = lasagne.layers.get_all_params(l_linout, trainable=True)\n", + "grads = theano.grad(loss.sum(), params)\n", + "updates = lasagne.updates.adam(\n", + " grads, params, \n", + " learning_rate=1e-4)\n", + "update_fn = theano.function(\n", + " [minibatch_indexes], \n", + " loss,\n", + " updates=updates)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "i = 0\n", + "nsteps = int(100 * n_sequences / batch_size)\n", + "params_history = []\n", + "loss_history = np.zeros((nsteps,))\n", + "\n", + "def update_plot(fig, ax1, ax2, loss_history):\n", + " ax1.clear()\n", + " ax1.set_xlim(0, len(loss_history))\n", + " ax1.set_yscale('log')\n", + " ax1.set_ylim(0.8 * np.percentile(loss_history, 1), \n", + " 1.2 * np.percentile(loss_history, 99))\n", + " ax1.grid(color='gray', linestyle='-', linewidth=1)\n", + " ax1.grid(color='gray', linestyle=':', which='minor', linewidth=1)\n", + " ax1.set_axisbelow(True)\n", + " xticks = np.arange(len(loss_history))\n", + " ax1.scatter(xticks, loss_history, marker='.', \n", + " color='firebrick', edgecolor=\"none\", alpha=0.1)\n", + " smooth_history = smooth(loss_history, 0.6)\n", + " ax1.plot(xticks, smooth_history, linewidth=2, color='firebrick')\n", + "\n", + " ax2.clear()\n", + " ax2.set_yscale('log')\n", + " ax2.set_ylim(0.8 * np.percentile(loss_history, 1), \n", + " 1.2 * np.percentile(loss_history, 99))\n", + " ax2.grid(False)\n", + " ax2.yaxis.set_label_position(\"right\")\n", + " ax2.set_yticks([], minor=True)\n", + " ax2.set_yticks([smooth_history[-1]])\n", + " ax2.get_yaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())\n", + "\n", + " fig.canvas.draw()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig = plt.figure()\n", + "ax1 = fig.add_subplot(111)\n", + "xticks = np.arange(i)\n", + "ax1.set_xlim(0, i + 1)\n", + "ax1.set_ylim(0, 1)\n", + "ax2 = ax1.twinx()\n", + "\n", + "# Note: you can interrupt and resume the execution of this cell\n", + "while i < nsteps:\n", + " t1 = time.time()\n", + " batch_loss = np.mean(update_fn(\n", + " np.random.choice(n_sequences, batch_size).astype(np.int32)))\n", + " t2 = time.time()\n", + " \n", + " print(\"\\r{:<6d} loss = {:>5.0f}, (d={:1.2f})\".format(i, batch_loss, t2 - t1), end='', flush=True)\n", + " loss_history[i] = batch_loss\n", + "\n", + " if (i + 1) % 10 == 0: \n", + " update_plot(fig, ax1, ax2, loss_history[:i])\n", + "\n", + "# if (i + 1) % 1000 == 0:\n", + "# params_history.append(lasagne.layers.get_all_param_values(l_linout))\n", + "\n", + " i += 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluate model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_output = lasagne.layers.get_output(l_linout, deterministic=True)\n", + "\n", + "logits_fn = theano.function(\n", + " [minibatch_indexes],\n", + " [batch_features, batch_durations, \n", + " batch_labels, batch_nlabels, \n", + " test_output])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sequence = 3\n", + "\n", + "f, d, l, n, p = logits_fn(np.array([sequence], dtype=np.int32))\n", + "f = f[0, :d[0]]\n", + "l = l[0, :n[0]]\n", + "p = p[0, :d[0]]\n", + "s = np.exp(p - np.max(p, axis=-1, keepdims=True)) \\\n", + " / np.sum(np.exp(p - np.max(p, axis=-1, keepdims=True)), axis=-1, keepdims=True)\n", + "\n", + "fig = plt.figure()\n", + "ax = plt.subplot(111)\n", + "lines = []\n", + "\n", + "for c in np.argsort(vocabulary[:-1]):\n", + " if c in l:\n", + " line, = ax.plot(np.arange(len(p)), s[:, c], label=vocabulary[c], picker=5)\n", + " lines.append(line)\n", + "\n", + "ax.plot(np.arange(len(p)), s[:, -1], linestyle=\":\")\n", + "\n", + "ax.set_ylim(0.0, 1.2)\n", + "# ax.set_yscale('log')\n", + "ax.set_title('Select curve to see the label')\n", + "\n", + "ax.legend(\n", + " framealpha=1,\n", + " loc='upper center', bbox_to_anchor=(0.5, -0.2), ncol=8)\n", + "\n", + "fig.subplots_adjust(bottom=0.5)\n", + "fig.show()\n", + "\n", + "def onpick(event):\n", + " for line in lines:\n", + " line.set_alpha(0.3)\n", + " line.set_linewidth(2)\n", + " \n", + " event.artist.set_alpha(1)\n", + " event.artist.set_linewidth(2)\n", + " ax.set_title(event.artist.get_label())\n", + "\n", + "cid = fig.canvas.mpl_connect('pick_event', onpick)\n", + "\n", + "print(\"target : {}\".format(\", \".join(vocabulary[l_] for l_ in l)))\n", + "print(\"prediction: {}\".format(\", \".join(vocabulary[l_] for l_ in argmax_decode(s, [blank]))))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/papers/connectionist_temporal_classification/ctc.py b/papers/connectionist_temporal_classification/ctc.py new file mode 100644 index 0000000..4535d1d --- /dev/null +++ b/papers/connectionist_temporal_classification/ctc.py @@ -0,0 +1,343 @@ +# Author: Nicolas Granger +# +# Implements the connectionist temporal classification loss from: +# Graves, A., Fernández, S., Gomez, F., & Schmidhuber, J. (2006, June). +# Connectionist temporal classification: labelling unsegmented sequence data +# with recurrent neural networks. In Proceedings of the 23rd international +# conference on Machine learning (pp. 369-376). ACM. +# ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf + +import numpy as np +import theano +import theano.tensor as T +from theano.tensor import discrete_dtypes, continuous_dtypes + + +def isneginf(x, neginf=-1e9): + return x < neginf + + +def logaddexp(x, y, magnitude=20): + x, y = T.minimum(x, y), T.maximum(x, y) + diff = T.minimum(y - x, magnitude) + res = x + T.log(1 + T.exp(diff)) + return T.switch((y - x) > magnitude, y, res) + + +def logsumexp(x, axis, keepdims=False): + k = T.max(x, axis=axis, keepdims=True) + res = T.log(T.sum(T.exp(x - k), axis=axis, keepdims=keepdims)) + k + return T.switch(isneginf(k), -2e9, res) + + +def log_softmax(X, axis=-1, clip=None): + k = T.max(X, axis=axis, keepdims=True) + norm_X = X - k + + if clip is not None: + norm_X = T.maximum(norm_X, clip) + + log_sum_exp_X = logsumexp(norm_X, axis=axis, keepdims=True) + return norm_X - log_sum_exp_X + + +# Bits of the CTC algorithm --------------------------------------------------- + +def insert_alternating_blanks(labels, blank_label): + batch_size, label_size = labels.shape + blanked_labels = T.zeros((batch_size, 2 * label_size + 1), dtype=np.int32) + blanked_labels = T.set_subtensor(blanked_labels[:, 0::2], blank_label) + blanked_labels = T.set_subtensor(blanked_labels[:, 1:-1:2], labels) + return blanked_labels + + +def ctc_forward(log_odds, durations, + blanked_labels, label_sizes, not_repeated): + seqsize, batch_sz, _ = log_odds.shape + label_size = blanked_labels.shape[1] + + def step(t, a_tm1, log_odds_, + durations_, blanked_labels_, label_sizes_, not_repeated_): + y_t = log_odds_[t] + a_t = a_tm1 + a_t = T.set_subtensor( + a_t[:, 1:], + logaddexp(a_t[:, 1:], a_tm1[:, :-1])) + a_t = T.set_subtensor( + a_t[:, 2:], + logaddexp(a_t[:, 2:], T.switch(not_repeated_, a_tm1[:, :-2], -2e9))) + + # stop after a_T(|l'|) + mask = T.ge(t, durations_)[:, None] \ + + T.ge(T.arange(label_size)[None, :], + 2 * label_sizes_[:, None] + 1) + + a_t = T.switch( + isneginf(a_t) + mask, -2e9, + a_t + y_t[T.arange(batch_sz)[:, None], blanked_labels_]) + return a_t + + alpha_init = -2e9 * T.ones((batch_sz, label_size)) + alpha_init = T.set_subtensor(alpha_init[:, 0], 0) + + alphas, _ = theano.scan( + fn=step, + n_steps=seqsize, + strict=True, + sequences=[T.arange(seqsize)], + outputs_info=alpha_init, + non_sequences=[log_odds, durations, blanked_labels, label_sizes, + not_repeated], + name="ctc_forward") + + return alphas + + +def ctc_backward(log_odds, durations, blanked_labels, label_sizes, not_repeated): + seqsize, batch_sz, _ = log_odds.shape + label_size = blanked_labels.shape[1] + + def step(t, b_tp1, log_odds_, + durations_, blanked_labels_, label_sizes_, not_repeated_): + y_t = log_odds_[t] + + # increase b_{T+1}(|l'|) from 0 to 1 to initialize recursion + starter_t = T.eq(t, durations_ - 1)[:, None] \ + * T.eq((2 * label_sizes_)[:, None], + T.arange(label_size)[None, :]) + b_tp1_2lp1 = b_tp1[T.arange(batch_sz), 2 * label_sizes_] + b_tp1 = T.set_subtensor( + b_tp1_2lp1, + T.switch(T.eq(t, durations_ - 1), 0, b_tp1_2lp1)) + b_tp1 = T.switch(starter_t, 0, b_tp1) # initialize recursion + + b_t = b_tp1 + b_t = T.set_subtensor( + b_t[:, :-1], + logaddexp(b_t[:, :-1], b_tp1[:, 1:])) + b_t = T.set_subtensor( + b_t[:, :-2], + logaddexp(b_t[:, :-2], T.switch(not_repeated_, b_tp1[:, 2:], -2e9))) + b_t += y_t[T.arange(batch_sz)[:, None], blanked_labels_] + b_t = T.switch(isneginf(b_t), -2e9, b_t) + return b_t + + beta_init = -2e9 * T.ones((batch_sz, label_size)) + + betas, _ = theano.scan( + fn=step, + n_steps=seqsize, + strict=True, + sequences=[T.arange(seqsize)], + outputs_info=beta_init, + non_sequences=[log_odds, durations, blanked_labels, label_sizes, + not_repeated], + go_backwards=True, + name="ctc_backward") + betas = betas[::-1, :, :] + + return betas + + +# Theano Op ------------------------------------------------------------------- + +def ctc_propagate(linout, durations, blanked_labels, label_sizes, not_repeated): + _, batch_size, voca_size = linout.shape + + logits = log_softmax(linout) + betas = ctc_backward(logits, durations, + blanked_labels, label_sizes, not_repeated) + loss = - logaddexp(betas[0, :, 0], betas[0, :, 1]) + + # alphas = ctc_forward(logits, durations, + # blanked_labels, label_sizes, not_repeated) + # loss = - logaddexp( + # alphas[durations - 1, T.arange(batch_size), 2 * label_sizes - 1], + # alphas[durations - 1, T.arange(batch_size), 2 * label_sizes]) + + return loss, logits, betas + + +def ctc_backprop(durations, blanked_labels, label_sizes, not_repeated, + logits, betas, loss, output_gradient): + seq_size, batch_size, voca_size = logits.shape + + alphas = ctc_forward(logits, durations, + blanked_labels, label_sizes, not_repeated) + + # log(sum_{s \in lab(l, k)} a_t(s) b_t(s)) + def fwbw_sum_step(k, s, labels_, ab_): + s_view = s[:, T.arange(batch_size), labels_[:, k]] + ab_view = ab_[:, :, k] + next_sum = logaddexp(s_view, ab_view) + s = T.set_subtensor(s_view, next_sum) + return s + + ab = alphas + betas + fwbw_sum = theano.scan( + fn=fwbw_sum_step, + sequences=[T.arange(blanked_labels.shape[1])], + outputs_info=-2e9 * T.ones((seq_size, batch_size, voca_size)), + non_sequences=[blanked_labels, ab], + strict=True, + name="fwbw_sum")[0][-1] # should be unrolled if label_size is known + + A = loss[None, :, None] + logits \ + + logsumexp(fwbw_sum - logits, axis=2, keepdims=True) + B = loss[None, :, None] + fwbw_sum - logits + dloss_dy = T.exp(A) - T.exp(B) + + dloss_dy = T.switch(T.all(isneginf(fwbw_sum), axis=2, keepdims=True), + 0, dloss_dy) + + return dloss_dy * output_gradient[None, :, None] + + +def make_ctc_op(): + preds_var = T.tensor3() + durations_var = T.ivector() + blanked_labels_var = T.imatrix() + bool_matrix = T.TensorType("bool", (False, False)) + not_repeated_var = bool_matrix() + label_sizes_var = T.ivector() + + # linout, durations, labels, label_sizes, blank = inputs + # seq_size, batch_size, voca_size = linout.shape + # + # logits, blanked_labels, not_repeated, betas, loss = \ + # ctc_perform_graph(linout, durations, labels, label_sizes, blank) + + loss, logits, betas = ctc_propagate(preds_var, durations_var, blanked_labels_var, + label_sizes_var, not_repeated_var) + + def backprop_op1(inputs, output_gradients): + del inputs + return [ + output_gradients[0], + theano.gradient.disconnected_type(), + theano.gradient.disconnected_type(), + theano.gradient.disconnected_type(), + theano.gradient.disconnected_type()] + + op1 = theano.OpFromGraph( + inputs=[preds_var, durations_var, + blanked_labels_var, label_sizes_var, + not_repeated_var], + outputs=[preds_var, logits, betas, loss], + grad_overrides=backprop_op1, + inline=True, name="ctcLossOp1") + + def backprop_op2(inputs, output_gradients): + preds_var_, logits_, betas_, loss_, \ + durations_, blanked_labels_, label_sizes_, not_repeated_ = inputs + output_gradient, = output_gradients + + g = ctc_backprop(durations_, blanked_labels_, label_sizes_, not_repeated_, + logits_, betas_, loss_, output_gradient) + + return [ + g, + T.zeros_like(logits_), + # theano.gradient.disconnected_type(), + T.zeros_like(betas_), + # theano.gradient.disconnected_type(), + T.zeros_like(loss_), + theano.gradient.disconnected_type(), + theano.gradient.disconnected_type(), + theano.gradient.disconnected_type(), + theano.gradient.disconnected_type(), + theano.gradient.disconnected_type()] + + preds, logits, betas, loss = op1( + preds_var, durations_var, + blanked_labels_var, label_sizes_var, + not_repeated_var) + + op2 = theano.OpFromGraph( + inputs=[preds, logits, betas, loss, + durations_var, blanked_labels_var, label_sizes_var, + not_repeated_var], + outputs=[loss + preds.sum() * 0 + logits.sum() * 0 + betas.sum() * 0], + grad_overrides=backprop_op2, + inline=True, name="ctcLossOp2") + + return op1, op2 + + +# ----------------------------------------------------------------------------- + +def ctc_loss(preds, durations, labels, label_sizes, blank=-1): + """Compute the Connectionnist Temporal Classification loss [#graves2006]_. + + .. math:: L = - ln\left( \sum_{\pi \in \mathcal{B}^{-1}(l)} P(\pi | y) + \right) + + where :math:`y` is the sequence of predictions, :math:`l` the target + label sequence without blanks or repetition, :math:`\pi` is taken from the + ensemble of possible label assignments over the observations and + :math:`\mathcal{B}` is a function that remove blanks and repetitions for a + sequence of labels. + + Parameters + ---------- + preds : Theano shared variable, expression or numpy array + The input values for the softmax function with shape + duration x batch_size x nclasses. + durations: Theano shared variable, expression or numpy array + An _integer_ vector of size batch_size contining the actual length of + each sequence in preds. + labels: Theano shared variable, expression or numpy array + An _integer_ matrix of size batch_size x label_size containing the + target labels. + label_sizes: Theano shared variable, expression or numpy array + An _integer_ vector of size batch_size containing the actual length of + each sequence in labels. + blank: + The blank label class, by default the last index. + + Returns + ------- + Theano tensor + A vector expression with the CTC loss of each sequence. + + Reference + --------- + .. [#graves2006] Graves, A., Fernández, S., Gomez, F., & Schmidhuber, J. + (2006, June). Connectionist temporal classification: labelling + unsegmented sequence data with recurrent neural networks. In + Proceedings of the 23rd international conference on Machine learning + (pp. 369-376). ACM. ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf + """ + preds = T.as_tensor_variable(preds) + durations = T.as_tensor_variable(durations) + labels = T.as_tensor_variable(labels) + label_sizes = T.as_tensor_variable(label_sizes) + blank = T.cast(T.as_tensor_variable(blank), 'int32') + + if not(preds.dtype in continuous_dtypes and preds.ndim == 3): + raise ValueError("preds must continuous with dimension 3") + if not (durations.dtype in discrete_dtypes and durations.ndim == 1): + raise ValueError("durations must be a integer vector") + if not (labels.dtype in discrete_dtypes and labels.ndim == 2): + raise ValueError("labels must be an integer matrix") + if not (label_sizes.dtype in discrete_dtypes and label_sizes.ndim == 1): + raise ValueError("label_sizes must be an integer vector") + if not (blank.dtype in discrete_dtypes and blank.ndim == 0): + raise ValueError("blank must be an integer value") + + voca_size = T.cast(preds.shape[2], 'int32') + labels = labels % voca_size + blank = blank % voca_size + + op1, op2 = make_ctc_op() + + blanked_labels = insert_alternating_blanks(labels, blank) + not_repeated = T.neq(blanked_labels[:, 2:], blanked_labels[:, :-2]) + + preds, logits, betas, loss = op1(preds, durations, + blanked_labels, label_sizes, + not_repeated) + loss = op2(preds, logits, betas, loss, + durations, blanked_labels, label_sizes, not_repeated) + + return loss diff --git a/papers/connectionist_temporal_classification/test_ctc.py b/papers/connectionist_temporal_classification/test_ctc.py new file mode 100644 index 0000000..204d5a5 --- /dev/null +++ b/papers/connectionist_temporal_classification/test_ctc.py @@ -0,0 +1,181 @@ +import numpy as np +import theano +import theano.tensor as T +from theano.tests import unittest_tools + +from papers.connectionist_temporal_classification.ctc import ctc_loss, isneginf + + +# def test_forward_backward(): +# batch_size = 6 +# label_size = 7 +# voca_size = 5 +# seq_size = 10 +# +# label_lengths = np.random.randint(0, label_size, +# size=(batch_size,), dtype=np.int32) +# label_lengths[0] = label_size # extremum case +# label_lengths[1] = 0 # extremum case +# labels = np.array( +# [np.random.randint(0, voca_size - 1, size=label_size, dtype=np.int32) +# for _ in range(batch_size)]) +# for i in range(batch_size): +# labels[i, label_lengths[i]:] = -1 +# +# seq_durations = np.array([ +# np.random.randint(max(1, label_lengths[i]), seq_size) +# for i in range(batch_size)], dtype=np.int32) +# +# linear_out = np.random.randn(seq_size, batch_size, voca_size) \ +# .astype(np.float32) +# +# blank_class = -1 +# blank_class = np.mod(blank_class, voca_size) +# +# labels = np.mod(labels, voca_size) +# +# log_odds = log_softmax(linear_out) +# blanked_labels = insert_alternating_blanks(T.mod(labels, voca_size), +# blank_class) +# not_repeated = T.neq(blanked_labels[:, 2:], blanked_labels[:, :-2]) +# +# alphas = ctc_forward(log_odds, seq_durations, +# blanked_labels, label_lengths, not_repeated) +# betas = ctc_backward(log_odds, seq_durations, +# blanked_labels, label_lengths, not_repeated) +# +# preds = log_softmax(linear_out) +# +# y_blanks = preds[:, T.arange(batch_size)[:, None], blanked_labels] +# p_l = T.sum(T.exp(alphas + betas - y_blanks), axis=2) +# +# alphas = alphas.eval() +# betas = betas.eval() +# preds = preds.eval() +# +# for i in range(batch_size): +# assert np.allclose(alphas[0, i, 0], preds[0, i, -1]) +# if label_lengths[i] > 0: +# assert np.allclose(alphas[0, i, 1], preds[0, i, labels[i, 0]]) +# else: +# assert isneginf(alphas[0, i, 1]) +# assert np.all(isneginf(alphas[0, i, 2:])) +# +# for i in range(batch_size): +# t = seq_durations[i] - 1 +# l = label_lengths[i] +# assert np.allclose(betas[t, i, 2 * l], preds[t, i, -1]) +# if l > 0: +# assert np.allclose(betas[t, i, 2 * l - 1], +# preds[t, i, labels[i, l - 1]]) +# assert np.all(isneginf(betas[t, i, :max(l - 2, 0)])) +# else: +# assert np.all(isneginf(betas[t, i, 1:])) +# +# p_l = p_l.eval() +# +# for i in range(batch_size): +# assert (np.allclose(p_l[:seq_durations[i], i], p_l[0, i])) +# a, b = max(0, 2 * label_lengths[i] - 1), 2 * label_lengths[i] + 1 +# p_li = np.exp(alphas[seq_durations[i] - 1, i, a:b]).sum() +# assert np.allclose(p_li, p_l[0, i]) +# p_li = np.exp(betas[0, i, :2]).sum() +# assert np.allclose(p_li, p_l[0, i]) + + +def test_simple_precomputed(): + # Test obtained from Torch tutorial at: + # https://github.com/baidu-research/warp-ctc/blob/master/torch_binding/TUTORIAL.md + + linear_out = np.asarray([ + [[0, 0, 0, 0, 0], [1, 2, 3, 4, 5], [-5, -4, -3, -2, -1]], + [[0, 0, 0, 0, 0], [6, 7, 8, 9, 10], [-10, -9, -8, -7, -6]], + [[0, 0, 0, 0, 0], [11, 12, 13, 14, 15], [-15, -14, -13, -12, -11]] + ], dtype=np.float32) + + seq_sizes = np.asarray([1, 3, 3], dtype=np.int32) + + labels = np.asarray([[1, 0], [3, 3], [2, 3]], dtype=np.int32) + + label_sizes = np.asarray([1, 2, 2], dtype=np.int32) + + expected_losses = np.asarray([1.609437943, 7.355742931, 4.938849926], + dtype=np.float32) + + blank = 0 + + expected_grad = np.asarray([ + [[0.2, -0.8, 0.2, 0.2, 0.2], + [ 0.01165623125, 0.03168492019, 0.08612854034, -0.7658783197, 0.636408627], + [-0.02115798369, 0.03168492019, -0.8810571432, 0.2341216654, 0.636408627]], + [[0, 0, 0, 0, 0], + [-0.9883437753, 0.03168492019, 0.08612854034, 0.2341216654, 0.636408627], + [-0.02115798369, 0.03168492019, -0.1891518533, -0.4577836394, 0.636408627]], + [[0, 0, 0, 0, 0], + [0.01165623125, 0.03168492019, 0.08612854034, -0.7658783197, 0.636408627], + [-0.02115798369, 0.03168492019, 0.08612854034, -0.7330639958, 0.636408627]] + ], dtype=np.float32) + + linear_out_var = T.as_tensor_variable(linear_out) + losses = ctc_loss( + linear_out_var, seq_sizes, labels, label_sizes, blank) + + assert np.allclose(losses.eval(), expected_losses, atol=1) + + grad = theano.grad(losses.sum(), wrt=linear_out_var) + + assert np.allclose(grad.eval(), expected_grad, rtol=.001, atol=1) + + +def test_random(): + batch_size = 16 + label_size = 5 + voca_size = 4 + seq_size = 20 + + label_sizes = np.random.randint( + 0, label_size, size=(batch_size,), dtype=np.int32) + label_sizes[0] = label_size + label_sizes[1] = 0 + label_sizes[2] = 5 + label_sizes[3] = 5 + + labels = np.random.randint( + 0, voca_size - 1, + size=(batch_size, label_size), dtype=np.int32) + labels[3] = 0 + + seq_sizes = np.array([ + np.random.randint(max(1, label_sizes[i]), seq_size) + for i in range(batch_size)], dtype=np.int32) + seq_sizes[2] = 4 + + linear_out = np.random.randn( + seq_size, batch_size, voca_size).astype(np.float32) + + # check edge cases + # TODO + + # check the gradient can be computed at all + linear_out_var = T.tensor3() + preds = T.nnet.softmax( + linear_out_var.reshape((-1, voca_size)) + ).reshape((seq_size, batch_size, voca_size)) + + g = theano.grad(ctc_loss(preds, seq_sizes, + labels, label_sizes).sum(), + wrt=linear_out_var).eval( + {linear_out_var: linear_out.astype(np.float32)}) + assert not np.any(np.isnan(g)) + + # check correctness against finite difference approximation + def f(linear_out_): + preds_ = T.nnet.softmax( + linear_out_.reshape((-1, voca_size)) + ).reshape((seq_size, batch_size, voca_size)) + loss = ctc_loss(preds_, seq_sizes, labels, label_sizes) + # prevent finite differences from failing + loss = T.switch(isneginf(-loss), 0, loss) + return loss + + unittest_tools.verify_grad(f, [linear_out], abs_tol=0.05, rel_tol=0.05)