From 296bfd1e88a91122e7628272d51ce0c75ac87fbf Mon Sep 17 00:00:00 2001 From: Christopher Beckham Date: Thu, 19 May 2016 20:43:14 -0400 Subject: [PATCH 1/6] add stochastic depth example --- examples/stochastic_depth_layers.ipynb | 412 +++++++++++++++++++++++++ 1 file changed, 412 insertions(+) create mode 100644 examples/stochastic_depth_layers.ipynb diff --git a/examples/stochastic_depth_layers.ipynb b/examples/stochastic_depth_layers.ipynb new file mode 100644 index 0000000..c667d59 --- /dev/null +++ b/examples/stochastic_depth_layers.ipynb @@ -0,0 +1,412 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import theano\n", + "from theano import tensor as T\n", + "import lasagne\n", + "from lasagne.layers import *\n", + "from lasagne.nonlinearities import *\n", + "from lasagne.objectives import *\n", + "from lasagne.regularization import *\n", + "from lasagne.random import get_rng\n", + "from lasagne.updates import *\n", + "from lasagne.init import *\n", + "from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams\n", + "from urllib import urlretrieve\n", + "import cPickle as pickle\n", + "import gzip" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "\"\"\"\n", + " Binomial dropout layer\n", + "\n", + " Samples a binomial mask on the first axis (i.e. batch size)\n", + " and multiplies it with the input. This has the effect of\n", + " zeroing the output for some examples in the batch (according\n", + " to the survival probability p)\n", + " \n", + " Parameters\n", + " ----------\n", + " \n", + " incoming : a :class:`Layer` instance\n", + " p : float\n", + " The survival probability for an example in the batch\n", + "\n", + "\"\"\"\n", + "class BinomialDropLayer(Layer):\n", + " def __init__(self, incoming, p=0.5, **kwargs):\n", + " super(BinomialDropLayer, self).__init__(incoming, **kwargs)\n", + " self._srng = RandomStreams(get_rng().randint(1, 2147462579))\n", + " self.p = p\n", + "\n", + " def get_output_for(self, input, deterministic=False, **kwargs):\n", + " if deterministic:\n", + " return self.p*input\n", + " else:\n", + " mask = self._srng.binomial(n=1, p=(self.p), size=(input.shape[0],),\n", + " dtype=input.dtype)\n", + " mask = mask.dimshuffle(0,'x','x','x')\n", + " return mask*input" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "\"\"\"\n", + " http://arxiv.org/abs/1603.09382\n", + " \n", + " \"...we replace the identity connections in these blocks\n", + " by an averaging pooling layer followed by zero paddings\n", + " to match the dimensions.\"\n", + " \n", + " To explain this method, let us consider two consecutive\n", + " convolution layers, `conv1` and `conv2`. Let us assume\n", + " they have different output shapes. To create the\n", + " identity connection between `conv1` and `conv2`, do 2x2\n", + " average pooling on `conv1`. Then, pad the result so that\n", + " it has the same dimensions as `conv2`. Afterwards, we have\n", + " to see if the final result has the same number of feature\n", + " maps as `conv2`; if not, we have to add all-zero feature\n", + " maps to either side of the result. Then, we construct a \n", + " binomial drop layer so that we can compute the final \n", + " equation:\n", + " \n", + " binomial_mask*conv2 + id(conv1)\n", + " \n", + " If we pass this through a nonlinearity layer, we can then\n", + " do: g( binomial_mask*conv2 + id(conv1) )\n", + " \n", + " Parameters\n", + " ----------\n", + " \n", + " incoming : a :class:`Layer` instance\n", + " p : float\n", + " The survival probability for the binomial mask\n", + "\n", + "\"\"\"\n", + "def stochastic_depth_block(incoming, p, nonlinearity=linear):\n", + " layer_before_incoming = None\n", + " for prev_layer in get_all_layers(incoming)[::-1][1::]:\n", + " if \"ignore\" not in prev_layer.name and not isinstance(prev_layer, NonlinearityLayer):\n", + " layer_before_incoming = prev_layer\n", + " break\n", + " if layer_before_incoming == None:\n", + " raise Exception(\"Cannot find an appropriate layer before layer: %s\" % incoming.name)\n", + " \n", + " if layer_before_incoming.output_shape != incoming.output_shape: \n", + " l_pool = Pool2DLayer(layer_before_incoming, pool_size=(2,2), mode=\"average_inc_pad\", name=\"ignore_pool\")\n", + " if (l_pool.output_shape[2] % 2 == 1 and incoming.output_shape[2] % 2 == 0) or \\\n", + " (l_pool.output_shape[2] % 2 == 0 and incoming.output_shape[2] % 2 == 1):\n", + " l_pad = pad( l_pool, width=((0,1),(0,1)), name=\"ignore_prelim_pad\" )\n", + " else:\n", + " l_pad = l_pool\n", + " nd1 = (incoming.output_shape[2]-l_pad.output_shape[2])/2\n", + " if nd1 > 0:\n", + " l_pad = pad(l_pad, width=(nd1,nd1), name=\"ignore_pad\")\n", + " # what if the layer_before_incoming num feature maps is\n", + " # less than the incoming_layer num feature maps?\n", + " if layer_before_incoming.output_shape[1] < incoming.output_shape[1]:\n", + " diff_in_fms = incoming.output_shape[1]-layer_before_incoming.output_shape[1]\n", + " if diff_in_fms % 2 == 0: \n", + " width_tp = ((diff_in_fms/2, diff_in_fms/2),)\n", + " else:\n", + " width_tp = (((diff_in_fms/2)+1, diff_in_fms/2),)\n", + " l_pad = pad(\n", + " l_pad, \n", + " batch_ndim=1, \n", + " width=width_tp,\n", + " name=\"ignore_fm_pad\"\n", + " )\n", + " l_binom_drop = BinomialDropLayer(incoming, p=p, name=\"ignore_binom\")\n", + " l_sum = ElemwiseSumLayer([l_binom_drop, l_pad], name=\"ignore_elemsum\")\n", + " l_nonlinearity = NonlinearityLayer(l_sum, nonlinearity=nonlinearity, name=\"ignore_nonlinearity\")\n", + " return l_nonlinearity\n", + " else:\n", + " l_binom_drop = BinomialDropLayer(incoming, p=p, name=\"ignore_binom\")\n", + " l_sum = ElemwiseSumLayer([l_binom_drop, layer_before_incoming], name=\"ignore_elemsum\")\n", + " l_nonlinearity = NonlinearityLayer(l_sum, nonlinearity=nonlinearity, name=\"ignore_nonlinearity\")\n", + " return l_nonlinearity" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let us create a simple convolution network" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "input (None, 1, 28, 28)\n", + "l_conv1 (None, 8, 26, 26)\n", + "ignore_binom (None, 8, 26, 26)\n", + "ignore_pool (None, 1, 14, 14)\n", + "ignore_pad (None, 1, 26, 26)\n", + "ignore_fm_pad (None, 8, 26, 26)\n", + "ignore_elemsum (None, 8, 26, 26)\n", + "ignore_nonlinearity (None, 8, 26, 26)\n", + "l_mp1 (None, 8, 13, 13)\n", + "ignore_binom (None, 8, 13, 13)\n", + "ignore_pool (None, 8, 13, 13)\n", + "ignore_elemsum (None, 8, 13, 13)\n", + "ignore_nonlinearity (None, 8, 13, 13)\n", + "l_conv2 (None, 8, 11, 11)\n", + "ignore_binom (None, 8, 11, 11)\n", + "ignore_pool (None, 8, 6, 6)\n", + "ignore_prelim_pad (None, 8, 7, 7)\n", + "ignore_pad (None, 8, 11, 11)\n", + "ignore_elemsum (None, 8, 11, 11)\n", + "ignore_nonlinearity (None, 8, 11, 11)\n", + "l_mp2 (None, 8, 5, 5)\n", + "ignore_binom (None, 8, 5, 5)\n", + "ignore_pool (None, 8, 5, 5)\n", + "ignore_elemsum (None, 8, 5, 5)\n", + "ignore_nonlinearity (None, 8, 5, 5)\n", + "l_conv3 (None, 16, 3, 3)\n", + "ignore_binom (None, 16, 3, 3)\n", + "ignore_pool (None, 8, 2, 2)\n", + "ignore_prelim_pad (None, 8, 3, 3)\n", + "ignore_fm_pad (None, 16, 3, 3)\n", + "ignore_elemsum (None, 16, 3, 3)\n", + "ignore_nonlinearity (None, 16, 3, 3)\n", + "l_fc (None, 10)\n", + "num of params: 3282\n" + ] + } + ], + "source": [ + "l_in = InputLayer( (None, 1, 28, 28), name=\"input\" )\n", + "\n", + "l_conv1 = Conv2DLayer(l_in, num_filters=8, filter_size=3, name=\"l_conv1\", nonlinearity=None)\n", + "l_sd1 = stochastic_depth_block(l_conv1, p=0.5, nonlinearity=rectify)\n", + "\n", + "l_mp1 = MaxPool2DLayer(l_sd1, pool_size=(2,2), name=\"l_mp1\")\n", + "l_sd2 = stochastic_depth_block(l_mp1, p=0.5)\n", + "\n", + "l_conv2 = Conv2DLayer(l_sd2, num_filters=8, filter_size=3, name=\"l_conv2\", nonlinearity=None)\n", + "l_sd3 = stochastic_depth_block(l_conv2, p=0.5, nonlinearity=rectify)\n", + "\n", + "l_mp2 = MaxPool2DLayer(l_sd3, pool_size=(2,2), name=\"l_mp2\")\n", + "l_sd4 = stochastic_depth_block(l_mp2, p=0.5)\n", + "\n", + "l_conv3 = Conv2DLayer(l_sd4, num_filters=16, filter_size=3, name=\"l_conv3\", nonlinearity=None)\n", + "l_sd5 = stochastic_depth_block(l_conv3, p=0.5, nonlinearity=rectify)\n", + "\n", + "l_fc = DenseLayer(l_sd5, num_units=10, nonlinearity=softmax, name=\"l_fc\")\n", + "\n", + "l_out = l_fc\n", + "\n", + "for layer in get_all_layers(l_out):\n", + " print layer.name, layer.output_shape\n", + "print \"num of params: %i\" % count_params(l_out)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Ok, let's download the MNIST dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "url_ret = urlretrieve(\"http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz\", \"/tmp/mnist.pkl.gz\")" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "with gzip.open(\"/tmp/mnist.pkl.gz\") as f:\n", + " dat = pickle.load(f)\n", + "train_data, _, _ = dat\n", + "X_train, y_train = train_data\n", + "X_train = X_train.reshape( (X_train.shape[0], 1, 28, 28) ).astype( theano.config.floatX )\n", + "y_train = y_train.astype(\"int32\")" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "((50000, 1, 28, 28), (50000,))" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.shape, y_train.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Define the Lasagne-related stuff we'll need for training the network" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "X = T.tensor4('X')\n", + "y = T.ivector('y')\n", + "\n", + "net_out = get_output(l_out, X)\n", + "net_out_det = get_output(l_out, X, deterministic=True)\n", + "loss = categorical_crossentropy(net_out, y).mean()\n", + "params = get_all_params(l_out, trainable=True)\n", + "grads = T.grad(loss, params)\n", + "updates = nesterov_momentum(grads, params, learning_rate=0.01, momentum=0.9)\n", + "train_fn = theano.function(inputs=[X, y], outputs=loss, updates=updates)\n", + "out_fn = theano.function(inputs=[X], outputs=net_out_det)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 1.04170554822\n", + "2 0.838029498493\n", + "3 0.794951380409\n", + "4 0.759657711097\n", + "5 0.733128651309\n", + "6 0.715255246341\n", + "7 0.718663093263\n", + "8 0.710821818392\n", + "9 0.721151590125\n", + "10 0.707448014221\n" + ] + } + ], + "source": [ + "bs = 32 \n", + "n_batches = X_train.shape[0] // bs\n", + "num_epochs = 10\n", + "for epoch in range(0, num_epochs):\n", + " train_losses = []\n", + " for b in range(0, n_batches):\n", + " train_losses.append( train_fn(X_train[b*bs : (b+1)*bs], y_train[b*bs : (b+1)*bs]) )\n", + " print (epoch+1), np.mean(train_losses)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "What accuracy do we get on the training set? (Between 0 and 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.92969999999999997" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.sum( np.argmax( out_fn(X_train), axis=1 ) == y_train ) / float(X_train.shape[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From 7ba6dc16f663070d3b55cb1ca35ee50fc44c625b Mon Sep 17 00:00:00 2001 From: Christopher Beckham Date: Sun, 22 May 2016 15:51:19 -0400 Subject: [PATCH 2/6] overhaul example and add py script --- examples/stochastic_depth_layers.ipynb | 748 +++++++++++++++++-------- examples/stochastic_depth_layers.py | 230 ++++++++ 2 files changed, 750 insertions(+), 228 deletions(-) create mode 100644 examples/stochastic_depth_layers.py diff --git a/examples/stochastic_depth_layers.ipynb b/examples/stochastic_depth_layers.ipynb index c667d59..25459da 100644 --- a/examples/stochastic_depth_layers.ipynb +++ b/examples/stochastic_depth_layers.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 29, + "execution_count": 55, "metadata": { "collapsed": false }, @@ -21,252 +21,551 @@ "from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams\n", "from urllib import urlretrieve\n", "import cPickle as pickle\n", - "import gzip" + "import gzip\n", + "import imp\n", + "import os\n", + "from time import time" ] }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 58, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ - "\"\"\"\n", - " Binomial dropout layer\n", - "\n", - " Samples a binomial mask on the first axis (i.e. batch size)\n", - " and multiplies it with the input. This has the effect of\n", - " zeroing the output for some examples in the batch (according\n", - " to the survival probability p)\n", - " \n", - " Parameters\n", - " ----------\n", - " \n", - " incoming : a :class:`Layer` instance\n", - " p : float\n", - " The survival probability for an example in the batch\n", - "\n", - "\"\"\"\n", "class BinomialDropLayer(Layer):\n", - " def __init__(self, incoming, p=0.5, **kwargs):\n", + " def __init__(self, incoming, nonlinearity=rectify, survival_p=0.5,\n", + " **kwargs):\n", " super(BinomialDropLayer, self).__init__(incoming, **kwargs)\n", + " self.nonlinearity = (identity if nonlinearity is None\n", + " else nonlinearity)\n", " self._srng = RandomStreams(get_rng().randint(1, 2147462579))\n", - " self.p = p\n", + " self.p = 1-survival_p\n", "\n", " def get_output_for(self, input, deterministic=False, **kwargs):\n", " if deterministic:\n", " return self.p*input\n", " else:\n", - " mask = self._srng.binomial(n=1, p=(self.p), size=(input.shape[0],),\n", - " dtype=input.dtype)\n", + " #mask = self._srng.binomial(n=1, p=(self.p), size=(input.shape[0],),\n", + " # dtype=input.dtype)\n", + " mask = T.zeros((input.shape[0],)) + self._srng.uniform( (1,), 0, 1)[0]\n", " mask = mask.dimshuffle(0,'x','x','x')\n", " return mask*input" ] }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 59, "metadata": { "collapsed": true }, "outputs": [], "source": [ - "\"\"\"\n", - " http://arxiv.org/abs/1603.09382\n", - " \n", - " \"...we replace the identity connections in these blocks\n", - " by an averaging pooling layer followed by zero paddings\n", - " to match the dimensions.\"\n", - " \n", - " To explain this method, let us consider two consecutive\n", - " convolution layers, `conv1` and `conv2`. Let us assume\n", - " they have different output shapes. To create the\n", - " identity connection between `conv1` and `conv2`, do 2x2\n", - " average pooling on `conv1`. Then, pad the result so that\n", - " it has the same dimensions as `conv2`. Afterwards, we have\n", - " to see if the final result has the same number of feature\n", - " maps as `conv2`; if not, we have to add all-zero feature\n", - " maps to either side of the result. Then, we construct a \n", - " binomial drop layer so that we can compute the final \n", - " equation:\n", - " \n", - " binomial_mask*conv2 + id(conv1)\n", - " \n", - " If we pass this through a nonlinearity layer, we can then\n", - " do: g( binomial_mask*conv2 + id(conv1) )\n", - " \n", - " Parameters\n", - " ----------\n", - " \n", - " incoming : a :class:`Layer` instance\n", - " p : float\n", - " The survival probability for the binomial mask\n", + "class IfElseDropLayer(Layer):\n", + " def __init__(self, incoming, nonlinearity=rectify, survival_p=0.5,\n", + " **kwargs):\n", + " super(IfElseDropLayer, self).__init__(incoming, **kwargs)\n", + " self.nonlinearity = (identity if nonlinearity is None\n", + " else nonlinearity)\n", + " self._srng = RandomStreams(get_rng().randint(1, 2147462579))\n", + " self.p = 1-survival_p\n", "\n", - "\"\"\"\n", - "def stochastic_depth_block(incoming, p, nonlinearity=linear):\n", - " layer_before_incoming = None\n", - " for prev_layer in get_all_layers(incoming)[::-1][1::]:\n", - " if \"ignore\" not in prev_layer.name and not isinstance(prev_layer, NonlinearityLayer):\n", - " layer_before_incoming = prev_layer\n", - " break\n", - " if layer_before_incoming == None:\n", - " raise Exception(\"Cannot find an appropriate layer before layer: %s\" % incoming.name)\n", - " \n", - " if layer_before_incoming.output_shape != incoming.output_shape: \n", - " l_pool = Pool2DLayer(layer_before_incoming, pool_size=(2,2), mode=\"average_inc_pad\", name=\"ignore_pool\")\n", - " if (l_pool.output_shape[2] % 2 == 1 and incoming.output_shape[2] % 2 == 0) or \\\n", - " (l_pool.output_shape[2] % 2 == 0 and incoming.output_shape[2] % 2 == 1):\n", - " l_pad = pad( l_pool, width=((0,1),(0,1)), name=\"ignore_prelim_pad\" )\n", + " def get_output_for(self, input, deterministic=False, **kwargs):\n", + " if deterministic:\n", + " return self.p*input\n", " else:\n", - " l_pad = l_pool\n", - " nd1 = (incoming.output_shape[2]-l_pad.output_shape[2])/2\n", - " if nd1 > 0:\n", - " l_pad = pad(l_pad, width=(nd1,nd1), name=\"ignore_pad\")\n", - " # what if the layer_before_incoming num feature maps is\n", - " # less than the incoming_layer num feature maps?\n", - " if layer_before_incoming.output_shape[1] < incoming.output_shape[1]:\n", - " diff_in_fms = incoming.output_shape[1]-layer_before_incoming.output_shape[1]\n", - " if diff_in_fms % 2 == 0: \n", - " width_tp = ((diff_in_fms/2, diff_in_fms/2),)\n", - " else:\n", - " width_tp = (((diff_in_fms/2)+1, diff_in_fms/2),)\n", - " l_pad = pad(\n", - " l_pad, \n", - " batch_ndim=1, \n", - " width=width_tp,\n", - " name=\"ignore_fm_pad\"\n", - " )\n", - " l_binom_drop = BinomialDropLayer(incoming, p=p, name=\"ignore_binom\")\n", - " l_sum = ElemwiseSumLayer([l_binom_drop, l_pad], name=\"ignore_elemsum\")\n", - " l_nonlinearity = NonlinearityLayer(l_sum, nonlinearity=nonlinearity, name=\"ignore_nonlinearity\")\n", - " return l_nonlinearity\n", - " else:\n", - " l_binom_drop = BinomialDropLayer(incoming, p=p, name=\"ignore_binom\")\n", - " l_sum = ElemwiseSumLayer([l_binom_drop, layer_before_incoming], name=\"ignore_elemsum\")\n", - " l_nonlinearity = NonlinearityLayer(l_sum, nonlinearity=nonlinearity, name=\"ignore_nonlinearity\")\n", - " return l_nonlinearity" + " return ifelse(\n", + " T.lt(self._srng.uniform( (1,), 0, 1)[0], self.p),\n", + " input,\n", + " T.zeros(input.shape)\n", + " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Let us create a simple convolution network" + "There is a difference between this residual block method and the one that is defined in [link]. When the number of filters is different to the layer's output shape (or the stride is different), instead of using a convolution to make things compatible, we use an average pooling with a pool size of 1 and a the defined stride, followed by (if necessary) adding extra zero-padded feature maps. This is because this is how the authors in [link] have defined it." ] }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 38, "metadata": { "collapsed": false }, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "input (None, 1, 28, 28)\n", - "l_conv1 (None, 8, 26, 26)\n", - "ignore_binom (None, 8, 26, 26)\n", - "ignore_pool (None, 1, 14, 14)\n", - "ignore_pad (None, 1, 26, 26)\n", - "ignore_fm_pad (None, 8, 26, 26)\n", - "ignore_elemsum (None, 8, 26, 26)\n", - "ignore_nonlinearity (None, 8, 26, 26)\n", - "l_mp1 (None, 8, 13, 13)\n", - "ignore_binom (None, 8, 13, 13)\n", - "ignore_pool (None, 8, 13, 13)\n", - "ignore_elemsum (None, 8, 13, 13)\n", - "ignore_nonlinearity (None, 8, 13, 13)\n", - "l_conv2 (None, 8, 11, 11)\n", - "ignore_binom (None, 8, 11, 11)\n", - "ignore_pool (None, 8, 6, 6)\n", - "ignore_prelim_pad (None, 8, 7, 7)\n", - "ignore_pad (None, 8, 11, 11)\n", - "ignore_elemsum (None, 8, 11, 11)\n", - "ignore_nonlinearity (None, 8, 11, 11)\n", - "l_mp2 (None, 8, 5, 5)\n", - "ignore_binom (None, 8, 5, 5)\n", - "ignore_pool (None, 8, 5, 5)\n", - "ignore_elemsum (None, 8, 5, 5)\n", - "ignore_nonlinearity (None, 8, 5, 5)\n", - "l_conv3 (None, 16, 3, 3)\n", - "ignore_binom (None, 16, 3, 3)\n", - "ignore_pool (None, 8, 2, 2)\n", - "ignore_prelim_pad (None, 8, 3, 3)\n", - "ignore_fm_pad (None, 16, 3, 3)\n", - "ignore_elemsum (None, 16, 3, 3)\n", - "ignore_nonlinearity (None, 16, 3, 3)\n", - "l_fc (None, 10)\n", - "num of params: 3282\n" - ] + "data": { + "text/plain": [ + "\"\\n\\nprint('Building model...')\\nmodel = nn.Sequential()\\n------> 3, 32,32\\nmodel:add(cudnn.SpatialConvolution(3, 16, 3,3, 1,1, 1,1)\\n :init('weight', nninit.kaiming, {gain = 'relu'})\\n :init('bias', nninit.constant, 0))\\nmodel:add(cudnn.SpatialBatchNormalization(16))\\nmodel:add(cudnn.ReLU(true))\\n------> 16, 32,32 First Group\\nfor i=1,opt.N do addResidualDrop(model, nil, 16) end\\n------> 32, 16,16 Second Group\\naddResidualDrop(model, nil, 16, 32, 2)\\nfor i=1,opt.N-1 do addResidualDrop(model, nil, 32) end\\n------> 64, 8,8 Third Group\\naddResidualDrop(model, nil, 32, 64, 2)\\nfor i=1,opt.N-1 do addResidualDrop(model, nil, 64) end\\n------> 10, 8,8 Pooling, Linear, Softmax\\nmodel:add(nn.SpatialAveragePooling(8,8)):add(nn.Reshape(64))\\nif opt.dataset == 'cifar10' or opt.dataset == 'svhn' then\\n model:add(nn.Linear(64, 10))\\nelseif opt.dataset == 'cifar100' then\\n model:add(nn.Linear(64, 100))\\nelse\\n print('Invalid argument for dataset!')\\nend\\n\\n\\n\"" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "l_in = InputLayer( (None, 1, 28, 28), name=\"input\" )\n", - "\n", - "l_conv1 = Conv2DLayer(l_in, num_filters=8, filter_size=3, name=\"l_conv1\", nonlinearity=None)\n", - "l_sd1 = stochastic_depth_block(l_conv1, p=0.5, nonlinearity=rectify)\n", - "\n", - "l_mp1 = MaxPool2DLayer(l_sd1, pool_size=(2,2), name=\"l_mp1\")\n", - "l_sd2 = stochastic_depth_block(l_mp1, p=0.5)\n", - "\n", - "l_conv2 = Conv2DLayer(l_sd2, num_filters=8, filter_size=3, name=\"l_conv2\", nonlinearity=None)\n", - "l_sd3 = stochastic_depth_block(l_conv2, p=0.5, nonlinearity=rectify)\n", - "\n", - "l_mp2 = MaxPool2DLayer(l_sd3, pool_size=(2,2), name=\"l_mp2\")\n", - "l_sd4 = stochastic_depth_block(l_mp2, p=0.5)\n", - "\n", - "l_conv3 = Conv2DLayer(l_sd4, num_filters=16, filter_size=3, name=\"l_conv3\", nonlinearity=None)\n", - "l_sd5 = stochastic_depth_block(l_conv3, p=0.5, nonlinearity=rectify)\n", + "\"\"\"\n", "\n", - "l_fc = DenseLayer(l_sd5, num_units=10, nonlinearity=softmax, name=\"l_fc\")\n", + "print('Building model...')\n", + "model = nn.Sequential()\n", + "------> 3, 32,32\n", + "model:add(cudnn.SpatialConvolution(3, 16, 3,3, 1,1, 1,1)\n", + " :init('weight', nninit.kaiming, {gain = 'relu'})\n", + " :init('bias', nninit.constant, 0))\n", + "model:add(cudnn.SpatialBatchNormalization(16))\n", + "model:add(cudnn.ReLU(true))\n", + "------> 16, 32,32 First Group\n", + "for i=1,opt.N do addResidualDrop(model, nil, 16) end\n", + "------> 32, 16,16 Second Group\n", + "addResidualDrop(model, nil, 16, 32, 2)\n", + "for i=1,opt.N-1 do addResidualDrop(model, nil, 32) end\n", + "------> 64, 8,8 Third Group\n", + "addResidualDrop(model, nil, 32, 64, 2)\n", + "for i=1,opt.N-1 do addResidualDrop(model, nil, 64) end\n", + "------> 10, 8,8 Pooling, Linear, Softmax\n", + "model:add(nn.SpatialAveragePooling(8,8)):add(nn.Reshape(64))\n", + "if opt.dataset == 'cifar10' or opt.dataset == 'svhn' then\n", + " model:add(nn.Linear(64, 10))\n", + "elseif opt.dataset == 'cifar100' then\n", + " model:add(nn.Linear(64, 100))\n", + "else\n", + " print('Invalid argument for dataset!')\n", + "end\n", "\n", - "l_out = l_fc\n", "\n", - "for layer in get_all_layers(l_out):\n", - " print layer.name, layer.output_shape\n", - "print \"num of params: %i\" % count_params(l_out)" + "\"\"\"" ] }, { - "cell_type": "markdown", - "metadata": {}, + "cell_type": "code", + "execution_count": 60, + "metadata": { + "collapsed": true + }, + "outputs": [], "source": [ - "Ok, let's download the MNIST dataset" + "def residual_block(layer, num_filters, filter_size=3, stride=1, num_layers=2, survival_p=0.5):\n", + " #print \"input =\", layer.output_shape\n", + " conv = layer\n", + " if (num_filters != layer.output_shape[1]) or (stride != 1):\n", + " layer = Pool2DLayer(layer, pool_size=1, stride=stride, mode=\"average_inc_pad\")\n", + " diff = num_filters-layer.output_shape[1]\n", + " if diff % 2 == 0: \n", + " width_tp = ((diff/2, diff/2),)\n", + " else:\n", + " width_tp = (((diff/2)+1, diff/2),)\n", + " layer = pad(\n", + " layer, \n", + " batch_ndim=1, \n", + " width=width_tp\n", + " )\n", + " #print \"layer =\", layer.output_shape\n", + " for _ in range(num_layers):\n", + " conv = Conv2DLayer(conv, num_filters, filter_size, stride=stride, pad='same')\n", + " #print \"conv =\", conv.output_shape\n", + " stride = 1\n", + " nonlinearity = conv.nonlinearity\n", + " conv.nonlinearity = lasagne.nonlinearities.identity\n", + " conv = BinomialDropLayer(conv, survival_p=survival_p)\n", + " return NonlinearityLayer(ElemwiseSumLayer([conv, layer]), nonlinearity)" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 63, "metadata": { "collapsed": false }, "outputs": [], "source": [ - "url_ret = urlretrieve(\"http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz\", \"/tmp/mnist.pkl.gz\")" + "# architecture from:\n", + "# https://github.com/yueatsprograms/Stochastic_Depth/blob/master/main.lua\n", + "survival_p = 0.5\n", + "layer = InputLayer( (None, 3, 32, 32) )\n", + "layer = Conv2DLayer(layer, num_filters=16, filter_size=3, stride=1, pad='same')\n", + "#layer = Pool2DLayer(layer, 2)\n", + "for _ in range(18):\n", + " layer = residual_block(layer, 16, survival_p=survival_p)\n", + "layer = residual_block(layer, 32, stride=2, survival_p=survival_p)\n", + "for _ in range(18):\n", + " layer = residual_block(layer, 32, survival_p=survival_p)\n", + "layer = residual_block(layer, 64, stride=2, survival_p=survival_p)\n", + "for _ in range(18):\n", + " layer = residual_block(layer, 64, survival_p=survival_p)\n", + "layer = Pool2DLayer(layer, pool_size=8, stride=1, mode=\"average_inc_pad\")\n", + "layer = DenseLayer(layer, num_units=10, nonlinearity=softmax)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " (None, 3, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + "<__main__.BinomialDropLayer object at 0x108d01790> (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + "<__main__.BinomialDropLayer object at 0x14a3a4d10> (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + "<__main__.BinomialDropLayer object at 0x14a510050> (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + "<__main__.BinomialDropLayer object at 0x147e24650> (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + "<__main__.BinomialDropLayer object at 0x12049b750> (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + "<__main__.BinomialDropLayer object at 0x1204bc150> (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + "<__main__.BinomialDropLayer object at 0x1204baf50> (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + "<__main__.BinomialDropLayer object at 0x1204ba8d0> (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + "<__main__.BinomialDropLayer object at 0x1204ba5d0> (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + "<__main__.BinomialDropLayer object at 0x1204bc650> (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + "<__main__.BinomialDropLayer object at 0x1204bce50> (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + "<__main__.BinomialDropLayer object at 0x12049b1d0> (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + "<__main__.BinomialDropLayer object at 0x1204882d0> (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + "<__main__.BinomialDropLayer object at 0x120488310> (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + "<__main__.BinomialDropLayer object at 0x120175050> (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + "<__main__.BinomialDropLayer object at 0x120175990> (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + "<__main__.BinomialDropLayer object at 0x1201754d0> (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + "<__main__.BinomialDropLayer object at 0x120160f90> (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 16, 32, 32)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x120160ed0> (None, 32, 16, 16)\n", + " (None, 16, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x120160690> (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x1201458d0> (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x120145f90> (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x1201452d0> (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x12014e350> (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x12014ed10> (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x12014e510> (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x1204dab90> (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x1204dad50> (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x1204da1d0> (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x1204db5d0> (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x1204dbd90> (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x1204db350> (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x1204ffd10> (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x1204ffa50> (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x1204ff1d0> (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x1204f5e10> (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + "<__main__.BinomialDropLayer object at 0x1204f5a10> (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 32, 16, 16)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x1204f5190> (None, 64, 8, 8)\n", + " (None, 32, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x124db5e50> (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x124db5a10> (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x124db53d0> (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x124db1910> (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x124db1650> (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x124db1490> (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x124da0350> (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x124da0710> (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x1145df510> (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x149a291d0> (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x114702690> (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x125a9dc90> (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x12758b110> (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x12d30d310> (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x108d0c850> (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x108db92d0> (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x14978b2d0> (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + "<__main__.BinomialDropLayer object at 0x124d8cad0> (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 8, 8)\n", + " (None, 64, 1, 1)\n", + " (None, 10)\n" + ] + } + ], + "source": [ + "for layer in get_all_layers(layer):\n", + " print layer, layer.output_shape" ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [], "source": [ - "with gzip.open(\"/tmp/mnist.pkl.gz\") as f:\n", - " dat = pickle.load(f)\n", - "train_data, _, _ = dat\n", - "X_train, y_train = train_data\n", - "X_train = X_train.reshape( (X_train.shape[0], 1, 28, 28) ).astype( theano.config.floatX )\n", - "y_train = y_train.astype(\"int32\")" + "cifar10_loader = imp.load_source(\"cifar10_loader\", \"../papers/deep_residual_learning/Deep_Residual_Learning_CIFAR-10.py\")" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 24, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "#curr_dir = os.getcwd()\n", + "#os.chdir(\"../papers/deep_residual_learning/\")\n", + "data = cifar10_loader.load_data()\n", + "X_train_and_valid, y_train_and_valid, X_test, y_test = \\\n", + " data[\"X_train\"][0:50000], data[\"Y_train\"][0:50000], data[\"X_test\"], data[\"Y_test\"]\n", + "#os.chdir(curr_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:1: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n", + " if __name__ == '__main__':\n", + "/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:2: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n", + " from ipykernel import kernelapp as app\n", + "/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:3: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n", + " app.launch_new_instance()\n", + "/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:4: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n" + ] + } + ], + "source": [ + "X_train = X_train_and_valid[ 0 : 0.9*X_train_and_valid.shape[0] ]\n", + "y_train = y_train_and_valid[ 0 : 0.9*y_train_and_valid.shape[0] ]\n", + "X_valid = X_train_and_valid[ 0.9*X_train_and_valid.shape[0] :: ]\n", + "y_valid = y_train_and_valid[ 0.9*y_train_and_valid.shape[0] :: ]" + ] + }, + { + "cell_type": "code", + "execution_count": 27, "metadata": { "collapsed": false }, @@ -274,30 +573,23 @@ { "data": { "text/plain": [ - "((50000, 1, 28, 28), (50000,))" + "(90000, 3, 32, 32)" ] }, - "execution_count": 33, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "X_train.shape, y_train.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Define the Lasagne-related stuff we'll need for training the network" + "X_train.shape" ] }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 51, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ @@ -311,12 +603,27 @@ "grads = T.grad(loss, params)\n", "updates = nesterov_momentum(grads, params, learning_rate=0.01, momentum=0.9)\n", "train_fn = theano.function(inputs=[X, y], outputs=loss, updates=updates)\n", - "out_fn = theano.function(inputs=[X], outputs=net_out_det)" + "eval_fn = theano.function(inputs=[X, y], outputs=loss)\n", + "preds_fn = theano.function(inputs=[X], outputs=net_out_det)" ] }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 57, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "#X_train = X_train[0:10]\n", + "#y_train = y_train[0:10]\n", + "#X_valid = X_valid[0:10]\n", + "#y_valid = y_valid[0:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 56, "metadata": { "collapsed": false }, @@ -325,57 +632,42 @@ "name": "stdout", "output_type": "stream", "text": [ - "1 1.04170554822\n", - "2 0.838029498493\n", - "3 0.794951380409\n", - "4 0.759657711097\n", - "5 0.733128651309\n", - "6 0.715255246341\n", - "7 0.718663093263\n", - "8 0.710821818392\n", - "9 0.721151590125\n", - "10 0.707448014221\n" + "epoch,avg_train_loss,valid_loss,valid_acc,time\n", + "1,nan,inf,0.200000,0.808203\n", + "2,nan,inf,0.200000,0.669914\n", + "3,nan,inf,0.200000,0.633933\n", + "4,nan,inf,0.200000,0.646192\n", + "5,nan,inf,0.200000,0.618688\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0mtrain_losses\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m \u001b[0mtrain_fn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mbs\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mbs\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mbs\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mbs\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mvalid_loss\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0meval_fn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_valid\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_valid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 11\u001b[0;31m \u001b[0mvalid_preds\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpreds_fn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_valid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 12\u001b[0m \u001b[0mvalid_acc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalid_preds\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0my_valid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0;36m1.0\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_valid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;32mprint\u001b[0m \u001b[0;34m\"%i,%f,%f,%f,%f\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mepoch\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_losses\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalid_loss\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalid_acc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0mt0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m//anaconda/lib/python2.7/site-packages/Theano-0.8.0.dev0-py2.7.egg/theano/compile/function_module.pyc\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 857\u001b[0m \u001b[0mt0_fn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 858\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 859\u001b[0;31m \u001b[0moutputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 860\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 861\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'position_of_error'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m//anaconda/lib/python2.7/site-packages/Theano-0.8.0.dev0-py2.7.egg/theano/gof/op.pyc\u001b[0m in \u001b[0;36mrval\u001b[0;34m(p, i, o, n)\u001b[0m\n\u001b[1;32m 905\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mparams\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mgraph\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mNoParams\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 906\u001b[0m \u001b[0;31m# default arguments are stored in the closure of `rval`\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 907\u001b[0;31m \u001b[0;32mdef\u001b[0m \u001b[0mrval\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnode_input_storage\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mo\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnode_output_storage\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 908\u001b[0m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mp\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mo\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 909\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mo\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mnode\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutputs\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ - "bs = 32 \n", - "n_batches = X_train.shape[0] // bs\n", + "batch_size = 128\n", + "n_batches = X_train.shape[0] // batch_size\n", "num_epochs = 10\n", + "print \"epoch,avg_train_loss,valid_loss,valid_acc,time\"\n", "for epoch in range(0, num_epochs):\n", + " t0 = time()\n", " train_losses = []\n", " for b in range(0, n_batches):\n", - " train_losses.append( train_fn(X_train[b*bs : (b+1)*bs], y_train[b*bs : (b+1)*bs]) )\n", - " print (epoch+1), np.mean(train_losses)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "What accuracy do we get on the training set? (Between 0 and 1)" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0.92969999999999997" - ] - }, - "execution_count": 64, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "np.sum( np.argmax( out_fn(X_train), axis=1 ) == y_train ) / float(X_train.shape[0])" + " train_losses.append( train_fn(X_train[b*bs:(b+1)*bs], y_train[b*bs:(b+1)*bs]) )\n", + " valid_loss = eval_fn(X_valid, y_valid)\n", + " valid_preds = np.argmax(preds_fn(X_valid),axis=1)\n", + " valid_acc = np.sum(valid_preds == y_valid)*1.0 / len(y_valid)\n", + " print \"%i,%f,%f,%f,%f\" % (epoch+1, np.mean(train_losses), valid_loss, valid_acc, time()-t0)" ] }, { diff --git a/examples/stochastic_depth_layers.py b/examples/stochastic_depth_layers.py new file mode 100644 index 0000000..3f49ca1 --- /dev/null +++ b/examples/stochastic_depth_layers.py @@ -0,0 +1,230 @@ + +# coding: utf-8 + +# In[55]: + +import theano +from theano import tensor as T +import lasagne +from lasagne.layers import * +from lasagne.nonlinearities import * +from lasagne.objectives import * +from lasagne.regularization import * +from lasagne.random import get_rng +from lasagne.updates import * +from lasagne.init import * +from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams +from urllib import urlretrieve +import cPickle as pickle +import gzip +import imp +import os +from time import time + + +# In[58]: + +class BinomialDropLayer(Layer): + def __init__(self, incoming, nonlinearity=rectify, survival_p=0.5, + **kwargs): + super(BinomialDropLayer, self).__init__(incoming, **kwargs) + self.nonlinearity = (identity if nonlinearity is None + else nonlinearity) + self._srng = RandomStreams(get_rng().randint(1, 2147462579)) + self.p = 1-survival_p + + def get_output_for(self, input, deterministic=False, **kwargs): + if deterministic: + return self.p*input + else: + #mask = self._srng.binomial(n=1, p=(self.p), size=(input.shape[0],), + # dtype=input.dtype) + mask = T.zeros((input.shape[0],)) + self._srng.uniform( (1,), 0, 1)[0] + mask = mask.dimshuffle(0,'x','x','x') + return mask*input + + +# In[59]: + +class IfElseDropLayer(Layer): + def __init__(self, incoming, nonlinearity=rectify, survival_p=0.5, + **kwargs): + super(IfElseDropLayer, self).__init__(incoming, **kwargs) + self.nonlinearity = (identity if nonlinearity is None + else nonlinearity) + self._srng = RandomStreams(get_rng().randint(1, 2147462579)) + self.p = 1-survival_p + + def get_output_for(self, input, deterministic=False, **kwargs): + if deterministic: + return self.p*input + else: + return ifelse( + T.lt(self._srng.uniform( (1,), 0, 1)[0], self.p), + input, + T.zeros(input.shape) + ) + + +# There is a difference between this residual block method and the one that is defined in [link]. When the number of filters is different to the layer's output shape (or the stride is different), instead of using a convolution to make things compatible, we use an average pooling with a pool size of 1 and a the defined stride, followed by (if necessary) adding extra zero-padded feature maps. This is because this is how the authors in [link] have defined it. + +# In[38]: + +""" + +print('Building model...') +model = nn.Sequential() +------> 3, 32,32 +model:add(cudnn.SpatialConvolution(3, 16, 3,3, 1,1, 1,1) + :init('weight', nninit.kaiming, {gain = 'relu'}) + :init('bias', nninit.constant, 0)) +model:add(cudnn.SpatialBatchNormalization(16)) +model:add(cudnn.ReLU(true)) +------> 16, 32,32 First Group +for i=1,opt.N do addResidualDrop(model, nil, 16) end +------> 32, 16,16 Second Group +addResidualDrop(model, nil, 16, 32, 2) +for i=1,opt.N-1 do addResidualDrop(model, nil, 32) end +------> 64, 8,8 Third Group +addResidualDrop(model, nil, 32, 64, 2) +for i=1,opt.N-1 do addResidualDrop(model, nil, 64) end +------> 10, 8,8 Pooling, Linear, Softmax +model:add(nn.SpatialAveragePooling(8,8)):add(nn.Reshape(64)) +if opt.dataset == 'cifar10' or opt.dataset == 'svhn' then + model:add(nn.Linear(64, 10)) +elseif opt.dataset == 'cifar100' then + model:add(nn.Linear(64, 100)) +else + print('Invalid argument for dataset!') +end + + +""" + + +# In[60]: + +def residual_block(layer, num_filters, filter_size=3, stride=1, num_layers=2, survival_p=0.5): + #print "input =", layer.output_shape + conv = layer + if (num_filters != layer.output_shape[1]) or (stride != 1): + layer = Pool2DLayer(layer, pool_size=1, stride=stride, mode="average_inc_pad") + diff = num_filters-layer.output_shape[1] + if diff % 2 == 0: + width_tp = ((diff/2, diff/2),) + else: + width_tp = (((diff/2)+1, diff/2),) + layer = pad( + layer, + batch_ndim=1, + width=width_tp + ) + #print "layer =", layer.output_shape + for _ in range(num_layers): + conv = Conv2DLayer(conv, num_filters, filter_size, stride=stride, pad='same') + #print "conv =", conv.output_shape + stride = 1 + nonlinearity = conv.nonlinearity + conv.nonlinearity = lasagne.nonlinearities.identity + conv = BinomialDropLayer(conv, survival_p=survival_p) + return NonlinearityLayer(ElemwiseSumLayer([conv, layer]), nonlinearity) + + +# In[63]: + +# architecture from: +# https://github.com/yueatsprograms/Stochastic_Depth/blob/master/main.lua +survival_p = 0.5 +layer = InputLayer( (None, 3, 32, 32) ) +layer = Conv2DLayer(layer, num_filters=16, filter_size=3, stride=1, pad='same') +#layer = Pool2DLayer(layer, 2) +for _ in range(18): + layer = residual_block(layer, 16, survival_p=survival_p) +layer = residual_block(layer, 32, stride=2, survival_p=survival_p) +for _ in range(18): + layer = residual_block(layer, 32, survival_p=survival_p) +layer = residual_block(layer, 64, stride=2, survival_p=survival_p) +for _ in range(18): + layer = residual_block(layer, 64, survival_p=survival_p) +layer = Pool2DLayer(layer, pool_size=8, stride=1, mode="average_inc_pad") +layer = DenseLayer(layer, num_units=10, nonlinearity=softmax) + + +# In[64]: + +for layer in get_all_layers(layer): + print layer, layer.output_shape + + +# In[14]: + +cifar10_loader = imp.load_source("cifar10_loader", "../papers/deep_residual_learning/Deep_Residual_Learning_CIFAR-10.py") + + +# In[24]: + +#curr_dir = os.getcwd() +#os.chdir("../papers/deep_residual_learning/") +data = cifar10_loader.load_data() +X_train_and_valid, y_train_and_valid, X_test, y_test = data["X_train"][0:50000], data["Y_train"][0:50000], data["X_test"], data["Y_test"] +#os.chdir(curr_dir) + + +# In[26]: + +X_train = X_train_and_valid[ 0 : 0.9*X_train_and_valid.shape[0] ] +y_train = y_train_and_valid[ 0 : 0.9*y_train_and_valid.shape[0] ] +X_valid = X_train_and_valid[ 0.9*X_train_and_valid.shape[0] :: ] +y_valid = y_train_and_valid[ 0.9*y_train_and_valid.shape[0] :: ] + + +# In[27]: + +X_train.shape + + +# In[51]: + +X = T.tensor4('X') +y = T.ivector('y') + +net_out = get_output(l_out, X) +net_out_det = get_output(l_out, X, deterministic=True) +loss = categorical_crossentropy(net_out, y).mean() +params = get_all_params(l_out, trainable=True) +grads = T.grad(loss, params) +updates = nesterov_momentum(grads, params, learning_rate=0.01, momentum=0.9) +train_fn = theano.function(inputs=[X, y], outputs=loss, updates=updates) +eval_fn = theano.function(inputs=[X, y], outputs=loss) +preds_fn = theano.function(inputs=[X], outputs=net_out_det) + + +# In[57]: + +#X_train = X_train[0:10] +#y_train = y_train[0:10] +#X_valid = X_valid[0:10] +#y_valid = y_valid[0:10] + + +# In[56]: + +batch_size = 128 +n_batches = X_train.shape[0] // batch_size +num_epochs = 10 +print "epoch,avg_train_loss,valid_loss,valid_acc,time" +for epoch in range(0, num_epochs): + t0 = time() + train_losses = [] + for b in range(0, n_batches): + train_losses.append( train_fn(X_train[b*bs:(b+1)*bs], y_train[b*bs:(b+1)*bs]) ) + valid_loss = eval_fn(X_valid, y_valid) + valid_preds = np.argmax(preds_fn(X_valid),axis=1) + valid_acc = np.sum(valid_preds == y_valid)*1.0 / len(y_valid) + print "%i,%f,%f,%f,%f" % (epoch+1, np.mean(train_losses), valid_loss, valid_acc, time()-t0) + + +# In[ ]: + + + From 56fc6b67f9a4e0b634af3e7b800d56f92eb22e0c Mon Sep 17 00:00:00 2001 From: Christopher Beckham Date: Sun, 22 May 2016 17:31:29 -0400 Subject: [PATCH 3/6] clean py script --- examples/stochastic_depth_layers.py | 183 ++++++++++------------------ 1 file changed, 65 insertions(+), 118 deletions(-) diff --git a/examples/stochastic_depth_layers.py b/examples/stochastic_depth_layers.py index 3f49ca1..160347a 100644 --- a/examples/stochastic_depth_layers.py +++ b/examples/stochastic_depth_layers.py @@ -1,10 +1,6 @@ - -# coding: utf-8 - -# In[55]: - import theano from theano import tensor as T +from theano.ifelse import ifelse import lasagne from lasagne.layers import * from lasagne.nonlinearities import * @@ -14,16 +10,27 @@ from lasagne.updates import * from lasagne.init import * from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams -from urllib import urlretrieve -import cPickle as pickle -import gzip import imp import os from time import time +""" + Binomial dropout layer + + Samples binomial(p,n=1) R.V. and multiplies the input tensor by + this value. On its own, this layer is useless as it + essentially either multiplies everything by one (i.e. do nothing), + or it makes every value in the tensor zero (lose all information). + This layer is intended to be used. + + Parameters + ---------- + + incoming : a :class:`Layer` instance + p : float + The survival probability for the input tensor -# In[58]: - +""" class BinomialDropLayer(Layer): def __init__(self, incoming, nonlinearity=rectify, survival_p=0.5, **kwargs): @@ -39,13 +46,11 @@ def get_output_for(self, input, deterministic=False, **kwargs): else: #mask = self._srng.binomial(n=1, p=(self.p), size=(input.shape[0],), # dtype=input.dtype) + # apply the same thing to all examples in the minibatch mask = T.zeros((input.shape[0],)) + self._srng.uniform( (1,), 0, 1)[0] mask = mask.dimshuffle(0,'x','x','x') return mask*input - -# In[59]: - class IfElseDropLayer(Layer): def __init__(self, incoming, nonlinearity=rectify, survival_p=0.5, **kwargs): @@ -66,46 +71,20 @@ def get_output_for(self, input, deterministic=False, **kwargs): ) -# There is a difference between this residual block method and the one that is defined in [link]. When the number of filters is different to the layer's output shape (or the stride is different), instead of using a convolution to make things compatible, we use an average pooling with a pool size of 1 and a the defined stride, followed by (if necessary) adding extra zero-padded feature maps. This is because this is how the authors in [link] have defined it. - -# In[38]: - """ +There is a difference between this residual block method and the one that is defined in: -print('Building model...') -model = nn.Sequential() -------> 3, 32,32 -model:add(cudnn.SpatialConvolution(3, 16, 3,3, 1,1, 1,1) - :init('weight', nninit.kaiming, {gain = 'relu'}) - :init('bias', nninit.constant, 0)) -model:add(cudnn.SpatialBatchNormalization(16)) -model:add(cudnn.ReLU(true)) -------> 16, 32,32 First Group -for i=1,opt.N do addResidualDrop(model, nil, 16) end -------> 32, 16,16 Second Group -addResidualDrop(model, nil, 16, 32, 2) -for i=1,opt.N-1 do addResidualDrop(model, nil, 32) end -------> 64, 8,8 Third Group -addResidualDrop(model, nil, 32, 64, 2) -for i=1,opt.N-1 do addResidualDrop(model, nil, 64) end -------> 10, 8,8 Pooling, Linear, Softmax -model:add(nn.SpatialAveragePooling(8,8)):add(nn.Reshape(64)) -if opt.dataset == 'cifar10' or opt.dataset == 'svhn' then - model:add(nn.Linear(64, 10)) -elseif opt.dataset == 'cifar100' then - model:add(nn.Linear(64, 100)) -else - print('Invalid argument for dataset!') -end +https://github.com/Lasagne/Lasagne/issues/531 +When the number of filters is different to the layer's output shape (or the stride is different), +instead of using a convolution to make things compatible, we use an average pooling with a pool +size of 1 and a the defined stride, followed by (if necessary) adding extra zero-padded feature +maps. This is because this is how the authors in the following link have defined it: +https://github.com/yueatsprograms/Stochastic_Depth/blob/master/ResidualDrop.lua """ - -# In[60]: - def residual_block(layer, num_filters, filter_size=3, stride=1, num_layers=2, survival_p=0.5): - #print "input =", layer.output_shape conv = layer if (num_filters != layer.output_shape[1]) or (stride != 1): layer = Pool2DLayer(layer, pool_size=1, stride=stride, mode="average_inc_pad") @@ -119,112 +98,80 @@ def residual_block(layer, num_filters, filter_size=3, stride=1, num_layers=2, su batch_ndim=1, width=width_tp ) - #print "layer =", layer.output_shape for _ in range(num_layers): conv = Conv2DLayer(conv, num_filters, filter_size, stride=stride, pad='same') - #print "conv =", conv.output_shape stride = 1 nonlinearity = conv.nonlinearity conv.nonlinearity = lasagne.nonlinearities.identity conv = BinomialDropLayer(conv, survival_p=survival_p) return NonlinearityLayer(ElemwiseSumLayer([conv, layer]), nonlinearity) - -# In[63]: - -# architecture from: -# https://github.com/yueatsprograms/Stochastic_Depth/blob/master/main.lua -survival_p = 0.5 -layer = InputLayer( (None, 3, 32, 32) ) -layer = Conv2DLayer(layer, num_filters=16, filter_size=3, stride=1, pad='same') -#layer = Pool2DLayer(layer, 2) -for _ in range(18): - layer = residual_block(layer, 16, survival_p=survival_p) -layer = residual_block(layer, 32, stride=2, survival_p=survival_p) -for _ in range(18): - layer = residual_block(layer, 32, survival_p=survival_p) -layer = residual_block(layer, 64, stride=2, survival_p=survival_p) -for _ in range(18): - layer = residual_block(layer, 64, survival_p=survival_p) -layer = Pool2DLayer(layer, pool_size=8, stride=1, mode="average_inc_pad") -layer = DenseLayer(layer, num_units=10, nonlinearity=softmax) - - -# In[64]: - -for layer in get_all_layers(layer): - print layer, layer.output_shape - - -# In[14]: - -cifar10_loader = imp.load_source("cifar10_loader", "../papers/deep_residual_learning/Deep_Residual_Learning_CIFAR-10.py") - - -# In[24]: - -#curr_dir = os.getcwd() -#os.chdir("../papers/deep_residual_learning/") +def get_net(): + # Architecture from: + # https://github.com/yueatsprograms/Stochastic_Depth/blob/master/main.lua + N = 18 + survival_p = 0.5 + layer = InputLayer( (None, 3, 32, 32) ) + layer = Conv2DLayer(layer, num_filters=16, filter_size=3, stride=1, pad='same') + #layer = Pool2DLayer(layer, 2) + for _ in range(N): + layer = residual_block(layer, 16, survival_p=survival_p) + layer = residual_block(layer, 32, stride=2, survival_p=survival_p) + for _ in range(N): + layer = residual_block(layer, 32, survival_p=survival_p) + layer = residual_block(layer, 64, stride=2, survival_p=survival_p) + for _ in range(N): + layer = residual_block(layer, 64, survival_p=survival_p) + layer = Pool2DLayer(layer, pool_size=8, stride=1, mode="average_inc_pad") + layer = DenseLayer(layer, num_units=10, nonlinearity=softmax) + for layer in get_all_layers(layer): + print layer, layer.output_shape + print "number of params:", count_params(layer) + return layer + +cifar10_loader = imp.load_source( + "cifar10_loader", "../papers/deep_residual_learning/Deep_Residual_Learning_CIFAR-10.py") +curr_dir = os.getcwd() +os.chdir("../papers/deep_residual_learning/") data = cifar10_loader.load_data() -X_train_and_valid, y_train_and_valid, X_test, y_test = data["X_train"][0:50000], data["Y_train"][0:50000], data["X_test"], data["Y_test"] -#os.chdir(curr_dir) - - -# In[26]: +X_train_and_valid, y_train_and_valid, X_test, y_test = \ + data["X_train"][0:50000], data["Y_train"][0:50000], data["X_test"], data["Y_test"] +os.chdir(curr_dir) X_train = X_train_and_valid[ 0 : 0.9*X_train_and_valid.shape[0] ] y_train = y_train_and_valid[ 0 : 0.9*y_train_and_valid.shape[0] ] X_valid = X_train_and_valid[ 0.9*X_train_and_valid.shape[0] :: ] y_valid = y_train_and_valid[ 0.9*y_train_and_valid.shape[0] :: ] - -# In[27]: - -X_train.shape - - -# In[51]: - X = T.tensor4('X') y = T.ivector('y') -net_out = get_output(l_out, X) -net_out_det = get_output(l_out, X, deterministic=True) +layer = get_net() +net_out = get_output(layer, X) +net_out_det = get_output(layer, X, deterministic=True) loss = categorical_crossentropy(net_out, y).mean() -params = get_all_params(l_out, trainable=True) +params = get_all_params(layer, trainable=True) grads = T.grad(loss, params) updates = nesterov_momentum(grads, params, learning_rate=0.01, momentum=0.9) train_fn = theano.function(inputs=[X, y], outputs=loss, updates=updates) eval_fn = theano.function(inputs=[X, y], outputs=loss) preds_fn = theano.function(inputs=[X], outputs=net_out_det) - -# In[57]: - -#X_train = X_train[0:10] -#y_train = y_train[0:10] -#X_valid = X_valid[0:10] -#y_valid = y_valid[0:10] - - -# In[56]: - batch_size = 128 n_batches = X_train.shape[0] // batch_size num_epochs = 10 print "epoch,avg_train_loss,valid_loss,valid_acc,time" for epoch in range(0, num_epochs): - t0 = time() + # shuffle examples + idxs = [x for x in range(0, X_train.shape[0])] + np.random.shuffle(idxs) + X_train = X_train[idxs] + y_train = y_train[idxs] train_losses = [] + t0 = time() for b in range(0, n_batches): train_losses.append( train_fn(X_train[b*bs:(b+1)*bs], y_train[b*bs:(b+1)*bs]) ) valid_loss = eval_fn(X_valid, y_valid) valid_preds = np.argmax(preds_fn(X_valid),axis=1) valid_acc = np.sum(valid_preds == y_valid)*1.0 / len(y_valid) print "%i,%f,%f,%f,%f" % (epoch+1, np.mean(train_losses), valid_loss, valid_acc, time()-t0) - - -# In[ ]: - - - From bb4879a46e97f4493019d20ad86216793f372dd3 Mon Sep 17 00:00:00 2001 From: Christopher Beckham Date: Sun, 22 May 2016 18:17:44 -0400 Subject: [PATCH 4/6] fix bugs --- examples/stochastic_depth_layers.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/examples/stochastic_depth_layers.py b/examples/stochastic_depth_layers.py index 160347a..87f258e 100644 --- a/examples/stochastic_depth_layers.py +++ b/examples/stochastic_depth_layers.py @@ -32,13 +32,10 @@ """ class BinomialDropLayer(Layer): - def __init__(self, incoming, nonlinearity=rectify, survival_p=0.5, - **kwargs): + def __init__(self, incoming, p=0.5, **kwargs): super(BinomialDropLayer, self).__init__(incoming, **kwargs) - self.nonlinearity = (identity if nonlinearity is None - else nonlinearity) self._srng = RandomStreams(get_rng().randint(1, 2147462579)) - self.p = 1-survival_p + self.p = p def get_output_for(self, input, deterministic=False, **kwargs): if deterministic: @@ -47,25 +44,22 @@ def get_output_for(self, input, deterministic=False, **kwargs): #mask = self._srng.binomial(n=1, p=(self.p), size=(input.shape[0],), # dtype=input.dtype) # apply the same thing to all examples in the minibatch - mask = T.zeros((input.shape[0],)) + self._srng.uniform( (1,), 0, 1)[0] + mask = T.zeros((input.shape[0],)) + _srng.binomial((1,), p=self.p, dtype=input.dtype)[0] mask = mask.dimshuffle(0,'x','x','x') return mask*input class IfElseDropLayer(Layer): - def __init__(self, incoming, nonlinearity=rectify, survival_p=0.5, - **kwargs): + def __init__(self, incoming, p=0.5, **kwargs): super(IfElseDropLayer, self).__init__(incoming, **kwargs) - self.nonlinearity = (identity if nonlinearity is None - else nonlinearity) self._srng = RandomStreams(get_rng().randint(1, 2147462579)) - self.p = 1-survival_p + self.p = p def get_output_for(self, input, deterministic=False, **kwargs): if deterministic: return self.p*input else: return ifelse( - T.lt(self._srng.uniform( (1,), 0, 1)[0], self.p), + T.lt(_srng.binomial((1,), p=self.p, dtype=input.dtype)[0], self.p), input, T.zeros(input.shape) ) From f96d3078a68cbcd7d87f0bd4d01efd4adb02c97e Mon Sep 17 00:00:00 2001 From: Christopher Beckham Date: Mon, 23 May 2016 09:57:58 -0400 Subject: [PATCH 5/6] bug fixes --- examples/stochastic_depth_layers.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/stochastic_depth_layers.py b/examples/stochastic_depth_layers.py index 87f258e..ab31da2 100644 --- a/examples/stochastic_depth_layers.py +++ b/examples/stochastic_depth_layers.py @@ -44,7 +44,7 @@ def get_output_for(self, input, deterministic=False, **kwargs): #mask = self._srng.binomial(n=1, p=(self.p), size=(input.shape[0],), # dtype=input.dtype) # apply the same thing to all examples in the minibatch - mask = T.zeros((input.shape[0],)) + _srng.binomial((1,), p=self.p, dtype=input.dtype)[0] + mask = T.zeros((input.shape[0],)) + self._srng.binomial((1,), p=self.p, dtype=input.dtype)[0] mask = mask.dimshuffle(0,'x','x','x') return mask*input @@ -59,7 +59,7 @@ def get_output_for(self, input, deterministic=False, **kwargs): return self.p*input else: return ifelse( - T.lt(_srng.binomial((1,), p=self.p, dtype=input.dtype)[0], self.p), + T.lt(self._srng.uniform( (1,), 0, 1)[0], self.p), input, T.zeros(input.shape) ) @@ -97,7 +97,7 @@ def residual_block(layer, num_filters, filter_size=3, stride=1, num_layers=2, su stride = 1 nonlinearity = conv.nonlinearity conv.nonlinearity = lasagne.nonlinearities.identity - conv = BinomialDropLayer(conv, survival_p=survival_p) + conv = IfElseDropLayer(conv, p=survival_p) return NonlinearityLayer(ElemwiseSumLayer([conv, layer]), nonlinearity) def get_net(): @@ -151,8 +151,8 @@ def get_net(): eval_fn = theano.function(inputs=[X, y], outputs=loss) preds_fn = theano.function(inputs=[X], outputs=net_out_det) -batch_size = 128 -n_batches = X_train.shape[0] // batch_size +bs = 128 +n_batches = X_train.shape[0] // bs num_epochs = 10 print "epoch,avg_train_loss,valid_loss,valid_acc,time" for epoch in range(0, num_epochs): From 49747c6e05330c0aebf7a6f42efbd98778d2717e Mon Sep 17 00:00:00 2001 From: Christopher Beckham Date: Tue, 24 May 2016 12:05:24 -0400 Subject: [PATCH 6/6] make resblock method more like in stochastic depth lua code --- examples/stochastic_depth_layers.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/examples/stochastic_depth_layers.py b/examples/stochastic_depth_layers.py index ab31da2..d75e9d0 100644 --- a/examples/stochastic_depth_layers.py +++ b/examples/stochastic_depth_layers.py @@ -78,27 +78,26 @@ def get_output_for(self, input, deterministic=False, **kwargs): https://github.com/yueatsprograms/Stochastic_Depth/blob/master/ResidualDrop.lua """ -def residual_block(layer, num_filters, filter_size=3, stride=1, num_layers=2, survival_p=0.5): +def residual_block(layer, n_out_channels, stride=1, survival_p=0.5): conv = layer - if (num_filters != layer.output_shape[1]) or (stride != 1): + if stride > 1: layer = Pool2DLayer(layer, pool_size=1, stride=stride, mode="average_inc_pad") - diff = num_filters-layer.output_shape[1] + if (n_out_channels != layer.output_shape[1]): + diff = n_out_channels-layer.output_shape[1] if diff % 2 == 0: width_tp = ((diff/2, diff/2),) else: width_tp = (((diff/2)+1, diff/2),) - layer = pad( - layer, - batch_ndim=1, - width=width_tp - ) - for _ in range(num_layers): - conv = Conv2DLayer(conv, num_filters, filter_size, stride=stride, pad='same') - stride = 1 - nonlinearity = conv.nonlinearity - conv.nonlinearity = lasagne.nonlinearities.identity - conv = IfElseDropLayer(conv, p=survival_p) - return NonlinearityLayer(ElemwiseSumLayer([conv, layer]), nonlinearity) + layer = pad(layer, batch_ndim=1, width=width_tp) + conv = Conv2DLayer(conv, num_filters=n_out_channels, + filter_size=(3,3), stride=(stride,stride), pad=(1,1), nonlinearity=linear) + conv = BatchNormLayer(conv) + conv = NonlinearityLayer(conv, nonlinearity=rectify) + conv = Conv2DLayer(conv, num_filters=n_out_channels, + filter_size=(3,3), stride=(1,1), pad=(1,1), nonlinearity=linear) + conv = BatchNormLayer(conv) + conv = BinomialDropLayer(conv, p=survival_p) + return NonlinearityLayer(ElemwiseSumLayer([conv, layer]), nonlinearity=rectify) def get_net(): # Architecture from: